# Explore the Data

In [1]:
# Imports

import pandas as pd

import numpy as np
import seaborn as sns
import os, time, pickle, re, json, requests

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# Helper function
def baseline_modeling(data, classifier):
    """A simple modeling of data using sklearn's classifier
    
    Arguments
    ---------
    data: Pandas Dataframe
        Dataframe containing observations and features (includes target)
    classifier: classifier object (Tree or Ensembles, as they have feature importance data)
        Classifier from sklearn librar
        
    Returns
    -------
    classifier: classifier object
        Classifier fit with the data
    feature_importance: Pandas Dataframe
        Lists feature importance of the model in decimals, listed in a pandas dataframe
    """
    
    y = data['fivestars']
    X = pd.get_dummies(data.drop('fivestars',axis=1))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    X_train = scale(X_train)
    X_test = scale(X_test)
    # X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, test_size=0.2, random_state=42)
    
    classifier.fit(X_train, y_train)
    print(f'F1 score for training set: %f' % f1_score(classifier.predict(X_train),y_train))
    print(f'F1 score for test set: %f' % f1_score(classifier.predict(X_test), y_test))
    
    feature_importances = pd.DataFrame(classifier.feature_importances_,index = X.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
    
    return classifier, feature_importances

## Import and Fit for a Baseline Metric

I used the Gradient Boosting Classifier to establish a baseline. I want to use F1 score because business-wise, it's not feasible to favor precision over recall, or vice versa:

1. Low Recall/High Precision - Most of my 5-star predictions are indeed 5-star listings. However, I miss-predict a lot of other listings that are actually 5-star listings. This misclassification can cause the hosts to spend more money in order to take their listings to a 5-star level, when they were already hosting a 5-star listing.

2. High Recall/Low Precision - I am able to classify most of the actual 5-star listings as such. However, my predictions also include a lot of listings that aren't actually 5-stars. This might mean the listing may not actually achieve the 5-star ratings needed in order to become SuperHost, costing the host future extra earnings. 

In [3]:
data = pickle.load(open("../data/processed/data_clean_v4_all_everything.pickle","rb"))

Call the helper function to get a baseline F1 Score using Gradient Boosting Classifier:

### ----------- Checkpoint 1: Baseline ------------

In [4]:
gb = GradientBoostingClassifier()
gb, feature_importances = baseline_modeling(data, gb)



F1 score for training set: 0.770574
F1 score for test set: 0.672522


## Feature Engineering 1 - Amenities and others

We can look at some feature importance and determine what we can do.

In [5]:
feature_importances.head(10)

Unnamed: 0,importance
host_total_listings_count,0.240761
number_of_reviews,0.19531
price,0.05134
host_since,0.041943
reviews_per_month,0.031982
First aid kit,0.027106
Coffee maker,0.025717
number_of_reviews_ltm,0.025459
calendar_updated,0.023707
host_response_rate,0.020579


### Engineering Features: Price per room

I want to introduce some interaction terms that could prove helpful:

- Price/bedrooms = How much you are paying per bedroom/bathroom combo (assume 1 if the listing has no bedroom/bathroom, as a place should be considered as at least 1 room)

In [6]:
data["bed_bath"] = data.bedrooms + data.bathrooms

In [7]:
data['price_bed'] = np.where(data.bed_bath > 0, data.price/data.bed_bath, data.price)

In [8]:
data.drop(columns=["bed_bath"],inplace=True)

### ----------- Checkpoint 2: Engr 1 ------------

Now we can check if the engineered features helped or caused overfitting by running the Gradient Boosting Classifier again.

In [9]:
gb = GradientBoostingClassifier(learning_rate=0.2, n_estimators=150)
gb, feature_importances = baseline_modeling(data, gb)



F1 score for training set: 0.862703
F1 score for test set: 0.672578


This feature helped our score go up, so we will keep the feature in.

### Engineering Features 2: Minimum Cost

I want to introduce another interaction term that could prove helpful:

- Minimum cost = Price * Minimum Nights + cleaning fee = How much minimum you are expected to pay

In [10]:
data["minimum_cost"] = data.price * data.minimum_nights + data.cleaning_fee

In [11]:
# data.drop(columns=["price_bed"], inplace=True)

Again, check for overfitting:

### ----------- Checkpoint 3: Engr 2 ------------

In [12]:
gb = GradientBoostingClassifier(learning_rate=0.2, n_estimators=150)
gb, feature_importances = baseline_modeling(data, gb)



F1 score for training set: 0.856685
F1 score for test set: 0.678009


Again, the test shows overfitting.

In [13]:
feature_importances.head(10)

Unnamed: 0,importance
host_total_listings_count,0.173212
number_of_reviews,0.152072
host_since,0.054584
minimum_cost,0.046523
price_bed,0.042071
reviews_per_month,0.037398
bikescore,0.028945
number_of_reviews_ltm,0.028217
calendar_updated,0.02768
availability_90,0.027347


In [14]:
data.drop(columns=["minimum_cost"], inplace=True)

### Engineering Feature 3: Walkscore per Price

In [15]:
data["walk_price"] = data.walkscore * data.price

In [16]:
gb = GradientBoostingClassifier(learning_rate=0.2, n_estimators=150)
gb, feature_importances = baseline_modeling(data, gb)



F1 score for training set: 0.860645
F1 score for test set: 0.661597


In [17]:
data.drop(columns=["walk_price"],inplace=True)

In [18]:
data["calendar_availability"] = data.calendar_updated*data.availability_90

In [19]:
gb = GradientBoostingClassifier(learning_rate=0.2, n_estimators=150)
gb, feature_importances = baseline_modeling(data, gb)



F1 score for training set: 0.860963
F1 score for test set: 0.682411


In [20]:
pickle.dump(data,open("../data/processed/data_EDA_v4_all_everything.pickle",'wb'))