In [1]:
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import warnings

# Silence warnings.
warnings.filterwarnings('ignore')

# Team data set.
df = pd.read_csv('Airbnb-CLEANED.csv') #, dtype={'id': 'int', 'NAME': 'str', }

In [3]:
df_subset = df[['number of reviews', 'review rate number', 'price', 'Construction year']]

for index, row in df_subset.iterrows():
    if (str(row['review rate number']) == 'nan'):
        df_subset = df_subset.drop(index)

In [4]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

construction_year_scaler = min_max_scaler.fit(df_subset['Construction year'].values.reshape(-1, 1))
construction_year_scaled = construction_year_scaler.transform(df_subset['Construction year'].values.reshape(-1, 1))
df_subset["construction year scaled"] = construction_year_scaled

### Binning

In [5]:
# Divide into tertiles.
tertile = df_subset['price'].quantile([.33, .66])
tertiles = [432, 808]

def get_tertile(price):
    if (price < tertiles[0]):
        return 1
    if (price < tertiles[1]):
        return 2
    else:
        return 3

tertiles_col = []
for index, row in df_subset[['price']].iterrows():
    tertiles_col.append(get_tertile(row['price']))
    
df_subset['price_tertile'] = tertiles_col

In [6]:
quintile = df_subset['number of reviews'].quantile([.2, .4, .6, .8])
quintiles = [1, 4, 13, 41]
def get_quintile(price):
    if (price <= quintiles[0]):
        return 1
    if (price <= quintiles[1]):
        return 2
    if (price <= quintiles[2]):
        return 3
    if (price <= quintiles[3]):
        return 4
    else:
        return 5
    
quintiles_col = []
for index, row in df_subset[['number of reviews']].iterrows():
    quintiles_col.append(get_quintile(row['number of reviews']))
    
df_subset['reviews_quintile'] = quintiles_col

### Decision tree

In [7]:
from sklearn import tree
from sklearn.model_selection import train_test_split

decision_tree_x = df_subset[['reviews_quintile', 'review rate number', 'construction year scaled']].values
decision_tree_y = df_subset['price_tertile'].values

X_train, X_test, y_train, y_test = train_test_split(decision_tree_x, decision_tree_y, test_size = 0.2, random_state = 0)

clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

print("Accuracy:", clf.score(X_test, y_test))

Accuracy: 0.35226928131001717


#### Attempted tuning.

In [11]:
from sklearn.model_selection import GridSearchCV
import pandas as pd

tree_params = {
    'criterion': ['gini', 'entropy'],
    'max_depth': list(range(2, 20, 2)),
    'min_samples_split': list(range(2, 20, 2)),
    'min_samples_leaf': list(range(2, 20)),
}
clf = GridSearchCV(tree.DecisionTreeClassifier(), param_grid = tree_params, cv = 5, n_jobs=10, scoring='f1_micro')
clf = clf.fit(X_train, y_train)

df = pd.DataFrame(clf.cv_results_)

print("Results:", df)

Results:       mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0          0.051133      0.001436         0.009811        0.000698   
1          0.051619      0.001138         0.010306        0.000406   
2          0.046007      0.001924         0.010542        0.000654   
3          0.042759      0.001241         0.009138        0.001128   
4          0.038657      0.007406         0.009766        0.001486   
...             ...           ...              ...             ...   
2911       0.141166      0.009385         0.019916        0.003539   
2912       0.142994      0.010928         0.015612        0.001053   
2913       0.121852      0.004598         0.015396        0.001182   
2914       0.119337      0.004920         0.014599        0.001633   
2915       0.103215      0.009092         0.012810        0.001549   

     param_criterion param_max_depth param_min_samples_leaf  \
0               gini               2                      2   
1               gini    

### Voting classifiers

In [17]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

clf1 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf3 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf4 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf5 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf6 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf7 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf8 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf9 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf10 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')
clf11 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features='sqrt')

eclf1 = VotingClassifier(estimators=[
    ('decision tree 1', clf1),
    ('decision tree 2', clf2),
    ('decision tree 3', clf3),
    ('decision tree 4', clf4),
    ('decision tree 5', clf5),
    ('decision tree 6', clf6),
    ('decision tree 7', clf7),
    ('decision tree 8', clf8),
    ('decision tree 9', clf9),
    ('decision tree 10', clf10),
    ('decision tree 11', clf11),
], voting='hard')

rf_clf1 = RandomForestClassifier()
rf_clf2 = RandomForestClassifier()
rf_clf3 = RandomForestClassifier()
rf_clf4 = RandomForestClassifier()
rf_clf5 = RandomForestClassifier()
rf_clf6 = RandomForestClassifier()
rf_clf7 = RandomForestClassifier()
rf_clf8 = RandomForestClassifier()
rf_clf9 = RandomForestClassifier()
rf_clf10 = RandomForestClassifier()
rf_clf11 = RandomForestClassifier()

eclf2 = VotingClassifier(estimators=[
    ('random forest 1', rf_clf1),
    ('random forest 2', rf_clf2),
    ('random forest 3', rf_clf3),
    ('random forest 4', rf_clf4),
    ('random forest 5', rf_clf5),
    ('random forest 6', rf_clf6),
    ('random forest 7', rf_clf7),
    ('random forest 8', rf_clf8),
    ('random forest 9', rf_clf9),
    ('random forest 10', rf_clf10),
    ('random forest 11', rf_clf11),
], voting='hard')

eclf1.fit(X_train, y_train)
print("Voting Classifier 1 results:", eclf1.score(X_test, y_test))

eclf2.fit(X_train, y_train)
print("\nVoting Classifier 2 results:", eclf2.score(X_test, y_test))

Voting Classifier 1 results: 0.35060143535833416

Voting Classifier 2 results: 0.3521176589507733


### Bagging classifier

In [30]:
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

clf1 = tree.DecisionTreeClassifier()
eclf1 = BaggingClassifier(base_estimator=clf1)

param_combos = {'max_features': np.arange(0.1, 1.0, 0.1)}

clf1 = GridSearchCV(eclf1, param_grid = param_combos, cv = 5, n_jobs=10, scoring='f1_micro')
clf1.fit(X_train, y_train)

clf2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=12, min_samples_leaf=3, min_samples_split=64, max_features=0.6)
eclf2 = BaggingClassifier(base_estimator=clf2)
eclf2.fit(X_train, y_train)

print("\nBaggingClassifier accuracy:", eclf2.score(X_test, y_test))


BaggingClassifier accuracy: 0.354240371980188


### Packaged ensemble classifiers

In [26]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

models = {'GBC':GradientBoostingClassifier(), 'ABC':AdaBoostClassifier(), 'RFC':RandomForestClassifier(), 'ETC':ExtraTreesClassifier(), 'HGBC': HistGradientBoostingClassifier() }
for k in models.keys():
    models[k].fit(X_train, y_train)
    print(k, "Accuracy:", models[k].score(X_test, y_test))

GBC Accuracy: 0.3481754776104316
ABC Accuracy: 0.3411503082987971
RFC Accuracy: 0.35333063782472457
ETC Accuracy: 0.35226928131001717
HGBC Accuracy: 0.3534317193975538
