In [1]:
import sys
sys.path.append('/Users/tompease/Documents/Coding/airbnb')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
import utils.data_loader as utils
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

loader = utils.AirbnbLoader()
X, y = loader.load_airbnb('Category', normalized=True)

encoder = LabelEncoder()
y = encoder.fit_transform(y)

cv_split = ShuffleSplit(n_splits = 5, test_size = .3, train_size = .7, random_state = 42)

The code below loops over a number of popular machine learning algorithms, using cross validation, and returning a table ordered from best to worst in terms of train accuracy

In [2]:
classification_algs = [
  LogisticRegression(max_iter=1000),
  RandomForestClassifier(),
  GradientBoostingClassifier(),
  SVC(),
  GaussianNB(),
  MultinomialNB(),
  SGDClassifier(),
  KNeighborsClassifier(),
  DecisionTreeClassifier(),
  XGBClassifier()
]

columns = ['MLA name', 'Parameters', 'Train accuracy', 'Test accuracy', 'Train f1 weighted', 'Test f1 weighted']
MLA_compare = pd.DataFrame(columns = columns)

row_index = 0

for alg in classification_algs:
  name = alg.__class__.__name__
  MLA_compare.loc[row_index, 'MLA name'] = name
  MLA_compare.loc[row_index, 'Parameters'] = str(alg.get_params())

  cv_results = cross_validate(alg, X, y, scoring=['f1_weighted', 'accuracy'], cv=cv_split, return_train_score=True)

  MLA_compare.loc[row_index, 'Train accuracy'] = cv_results['train_accuracy'].mean()
  MLA_compare.loc[row_index, 'Test accuracy'] = cv_results['test_accuracy'].mean()
  MLA_compare.loc[row_index, 'Train f1 weighted'] = cv_results['train_f1_weighted'].mean()
  MLA_compare.loc[row_index, 'Test f1 weighted'] = cv_results['test_f1_weighted'].mean()

  row_index += 1

MLA_compare.sort_values(by = ['Test accuracy'], ascending=False, inplace = True)

MLA_compare

Unnamed: 0,MLA name,Parameters,Train accuracy,Test accuracy,Train f1 weighted,Test f1 weighted
2,GradientBoostingClassifier,"{'ccp_alpha': 0.0, 'criterion': 'friedman_mse'...",0.866093,0.359839,0.867292,0.356218
1,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.997246,0.359036,0.997246,0.354229
0,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.404819,0.351004,0.375499,0.324425
9,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.997246,0.340562,0.997246,0.337706
4,GaussianNB,"{'priors': None, 'var_smoothing': 1e-09}",0.345611,0.316466,0.289456,0.256276
8,DecisionTreeClassifier,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.997246,0.284337,0.997246,0.284598
5,MultinomialNB,"{'alpha': 1.0, 'class_prior': None, 'fit_prior...",0.330809,0.281124,0.268169,0.229931
7,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.51222,0.279518,0.500479,0.265075
3,SVC,"{'C': 1.0, 'break_ties': False, 'cache_size': ...",0.317728,0.274699,0.24863,0.214887
6,SGDClassifier,"{'alpha': 0.0001, 'average': False, 'class_wei...",0.273666,0.26988,0.182038,0.169124
