In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

In [2]:
# load data
df = pd.read_csv('output/data_cleaned_balanced.csv')

# split data into features and target
X = df.drop(columns=['seo class'])
y = df['seo class']

In [3]:
# dictionary of evaluation metrics
metrics = {'accuracy': 'accuracy',
           'precision': 'precision_macro', 
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

In [4]:
# create stratified split for cross validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=.66, random_state=22)

In [5]:
# list of classifiers to compare
classifiers = {'AdaBoost': AdaBoostClassifier(),
               'BernoulliNB': BernoulliNB(),
               'DecisionTree': DecisionTreeClassifier(),
               'ExtraTrees': ExtraTreesClassifier(),
               'GaussianNB': GaussianNB(),
               'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'LinearSVC': LinearSVC(),
               'RadiusNeighbors': RadiusNeighborsClassifier(),
               'RandomForest': RandomForestClassifier(),
               'SGD': SGDClassifier(),
               'SVC': SVC()}

In [6]:
params = {'max_iter': 100,
          'max_depth' : 10,
          'penalty': 'l2',
          'n_neighbors': 4,
          'outlier_label': 5}

In [7]:
feature_counts = [6, 25, 29, 42, 49]

In [8]:
cv_results = {}

for i in feature_counts:
    for name, clf in classifiers.items():
        # display current classifier
        # to show progress while code is running
        clear_output()
        print('Current classifier: %s %d' % (name, i))

        # get parameter options for current classifier
        clf_params = clf.get_params()

        # select matching parameters for current classifier from params
        c_params = {}
        for p in params.keys():
            if p in clf_params.keys():
                c_params[p] = params[p]

        # set parameters
        if c_params:
            clf.set_params(**c_params)

        # cross validate classifier
        cv = cross_validate(clf, X, y, scoring=metrics, cv=sss)
        
        # save results of cross validation
        name_i = name + '_' + str(i)
        cv_results[name_i] = cv

Current classifier: SVC 49


In [9]:
data = []
for name, results in cv_results.items():
    row = name.split('_')
    for k, v in results.items():
        # add mean and standard deviation to data
        row.append(v.mean())
        row.append(v.std())
    data.append(row)

# column names for dataframe
columns = ['classifier', 'feature count']
for k in cv.keys():
    k = k.replace('test_', '')
    columns.append(k+'_mean')
    columns.append(k+'_std')

In [10]:
results = pd.DataFrame(data, columns=columns)
results.to_csv('output/benchmarking_results_3.csv')

In [11]:
# sorted by f1 mean
results.sort_values(by=['f1_mean'], ascending=False)

Unnamed: 0,classifier,feature count,fit_time_mean,fit_time_std,score_time_mean,score_time_std,accuracy_mean,accuracy_std,precision_mean,precision_std,recall_mean,recall_std,f1_mean,f1_std
50,DecisionTree,49,1.575643,0.029498,0.710227,0.010431,0.999874,1.1e-05,0.999877,1.1e-05,0.999871,1.2e-05,0.999874,1.1e-05
26,DecisionTree,29,1.545944,0.024523,0.71178,0.017926,0.999874,1.2e-05,0.999877,1.1e-05,0.99987,1.2e-05,0.999874,1.2e-05
2,DecisionTree,6,1.638292,0.078691,0.758403,0.041126,0.999873,1.1e-05,0.999876,1e-05,0.999869,1.1e-05,0.999872,1.1e-05
14,DecisionTree,25,1.579375,0.029146,0.714989,0.007936,0.999871,1.2e-05,0.999874,1.2e-05,0.999867,1.3e-05,0.999871,1.2e-05
38,DecisionTree,42,1.553759,0.022777,0.701578,0.008623,0.999871,1.2e-05,0.999874,1.2e-05,0.999867,1.3e-05,0.999871,1.2e-05
5,GradientBoosting,6,852.873263,40.220856,9.480216,0.243136,0.999846,1.2e-05,0.999848,1.3e-05,0.999844,1.1e-05,0.999846,1.2e-05
41,GradientBoosting,42,781.69522,36.340715,8.83822,0.39445,0.999844,1.7e-05,0.999845,1.8e-05,0.999842,1.7e-05,0.999844,1.7e-05
17,GradientBoosting,25,784.908568,39.639501,8.870903,0.283555,0.999842,1.5e-05,0.999844,1.6e-05,0.99984,1.5e-05,0.999842,1.5e-05
29,GradientBoosting,29,777.705243,38.363344,8.80027,0.303265,0.999841,1.6e-05,0.999843,1.7e-05,0.99984,1.5e-05,0.999841,1.6e-05
53,GradientBoosting,49,794.155102,31.288369,8.940997,0.197145,0.999841,1.9e-05,0.999843,2e-05,0.999839,1.9e-05,0.999841,2e-05


In [12]:
# get standard dev columns to remove from df display
std_c = [c for c in results.columns if '_std' in c]

# set filters to narrow down results
# Filters: F1 > 75% and Accuracy > 95%
filter_ = (results['f1_mean'] > 0.75) & (results['accuracy_mean'] > 0.95)

# filter results by f1 > 75% and accuracy > 95%, sort by fit time
results[filter_].sort_values(by=['fit_time_mean']).drop(columns=std_c)

Unnamed: 0,classifier,feature count,fit_time_mean,score_time_mean,accuracy_mean,precision_mean,recall_mean,f1_mean
25,BernoulliNB,29,0.718342,1.022885,0.959448,0.961539,0.95839,0.957928
37,BernoulliNB,42,0.722567,1.03515,0.959448,0.961539,0.95839,0.957928
49,BernoulliNB,49,0.727766,1.029764,0.959448,0.961539,0.95839,0.957928
13,BernoulliNB,25,0.727831,1.045464,0.959448,0.961539,0.95839,0.957928
40,GaussianNB,42,0.743666,1.728213,0.989987,0.989883,0.989842,0.989816
28,GaussianNB,29,0.745624,1.728758,0.989987,0.989883,0.989842,0.989816
52,GaussianNB,49,0.747855,1.740926,0.989987,0.989883,0.989842,0.989816
16,GaussianNB,25,0.753277,1.751793,0.989987,0.989883,0.989842,0.989816
4,GaussianNB,6,0.811275,1.949909,0.989987,0.989883,0.989842,0.989816
1,BernoulliNB,6,0.863923,1.182805,0.959448,0.961539,0.95839,0.957928
