In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.feature_selection import SelectKBest, f_classif

from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

In [2]:
# load data
df = pd.read_csv('output/data_cleaned_balanced.csv')

# split data into features and target
X = df.drop(columns=['seo class'])
y = df['seo class']

In [3]:
# dictionary of evaluation metrics
metrics = {'accuracy': 'accuracy',
           'precision': 'precision_macro', 
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

In [4]:
# create stratified split for cross validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=.66, random_state=22)

In [5]:
# list of classifiers to compare
classifiers = {'AdaBoost': AdaBoostClassifier(),
               'BernoulliNB': BernoulliNB(),
               'DecisionTree': DecisionTreeClassifier(),
               'ExtraTrees': ExtraTreesClassifier(),
               'GaussianNB': GaussianNB(),
               'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'LinearSVC': LinearSVC(),
               'RadiusNeighbors': RadiusNeighborsClassifier(),
               'RandomForest': RandomForestClassifier(),
               'SGD': SGDClassifier(),
               'SVC': SVC()}

In [6]:
params = {'max_iter': 100,
          'max_depth' : 10,
          'penalty': 'l2',
          'n_neighbors': 4,
          'outlier_label': 5}

In [7]:
feature_counts = [6, 25, 29, 42, 49]

In [None]:
cv_results = {}

for i in feature_counts:
    for name, clf in classifiers.items():
        # display current classifier
        # to show progress while code is running
        clear_output()
        print('Current classifier: %s %d' % (name, i))

        # get parameter options for current classifier
        clf_params = clf.get_params()

        # select matching parameters for current classifier from params
        c_params = {}
        for p in params.keys():
            if p in clf_params.keys():
                c_params[p] = params[p]

        # set parameters
        if c_params:
            clf.set_params(**c_params)
            
        # select features
        selector = SelectKBest(f_classif, k=i)
        X2 = selector.fit_transform(X, y)

        # cross validate classifier
        cv = cross_validate(clf, X2, y, scoring=metrics, cv=sss)
        
        # save results of cross validation
        name_i = name + '_' + str(i)
        cv_results[name_i] = cv

Current classifier: KNeighbors 6


In [None]:
data = []
for name, results in cv_results.items():
    row = name.split('_')
    for k, v in results.items():
        # add mean and standard deviation to data
        row.append(v.mean())
        row.append(v.std())
    data.append(row)

# column names for dataframe
columns = ['classifier', 'feature count']
for k in cv.keys():
    k = k.replace('test_', '')
    columns.append(k+'_mean')
    columns.append(k+'_std')

In [None]:
results = pd.DataFrame(data, columns=columns)
results.to_csv('output/benchmarking_results_3-1.csv')

In [None]:
# sorted by f1 mean
results.sort_values(by=['f1_mean'], ascending=False)

In [None]:
# get standard dev columns to remove from df display
std_c = [c for c in results.columns if '_std' in c]

# set filters to narrow down results
# Filters: F1 > 75% and Accuracy > 95%
filter_ = (results['f1_mean'] > 0.75) & (results['accuracy_mean'] > 0.95)

# filter results by f1 > 75% and accuracy > 95%, sort by fit time
results[filter_].sort_values(by=['fit_time_mean']).drop(columns=std_c)