In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate, StratifiedShuffleSplit
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

import warnings
warnings.filterwarnings('ignore')

from IPython.display import clear_output

# load data
df = pd.read_csv('output/data_balanced_scaled.csv')

# split data into features and target
X = df.drop(columns=['seo class'])
y = df['seo class']

# list of classifiers to compare
classifiers = {'AdaBoost': AdaBoostClassifier(),
               'BernoulliNB': BernoulliNB(),
               'DecisionTree': DecisionTreeClassifier(),
               'ExtraTrees': ExtraTreesClassifier(),
               'GaussianNB': GaussianNB(),
               'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'LinearSVC': LinearSVC(),
               'RadiusNeighbors': RadiusNeighborsClassifier(),
               'RandomForest': RandomForestClassifier(),
               'SGD': SGDClassifier(),
               'SVC': SVC()}

# dictionary of evaluation metrics
metrics = {'accuracy': 'accuracy',
           'precision': 'precision_macro', 
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

# set minimal parameters to make sure algorithms function
params = {'max_iter': 100,
          'max_depth' : 10,
          'penalty': 'l2',
          'n_neighbors': 4,
          'outlier_label': 5}

# create stratified split for cross validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=.66, random_state=22)

# empty dictionary to store results
cv_results = {}

# iterate over classifiers to compare results
for name, clf in classifiers.items():
    clear_output()
    print('Current classifier: %s' % (name))
    
    # get parameter options for current classifier
    clf_params = clf.get_params()
    
    # select matching parameters for current classifier from params
    c_params = {}
    for p in params.keys():
        if p in clf_params.keys():
            c_params[p] = params[p]
    
    # set parameters
    if c_params:
        clf.set_params(**c_params)
    
    # cross validate classifier
    cv = cross_validate(clf, X, y, scoring=metrics, cv=sss)
    # save results of cross validation
    cv_results[name] = cv
    
# format data for dataframe
data = []
for name, results in cv_results.items():
    row = [name]
    for k, v in results.items():
        # add mean and standard deviation to data
        row.append(v.mean())
        row.append(v.std())
    data.append(row)
    
# column names for dataframe
columns = ['classifier']
for k in cv.keys():
    k = k.replace('test_', '')
    columns.append(k+'_mean')
    columns.append(k+'_std')

# create data frame to display cv results
results = pd.DataFrame(data, columns=columns)
# save data frame as csv file
results.to_csv('output/04_model_comparison.csv')

In [None]:
# get standard dev columns to remove from df display
std_c = [c for c in results.columns if '_std' in c]

# filter results by f1 > 75% and accuracy > 95%, sort by fit time
results.sort_values(by=['f1_mean'], ascending=False).drop(columns=std_c)

compare decisiontree and gaussiannb with feature reduction. and then test for statistical significance in difference

In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate, StratifiedShuffleSplit

from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

import warnings
warnings.filterwarnings('ignore')

from IPython.display import clear_output

import json

# load data
df = pd.read_csv('output/data_balanced.csv')

# split data into features and target
X = df.drop(columns=['seo class'])
y = df['seo class']

# list of classifiers to compare
classifiers = {'AdaBoost': AdaBoostClassifier(),
               'BernoulliNB': BernoulliNB(),
               'DecisionTree': DecisionTreeClassifier(),
               'ExtraTrees': ExtraTreesClassifier(),
               'GaussianNB': GaussianNB(),
               'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'LinearSVC': LinearSVC(),
               'RandomForest': RandomForestClassifier(),
               'SGD': SGDClassifier(),
               'SVC': SVC()}

# dictionary of evaluation metrics
metrics = {'accuracy': 'accuracy',
           'precision': 'precision_macro', 
           'recall': 'recall_macro',
           'f1': 'f1_macro'}

# set minimal parameters to make sure algorithms function
params = {'max_iter': 100,
          'max_depth' : 10,
          'penalty': 'l2',
          'n_neighbors': 4,
          'outlier_label': 5}

# create stratified split for cross validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=.66, random_state=22)

# empty dictionary to store results
cv_results = {}

# iterate over k for selectKbest
for i in [6, 25, 29, 42, 49]:
    # reset X and y
    X = df.drop(columns=['seo class'])
    y = df['seo class']
    
    selector = SelectKBest(f_classif, k=i)
    X2 = selector.fit_transform(X, y)
    X2 = StandardScaler().fit_transform(X2)

    # iterate over classifiers to compare results
    for name, clf in classifiers.items():
        clear_output()
        print('Current classifier: %s_%d' % (name, i))

        # get parameter options for current classifier
        clf_params = clf.get_params()

        # select matching parameters for current classifier from params
        c_params = {}
        for p in params.keys():
            if p in clf_params.keys():
                c_params[p] = params[p]

        # set parameters
        if c_params:
            clf.set_params(**c_params)

        # cross validate classifier
        cv = cross_validate(clf, X2, y, scoring=metrics, cv=sss)
        # save results of cross validation
        cv_results[name + '_' + str(i)] = cv
          
# format data for dataframe
data = []
for name, results in cv_results.items():
    row = [name]
    for k, v in results.items():
        # add mean and standard deviation to data
        row.append(v.mean())
        row.append(v.std())
    data.append(row)
    
# column names for dataframe
columns = ['classifier']
for k in cv.keys():
    k = k.replace('test_', '')
    columns.append(k+'_mean')
    columns.append(k+'_std')

# create data frame to display cv results
results = pd.DataFrame(data, columns=columns)
# save data frame as csv file
results.to_csv('output/05_model_comparison_features.csv')

Current classifier: SVC_49


In [19]:
# load data
df = pd.read_csv('output/data_balanced_scaled.csv')

# split data into features and target
X = df.drop(columns=['seo class'])
y = df['seo class']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=22)

In [20]:
# list of classifiers to compare
classifiers = {'AdaBoost': AdaBoostClassifier(),
               'BernoulliNB': BernoulliNB(),
               'DecisionTree': DecisionTreeClassifier(),
               'ExtraTrees': ExtraTreesClassifier(),
               'GaussianNB': GaussianNB(),
               'GradientBoosting': GradientBoostingClassifier(),
               'KNeighbors': KNeighborsClassifier(),
               'LinearSVC': LinearSVC(),
               'RandomForest': RandomForestClassifier(),
               'SGD': SGDClassifier(),
               'SVC': SVC()}



clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

array([0.  , 0.02, 0.  , 0.  , 0.  , 0.  , 0.02, 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.12, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.12, 0.  , 0.1 , 0.12, 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  , 0.  ])

In [25]:
feature_importance = [[n, i] for n, i in zip(X.columns, clf.feature_importances_)]

df_imp = pd.DataFrame(feature_importance, columns=['Feature', 'Importance'])
df_imp

Unnamed: 0,Feature,Importance
0,position,0.0
1,speed,0.02
2,check canonical,0.0
3,check description,0.0
4,check external links,0.0
5,check h1,0.0
6,check https,0.02
7,check internal links,0.0
8,check kw_count,0.0
9,check kw_density,0.0


In [29]:
# check for unbalanced set
# check when rule-based features are removed
check when external features are removed
df_imp[df_imp['Importance'] > 0].sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
38,source not optimized,0.5
24,check og,0.12
34,source ads,0.12
37,source news,0.12
36,source known,0.1
1,speed,0.02
6,check https,0.02


In [31]:
from sklearn.inspection import permutation_importance
perm_importance = permutation_importance(clf, X_train, y_train, n_repeats=3,random_state=0)
perm_imp = [[n, i] for n, i in zip(X.columns, perm_importance.importances_mean)]
df_pimp = pd.DataFrame(perm_imp, columns=['Feature', 'Importance'])
df_pimp[df_pimp['Importance'] > 0].sort_values(by='Importance', ascending=False)

Unnamed: 0,Feature,Importance
38,source not optimized,0.375835
37,source news,0.211523
24,check og,0.189516
6,check https,0.18191
1,speed,0.115092
34,source ads,0.0531
36,source known,0.017898


In [32]:
y_pred = clf.predict_proba(X_test)

In [38]:
class_prediction = [prob[class_] for prob, class_ in zip(y_pred, y_test)]
class_prediction = np.array(class_prediction)
class_prediction.mean()

0.7491852489388195

In [56]:
regular_df = pd.read_csv('output/data_cleaned.csv')
optimized_df = pd.read_csv('output/data_balanced_scaled.csv')

data = {'regular data': regular_df, 
        'optimized data': optimized_df}

features = {'all': [],
            'no rule columns': ['a'],
            'no external columns': [c for c in regular_df.columns if c.startswith(('micros', 'source', 'tools'))]}

In [57]:
for dk, dv in data.items():
    for fk, fv in features.items():
        to_drop = ['seo class'] + fv
        print(to_drop)

        #X = dv.drop(columns=to_drop)
        #y = dv['seo class']
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.66, random_state=22)
        

        print(dk, fk)
        #clf.fit(X_train, y_train)
        #feature_importance = [[n, i] for n, i in zip(X.columns, clf.feature_importances_)]

['seo class']
regular data all
['seo class', 'a']
regular data no rule columns
['seo class', 'micros counter', 'source ads', 'source company', 'source known', 'source news', 'source not optimized', 'source search engine', 'source shop', 'source top', 'tools ads count', 'tools analytics count', 'tools caching count', 'tools content count', 'tools seo count', 'tools social count']
regular data no external columns
['seo class']
optimized data all
['seo class', 'a']
optimized data no rule columns
['seo class', 'micros counter', 'source ads', 'source company', 'source known', 'source news', 'source not optimized', 'source search engine', 'source shop', 'source top', 'tools ads count', 'tools analytics count', 'tools caching count', 'tools content count', 'tools seo count', 'tools social count']
optimized data no external columns
