In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
header = ['letter', 'x-box', 'y-box', 'width', 'height', 'totpix', 
         'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar',
         'x-edge', 'x-edge-y', 'y-edge', 'y-edge-x']
letters = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data', names=header)
# letters = letters.set_index('letter') 
letters = letters.sort_values('letter') # order rows alphabetically
letters = letters.reset_index(drop=True)
letters.head()

Unnamed: 0,letter,x-box,y-box,width,height,totpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybar,xy2bar,x-edge,x-edge-y,y-edge,y-edge-x
0,A,4,9,6,6,2,9,5,3,1,8,1,8,2,7,2,8
1,A,3,6,5,4,1,7,5,3,1,6,1,8,2,7,2,7
2,A,5,9,6,7,7,8,8,8,4,6,6,8,3,8,8,4
3,A,4,10,7,7,5,7,5,2,3,5,2,6,3,7,4,4
4,A,4,9,7,7,5,8,5,2,4,6,1,5,3,5,4,5


In [3]:
# make Y 
ones_pos = [1]*9940 # 9940 is the number of datapoints with letters A-M
ones_neg = [-1]*(20000-9940) # the number of the rest of the datapoints with letters N-Z
Y = np.array(ones_pos + ones_neg)
print(Y.shape)
# make X_and_Y
letters_withY = letters
letters_withY['Y'] = Y
# letters_withY
X_and_Y = letters_withY.drop('letter',1)
X_and_Y.head()

(20000,)


Unnamed: 0,x-box,y-box,width,height,totpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybar,xy2bar,x-edge,x-edge-y,y-edge,y-edge-x,Y
0,4,9,6,6,2,9,5,3,1,8,1,8,2,7,2,8,1
1,3,6,5,4,1,7,5,3,1,6,1,8,2,7,2,7,1
2,5,9,6,7,7,8,8,8,4,6,6,8,3,8,8,4,1
3,4,10,7,7,5,7,5,2,3,5,2,6,3,7,4,4,1
4,4,9,7,7,5,8,5,2,4,6,1,5,3,5,4,5,1


## Create 5 trials, each with train and test set

In [4]:
from sklearn.model_selection import train_test_split

XY_train1, XY_test1 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train2, XY_test2 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train3, XY_test3 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train4, XY_test4 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train5, XY_test5 = train_test_split(X_and_Y, test_size=15000, shuffle=True)

## Perform GridSearchCV on classifier

In [6]:
%%time
#import warnings
# there are a lot of convergence warnings for some params, however be careful with this!!
# sometimes you need to see those wanrings, and now we've screwed tha tup for the whole notebook from here on!!
#warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor


# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', SVC())])

# Create search space of candidate learning algorithms and their hyperparameters
# search_space = [{'classifier': [SVC()],
#                  'classifier__kernel': ['rbf'],
#                  'classifier__gamma': [0.001, 0.01, 0.1, 1, 10],
#                  'classifier__C': np.logspace(-4, 4, 9)},
#                 {'classifier': [SVC()],
#                  'classifier__kernel': ['linear'],
#                  'classifier__C': np.logspace(-4, 4, 9)},
#                 {'classifier': [SVC()],
#                  'classifier__kernel': ['poly'],
#                  'classifier__degree': [2,3],
#                  'classifier__C': np.logspace(-4, 4, 9)}
#                 ]
search_space = [{'classifier': [SVC()],
                 'classifier__kernel': ['rbf'],
                 'classifier__gamma': [0.001, 0.1, 1, 1],
                 'classifier__C': np.logspace(-2, 2, 5)},
                {'classifier': [SVC()],
                 'classifier__kernel': ['linear'],
                 'classifier__C': np.logspace(-2, 2, 5)},
                {'classifier': [SVC()],
                 'classifier__kernel': ['poly'],
                 'classifier__degree': [2,3],
                 'classifier__C': np.logspace(-2, 2, 5)}
                ]

trialnum = 0
accuracy_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
roc_sum = 0 # sum of top roc score to later calculate the average of all 5 trials
f1_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
accuracy_scores = []
roc_scores = []
f1_scores = []
all_accuracy_models = []
all_roc_models = []
all_f1_models = []

# for every trial
for trial in [XY_train1, XY_train2, XY_train3, XY_train4, XY_train5]:
 
    trialnum = trialnum + 1

    X_l = trial.drop(['Y'],1)
    y_l = trial['Y']

    # Create grid search 
    clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc', 'f1_micro'], refit=False,
                       verbose=0)

    # Fit grid search
    best_model = clf.fit(X_l, y_l)
    
    print("------------------------------------------------------------------------------------")
    print("RESULTS FOR TRIAL:")
    print(trialnum)
    print("------------------------------------------------------------------------------------")
    
    # the detailed results of the whole model selection search...
#     print(best_model.cv_results_)

    print("---------------BEST MODEL FOR ACCURACY: ----------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    print("---WITH ACCURACY: ---")
    current_accuracy = best_model.cv_results_['mean_test_accuracy'][ np.argmax(best_model.cv_results_['mean_test_accuracy']) ]
    print(current_accuracy)
    accuracy_sum = accuracy_sum + current_accuracy
    accuracy_scores.append(current_accuracy)
    all_accuracy_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    
    print("---------------BEST MODEL FOR ROC: ---------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc']) ] )
    print("---WITH ROC: ---")
    current_roc = best_model.cv_results_['mean_test_roc_auc'][ np.argmax(best_model.cv_results_['mean_test_roc_auc']) ]
    print(current_roc)
    roc_sum = roc_sum + current_roc
    roc_scores.append(current_roc)
    all_roc_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc']) ] )

    
    print("---------------BEST MODEL FOR F1: ----------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )
    print("---WITH F1: ---")    
    current_f1 = best_model.cv_results_['mean_test_f1_micro'][ np.argmax(best_model.cv_results_['mean_test_f1_micro']) ]
    print(current_f1)
    f1_sum = f1_sum + current_f1
    f1_scores.append(current_f1)
    all_f1_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )

    
    # below (optional): check that the above outputs actually show best scores

#     print results just to check alignment with the above
#     results = pd.DataFrame( best_model.cv_results_['params'] ) # parameter settings for best model
#     grab the accuracy score resulting from those parameters
#     results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
#     results['score_roc'] = best_model.cv_results_['mean_test_roc_auc_ovr']
#     results['score_f1'] = best_model.cv_results_['mean_test_f1_micro']
#     get rid of classifier__XX in columns
#     cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
#     results.columns = cols
#     print(results)


------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
1
------------------------------------------------------------------------------------
---------------BEST MODEL FOR ACCURACY: ----------
{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
---WITH ACCURACY: ---
0.9490000000000001
---------------BEST MODEL FOR ROC: ---------------
{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
---WITH ROC: ---
0.9892935852095409
---------------BEST MODEL FOR F1: ----------------
{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
---WITH F1: ---
0.9490000000000001
------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
2
------------------------------------------------------------------------------------
---------------BEST MODEL FOR ACCURACY: ----------
{'classi

### Get Training Metrics

In [7]:
print("===== ACCURACY SCORES: =====")
print(accuracy_scores)
print("===== ROC SCORES: =====")
print(roc_scores)
print("===== F1 SCORES: =====")
print(f1_scores)

===== ACCURACY SCORES: =====
[0.9490000000000001, 0.9570000000000001, 0.9501999999999999, 0.9565999999999999, 0.9538]
===== ROC SCORES: =====
[0.9892935852095409, 0.9907812340114491, 0.9897851060968004, 0.9907151628606513, 0.9904123357804681]
===== F1 SCORES: =====
[0.9490000000000001, 0.9570000000000001, 0.9501999999999999, 0.9565999999999999, 0.9538]


In [8]:
print("================ BEST ACCURACY MODEL IN TRAINING: ==================")
print(all_accuracy_models[ np.argmax(accuracy_scores) ])
print("WITH ACCURACY:")
print(max(accuracy_scores))
print("================ BEST ROC MODEL IN TRAINING: ==================")
print(all_roc_models[ np.argmax(roc_scores) ])
print("WITH ROC SCORE:")
print(max(roc_scores))
print("================ BEST F1 MODEL IN TRAINING: ==================")
print(all_f1_models[ np.argmax(f1_scores) ])
print("WITH F1 SCORE:")
print(max(f1_scores))

{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
WITH ACCURACY:
0.9570000000000001
{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
WITH ROC SCORE:
0.9907812340114491
{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
WITH F1 SCORE:
0.9570000000000001


In [9]:
print("================ AVERAGE ACCURACY ON TRAIN SET: ==================")
print(accuracy_sum / 5)
print("================ AVERAGE ROC SCORE ON TRAIN SET: ==================")
print(roc_sum / 5)
print("================ AVERAGE F1 SCORE ON TRAIN SET: ==================")
print(f1_sum / 5)

0.9533200000000001
0.990197484791782
0.9533200000000001


### Get Test Metrics

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# test performance of model rated as best for accuracy and roc and f1 score on training set
performance_AccModel = pd.DataFrame(index=['acc', 'roc', 'f1'], columns=['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5'])
performance_AccModel

# test performance on trial 1 test set
clf = SVC(C=10.0, gamma=1.0, kernel='rbf').fit(XY_train1.drop(['Y'],1), XY_train1['Y'])
pred = clf.predict(XY_test1.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 1'] = accuracy_score(XY_test1['Y'], pred)
performance_AccModel.loc['roc', 'Trial 1'] = roc_auc_score(XY_test1['Y'], pred)
performance_AccModel.loc['f1', 'Trial 1'] = f1_score(XY_test1['Y'], pred)

# test performance on trial 2 test set
clf = SVC(C=10.0, gamma=1.0, kernel='rbf').fit(XY_train2.drop(['Y'],1), XY_train2['Y'])
pred = clf.predict(XY_test2.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 2'] = accuracy_score(XY_test2['Y'], pred)
performance_AccModel.loc['roc', 'Trial 2'] = roc_auc_score(XY_test2['Y'], pred)
performance_AccModel.loc['f1', 'Trial 2'] = f1_score(XY_test2['Y'], pred)

# test performance on trial 3 test set
clf = SVC(C=10.0, gamma=1.0, kernel='rbf').fit(XY_train3.drop(['Y'],1), XY_train3['Y'])
pred = clf.predict(XY_test3.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 3'] = accuracy_score(XY_test3['Y'], pred)
performance_AccModel.loc['roc', 'Trial 3'] = roc_auc_score(XY_test3['Y'], pred)
performance_AccModel.loc['f1', 'Trial 3'] = f1_score(XY_test3['Y'], pred)

# test performance on trial 4 test set
clf = SVC(C=10.0, gamma=1.0, kernel='rbf').fit(XY_train4.drop(['Y'],1), XY_train4['Y'])
pred = clf.predict(XY_test4.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 4'] = accuracy_score(XY_test4['Y'], pred)
performance_AccModel.loc['roc', 'Trial 4'] = roc_auc_score(XY_test4['Y'], pred)
performance_AccModel.loc['f1', 'Trial 4'] = f1_score(XY_test4['Y'], pred)

# test performance on trial 5 test set
clf = SVC(C=10.0, gamma=1.0, kernel='rbf').fit(XY_train5.drop(['Y'],1), XY_train5['Y'])
pred = clf.predict(XY_test5.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 5'] = accuracy_score(XY_test5['Y'], pred)
performance_AccModel.loc['roc', 'Trial 5'] = roc_auc_score(XY_test5['Y'], pred)
performance_AccModel.loc['f1', 'Trial 5'] = f1_score(XY_test5['Y'], pred)

performance_AccModel

Unnamed: 0,Trial 1,Trial 2,Trial 3,Trial 4,Trial 5
acc,0.8346,0.757267,0.687867,0.799067,0.795867
roc,0.8333,0.760039,0.687117,0.797344,0.797884
f1,0.802106,0.802238,0.544995,0.74702,0.828209


## Get final SVC results

In [16]:
print("================ BEST ACCURACY MODEL IN SVC: ==================")
print(all_accuracy_models[ np.argmax(np.array(performance_AccModel.iloc[0])) ])
print("WITH ACCURACY:")
print((np.array(performance_AccModel.iloc[0])).mean())
print("================ BEST ROC MODEL IN SVC: ==================")
print(all_roc_models[ np.argmax(np.array(performance_AccModel.iloc[1])) ])
print("WITH ROC SCORE:")
print((performance_AccModel.iloc[1]).mean())
print("================ BEST F1 MODEL IN SVC: ==================")
print(all_f1_models[ np.argmax(np.array(performance_AccModel.iloc[2])) ])
print("WITH F1 SCORE:")
print((np.array(performance_AccModel.iloc[2])).mean())

{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
WITH ACCURACY:
0.7749333333333334
{'classifier': SVC(), 'classifier__C': 10.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
WITH ROC SCORE:
0.7751366435681344
{'classifier': SVC(), 'classifier__C': 1.0, 'classifier__gamma': 1, 'classifier__kernel': 'rbf'}
WITH F1 SCORE:
0.744913633779794
