# Letter Recognition using KNN

sources:
https://medium.com/@erikgreenj/k-neighbors-classifier-with-gridsearchcv-basics-3c445ddeb657

## Format the Data

In [2]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
header = ['letter', 'x-box', 'y-box', 'width', 'height', 'totpix', 
         'x-bar', 'y-bar', 'x2bar', 'y2bar', 'xybar', 'x2ybar', 'xy2bar',
         'x-edge', 'x-edge-y', 'y-edge', 'y-edge-x']
letters = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/letter-recognition.data', names=header)
# letters = letters.set_index('letter') 
letters = letters.sort_values('letter') # order rows alphabetically
letters = letters.reset_index(drop=True)
letters.head()

Unnamed: 0,letter,x-box,y-box,width,height,totpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybar,xy2bar,x-edge,x-edge-y,y-edge,y-edge-x
0,A,4,9,6,6,2,9,5,3,1,8,1,8,2,7,2,8
1,A,3,6,5,4,1,7,5,3,1,6,1,8,2,7,2,7
2,A,5,9,6,7,7,8,8,8,4,6,6,8,3,8,8,4
3,A,4,10,7,7,5,7,5,2,3,5,2,6,3,7,4,4
4,A,4,9,7,7,5,8,5,2,4,6,1,5,3,5,4,5


In [4]:
# make Y 
ones_pos = [1]*9940 # 9940 is the number of datapoints with letters A-M
ones_neg = [-1]*(20000-9940) # the number of the rest of the datapoints with letters N-Z
Y = np.array(ones_pos + ones_neg)
print(Y.shape)
# make X_and_Y
letters_withY = letters
letters_withY['Y'] = Y
# letters_withY
X_and_Y = letters_withY.drop('letter',1)
X_and_Y.head()

(20000,)


Unnamed: 0,x-box,y-box,width,height,totpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybar,xy2bar,x-edge,x-edge-y,y-edge,y-edge-x,Y
0,4,9,6,6,2,9,5,3,1,8,1,8,2,7,2,8,1
1,3,6,5,4,1,7,5,3,1,6,1,8,2,7,2,7,1
2,5,9,6,7,7,8,8,8,4,6,6,8,3,8,8,4,1
3,4,10,7,7,5,7,5,2,3,5,2,6,3,7,4,4,1
4,4,9,7,7,5,8,5,2,4,6,1,5,3,5,4,5,1


## Create 5 trials, each with train and test set

In [5]:
from sklearn.model_selection import train_test_split

XY_train1, XY_test1 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train2, XY_test2 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train3, XY_test3 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train4, XY_test4 = train_test_split(X_and_Y, test_size=15000, shuffle=True)
XY_train5, XY_test5 = train_test_split(X_and_Y, test_size=15000, shuffle=True)

## Perform GridSearchCV on classifier

In [6]:
%%time
#import warnings
# there are a lot of convergence warnings for some params, however be careful with this!!
# sometimes you need to see those wanrings, and now we've screwed tha tup for the whole notebook from here on!!
#warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV

# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', KNeighborsClassifier())])

# Create search space of candidate learning algorithms and their hyperparameters
grid_params = {
    'n_neighbors': [3,5,11,19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

trialnum = 0
accuracy_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
roc_sum = 0 # sum of top roc score to later calculate the average of all 5 trials
f1_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
accuracy_scores = []
roc_scores = []
f1_scores = []
all_accuracy_models = []
all_roc_models = []
all_f1_models = []

# for every trial
for trial in [XY_train1, XY_train2, XY_train3, XY_train4, XY_train5]:
 
    trialnum = trialnum + 1
    X_l = trial.drop(['Y'],1)
    y_l = trial['Y']
    
    # Create grid search 
#     clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
#                        scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
#                        verbose=0)
    clf = GridSearchCV(KNeighborsClassifier(), param_grid=grid_params, cv=StratifiedKFold(n_splits=5), 
                      scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False, 
                       n_jobs=-1, verbose=-1)

    # Fit grid search
    best_model = clf.fit(X_l, y_l)
    
    print("------------------------------------------------------------------------------------")
    print("RESULTS FOR TRIAL:")
    print(trialnum)
    print("------------------------------------------------------------------------------------")
    
    # the detailed results of the whole model selection search...
#     print(best_model.cv_results_)

    print("---------------BEST MODEL FOR ACCURACY: ----------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    print("---WITH ACCURACY: ---")
    current_accuracy = best_model.cv_results_['mean_test_accuracy'][ np.argmax(best_model.cv_results_['mean_test_accuracy']) ]
    print(current_accuracy)
    accuracy_sum = accuracy_sum + current_accuracy
    accuracy_scores.append(current_accuracy)
    all_accuracy_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    
    print("---------------BEST MODEL FOR ROC: ---------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr']) ] )
    print("---WITH ROC: ---")
    current_roc = best_model.cv_results_['mean_test_roc_auc_ovr'][ np.argmax(best_model.cv_results_['mean_test_roc_auc_ovr']) ]
    print(current_roc)
    roc_sum = roc_sum + current_roc
    roc_scores.append(current_roc)
    all_roc_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr']) ] )

    
    print("---------------BEST MODEL FOR F1: ----------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )
    print("---WITH F1: ---")    
    current_f1 = best_model.cv_results_['mean_test_f1_micro'][ np.argmax(best_model.cv_results_['mean_test_f1_micro']) ]
    print(current_f1)
    f1_sum = f1_sum + current_f1
    f1_scores.append(current_f1)
    all_f1_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )

    
    # below (optional): check that the above outputs actually show best scores

#     print results just to check alignment with the above
#     results = pd.DataFrame( best_model.cv_results_['params'] ) # parameter settings for best model
#     grab the accuracy score resulting from those parameters
#     results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
#     results['score_roc'] = best_model.cv_results_['mean_test_roc_auc_ovr']
#     results['score_f1'] = best_model.cv_results_['mean_test_f1_micro']
#     get rid of classifier__XX in columns
#     cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
#     results.columns = cols
#     print(results)


------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
1
------------------------------------------------------------------------------------
---------------BEST MODEL FOR ACCURACY: ----------
{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
---WITH ACCURACY: ---
0.9461999999999999
---------------BEST MODEL FOR ROC: ---------------
{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
---WITH ROC: ---
0.9874878374135492
---------------BEST MODEL FOR F1: ----------------
{'metric': 'euclidean', 'n_neighbors': 3, 'weights': 'distance'}
---WITH F1: ---
0.9461999999999999
------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
2
------------------------------------------------------------------------------------
---------------BEST MODEL FOR ACCURACY: ----------
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
---WITH ACCURACY: ---
0.9521999999999998
-

### Get Train Metrics

In [7]:
print("===== ACCURACY SCORES: =====")
print(accuracy_scores)
print("===== ROC SCORES: =====")
print(roc_scores)
print("===== F1 SCORES: =====")
print(f1_scores)

===== ACCURACY SCORES: =====
[0.9461999999999999, 0.9521999999999998, 0.9488, 0.9456, 0.9475999999999999]
===== ROC SCORES: =====
[0.9874878374135492, 0.9885579215200169, 0.9867112980264757, 0.9881638728062573, 0.9893954494840684]
===== F1 SCORES: =====
[0.9461999999999999, 0.9522, 0.9488, 0.9456, 0.9475999999999999]


In [8]:
print("================ BEST ACCURACY MODEL IN TRAINING: ==================")
print(all_accuracy_models[ np.argmax(accuracy_scores) ])
print("WITH ACCURACY:")
print(max(accuracy_scores))
print("================ BEST ROC MODEL IN TRAINING: ==================")
print(all_roc_models[ np.argmax(roc_scores) ])
print("WITH ROC SCORE:")
print(max(roc_scores))
print("================ BEST F1 MODEL IN TRAINING: ==================")
print(all_f1_models[ np.argmax(f1_scores) ])
print("WITH F1 SCORE:")
print(max(f1_scores))

{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
WITH ACCURACY:
0.9521999999999998
{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
WITH ROC SCORE:
0.9893954494840684
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
WITH F1 SCORE:
0.9522


In [9]:
print("================ AVERAGE ACCURACY ON TRAIN SET: ==================")
print(accuracy_sum / 5)
print("================ AVERAGE ROC SCORE ON TRAIN SET: ==================")
print(roc_sum / 5)
print("================ AVERAGE F1 SCORE ON TRAIN SET: ==================")
print(f1_sum / 5)

0.9480799999999998
0.9880632758500735
0.9480799999999998


### Get Test Metrics

In [10]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# test performance of model rated as best for accuracy and f1 score on training set
performance_AccModel = pd.DataFrame(index=['acc', 'roc', 'f1'], columns=['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5'])
performance_AccModel

# test performance on trial 1 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance').fit(XY_train1.drop(['Y'],1), XY_train1['Y'])
pred = clf.predict(XY_test1.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 1'] = accuracy_score(XY_test1['Y'], pred)
performance_AccModel.loc['roc', 'Trial 1'] = roc_auc_score(XY_test1['Y'], pred)
performance_AccModel.loc['f1', 'Trial 1'] = f1_score(XY_test1['Y'], pred)

# test performance on trial 2 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance').fit(XY_train2.drop(['Y'],1), XY_train2['Y'])
pred = clf.predict(XY_test2.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 2'] = accuracy_score(XY_test2['Y'], pred)
performance_AccModel.loc['roc', 'Trial 2'] = roc_auc_score(XY_test2['Y'], pred)
performance_AccModel.loc['f1', 'Trial 2'] = f1_score(XY_test2['Y'], pred)

# test performance on trial 3 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance').fit(XY_train3.drop(['Y'],1), XY_train3['Y'])
pred = clf.predict(XY_test3.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 3'] = accuracy_score(XY_test3['Y'], pred)
performance_AccModel.loc['roc', 'Trial 3'] = roc_auc_score(XY_test3['Y'], pred)
performance_AccModel.loc['f1', 'Trial 3'] = f1_score(XY_test3['Y'], pred)

# test performance on trial 4 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance').fit(XY_train4.drop(['Y'],1), XY_train4['Y'])
pred = clf.predict(XY_test4.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 4'] = accuracy_score(XY_test4['Y'], pred)
performance_AccModel.loc['roc', 'Trial 4'] = roc_auc_score(XY_test4['Y'], pred)
performance_AccModel.loc['f1', 'Trial 4'] = f1_score(XY_test4['Y'], pred)

# test performance on trial 5 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=3, weights='distance').fit(XY_train5.drop(['Y'],1), XY_train5['Y'])
pred = clf.predict(XY_test5.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 5'] = accuracy_score(XY_test5['Y'], pred)
performance_AccModel.loc['roc', 'Trial 5'] = roc_auc_score(XY_test5['Y'], pred)
performance_AccModel.loc['f1', 'Trial 5'] = f1_score(XY_test5['Y'], pred)

performance_AccModel

Unnamed: 0,Trial 1,Trial 2,Trial 3,Trial 4,Trial 5
acc,0.953533,0.9508,0.953467,0.9526,0.954067
roc,0.95358,0.950841,0.953494,0.9526,0.954105
f1,0.953536,0.950437,0.95306,0.952691,0.954027


In [11]:
# test performance of model rated as best for roc on training set
performance_ROCModel = pd.DataFrame(index=['acc', 'roc', 'f1'], columns=['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5'])

# test performance on trial 1 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance').fit(XY_train1.drop(['Y'],1), XY_train1['Y'])
pred = clf.predict(XY_test1.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 1'] = accuracy_score(XY_test1['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 1'] = roc_auc_score(XY_test1['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 1'] = f1_score(XY_test1['Y'], pred)

# test performance on trial 2 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance').fit(XY_train2.drop(['Y'],1), XY_train2['Y'])
pred = clf.predict(XY_test2.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 2'] = accuracy_score(XY_test2['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 2'] = roc_auc_score(XY_test2['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 2'] = f1_score(XY_test2['Y'], pred)

# test performance on trial 3 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance').fit(XY_train3.drop(['Y'],1), XY_train3['Y'])
pred = clf.predict(XY_test3.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 3'] = accuracy_score(XY_test3['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 3'] = roc_auc_score(XY_test3['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 3'] = f1_score(XY_test3['Y'], pred)

# test performance on trial 4 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance').fit(XY_train4.drop(['Y'],1), XY_train4['Y'])
pred = clf.predict(XY_test4.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 4'] = accuracy_score(XY_test4['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 4'] = roc_auc_score(XY_test4['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 4'] = f1_score(XY_test4['Y'], pred)

# test performance on trial 5 test set
clf = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance').fit(XY_train5.drop(['Y'],1), XY_train5['Y'])
pred = clf.predict(XY_test5.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 5'] = accuracy_score(XY_test5['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 5'] = roc_auc_score(XY_test5['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 5'] = f1_score(XY_test5['Y'], pred)

performance_ROCModel

Unnamed: 0,Trial 1,Trial 2,Trial 3,Trial 4,Trial 5
acc,0.952133,0.949933,0.9518,0.9512,0.950867
roc,0.952201,0.950022,0.951873,0.9512,0.950922
f1,0.952286,0.949776,0.951571,0.951265,0.950948


## Get final KNN results

In [13]:
print("================ BEST ACCURACY MODEL IN KNN: ==================")
print(all_accuracy_models[ np.argmax(np.array(performance_AccModel.iloc[0])) ])
print("WITH ACCURACY:")
print((np.array(performance_AccModel.iloc[0])).mean())
print("================ BEST ROC MODEL IN KNN: ==================")
print(all_roc_models[ np.argmax(np.array(performance_ROCModel.iloc[1])) ])
print("WITH ROC SCORE:")
print((performance_ROCModel.iloc[1]).mean())
print("================ BEST F1 MODEL IN KNN: ==================")
print(all_f1_models[ np.argmax(np.array(performance_AccModel.iloc[2])) ])
print("WITH F1 SCORE:")
print((np.array(performance_AccModel.iloc[2])).mean())

{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
WITH ACCURACY:
0.9528933333333333
{'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'distance'}
WITH ROC SCORE:
0.9512436942161152
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}
WITH F1 SCORE:
0.9527502207921407
