In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Format data

In [18]:
header = ['wking_file', 'wking_rank', 'wrook_file', 'wrook_rank', 
          'bking_file', 'bking_rank', 'white_win_depth']
chess = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/chess/king-rook-vs-king/krkopt.data', names=header)

In [20]:
print(chess.shape)
print(chess['white_win_depth'].value_counts())
chess.head()

(28056, 7)
fourteen    4553
thirteen    4194
twelve      3597
eleven      2854
draw        2796
fifteen     2166
ten         1985
nine        1712
eight       1433
seven        683
six          592
five         471
sixteen      390
two          246
four         198
three         81
one           78
zero          27
Name: white_win_depth, dtype: int64


Unnamed: 0,wking_file,wking_rank,wrook_file,wrook_rank,bking_file,bking_rank,white_win_depth
0,a,1,b,3,c,2,draw
1,a,1,c,1,c,2,draw
2,a,1,c,1,d,1,draw
3,a,1,c,1,d,2,draw
4,a,1,c,2,c,1,draw


In [32]:
class1_vals = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven',
         'eight', 'nine', 'ten', 'eleven', 'twelve']
class0_vals = ['thirteen', 'fourteen', 'fifteen', 'sixteen', 'draw']
class1 = chess.loc[chess['white_win_depth'].isin(class1_vals)]
class0 = chess.loc[chess['white_win_depth'].isin(class0_vals)]
print(class1.shape)
print(class0.shape)
class1.head()

(13957, 7)
(14099, 7)


Unnamed: 0,wking_file,wking_rank,wrook_file,wrook_rank,bking_file,bking_rank,white_win_depth
2796,c,1,a,3,a,1,zero
2797,c,1,a,4,a,1,zero
2798,c,1,a,5,a,1,zero
2799,c,1,a,6,a,1,zero
2800,c,1,a,7,a,1,zero


In [39]:
class0.loc[:,'white_win_depth'] = 0
class1.loc[:,'white_win_depth'] = 1
# class1.loc['']
class0.head()

Unnamed: 0,wking_file,wking_rank,wrook_file,wrook_rank,bking_file,bking_rank,white_win_depth
0,a,1,b,3,c,2,0
1,a,1,c,1,c,2,0
2,a,1,c,1,d,1,0
3,a,1,c,1,d,2,0
4,a,1,c,2,c,1,0


In [42]:
chess = class0.append(class1)
chess['white_win_depth'].value_counts()

0    14099
1    13957
Name: white_win_depth, dtype: int64

### One-Hot-encode data

In [43]:
# set up new dataframes of each variable, 
# and one-hot-encode the necessary variables using dummy variables
wking_file_oh = pd.get_dummies(chess.wking_file, prefix='wking_file')
wking_rank_oh = pd.get_dummies(chess.wking_rank, prefix='wking_rank')
wrook_file_oh = pd.get_dummies(chess.wrook_file, prefix='wrook_file')
wrook_rank_oh = pd.get_dummies(chess.wrook_rank, prefix='wrook_rank')
bking_file_oh = pd.get_dummies(chess.bking_file, prefix='bking_file')
bking_rank_oh = pd.get_dummies(chess.bking_rank, prefix='bking_rank')
bking_rank_oh.head()

Unnamed: 0,bking_rank_1,bking_rank_2,bking_rank_3,bking_rank_4,bking_rank_5,bking_rank_6,bking_rank_7,bking_rank_8
0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0


In [46]:
var_list = [wking_file_oh, wking_rank_oh, wrook_file_oh, wrook_rank_oh, 
          bking_file_oh, bking_rank_oh, chess['white_win_depth']]
chess_oh = pd.concat(var_list, axis=1).reindex(wking_file_oh.index)
chess_oh = chess_oh.rename(columns={'white_win_depth': 'Y'})
print(chess_oh.shape)
chess_oh.head()

(28056, 41)


Unnamed: 0,wking_file_a,wking_file_b,wking_file_c,wking_file_d,wking_rank_1,wking_rank_2,wking_rank_3,wking_rank_4,wrook_file_a,wrook_file_b,...,bking_file_h,bking_rank_1,bking_rank_2,bking_rank_3,bking_rank_4,bking_rank_5,bking_rank_6,bking_rank_7,bking_rank_8,Y
0,1,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [47]:
chess_oh.to_csv('chess_oh.csv', index=False)

## Create 5 Trials - Training and Testing Sets

In [49]:
from sklearn.model_selection import train_test_split

XY_train1, XY_test1 = train_test_split(chess_oh, train_size=5000, shuffle=True)
XY_train2, XY_test2 = train_test_split(chess_oh, train_size=5000, shuffle=True)
XY_train3, XY_test3 = train_test_split(chess_oh, train_size=5000, shuffle=True)
XY_train4, XY_test4 = train_test_split(chess_oh, train_size=5000, shuffle=True)
XY_train5, XY_test5 = train_test_split(chess_oh, train_size=5000, shuffle=True)

In [50]:
print(XY_train1.shape)
print(XY_test1.shape)
XY_train1.head()

(5000, 41)
(23056, 41)


Unnamed: 0,wking_file_a,wking_file_b,wking_file_c,wking_file_d,wking_rank_1,wking_rank_2,wking_rank_3,wking_rank_4,wrook_file_a,wrook_file_b,...,bking_file_h,bking_rank_1,bking_rank_2,bking_rank_3,bking_rank_4,bking_rank_5,bking_rank_6,bking_rank_7,bking_rank_8,Y
26047,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
20555,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1526,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3422,0,0,0,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
26557,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


## Run GridSearchCV on classifier

In [51]:
%%time
#import warnings
# there are a lot of convergence warnings for some params, however be careful with this!!
# sometimes you need to see those wanrings, and now we've screwed tha tup for the whole notebook from here on!!
#warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression 
import numpy as np
from sklearn.model_selection import GridSearchCV

# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']}
                ]

trialnum = 0
accuracy_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
roc_sum = 0 # sum of top roc score to later calculate the average of all 5 trials
f1_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
accuracy_scores = []
roc_scores = []
f1_scores = []
all_accuracy_models = []
all_roc_models = []
all_f1_models = []

# for every trial
for trial in [XY_train1, XY_train2, XY_train3, XY_train4, XY_train5]:
 
    trialnum = trialnum + 1
    X_l = trial.drop(['Y'],1)
    y_l = trial['Y']
    
    # Create grid search 
    clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
                       verbose=0)

    # Fit grid search
    best_model = clf.fit(X_l, y_l)
    
    print("------------------------------------------------------------------------------------")
    print("RESULTS FOR TRIAL:")
    print(trialnum)
    print("------------------------------------------------------------------------------------")
    
    # the detailed results of the whole model selection search...
#     print(best_model.cv_results_)

    print("---------------BEST MODEL FOR ACCURACY: ----------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    print("---WITH ACCURACY: ---")
    current_accuracy = best_model.cv_results_['mean_test_accuracy'][ np.argmax(best_model.cv_results_['mean_test_accuracy']) ]
    print(current_accuracy)
    accuracy_sum = accuracy_sum + current_accuracy
    accuracy_scores.append(current_accuracy)
    all_accuracy_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    
    print("---------------BEST MODEL FOR ROC: ---------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr']) ] )
    print("---WITH ROC: ---")
    current_roc = best_model.cv_results_['mean_test_roc_auc_ovr'][ np.argmax(best_model.cv_results_['mean_test_roc_auc_ovr']) ]
    print(current_roc)
    roc_sum = roc_sum + current_roc
    roc_scores.append(current_roc)
    all_roc_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr']) ] )

    
    print("---------------BEST MODEL FOR F1: ----------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )
    print("---WITH F1: ---")    
    current_f1 = best_model.cv_results_['mean_test_f1_micro'][ np.argmax(best_model.cv_results_['mean_test_f1_micro']) ]
    print(current_f1)
    f1_sum = f1_sum + current_f1
    f1_scores.append(current_f1)
    all_f1_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )



------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
1
------------------------------------------------------------------------------------
---------------BEST MODEL FOR ACCURACY: ----------
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
---WITH ACCURACY: ---
0.8246
---------------BEST MODEL FOR ROC: ---------------
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
---WITH ROC: ---
0.908720940504683
---------------BEST MODEL FOR F1: ----------------
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
---WITH F1: ---
0.8245999999999999
------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
2
--------------------------------------------------------

### Get Train metrics

In [53]:
print("===== ACCURACY SCORES: =====")
print(accuracy_scores)
print("===== ROC SCORES: =====")
print(roc_scores)
print("===== F1 SCORES: =====")
print(f1_scores)

===== ACCURACY SCORES: =====
[0.8246, 0.8333999999999999, 0.8176, 0.8276, 0.8198000000000001]
===== ROC SCORES: =====
[0.908720940504683, 0.9116755161761161, 0.9017958394150039, 0.907312083922568, 0.8932204600212325]
===== F1 SCORES: =====
[0.8245999999999999, 0.8333999999999999, 0.8176, 0.8276, 0.8198000000000001]


In [54]:
print("================ BEST ACCURACY MODEL IN TRAINING: ==================")
print(all_accuracy_models[ np.argmax(accuracy_scores) ])
print("WITH ACCURACY:")
print(max(accuracy_scores))
print("================ BEST ROC MODEL IN TRAINING: ==================")
print(all_roc_models[ np.argmax(roc_scores) ])
print("WITH ROC SCORE:")
print(max(roc_scores))
print("================ BEST F1 MODEL IN TRAINING: ==================")
print(all_f1_models[ np.argmax(f1_scores) ])
print("WITH F1 SCORE:")
print(max(f1_scores))

{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
WITH ACCURACY:
0.8333999999999999
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 1.0, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH ROC SCORE:
0.9116755161761161
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
WITH F1 SCORE:
0.8333999999999999


In [55]:
print("================ AVERAGE ACCURACY ON TRAIN SET: ==================")
print(accuracy_sum / 5)
print("================ AVERAGE ROC SCORE ON TRAIN SET: ==================")
print(roc_sum / 5)
print("================ AVERAGE F1 SCORE ON TRAIN SET: ==================")
print(f1_sum / 5)


0.8246
0.9045449680079207
0.8246


### Get Test metrics

In [57]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# test performance of model rated as best for accuracy and f1 score on training set
performance_AccModel = pd.DataFrame(index=['acc', 'roc', 'f1'], columns=['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5'])
performance_AccModel

# test performance on trial 1 test set
clf = LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000).fit(XY_train1.drop(['Y'],1), XY_train1['Y'])
pred = clf.predict(XY_test1.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 1'] = accuracy_score(XY_test1['Y'], pred)
performance_AccModel.loc['roc', 'Trial 1'] = roc_auc_score(XY_test1['Y'], pred)
performance_AccModel.loc['f1', 'Trial 1'] = f1_score(XY_test1['Y'], pred)

# test performance on trial 2 test set
clf = LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000).fit(XY_train2.drop(['Y'],1), XY_train2['Y'])
pred = clf.predict(XY_test2.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 2'] = accuracy_score(XY_test2['Y'], pred)
performance_AccModel.loc['roc', 'Trial 2'] = roc_auc_score(XY_test2['Y'], pred)
performance_AccModel.loc['f1', 'Trial 2'] = f1_score(XY_test2['Y'], pred)

# test performance on trial 3 test set
clf = LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000).fit(XY_train3.drop(['Y'],1), XY_train3['Y'])
pred = clf.predict(XY_test3.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 3'] = accuracy_score(XY_test3['Y'], pred)
performance_AccModel.loc['roc', 'Trial 3'] = roc_auc_score(XY_test3['Y'], pred)
performance_AccModel.loc['f1', 'Trial 3'] = f1_score(XY_test3['Y'], pred)

# test performance on trial 4 test set
clf = LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000).fit(XY_train4.drop(['Y'],1), XY_train4['Y'])
pred = clf.predict(XY_test4.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 4'] = accuracy_score(XY_test4['Y'], pred)
performance_AccModel.loc['roc', 'Trial 4'] = roc_auc_score(XY_test4['Y'], pred)
performance_AccModel.loc['f1', 'Trial 4'] = f1_score(XY_test4['Y'], pred)

# test performance on trial 5 test set
clf = LogisticRegression(penalty='l2', C=0.1, solver='saga', max_iter=5000).fit(XY_train5.drop(['Y'],1), XY_train5['Y'])
pred = clf.predict(XY_test5.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 5'] = accuracy_score(XY_test5['Y'], pred)
performance_AccModel.loc['roc', 'Trial 5'] = roc_auc_score(XY_test5['Y'], pred)
performance_AccModel.loc['f1', 'Trial 5'] = f1_score(XY_test5['Y'], pred)

performance_AccModel

Unnamed: 0,Trial 1,Trial 2,Trial 3,Trial 4,Trial 5
acc,0.827594,0.824601,0.827811,0.82703,0.826336
roc,0.827491,0.824546,0.82773,0.827009,0.82626
f1,0.824029,0.822195,0.824025,0.824626,0.822581


In [58]:
# test performance of model rated as best for roc on training set
performance_ROCModel = pd.DataFrame(index=['acc', 'roc', 'f1'], columns=['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5'])

# test performance on trial 1 test set
clf = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=5000).fit(XY_train1.drop(['Y'],1), XY_train1['Y'])
pred = clf.predict(XY_test1.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 1'] = accuracy_score(XY_test1['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 1'] = roc_auc_score(XY_test1['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 1'] = f1_score(XY_test1['Y'], pred)

# test performance on trial 2 test set
clf = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=5000).fit(XY_train2.drop(['Y'],1), XY_train2['Y'])
pred = clf.predict(XY_test2.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 2'] = accuracy_score(XY_test2['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 2'] = roc_auc_score(XY_test2['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 2'] = f1_score(XY_test2['Y'], pred)

# test performance on trial 3 test set
clf = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=5000).fit(XY_train3.drop(['Y'],1), XY_train3['Y'])
pred = clf.predict(XY_test3.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 3'] = accuracy_score(XY_test3['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 3'] = roc_auc_score(XY_test3['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 3'] = f1_score(XY_test3['Y'], pred)

# test performance on trial 4 test set
clf = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=5000).fit(XY_train4.drop(['Y'],1), XY_train4['Y'])
pred = clf.predict(XY_test4.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 4'] = accuracy_score(XY_test4['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 4'] = roc_auc_score(XY_test4['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 4'] = f1_score(XY_test4['Y'], pred)

# test performance on trial 5 test set
clf = LogisticRegression(penalty='l1', C=1.0, solver='saga', max_iter=5000).fit(XY_train5.drop(['Y'],1), XY_train5['Y'])
pred = clf.predict(XY_test5.drop(['Y'],1))
performance_ROCModel.loc['acc', 'Trial 5'] = accuracy_score(XY_test5['Y'], pred)
performance_ROCModel.loc['roc', 'Trial 5'] = roc_auc_score(XY_test5['Y'], pred)
performance_ROCModel.loc['f1', 'Trial 5'] = f1_score(XY_test5['Y'], pred)

performance_ROCModel

Unnamed: 0,Trial 1,Trial 2,Trial 3,Trial 4,Trial 5
acc,0.826162,0.826423,0.828114,0.826986,0.826379
roc,0.826076,0.826397,0.828048,0.826971,0.82632
f1,0.822968,0.82475,0.824886,0.82522,0.823337


## Get final Logistic Regression results

In [59]:
print("================ BEST ACCURACY MODEL IN LOGISTIC REGRESSION: ==================")
print(all_accuracy_models[ np.argmax(np.array(performance_AccModel.iloc[0])) ])
print("WITH ACCURACY:")
print((np.array(performance_AccModel.iloc[0])).mean())
print("================ BEST ROC MODEL IN LOGISTIC REGRESSION: ==================")
print(all_roc_models[ np.argmax(np.array(performance_ROCModel.iloc[1])) ])
print("WITH ROC SCORE:")
print((performance_ROCModel.iloc[1]).mean())
print("================ BEST F1 MODEL IN LOGISTIC REGRESSION: ==================")
print(all_f1_models[ np.argmax(np.array(performance_AccModel.iloc[2])) ])
print("WITH F1 SCORE:")
print((np.array(performance_AccModel.iloc[2])).mean())

{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
WITH ACCURACY:
0.826674184594032
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH ROC SCORE:
0.8267623990592423
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.01, 'classifier__penalty': 'l2', 'classifier__solver': 'saga'}
WITH F1 SCORE:
0.8234911873193939
