## Exhaustive Feature Selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

## Read Data

In [2]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [3]:
data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

### Remove correlated features

The Exhaustive Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_test, 0.6)
print('correlated features: ', len(set(corr_features)) )

correlated features:  23


In [6]:
# removed correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((140272, 20), (35069, 20))

###  Exhaustive Feature Selection

In [7]:
# 1. the algorithm to create, in this case RandomForests
# 2. the number of minimum features we want our model to have
# 3. the number of maximum features we want our model to have 
# 4. the evaluation metric: in this case the roc_auc
# 5. the cross-validation

efs = EFS(RandomForestClassifier(n_estimators=5,
                                 n_jobs=4,
                                 random_state=0,
                                 max_depth=2),
          min_features=2,
          max_features=3,
          scoring='roc_auc',
          print_progress=True,
          cv=2)

efs = efs.fit(np.array(X_test), y_test)

Features: 1330/1330

The log above means that the search evaluated 3276 feature combinations!

In [8]:
efs.best_idx_

(7, 9, 13)

In [9]:
selected_feat = X_test.columns[list(efs.best_idx_)]
selected_feat

Index(['sttl', 'dload', 'smean'], dtype='object')

### Compare performance of feature subsets

In [10]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [11]:
# evaluate performance of classifier using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9686834938970512
Test set
Random Forests roc-auc: 0.9689594335346903


In [12]:
# and for comparison, we train random forests using
# all features (except the correlated ones, which we removed already)

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9753174567953901
Test set
Random Forests roc-auc: 0.9747859642795548


Even with 3 features, the performance is not super far off of that model built using 20 features.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((140272, 3), (35069, 3))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers




In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation



In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [20]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25).fit(X_train, y_train)

CPU times: user 11 ms, sys: 5.99 ms, total: 16.9 ms
Wall time: 695 ms


In [21]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.739542045681371
F1 Score: 0.7721285300868177
FPR: 0.06989151698381647
TPR: 0.649582336397599


### Naive Bayes




In [22]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-08).fit(X_train, y_train)

CPU times: user 20.2 ms, sys: 1.71 ms, total: 21.9 ms
Wall time: 20 ms


In [23]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.30941857480966095
F1 Score: 0.03828131204828846
FPR: 0.07798328294504712
TPR: 0.02023254837761827


### Random Forest




In [24]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000).fit(X_train, y_train)

CPU times: user 1min 3s, sys: 479 ms, total: 1min 4s
Wall time: 1min 4s


In [25]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN




In [26]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform').fit(X_train, y_train)

CPU times: user 3.9 s, sys: 23.1 ms, total: 3.93 s
Wall time: 3.91 s


In [27]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.7251988936097408
f1: 0.7640130274016211
fpr: 0.12573359416681487
tpr: 0.6548293665785165


### CatBoost





In [28]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6621724	total: 62ms	remaining: 3.04s
1:	learn: 0.6337545	total: 70.1ms	remaining: 1.68s
2:	learn: 0.6076672	total: 77.7ms	remaining: 1.22s
3:	learn: 0.5829667	total: 85.6ms	remaining: 984ms
4:	learn: 0.5598742	total: 93.6ms	remaining: 843ms
5:	learn: 0.5383965	total: 101ms	remaining: 742ms
6:	learn: 0.5184666	total: 109ms	remaining: 667ms
7:	learn: 0.4999077	total: 116ms	remaining: 609ms
8:	learn: 0.4823267	total: 124ms	remaining: 564ms
9:	learn: 0.4660265	total: 131ms	remaining: 524ms
10:	learn: 0.4505331	total: 139ms	remaining: 493ms
11:	learn: 0.4360135	total: 148ms	remaining: 469ms
12:	learn: 0.4221590	total: 156ms	remaining: 445ms
13:	learn: 0.4092269	total: 166ms	remaining: 426ms
14:	learn: 0.3972862	total: 175ms	remaining: 408ms
15:	learn: 0.3856464	total: 184ms	remaining: 390ms
16:	learn: 0.3747522	total: 192ms	remaining: 372ms
17:	learn: 0.3643412	total: 200ms	remaining: 355ms
18:	learn: 0.3544722	total: 207ms	remaining: 338ms
19:	learn: 0.3454472	total: 215ms	rema

In [29]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


## Model Evaluation




In [30]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../UNSW_Test.csv")
test_df.shape

(175341, 44)

In [31]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [32]:
X_eval = X_eval[selected_feat]

In [33]:
X_eval.shape

(175341, 3)

### Model Evaluation - Logistic Regression



In [34]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25)
modelLR.fit(X_train, y_train)

LogisticRegression(C=25, n_jobs=-1, random_state=42)

In [35]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [36]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.8627951408691684
Testing accuracy is  0.739542045681371


In [37]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.739542045681371
F1 Score: 0.7721285300868177
Precision Score: 0.9516634893302995
Recall Score: 0.649582336397599
Confusion Matrix:
 [[10460   786]
 [ 8348 15475]]


### Cross validation - Logistic Regression





In [38]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.87445 (+/- 0.17469)
F1 Score: 0.91578 (+/- 0.10102)
Precision: 0.88798 (+/- 0.20266)
Recall: 0.95471 (+/- 0.04723)


### Model Evaluation - Naive Bayes





In [39]:
modelNB = GaussianNB(var_smoothing=1e-08)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-08)

In [40]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [41]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.8572059997718718
Testing accuracy is  0.30941857480966095


In [42]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.30941857480966095
F1 Score: 0.03828131204828846
Precision Score: 0.35467255334805003
Recall Score: 0.02023254837761827
Confusion Matrix:
 [[10369   877]
 [23341   482]]


### Cross validation - Naive Bayes




In [43]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.82938 (+/- 0.17532)
F1 Score: 0.89003 (+/- 0.09636)
Precision: 0.81695 (+/- 0.16152)
Recall: 0.98364 (+/- 0.01598)


### Model Evaluation - Random Forest





In [44]:
modelRF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=100, n_estimators=1000, random_state=0)

In [45]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [46]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  0.9833252537926315
Testing accuracy is  0.6793179161082438


In [47]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest





In [48]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.91406 (+/- 0.11375)
F1 Score: 0.93969 (+/- 0.07089)
Precision: 0.93720 (+/- 0.16841)
Recall: 0.94889 (+/- 0.06093)


### Model Evaluation - KNN

In [49]:
modelKNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=1)

In [50]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [51]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9509025322231094
Testing accuracy is  0.7251988936097408


In [52]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.7251988936097408
F1 Score: 0.7640130274016211
Precision Score: 0.9168919713177384
Recall Score: 0.6548293665785165
Confusion Matrix:
 [[ 9832  1414]
 [ 8223 15600]]


### Cross validation - KNN





In [53]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.86986 (+/- 0.11166)
F1 Score: 0.90762 (+/- 0.06506)
Precision: 0.90284 (+/- 0.15793)
Recall: 0.92009 (+/- 0.08658)


### Model Evaluation - CatBoost




In [54]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6621724	total: 8.01ms	remaining: 393ms
1:	learn: 0.6337545	total: 16.3ms	remaining: 392ms
2:	learn: 0.6076672	total: 24.5ms	remaining: 384ms
3:	learn: 0.5829667	total: 32.6ms	remaining: 375ms
4:	learn: 0.5598742	total: 40.9ms	remaining: 368ms
5:	learn: 0.5383965	total: 49.1ms	remaining: 360ms
6:	learn: 0.5184666	total: 57.6ms	remaining: 354ms
7:	learn: 0.4999077	total: 66ms	remaining: 347ms
8:	learn: 0.4823267	total: 74ms	remaining: 337ms
9:	learn: 0.4660265	total: 82ms	remaining: 328ms
10:	learn: 0.4505331	total: 90ms	remaining: 319ms
11:	learn: 0.4360135	total: 98.2ms	remaining: 311ms
12:	learn: 0.4221590	total: 106ms	remaining: 302ms
13:	learn: 0.4092269	total: 114ms	remaining: 293ms
14:	learn: 0.3972862	total: 122ms	remaining: 285ms
15:	learn: 0.3856464	total: 130ms	remaining: 277ms
16:	learn: 0.3747522	total: 138ms	remaining: 269ms
17:	learn: 0.3643412	total: 146ms	remaining: 260ms
18:	learn: 0.3544722	total: 155ms	remaining: 253ms
19:	learn: 0.3454472	total: 164ms	rem

<catboost.core.CatBoostClassifier at 0x7fa2c84d3fa0>

In [55]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [56]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9318253108246835
Testing accuracy is  0.6793179161082438


In [57]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - CatBoost




In [58]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6626683	total: 8.85ms	remaining: 434ms
1:	learn: 0.6347695	total: 17.8ms	remaining: 427ms
2:	learn: 0.6085419	total: 26.4ms	remaining: 414ms
3:	learn: 0.5841185	total: 34.9ms	remaining: 401ms
4:	learn: 0.5613876	total: 43.7ms	remaining: 393ms
5:	learn: 0.5402056	total: 52.2ms	remaining: 383ms
6:	learn: 0.5204676	total: 60.8ms	remaining: 374ms
7:	learn: 0.5020485	total: 69.8ms	remaining: 366ms
8:	learn: 0.4847949	total: 78.6ms	remaining: 358ms
9:	learn: 0.4687596	total: 87.2ms	remaining: 349ms
10:	learn: 0.4535197	total: 95.9ms	remaining: 340ms
11:	learn: 0.4392735	total: 105ms	remaining: 332ms
12:	learn: 0.4255397	total: 114ms	remaining: 324ms
13:	learn: 0.4128124	total: 122ms	remaining: 315ms
14:	learn: 0.4007888	total: 131ms	remaining: 306ms
15:	learn: 0.3892985	total: 140ms	remaining: 298ms
16:	learn: 0.3784652	total: 149ms	remaining: 289ms
17:	learn: 0.3682113	total: 158ms	remaining: 281ms
18:	learn: 0.3584747	total: 166ms	remaining: 272ms
19:	learn: 0.3494615	total: 17

13:	learn: 0.4134595	total: 124ms	remaining: 318ms
14:	learn: 0.4015059	total: 133ms	remaining: 310ms
15:	learn: 0.3900519	total: 142ms	remaining: 303ms
16:	learn: 0.3792888	total: 152ms	remaining: 295ms
17:	learn: 0.3690487	total: 161ms	remaining: 287ms
18:	learn: 0.3592813	total: 171ms	remaining: 279ms
19:	learn: 0.3501175	total: 180ms	remaining: 270ms
20:	learn: 0.3412764	total: 189ms	remaining: 261ms
21:	learn: 0.3331254	total: 198ms	remaining: 252ms
22:	learn: 0.3251787	total: 207ms	remaining: 243ms
23:	learn: 0.3178096	total: 216ms	remaining: 234ms
24:	learn: 0.3106986	total: 225ms	remaining: 225ms
25:	learn: 0.3038583	total: 234ms	remaining: 216ms
26:	learn: 0.2974556	total: 243ms	remaining: 207ms
27:	learn: 0.2913718	total: 252ms	remaining: 198ms
28:	learn: 0.2856392	total: 261ms	remaining: 189ms
29:	learn: 0.2800507	total: 271ms	remaining: 181ms
30:	learn: 0.2746658	total: 281ms	remaining: 172ms
31:	learn: 0.2694291	total: 290ms	remaining: 163ms
32:	learn: 0.2645145	total: 299

44:	learn: 0.2246437	total: 407ms	remaining: 45.2ms
45:	learn: 0.2222179	total: 416ms	remaining: 36.2ms
46:	learn: 0.2197529	total: 425ms	remaining: 27.2ms
47:	learn: 0.2174730	total: 435ms	remaining: 18.1ms
48:	learn: 0.2151337	total: 444ms	remaining: 9.06ms
49:	learn: 0.2129442	total: 453ms	remaining: 0us
0:	learn: 0.6608524	total: 8.64ms	remaining: 423ms
1:	learn: 0.6312620	total: 17.7ms	remaining: 425ms
2:	learn: 0.6038351	total: 26.8ms	remaining: 420ms
3:	learn: 0.5781213	total: 35.7ms	remaining: 411ms
4:	learn: 0.5541173	total: 44.7ms	remaining: 403ms
5:	learn: 0.5319179	total: 53.8ms	remaining: 395ms
6:	learn: 0.5110160	total: 62.9ms	remaining: 387ms
7:	learn: 0.4916583	total: 72.7ms	remaining: 382ms
8:	learn: 0.4734155	total: 81.9ms	remaining: 373ms
9:	learn: 0.4563257	total: 91.7ms	remaining: 367ms
10:	learn: 0.4402360	total: 101ms	remaining: 358ms
11:	learn: 0.4251930	total: 110ms	remaining: 349ms
12:	learn: 0.4109109	total: 120ms	remaining: 340ms
13:	learn: 0.3973587	total: 

12:	learn: 0.4255397	total: 113ms	remaining: 321ms
13:	learn: 0.4128124	total: 122ms	remaining: 314ms
14:	learn: 0.4007888	total: 132ms	remaining: 308ms
15:	learn: 0.3892985	total: 141ms	remaining: 301ms
16:	learn: 0.3784652	total: 150ms	remaining: 292ms
17:	learn: 0.3682113	total: 159ms	remaining: 283ms
18:	learn: 0.3584747	total: 168ms	remaining: 274ms
19:	learn: 0.3494615	total: 176ms	remaining: 265ms
20:	learn: 0.3406816	total: 186ms	remaining: 256ms
21:	learn: 0.3324791	total: 194ms	remaining: 248ms
22:	learn: 0.3245201	total: 203ms	remaining: 239ms
23:	learn: 0.3172089	total: 213ms	remaining: 230ms
24:	learn: 0.3100697	total: 222ms	remaining: 222ms
25:	learn: 0.3032365	total: 231ms	remaining: 213ms
26:	learn: 0.2968004	total: 240ms	remaining: 205ms
27:	learn: 0.2907907	total: 250ms	remaining: 196ms
28:	learn: 0.2849256	total: 259ms	remaining: 188ms
29:	learn: 0.2793387	total: 269ms	remaining: 179ms
30:	learn: 0.2740266	total: 278ms	remaining: 170ms
31:	learn: 0.2688801	total: 287

42:	learn: 0.2263899	total: 401ms	remaining: 65.2ms
43:	learn: 0.2233320	total: 410ms	remaining: 56ms
44:	learn: 0.2203962	total: 420ms	remaining: 46.6ms
45:	learn: 0.2179592	total: 430ms	remaining: 37.4ms
46:	learn: 0.2154120	total: 439ms	remaining: 28ms
47:	learn: 0.2130081	total: 448ms	remaining: 18.7ms
48:	learn: 0.2106230	total: 457ms	remaining: 9.33ms
49:	learn: 0.2083159	total: 466ms	remaining: 0us
0:	learn: 0.6627135	total: 9ms	remaining: 441ms
1:	learn: 0.6349155	total: 18.1ms	remaining: 435ms
2:	learn: 0.6087404	total: 26.9ms	remaining: 421ms
3:	learn: 0.5844462	total: 35.8ms	remaining: 411ms
4:	learn: 0.5617334	total: 44.5ms	remaining: 401ms
5:	learn: 0.5407032	total: 53.4ms	remaining: 392ms
6:	learn: 0.5209794	total: 62.2ms	remaining: 382ms
7:	learn: 0.5025802	total: 71.9ms	remaining: 378ms
8:	learn: 0.4852653	total: 81.5ms	remaining: 371ms
9:	learn: 0.4692210	total: 90.6ms	remaining: 362ms
10:	learn: 0.4539911	total: 99.5ms	remaining: 353ms
11:	learn: 0.4397695	total: 109m

21:	learn: 0.3130635	total: 207ms	remaining: 263ms
22:	learn: 0.3047549	total: 216ms	remaining: 254ms
23:	learn: 0.2973151	total: 226ms	remaining: 245ms
24:	learn: 0.2897820	total: 236ms	remaining: 236ms
25:	learn: 0.2825946	total: 246ms	remaining: 227ms
26:	learn: 0.2758431	total: 255ms	remaining: 218ms
27:	learn: 0.2694227	total: 265ms	remaining: 208ms
28:	learn: 0.2633216	total: 275ms	remaining: 199ms
29:	learn: 0.2575950	total: 284ms	remaining: 189ms
30:	learn: 0.2520491	total: 293ms	remaining: 180ms
31:	learn: 0.2467852	total: 303ms	remaining: 170ms
32:	learn: 0.2418776	total: 313ms	remaining: 161ms
33:	learn: 0.2370533	total: 323ms	remaining: 152ms
34:	learn: 0.2324422	total: 332ms	remaining: 142ms
35:	learn: 0.2280689	total: 341ms	remaining: 133ms
36:	learn: 0.2239362	total: 350ms	remaining: 123ms
37:	learn: 0.2198636	total: 359ms	remaining: 114ms
38:	learn: 0.2160571	total: 368ms	remaining: 104ms
39:	learn: 0.2125034	total: 377ms	remaining: 94.3ms
40:	learn: 0.2091721	total: 38

43:	learn: 0.2226750	total: 407ms	remaining: 55.5ms
44:	learn: 0.2197906	total: 416ms	remaining: 46.3ms
45:	learn: 0.2172964	total: 425ms	remaining: 37ms
46:	learn: 0.2145984	total: 435ms	remaining: 27.8ms
47:	learn: 0.2121441	total: 444ms	remaining: 18.5ms
48:	learn: 0.2097912	total: 453ms	remaining: 9.25ms
49:	learn: 0.2074758	total: 462ms	remaining: 0us
0:	learn: 0.6626588	total: 8.5ms	remaining: 417ms
1:	learn: 0.6346390	total: 16.9ms	remaining: 407ms
2:	learn: 0.6089787	total: 25.9ms	remaining: 407ms
3:	learn: 0.5845744	total: 34.7ms	remaining: 399ms
4:	learn: 0.5617911	total: 43.8ms	remaining: 395ms
5:	learn: 0.5406051	total: 52.5ms	remaining: 385ms
6:	learn: 0.5208216	total: 61.4ms	remaining: 377ms
7:	learn: 0.5022497	total: 69.9ms	remaining: 367ms
8:	learn: 0.4848507	total: 79.1ms	remaining: 361ms
9:	learn: 0.4687718	total: 88.4ms	remaining: 354ms
10:	learn: 0.4534689	total: 97.5ms	remaining: 346ms
11:	learn: 0.4391297	total: 107ms	remaining: 339ms
12:	learn: 0.4253816	total: 1

13:	learn: 0.4132682	total: 127ms	remaining: 328ms
14:	learn: 0.4012984	total: 137ms	remaining: 320ms
15:	learn: 0.3898243	total: 146ms	remaining: 311ms
16:	learn: 0.3790999	total: 156ms	remaining: 303ms
17:	learn: 0.3688995	total: 165ms	remaining: 293ms
18:	learn: 0.3591808	total: 174ms	remaining: 283ms
19:	learn: 0.3500322	total: 183ms	remaining: 274ms
20:	learn: 0.3411973	total: 193ms	remaining: 266ms
21:	learn: 0.3330977	total: 202ms	remaining: 257ms
22:	learn: 0.3251695	total: 211ms	remaining: 248ms
23:	learn: 0.3178576	total: 221ms	remaining: 239ms
24:	learn: 0.3107448	total: 230ms	remaining: 230ms
25:	learn: 0.3039471	total: 240ms	remaining: 221ms
26:	learn: 0.2974742	total: 250ms	remaining: 213ms
27:	learn: 0.2914630	total: 260ms	remaining: 204ms
28:	learn: 0.2856093	total: 269ms	remaining: 195ms
29:	learn: 0.2801855	total: 279ms	remaining: 186ms
30:	learn: 0.2748352	total: 289ms	remaining: 177ms
31:	learn: 0.2696796	total: 299ms	remaining: 168ms
32:	learn: 0.2648776	total: 309

44:	learn: 0.1966690	total: 409ms	remaining: 45.4ms
45:	learn: 0.1938564	total: 418ms	remaining: 36.4ms
46:	learn: 0.1910674	total: 428ms	remaining: 27.3ms
47:	learn: 0.1885005	total: 438ms	remaining: 18.2ms
48:	learn: 0.1859388	total: 448ms	remaining: 9.13ms
49:	learn: 0.1835664	total: 458ms	remaining: 0us
0:	learn: 0.6603454	total: 8.52ms	remaining: 418ms
1:	learn: 0.6300577	total: 17.7ms	remaining: 425ms
2:	learn: 0.6020536	total: 26.3ms	remaining: 411ms
3:	learn: 0.5759296	total: 34.7ms	remaining: 399ms
4:	learn: 0.5515923	total: 43.3ms	remaining: 390ms
5:	learn: 0.5288655	total: 52.3ms	remaining: 384ms
6:	learn: 0.5078090	total: 60.8ms	remaining: 373ms
7:	learn: 0.4879527	total: 69.7ms	remaining: 366ms
8:	learn: 0.4693924	total: 79.6ms	remaining: 363ms
9:	learn: 0.4519339	total: 89.3ms	remaining: 357ms
10:	learn: 0.4356305	total: 98.5ms	remaining: 349ms
11:	learn: 0.4206147	total: 108ms	remaining: 340ms
12:	learn: 0.4061128	total: 116ms	remaining: 331ms
13:	learn: 0.3924134	total:

12:	learn: 0.4253816	total: 122ms	remaining: 348ms
13:	learn: 0.4125066	total: 133ms	remaining: 342ms
14:	learn: 0.4005031	total: 143ms	remaining: 335ms
15:	learn: 0.3889728	total: 154ms	remaining: 327ms
16:	learn: 0.3781254	total: 163ms	remaining: 316ms
17:	learn: 0.3678450	total: 173ms	remaining: 307ms
18:	learn: 0.3580610	total: 183ms	remaining: 298ms
19:	learn: 0.3487575	total: 193ms	remaining: 289ms
20:	learn: 0.3398732	total: 202ms	remaining: 279ms
21:	learn: 0.3316291	total: 211ms	remaining: 269ms
22:	learn: 0.3236513	total: 221ms	remaining: 259ms
23:	learn: 0.3162681	total: 230ms	remaining: 249ms
24:	learn: 0.3091466	total: 240ms	remaining: 240ms
25:	learn: 0.3022905	total: 249ms	remaining: 230ms
26:	learn: 0.2958687	total: 259ms	remaining: 220ms
27:	learn: 0.2897845	total: 268ms	remaining: 211ms
28:	learn: 0.2838745	total: 278ms	remaining: 201ms
29:	learn: 0.2782733	total: 288ms	remaining: 192ms
30:	learn: 0.2729761	total: 298ms	remaining: 183ms
31:	learn: 0.2678278	total: 308

43:	learn: 0.2241054	total: 407ms	remaining: 55.4ms
44:	learn: 0.2212268	total: 416ms	remaining: 46.2ms
45:	learn: 0.2187701	total: 425ms	remaining: 37ms
46:	learn: 0.2162267	total: 435ms	remaining: 27.8ms
47:	learn: 0.2136747	total: 445ms	remaining: 18.5ms
48:	learn: 0.2113290	total: 454ms	remaining: 9.26ms
49:	learn: 0.2091243	total: 463ms	remaining: 0us
0:	learn: 0.6628735	total: 8.84ms	remaining: 433ms
1:	learn: 0.6352448	total: 17.6ms	remaining: 422ms
2:	learn: 0.6092093	total: 26.1ms	remaining: 409ms
3:	learn: 0.5850384	total: 35.1ms	remaining: 403ms
4:	learn: 0.5625888	total: 44ms	remaining: 396ms
5:	learn: 0.5416177	total: 53.7ms	remaining: 394ms
6:	learn: 0.5219951	total: 63.1ms	remaining: 388ms
7:	learn: 0.5037003	total: 73ms	remaining: 383ms
8:	learn: 0.4865121	total: 82.7ms	remaining: 377ms
9:	learn: 0.4705884	total: 92.6ms	remaining: 370ms
10:	learn: 0.4554564	total: 103ms	remaining: 364ms
11:	learn: 0.4412864	total: 112ms	remaining: 355ms
12:	learn: 0.4277104	total: 122ms

8:	learn: 0.4693924	total: 82.5ms	remaining: 376ms
9:	learn: 0.4519339	total: 92.4ms	remaining: 370ms
10:	learn: 0.4356305	total: 103ms	remaining: 364ms
11:	learn: 0.4206147	total: 112ms	remaining: 355ms
12:	learn: 0.4061128	total: 121ms	remaining: 345ms
13:	learn: 0.3924134	total: 130ms	remaining: 335ms
14:	learn: 0.3797550	total: 140ms	remaining: 326ms
15:	learn: 0.3676851	total: 149ms	remaining: 316ms
16:	learn: 0.3561403	total: 158ms	remaining: 307ms
17:	learn: 0.3452559	total: 167ms	remaining: 297ms
18:	learn: 0.3349388	total: 177ms	remaining: 288ms
19:	learn: 0.3251319	total: 186ms	remaining: 279ms
20:	learn: 0.3158177	total: 195ms	remaining: 269ms
21:	learn: 0.3071598	total: 204ms	remaining: 260ms
22:	learn: 0.2988130	total: 213ms	remaining: 250ms
23:	learn: 0.2909467	total: 223ms	remaining: 241ms
24:	learn: 0.2833050	total: 232ms	remaining: 232ms
25:	learn: 0.2761980	total: 241ms	remaining: 223ms
26:	learn: 0.2694818	total: 250ms	remaining: 213ms
27:	learn: 0.2631576	total: 260

In [59]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.91978 (+/- 0.18929)
F1 Score: 0.94797 (+/- 0.11628)
Precision: 0.91821 (+/- 0.21401)
Recall: 0.98806 (+/- 0.01947)
