## Exhaustive Feature Selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS

## Read Data

In [2]:
data = pd.read_csv('../Kyoto_Train.csv')
data.shape

(124055, 24)

In [3]:
data.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,2.863309,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,90847.0,14024.0,25836.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,218531.0,8968.0,45541.0,1.0,0.0
2,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,176665.0,15997.0,59860.0,1.0,0.0
3,0.0,0,0,0,0.0,0.0,0.67,49,100,0.02,...,6.0,6.0,0.0,0.0,0.0,52769.0,473.0,40649.0,1.0,0.0
4,0.0,0,0,1,1.0,0.0,0.36,0,2,0.0,...,6.0,0.0,0.0,0.0,0.0,65048.0,16609.0,39283.0,1.0,0.0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),
    data['Label_code'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((99244, 23), (24811, 23))

### Remove correlated features

The Exhaustive Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_test, 0.6)
print('correlated features: ', len(set(corr_features)) )

correlated features:  5


In [6]:
# removed correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((99244, 18), (24811, 18))

###  Exhaustive Feature Selection

In [7]:
# 1. the algorithm to create, in this case RandomForests
# 2. the number of minimum features we want our model to have
# 3. the number of maximum features we want our model to have 
# 4. the evaluation metric: in this case the roc_auc
# 5. the cross-validation

efs = EFS(RandomForestClassifier(n_estimators=5,
                                 n_jobs=4,
                                 random_state=0,
                                 max_depth=2),
          min_features=3,
          max_features=4,
          scoring='roc_auc',
          print_progress=True,
          cv=2)

efs = efs.fit(np.array(X_test), y_test)

Features: 3876/3876

The log above means that the search evaluated 3876 feature combinations!

In [8]:
efs.best_idx_

(2, 15, 16, 17)

In [9]:
selected_feat = X_test.columns[list(efs.best_idx_)]
selected_feat

Index(['Destination_bytes', 'Source_IP_Address_code',
       'Destination_IP_Address_code', 'Start_Time_code'],
      dtype='object')

### Compare performance of feature subsets

In [10]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [11]:
# evaluate performance of classifier using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9998130780656419
Test set
Random Forests roc-auc: 0.9997878869985705


In [12]:
# and for comparison, we train random forests using
# all features (except the correlated ones, which we removed already)

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9916585994842093
Test set
Random Forests roc-auc: 0.9914832971491265


Even with 4 features, the performance is not super far off of that model built using 18 features.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((99244, 4), (24811, 4))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers




In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation



In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 62 ms, sys: 229 ms, total: 291 ms
Wall time: 1.52 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.09447422514207408
F1 Score: 0.17208976673913845
FPR: 0.9995995728777363
TPR: 1.0


### Naive Bayes




In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 195 ms, sys: 11.1 ms, total: 206 ms
Wall time: 25.7 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.8986336705493531
F1 Score: 0.0055357848952155
FPR: 0.00831998576259121
TPR: 0.0029978586723768737


### Random Forest




In [22]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 4.97 s, sys: 52.9 ms, total: 5.02 s
Wall time: 4.39 s


In [23]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN




In [24]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 3.18 s, sys: 16.6 ms, total: 3.19 s
Wall time: 3.18 s


In [25]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.0
fpr: 1.0
tpr: 1.0


### CatBoost





In [26]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6571793	total: 66.8ms	remaining: 3.27s
1:	learn: 0.6216429	total: 73.6ms	remaining: 1.77s
2:	learn: 0.5884761	total: 80.3ms	remaining: 1.26s
3:	learn: 0.5577450	total: 87.2ms	remaining: 1s
4:	learn: 0.5291308	total: 94.4ms	remaining: 849ms
5:	learn: 0.5037519	total: 101ms	remaining: 740ms
6:	learn: 0.4786990	total: 108ms	remaining: 666ms
7:	learn: 0.4552436	total: 117ms	remaining: 612ms
8:	learn: 0.4344983	total: 123ms	remaining: 560ms
9:	learn: 0.4142154	total: 129ms	remaining: 517ms
10:	learn: 0.3946655	total: 136ms	remaining: 481ms
11:	learn: 0.3771641	total: 143ms	remaining: 454ms
12:	learn: 0.3598489	total: 150ms	remaining: 428ms
13:	learn: 0.3437185	total: 159ms	remaining: 408ms
14:	learn: 0.3285233	total: 165ms	remaining: 386ms
15:	learn: 0.3143187	total: 172ms	remaining: 365ms
16:	learn: 0.3000410	total: 179ms	remaining: 347ms
17:	learn: 0.2866818	total: 185ms	remaining: 329ms
18:	learn: 0.2745100	total: 191ms	remaining: 312ms
19:	learn: 0.2623236	total: 198ms	remai

In [27]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


## Model Evaluation




In [100]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [101]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code'])

In [102]:
X_eval = X_eval[selected_feat]

In [103]:
X_eval.shape

(62028, 4)

### Model Evaluation - Logistic Regression



In [104]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [105]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [106]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9054955463302568
Testing accuracy is  0.09447422514207408


In [125]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR, average='weighted', labels=np.unique(y_predLR), zero_division=1))
print('Precision Score:',precision_score(y_test, y_predLR, average='weighted', labels=np.unique(y_predLR), zero_division=1))
print('Recall Score:', recall_score(y_test, y_predLR, average='weighted', labels=np.unique(y_predLR), zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.09447422514207408
F1 Score: 0.01692081740085793
Precision Score: 0.9147487023508888
Recall Score: 0.09447422514207408
Confusion Matrix:
 [[    9 22467]
 [    0  2335]]


### Cross validation - Logistic Regression





In [131]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=2, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=2, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=2, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=2, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.90351 (+/- 0.00003)
F1 Score: 0.00000 (+/- 0.00000)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.00000 (+/- 0.00000)
Recall: 0.00000 (+/- 0.00000)


### Model Evaluation - Naive Bayes





In [37]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [38]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [39]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.39783765265406473
Testing accuracy is  0.8986336705493531


In [76]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB, average='weighted', zero_division=1))
print('Precision Score:',precision_score(y_test, y_predNB, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predNB, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.8986336705493531
F1 Score: 0.8580305538826403
Precision Score: 0.8236155078436386
Recall Score: 0.8986336705493531
Confusion Matrix:
 [[22289   187]
 [ 2328     7]]


### Cross validation - Naive Bayes




In [66]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.86297 (+/- 0.23000)
F1 Score: 0.02521 (+/- 0.15128)
Precision: 0.01482 (+/- 0.08894)
Recall: 0.08428 (+/- 0.50569)


### Model Evaluation - Random Forest





In [42]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [43]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [44]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [45]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=1))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest





In [46]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99994 (+/- 0.00030)
F1 Score: 0.99967 (+/- 0.00153)
Precision: 0.99933 (+/- 0.00305)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN

In [60]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [61]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [62]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9958889202369916
Testing accuracy is  0.9058885171899561


In [63]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Precision Score:', precision_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Recall Score:', recall_score(y_test, y_predKNN, average='weighted', zero_division=1))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.9147454883866614
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - KNN





In [64]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98967 (+/- 0.00206)
F1 Score: 0.94494 (+/- 0.01123)
Precision: 0.97209 (+/- 0.00919)
Recall: 0.91930 (+/- 0.01733)


### Model Evaluation - CatBoost




In [52]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6571793	total: 7.18ms	remaining: 352ms
1:	learn: 0.6216429	total: 14.1ms	remaining: 338ms
2:	learn: 0.5884761	total: 21ms	remaining: 330ms
3:	learn: 0.5577450	total: 27.9ms	remaining: 320ms
4:	learn: 0.5291308	total: 34.4ms	remaining: 310ms
5:	learn: 0.5037519	total: 41ms	remaining: 301ms
6:	learn: 0.4786990	total: 47.6ms	remaining: 292ms
7:	learn: 0.4552436	total: 54.8ms	remaining: 288ms
8:	learn: 0.4344983	total: 61.8ms	remaining: 282ms
9:	learn: 0.4142154	total: 68.8ms	remaining: 275ms
10:	learn: 0.3946655	total: 75.6ms	remaining: 268ms
11:	learn: 0.3771641	total: 82.3ms	remaining: 261ms
12:	learn: 0.3598489	total: 89.1ms	remaining: 254ms
13:	learn: 0.3437185	total: 96.1ms	remaining: 247ms
14:	learn: 0.3285233	total: 103ms	remaining: 239ms
15:	learn: 0.3143187	total: 109ms	remaining: 233ms
16:	learn: 0.3000410	total: 117ms	remaining: 226ms
17:	learn: 0.2866818	total: 123ms	remaining: 219ms
18:	learn: 0.2745100	total: 130ms	remaining: 212ms
19:	learn: 0.2623236	total: 137

<catboost.core.CatBoostClassifier at 0x7f8788683be0>

In [53]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [54]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.997531336907017
Testing accuracy is  0.9058885171899561


In [55]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - CatBoost




In [56]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6570021	total: 5.45ms	remaining: 267ms
1:	learn: 0.6215673	total: 12.8ms	remaining: 307ms
2:	learn: 0.5892114	total: 19.5ms	remaining: 306ms
3:	learn: 0.5587608	total: 24.7ms	remaining: 284ms
4:	learn: 0.5298643	total: 29.6ms	remaining: 266ms
5:	learn: 0.5044294	total: 34.9ms	remaining: 256ms
6:	learn: 0.4800640	total: 39.7ms	remaining: 244ms
7:	learn: 0.4564764	total: 44.8ms	remaining: 235ms
8:	learn: 0.4358920	total: 50ms	remaining: 228ms
9:	learn: 0.4156226	total: 55.3ms	remaining: 221ms
10:	learn: 0.3962649	total: 60.3ms	remaining: 214ms
11:	learn: 0.3789242	total: 65.4ms	remaining: 207ms
12:	learn: 0.3614667	total: 70.9ms	remaining: 202ms
13:	learn: 0.3454016	total: 76.3ms	remaining: 196ms
14:	learn: 0.3301660	total: 81.6ms	remaining: 190ms
15:	learn: 0.3159118	total: 87ms	remaining: 185ms
16:	learn: 0.3018089	total: 92.8ms	remaining: 180ms
17:	learn: 0.2886738	total: 98.3ms	remaining: 175ms
18:	learn: 0.2766493	total: 103ms	remaining: 169ms
19:	learn: 0.2642994	total:

19:	learn: 0.2646175	total: 100ms	remaining: 150ms
20:	learn: 0.2530819	total: 105ms	remaining: 145ms
21:	learn: 0.2418427	total: 110ms	remaining: 140ms
22:	learn: 0.2313991	total: 115ms	remaining: 135ms
23:	learn: 0.2218209	total: 119ms	remaining: 129ms
24:	learn: 0.2122652	total: 124ms	remaining: 124ms
25:	learn: 0.2032707	total: 129ms	remaining: 119ms
26:	learn: 0.1956173	total: 134ms	remaining: 114ms
27:	learn: 0.1879770	total: 139ms	remaining: 109ms
28:	learn: 0.1802489	total: 144ms	remaining: 104ms
29:	learn: 0.1728941	total: 148ms	remaining: 99ms
30:	learn: 0.1658124	total: 153ms	remaining: 94ms
31:	learn: 0.1595253	total: 158ms	remaining: 89ms
32:	learn: 0.1531668	total: 163ms	remaining: 84ms
33:	learn: 0.1472123	total: 168ms	remaining: 79ms
34:	learn: 0.1412887	total: 173ms	remaining: 74ms
35:	learn: 0.1356888	total: 177ms	remaining: 68.9ms
36:	learn: 0.1304703	total: 182ms	remaining: 64ms
37:	learn: 0.1251416	total: 187ms	remaining: 59ms
38:	learn: 0.1200562	total: 192ms	rema

43:	learn: 0.0994011	total: 224ms	remaining: 30.5ms
44:	learn: 0.0960145	total: 229ms	remaining: 25.4ms
45:	learn: 0.0925985	total: 234ms	remaining: 20.3ms
46:	learn: 0.0893731	total: 238ms	remaining: 15.2ms
47:	learn: 0.0864039	total: 243ms	remaining: 10.1ms
48:	learn: 0.0832029	total: 248ms	remaining: 5.07ms
49:	learn: 0.0798007	total: 254ms	remaining: 0us
0:	learn: 0.6570498	total: 5.78ms	remaining: 283ms
1:	learn: 0.6210722	total: 11.7ms	remaining: 280ms
2:	learn: 0.5885088	total: 17.1ms	remaining: 267ms
3:	learn: 0.5579559	total: 22.8ms	remaining: 262ms
4:	learn: 0.5300598	total: 28.4ms	remaining: 256ms
5:	learn: 0.5043477	total: 34.7ms	remaining: 254ms
6:	learn: 0.4803104	total: 40.4ms	remaining: 248ms
7:	learn: 0.4565615	total: 45.7ms	remaining: 240ms
8:	learn: 0.4348832	total: 50.7ms	remaining: 231ms
9:	learn: 0.4147904	total: 55.9ms	remaining: 224ms
10:	learn: 0.3964124	total: 61.1ms	remaining: 216ms
11:	learn: 0.3791560	total: 66.5ms	remaining: 211ms
12:	learn: 0.3615529	tota

19:	learn: 0.2642994	total: 105ms	remaining: 157ms
20:	learn: 0.2538749	total: 110ms	remaining: 153ms
21:	learn: 0.2429394	total: 116ms	remaining: 147ms
22:	learn: 0.2325566	total: 121ms	remaining: 142ms
23:	learn: 0.2236365	total: 126ms	remaining: 136ms
24:	learn: 0.2139226	total: 131ms	remaining: 131ms
25:	learn: 0.2048775	total: 136ms	remaining: 126ms
26:	learn: 0.1968582	total: 141ms	remaining: 120ms
27:	learn: 0.1894320	total: 146ms	remaining: 115ms
28:	learn: 0.1823072	total: 151ms	remaining: 110ms
29:	learn: 0.1751400	total: 156ms	remaining: 104ms
30:	learn: 0.1679383	total: 161ms	remaining: 99ms
31:	learn: 0.1616586	total: 167ms	remaining: 93.8ms
32:	learn: 0.1554048	total: 172ms	remaining: 88.7ms
33:	learn: 0.1495358	total: 178ms	remaining: 83.7ms
34:	learn: 0.1432280	total: 183ms	remaining: 78.4ms
35:	learn: 0.1377971	total: 188ms	remaining: 73.1ms
36:	learn: 0.1327471	total: 193ms	remaining: 67.9ms
37:	learn: 0.1271494	total: 199ms	remaining: 62.7ms
38:	learn: 0.1220659	tota

37:	learn: 0.1251416	total: 199ms	remaining: 62.8ms
38:	learn: 0.1200562	total: 204ms	remaining: 57.5ms
39:	learn: 0.1150805	total: 209ms	remaining: 52.3ms
40:	learn: 0.1104162	total: 214ms	remaining: 47ms
41:	learn: 0.1063421	total: 219ms	remaining: 41.7ms
42:	learn: 0.1022293	total: 224ms	remaining: 36.5ms
43:	learn: 0.0984506	total: 230ms	remaining: 31.3ms
44:	learn: 0.0950982	total: 235ms	remaining: 26.1ms
45:	learn: 0.0912542	total: 240ms	remaining: 20.9ms
46:	learn: 0.0879828	total: 245ms	remaining: 15.7ms
47:	learn: 0.0850154	total: 251ms	remaining: 10.5ms
48:	learn: 0.0815504	total: 256ms	remaining: 5.22ms
49:	learn: 0.0783667	total: 261ms	remaining: 0us
0:	learn: 0.6573022	total: 5.63ms	remaining: 276ms
1:	learn: 0.6217953	total: 11ms	remaining: 264ms
2:	learn: 0.5889102	total: 16.4ms	remaining: 257ms
3:	learn: 0.5585467	total: 21.8ms	remaining: 250ms
4:	learn: 0.5315661	total: 26.9ms	remaining: 242ms
5:	learn: 0.5061442	total: 32.6ms	remaining: 239ms
6:	learn: 0.4827555	total

2:	learn: 0.5885088	total: 15.2ms	remaining: 238ms
3:	learn: 0.5579559	total: 20.3ms	remaining: 234ms
4:	learn: 0.5300598	total: 25.3ms	remaining: 227ms
5:	learn: 0.5043477	total: 30.3ms	remaining: 222ms
6:	learn: 0.4803104	total: 35.3ms	remaining: 217ms
7:	learn: 0.4565615	total: 40.3ms	remaining: 212ms
8:	learn: 0.4348832	total: 45.3ms	remaining: 207ms
9:	learn: 0.4147904	total: 50ms	remaining: 200ms
10:	learn: 0.3964124	total: 55.2ms	remaining: 196ms
11:	learn: 0.3791560	total: 60ms	remaining: 190ms
12:	learn: 0.3615529	total: 65.1ms	remaining: 185ms
13:	learn: 0.3453082	total: 70.5ms	remaining: 181ms
14:	learn: 0.3301921	total: 75.4ms	remaining: 176ms
15:	learn: 0.3157929	total: 80.5ms	remaining: 171ms
16:	learn: 0.3014285	total: 85.8ms	remaining: 166ms
17:	learn: 0.2883843	total: 90.9ms	remaining: 162ms
18:	learn: 0.2764864	total: 95.8ms	remaining: 156ms
19:	learn: 0.2654412	total: 101ms	remaining: 151ms
20:	learn: 0.2538746	total: 106ms	remaining: 146ms
21:	learn: 0.2429486	total

22:	learn: 0.2325566	total: 112ms	remaining: 132ms
23:	learn: 0.2236365	total: 117ms	remaining: 127ms
24:	learn: 0.2139226	total: 122ms	remaining: 122ms
25:	learn: 0.2048775	total: 126ms	remaining: 117ms
26:	learn: 0.1968582	total: 131ms	remaining: 112ms
27:	learn: 0.1894320	total: 136ms	remaining: 107ms
28:	learn: 0.1823072	total: 141ms	remaining: 102ms
29:	learn: 0.1751400	total: 146ms	remaining: 97.2ms
30:	learn: 0.1679383	total: 151ms	remaining: 92.4ms
31:	learn: 0.1616586	total: 156ms	remaining: 87.6ms
32:	learn: 0.1554048	total: 161ms	remaining: 82.7ms
33:	learn: 0.1495358	total: 165ms	remaining: 77.8ms
34:	learn: 0.1432280	total: 170ms	remaining: 72.8ms
35:	learn: 0.1377971	total: 175ms	remaining: 68.1ms
36:	learn: 0.1327471	total: 180ms	remaining: 63.3ms
37:	learn: 0.1271494	total: 186ms	remaining: 58.6ms
38:	learn: 0.1220659	total: 190ms	remaining: 53.7ms
39:	learn: 0.1170519	total: 195ms	remaining: 48.8ms
40:	learn: 0.1124025	total: 200ms	remaining: 43.9ms
41:	learn: 0.108295

39:	learn: 0.1150805	total: 209ms	remaining: 52.3ms
40:	learn: 0.1104162	total: 215ms	remaining: 47.1ms
41:	learn: 0.1063421	total: 220ms	remaining: 41.9ms
42:	learn: 0.1022293	total: 225ms	remaining: 36.7ms
43:	learn: 0.0984506	total: 230ms	remaining: 31.4ms
44:	learn: 0.0950982	total: 236ms	remaining: 26.2ms
45:	learn: 0.0912542	total: 241ms	remaining: 21ms
46:	learn: 0.0879828	total: 246ms	remaining: 15.7ms
47:	learn: 0.0850154	total: 252ms	remaining: 10.5ms
48:	learn: 0.0815504	total: 257ms	remaining: 5.24ms
49:	learn: 0.0783667	total: 262ms	remaining: 0us
0:	learn: 0.6573022	total: 5.01ms	remaining: 245ms
1:	learn: 0.6217953	total: 10.6ms	remaining: 255ms
2:	learn: 0.5889102	total: 15.9ms	remaining: 249ms
3:	learn: 0.5585467	total: 21.1ms	remaining: 242ms
4:	learn: 0.5315661	total: 26.3ms	remaining: 237ms
5:	learn: 0.5061442	total: 31.3ms	remaining: 230ms
6:	learn: 0.4827555	total: 36.3ms	remaining: 223ms
7:	learn: 0.4590165	total: 41.3ms	remaining: 217ms
8:	learn: 0.4378194	total

0:	learn: 0.6570498	total: 5.81ms	remaining: 285ms
1:	learn: 0.6210722	total: 11ms	remaining: 264ms
2:	learn: 0.5885088	total: 15.9ms	remaining: 249ms
3:	learn: 0.5579559	total: 21ms	remaining: 242ms
4:	learn: 0.5300598	total: 26.7ms	remaining: 240ms
5:	learn: 0.5043477	total: 31.9ms	remaining: 234ms
6:	learn: 0.4803104	total: 37.5ms	remaining: 230ms
7:	learn: 0.4565615	total: 42.2ms	remaining: 222ms
8:	learn: 0.4348832	total: 48.5ms	remaining: 221ms
9:	learn: 0.4147904	total: 53.1ms	remaining: 212ms
10:	learn: 0.3964124	total: 57.8ms	remaining: 205ms
11:	learn: 0.3791560	total: 62.8ms	remaining: 199ms
12:	learn: 0.3615529	total: 67.7ms	remaining: 193ms
13:	learn: 0.3453082	total: 72.6ms	remaining: 187ms
14:	learn: 0.3301921	total: 77.9ms	remaining: 182ms
15:	learn: 0.3157929	total: 83.3ms	remaining: 177ms
16:	learn: 0.3014285	total: 88.4ms	remaining: 172ms
17:	learn: 0.2883843	total: 93.5ms	remaining: 166ms
18:	learn: 0.2764864	total: 98.7ms	remaining: 161ms
19:	learn: 0.2654412	total

20:	learn: 0.2538749	total: 105ms	remaining: 145ms
21:	learn: 0.2429394	total: 110ms	remaining: 140ms
22:	learn: 0.2325566	total: 115ms	remaining: 135ms
23:	learn: 0.2236365	total: 119ms	remaining: 129ms
24:	learn: 0.2139226	total: 124ms	remaining: 124ms
25:	learn: 0.2048775	total: 129ms	remaining: 119ms
26:	learn: 0.1968582	total: 134ms	remaining: 114ms
27:	learn: 0.1894320	total: 139ms	remaining: 110ms
28:	learn: 0.1823072	total: 144ms	remaining: 104ms
29:	learn: 0.1751400	total: 149ms	remaining: 99.5ms
30:	learn: 0.1679383	total: 154ms	remaining: 94.5ms
31:	learn: 0.1616586	total: 159ms	remaining: 89.5ms
32:	learn: 0.1554048	total: 164ms	remaining: 84.4ms
33:	learn: 0.1495358	total: 169ms	remaining: 79.6ms
34:	learn: 0.1432280	total: 174ms	remaining: 74.6ms
35:	learn: 0.1377971	total: 179ms	remaining: 69.6ms
36:	learn: 0.1327471	total: 184ms	remaining: 64.6ms
37:	learn: 0.1271494	total: 189ms	remaining: 59.6ms
38:	learn: 0.1220659	total: 194ms	remaining: 54.7ms
39:	learn: 0.1170519	

38:	learn: 0.1200562	total: 205ms	remaining: 57.7ms
39:	learn: 0.1150805	total: 210ms	remaining: 52.5ms
40:	learn: 0.1104162	total: 215ms	remaining: 47.3ms
41:	learn: 0.1063421	total: 221ms	remaining: 42.1ms
42:	learn: 0.1022293	total: 226ms	remaining: 36.8ms
43:	learn: 0.0984506	total: 231ms	remaining: 31.5ms
44:	learn: 0.0950982	total: 236ms	remaining: 26.3ms
45:	learn: 0.0912542	total: 242ms	remaining: 21ms
46:	learn: 0.0879828	total: 248ms	remaining: 15.8ms
47:	learn: 0.0850154	total: 253ms	remaining: 10.5ms
48:	learn: 0.0815504	total: 258ms	remaining: 5.27ms
49:	learn: 0.0783667	total: 264ms	remaining: 0us
0:	learn: 0.6573022	total: 5.25ms	remaining: 257ms
1:	learn: 0.6217953	total: 10.4ms	remaining: 249ms
2:	learn: 0.5889102	total: 15.8ms	remaining: 247ms
3:	learn: 0.5585467	total: 20.8ms	remaining: 240ms
4:	learn: 0.5315661	total: 25.9ms	remaining: 233ms
5:	learn: 0.5061442	total: 31.8ms	remaining: 233ms
6:	learn: 0.4827555	total: 37.1ms	remaining: 228ms
7:	learn: 0.4590165	tota

17:	learn: 0.2883843	total: 90.9ms	remaining: 162ms
18:	learn: 0.2764864	total: 95.8ms	remaining: 156ms
19:	learn: 0.2654412	total: 101ms	remaining: 151ms
20:	learn: 0.2538746	total: 106ms	remaining: 147ms
21:	learn: 0.2429486	total: 111ms	remaining: 141ms
22:	learn: 0.2324772	total: 116ms	remaining: 136ms
23:	learn: 0.2229069	total: 121ms	remaining: 131ms
24:	learn: 0.2133663	total: 127ms	remaining: 127ms
25:	learn: 0.2043438	total: 131ms	remaining: 121ms
26:	learn: 0.1966061	total: 137ms	remaining: 116ms
27:	learn: 0.1892058	total: 142ms	remaining: 111ms
28:	learn: 0.1821090	total: 146ms	remaining: 106ms
29:	learn: 0.1746902	total: 152ms	remaining: 101ms
30:	learn: 0.1672056	total: 157ms	remaining: 96.1ms
31:	learn: 0.1608900	total: 162ms	remaining: 91ms
32:	learn: 0.1540588	total: 167ms	remaining: 85.9ms
33:	learn: 0.1481170	total: 172ms	remaining: 80.9ms
34:	learn: 0.1419544	total: 177ms	remaining: 75.8ms
35:	learn: 0.1365230	total: 182ms	remaining: 70.8ms
36:	learn: 0.1312970	tota

In [57]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99797 (+/- 0.00099)
F1 Score: 0.98936 (+/- 0.00525)
Precision: 0.99983 (+/- 0.00102)
Recall: 0.97911 (+/- 0.01017)
