### Step forward feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Read Data

In [2]:
data = pd.read_csv('../Kyoto_Train.csv')
data.shape

(124055, 24)

In [3]:
data.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,2.863309,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,90847.0,14024.0,25836.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,218531.0,8968.0,45541.0,1.0,0.0
2,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,176665.0,15997.0,59860.0,1.0,0.0
3,0.0,0,0,0,0.0,0.0,0.67,49,100,0.02,...,6.0,6.0,0.0,0.0,0.0,52769.0,473.0,40649.0,1.0,0.0
4,0.0,0,0,1,1.0,0.0,0.36,0,2,0.0,...,6.0,0.0,0.0,0.0,0.0,65048.0,16609.0,39283.0,1.0,0.0


### Train - Test Split

In [4]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),
    data['Label_code'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((99244, 23), (24811, 23))

### Remove Correlated features

Step Forward Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
# remove correlated features to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  2


In [6]:
corr_features

{'Dst_host_srv_count', 'Dst_host_srv_serror_rate'}

In [7]:
# remove correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((99244, 21), (24811, 21))

### Step Forward Feature Selection

In [8]:
# 1. the algorithm used is RandomForests
# 2. the stopping criteria: 11 features 
# 3. to perform step forward or step backward
# 4. the evaluation metric: the roc_auc
# 5. and the cross-validation
# this is going to take a while

sfs = SFS(RandomForestClassifier(n_estimators=15, n_jobs=4, random_state=0), 
           k_features=15,  # the more features we want, the longer it will take to run
           forward=True, 
           floating=False, # see the docs for more details in this parameter
           verbose=2,      # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)

sfs = sfs.fit(np.array(X_train), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    7.7s finished

[2021-05-29 14:26:46] Features: 1/15 -- score: 0.9763744015609426[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    7.0s finished

[2021-05-29 14:26:53] Features: 2/15 -- score: 0.9999942392105073[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  19 out of  19 | elapsed:    5.0s finished

[2021-05-29 14:26:58] Features: 3/15 -- score: 0.9999998433757996[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

From the output above, we can see that after adding the 10º feature, the performance begins to plateau. Adding the 11th feature did not increase the performance.

In [9]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['Duration', 'Source', 'Count', 'Same_srv_rate', 'Serror_rate',
       'Dst_host_count', 'Dst_host_serror_rate', 'Destination_Port_Number',
       'IDS_detection_code', 'Malware_detection_code', 'Ashula_detection_code',
       'Source_IP_Address_code', 'Destination_IP_Address_code',
       'Start_Time_code', 'Protocol_code'],
      dtype='object')

### Compare performance of feature subsets

In [10]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [11]:
# evaluate performance of algorithm built using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9899091223702193
Test set
Random Forests roc-auc: 0.9901265894660705


In [12]:
# and for comparison, the result with train random forests using all features
# except the correlated ones, which was removed already

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9899771100538453
Test set
Random Forests roc-auc: 0.989399837580738


As you can see, in this dataset, with 15 characteristics, we get slightly better performance compared to all the variables in the dataset.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((99244, 15), (24811, 15))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers


In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 64.3 ms, sys: 181 ms, total: 245 ms
Wall time: 1.85 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.0
FPR: 1.0
TPR: 1.0


### Naive Bayes




In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 27.3 ms, sys: 3.71 ms, total: 31 ms
Wall time: 29.2 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9022610938696546
F1 Score: 0.0016467682173734047
FPR: 0.00409325502758498
TPR: 0.0008565310492505353


### Random Forest





In [24]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 4.85 s, sys: 37.5 ms, total: 4.88 s
Wall time: 4.89 s


In [61]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=1)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN




In [62]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 9.22 s, sys: 35.2 ms, total: 9.26 s
Wall time: 9.24 s


In [63]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN, average='weighted', zero_division=1)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.8611563563923045
fpr: 1.0
tpr: 1.0


### CatBoost




In [64]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6569262	total: 8.86ms	remaining: 434ms
1:	learn: 0.6218588	total: 16.8ms	remaining: 403ms
2:	learn: 0.5907602	total: 24.7ms	remaining: 386ms
3:	learn: 0.5601849	total: 32.5ms	remaining: 373ms
4:	learn: 0.5312364	total: 39.8ms	remaining: 359ms
5:	learn: 0.5045073	total: 46.9ms	remaining: 344ms
6:	learn: 0.4795468	total: 54.2ms	remaining: 333ms
7:	learn: 0.4575675	total: 61.2ms	remaining: 321ms
8:	learn: 0.4368898	total: 68.2ms	remaining: 311ms
9:	learn: 0.4161666	total: 75.4ms	remaining: 302ms
10:	learn: 0.3959893	total: 82.6ms	remaining: 293ms
11:	learn: 0.3777337	total: 90ms	remaining: 285ms
12:	learn: 0.3601769	total: 97.1ms	remaining: 276ms
13:	learn: 0.3449775	total: 104ms	remaining: 268ms
14:	learn: 0.3292640	total: 111ms	remaining: 259ms
15:	learn: 0.3144322	total: 118ms	remaining: 252ms
16:	learn: 0.3005974	total: 125ms	remaining: 243ms
17:	learn: 0.2884532	total: 132ms	remaining: 236ms
18:	learn: 0.2768883	total: 140ms	remaining: 228ms
19:	learn: 0.2647620	total: 14

In [65]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=1)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


## Model Evaluation





In [100]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [101]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code'])

In [102]:
X_eval = X_eval[selected_feat]

In [103]:
X_eval.shape

(62028, 15)

### Model Evaluation - Logistic Regression

In [104]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [105]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [106]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9124279553423884
Testing accuracy is  0.9058885171899561


In [109]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Cross validation - Logistic Regression





In [110]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.90330 (+/- 0.00064)
F1 Score: 0.00066 (+/- 0.00396)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.02857 (+/- 0.17143)
Recall: 0.00033 (+/- 0.00200)


### Model Evaluation - Naive Bayes




In [39]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [40]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [41]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.19440973761637984
Testing accuracy is  0.9022610938696546


In [42]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.9022610938696546
F1 Score: 0.0016467682173734047
Precision Score: 0.02127659574468085
Recall Score: 0.0008565310492505353
Confusion Matrix:
 [[22384    92]
 [ 2333     2]]


### Cross validation - Naive Bayes




In [43]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.39574 (+/- 0.31405)
F1 Score: 0.25525 (+/- 0.20058)
Precision: 0.15369 (+/- 0.17075)
Recall: 0.95941 (+/- 0.06388)


### Model Evaluation - Random Forest




In [44]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [45]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [46]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [47]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest




In [48]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99958 (+/- 0.00066)
F1 Score: 0.99782 (+/- 0.00345)
Precision: 0.99966 (+/- 0.00202)
Recall: 0.99599 (+/- 0.00564)


### Model Evaluation - KNN

In [49]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [50]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [51]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9958486155334327
Testing accuracy is  0.9058885171899561


In [52]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Cross validation - KNN



In [53]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98875 (+/- 0.00243)
F1 Score: 0.94018 (+/- 0.01319)
Precision: 0.96485 (+/- 0.01221)
Recall: 0.91679 (+/- 0.01888)


### Model Evaluation - CatBoost




In [54]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6569262	total: 7.96ms	remaining: 390ms
1:	learn: 0.6218588	total: 15.4ms	remaining: 368ms
2:	learn: 0.5907602	total: 22.8ms	remaining: 357ms
3:	learn: 0.5601849	total: 30.3ms	remaining: 348ms
4:	learn: 0.5312364	total: 37.8ms	remaining: 341ms
5:	learn: 0.5045073	total: 45.2ms	remaining: 331ms
6:	learn: 0.4795468	total: 52.5ms	remaining: 322ms
7:	learn: 0.4575675	total: 59.6ms	remaining: 313ms
8:	learn: 0.4368898	total: 66.7ms	remaining: 304ms
9:	learn: 0.4161666	total: 73.6ms	remaining: 294ms
10:	learn: 0.3959893	total: 81ms	remaining: 287ms
11:	learn: 0.3777337	total: 88.1ms	remaining: 279ms
12:	learn: 0.3601769	total: 95.3ms	remaining: 271ms
13:	learn: 0.3449775	total: 103ms	remaining: 264ms
14:	learn: 0.3292640	total: 110ms	remaining: 257ms
15:	learn: 0.3144322	total: 117ms	remaining: 249ms
16:	learn: 0.3005974	total: 125ms	remaining: 242ms
17:	learn: 0.2884532	total: 132ms	remaining: 234ms
18:	learn: 0.2768883	total: 139ms	remaining: 226ms
19:	learn: 0.2647620	total: 14

<catboost.core.CatBoostClassifier at 0x7ff181242bb0>

In [55]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [56]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9987908588932328
Testing accuracy is  0.9058885171899561


In [57]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - CatBoost


In [58]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6572943	total: 6.23ms	remaining: 305ms
1:	learn: 0.6223135	total: 12.1ms	remaining: 289ms
2:	learn: 0.5919396	total: 17.6ms	remaining: 275ms
3:	learn: 0.5615531	total: 23ms	remaining: 265ms
4:	learn: 0.5334193	total: 28.5ms	remaining: 256ms
5:	learn: 0.5067427	total: 34.1ms	remaining: 250ms
6:	learn: 0.4836411	total: 39.6ms	remaining: 243ms
7:	learn: 0.4617062	total: 45.4ms	remaining: 238ms
8:	learn: 0.4410377	total: 50.9ms	remaining: 232ms
9:	learn: 0.4212757	total: 56.5ms	remaining: 226ms
10:	learn: 0.4032465	total: 62.1ms	remaining: 220ms
11:	learn: 0.3843483	total: 67.7ms	remaining: 214ms
12:	learn: 0.3684605	total: 73.2ms	remaining: 208ms
13:	learn: 0.3532356	total: 78.8ms	remaining: 203ms
14:	learn: 0.3388118	total: 84.2ms	remaining: 196ms
15:	learn: 0.3239502	total: 89.8ms	remaining: 191ms
16:	learn: 0.3109295	total: 94.9ms	remaining: 184ms
17:	learn: 0.2972333	total: 101ms	remaining: 180ms
18:	learn: 0.2851867	total: 107ms	remaining: 175ms
19:	learn: 0.2742535	total

24:	learn: 0.2170783	total: 140ms	remaining: 140ms
25:	learn: 0.2081026	total: 146ms	remaining: 135ms
26:	learn: 0.1993637	total: 152ms	remaining: 130ms
27:	learn: 0.1920678	total: 158ms	remaining: 124ms
28:	learn: 0.1838750	total: 164ms	remaining: 119ms
29:	learn: 0.1772452	total: 170ms	remaining: 113ms
30:	learn: 0.1697676	total: 175ms	remaining: 107ms
31:	learn: 0.1626057	total: 180ms	remaining: 101ms
32:	learn: 0.1568420	total: 186ms	remaining: 95.7ms
33:	learn: 0.1502747	total: 191ms	remaining: 90.1ms
34:	learn: 0.1441809	total: 197ms	remaining: 84.5ms
35:	learn: 0.1391266	total: 203ms	remaining: 78.8ms
36:	learn: 0.1336066	total: 208ms	remaining: 73.1ms
37:	learn: 0.1282535	total: 213ms	remaining: 67.4ms
38:	learn: 0.1231734	total: 219ms	remaining: 61.9ms
39:	learn: 0.1191442	total: 225ms	remaining: 56.3ms
40:	learn: 0.1146548	total: 231ms	remaining: 50.7ms
41:	learn: 0.1099671	total: 237ms	remaining: 45.1ms
42:	learn: 0.1057696	total: 242ms	remaining: 39.4ms
43:	learn: 0.1014196

35:	learn: 0.1410268	total: 200ms	remaining: 77.9ms
36:	learn: 0.1351610	total: 206ms	remaining: 72.4ms
37:	learn: 0.1295477	total: 212ms	remaining: 66.9ms
38:	learn: 0.1251440	total: 218ms	remaining: 61.5ms
39:	learn: 0.1200336	total: 224ms	remaining: 56ms
40:	learn: 0.1152040	total: 230ms	remaining: 50.4ms
41:	learn: 0.1112460	total: 236ms	remaining: 45ms
42:	learn: 0.1075967	total: 242ms	remaining: 39.4ms
43:	learn: 0.1040322	total: 248ms	remaining: 33.9ms
44:	learn: 0.1001305	total: 255ms	remaining: 28.3ms
45:	learn: 0.0960919	total: 261ms	remaining: 22.7ms
46:	learn: 0.0922090	total: 268ms	remaining: 17.1ms
47:	learn: 0.0886847	total: 274ms	remaining: 11.4ms
48:	learn: 0.0859644	total: 281ms	remaining: 5.73ms
49:	learn: 0.0825772	total: 287ms	remaining: 0us
0:	learn: 0.6573190	total: 6.97ms	remaining: 342ms
1:	learn: 0.6218077	total: 13.4ms	remaining: 322ms
2:	learn: 0.5910629	total: 19.9ms	remaining: 312ms
3:	learn: 0.5608168	total: 26.1ms	remaining: 300ms
4:	learn: 0.5321122	tot

0:	learn: 0.6572943	total: 5.69ms	remaining: 279ms
1:	learn: 0.6223135	total: 11.5ms	remaining: 275ms
2:	learn: 0.5919396	total: 17.1ms	remaining: 268ms
3:	learn: 0.5615531	total: 22.9ms	remaining: 263ms
4:	learn: 0.5334193	total: 29.1ms	remaining: 262ms
5:	learn: 0.5067427	total: 35ms	remaining: 257ms
6:	learn: 0.4836411	total: 40.7ms	remaining: 250ms
7:	learn: 0.4617062	total: 46.8ms	remaining: 246ms
8:	learn: 0.4410377	total: 52.2ms	remaining: 238ms
9:	learn: 0.4212757	total: 58.1ms	remaining: 232ms
10:	learn: 0.4032465	total: 64ms	remaining: 227ms
11:	learn: 0.3843483	total: 69.7ms	remaining: 221ms
12:	learn: 0.3684605	total: 75.6ms	remaining: 215ms
13:	learn: 0.3532356	total: 81.3ms	remaining: 209ms
14:	learn: 0.3388118	total: 87.6ms	remaining: 205ms
15:	learn: 0.3239502	total: 94.2ms	remaining: 200ms
16:	learn: 0.3109295	total: 99.1ms	remaining: 192ms
17:	learn: 0.2972333	total: 105ms	remaining: 186ms
18:	learn: 0.2851867	total: 111ms	remaining: 180ms
19:	learn: 0.2742535	total: 

15:	learn: 0.3186304	total: 91ms	remaining: 193ms
16:	learn: 0.3042882	total: 96.8ms	remaining: 188ms
17:	learn: 0.2921243	total: 102ms	remaining: 182ms
18:	learn: 0.2792675	total: 108ms	remaining: 177ms
19:	learn: 0.2670043	total: 114ms	remaining: 171ms
20:	learn: 0.2565287	total: 119ms	remaining: 165ms
21:	learn: 0.2454981	total: 125ms	remaining: 159ms
22:	learn: 0.2362906	total: 130ms	remaining: 153ms
23:	learn: 0.2265495	total: 136ms	remaining: 147ms
24:	learn: 0.2170783	total: 142ms	remaining: 142ms
25:	learn: 0.2081026	total: 149ms	remaining: 137ms
26:	learn: 0.1993637	total: 155ms	remaining: 132ms
27:	learn: 0.1920678	total: 162ms	remaining: 127ms
28:	learn: 0.1838750	total: 168ms	remaining: 122ms
29:	learn: 0.1772452	total: 176ms	remaining: 117ms
30:	learn: 0.1697676	total: 182ms	remaining: 111ms
31:	learn: 0.1626057	total: 188ms	remaining: 106ms
32:	learn: 0.1568420	total: 194ms	remaining: 100ms
33:	learn: 0.1502747	total: 201ms	remaining: 94.4ms
34:	learn: 0.1441809	total: 20

37:	learn: 0.1295477	total: 216ms	remaining: 68.1ms
38:	learn: 0.1251440	total: 221ms	remaining: 62.4ms
39:	learn: 0.1200336	total: 227ms	remaining: 56.6ms
40:	learn: 0.1152040	total: 232ms	remaining: 50.9ms
41:	learn: 0.1112460	total: 238ms	remaining: 45.3ms
42:	learn: 0.1075967	total: 243ms	remaining: 39.5ms
43:	learn: 0.1040322	total: 249ms	remaining: 33.9ms
44:	learn: 0.1001305	total: 254ms	remaining: 28.3ms
45:	learn: 0.0960919	total: 260ms	remaining: 22.6ms
46:	learn: 0.0922090	total: 266ms	remaining: 16.9ms
47:	learn: 0.0886847	total: 271ms	remaining: 11.3ms
48:	learn: 0.0859644	total: 277ms	remaining: 5.65ms
49:	learn: 0.0825772	total: 283ms	remaining: 0us
0:	learn: 0.6573190	total: 5.78ms	remaining: 283ms
1:	learn: 0.6218077	total: 11.6ms	remaining: 278ms
2:	learn: 0.5910629	total: 17.3ms	remaining: 271ms
3:	learn: 0.5608168	total: 23ms	remaining: 264ms
4:	learn: 0.5321122	total: 28.6ms	remaining: 257ms
5:	learn: 0.5073604	total: 34.1ms	remaining: 250ms
6:	learn: 0.4840112	tot

47:	learn: 0.0892833	total: 270ms	remaining: 11.3ms
48:	learn: 0.0857121	total: 276ms	remaining: 5.63ms
49:	learn: 0.0822169	total: 282ms	remaining: 0us
0:	learn: 0.6572943	total: 6.05ms	remaining: 296ms
1:	learn: 0.6223135	total: 11.9ms	remaining: 287ms
2:	learn: 0.5919396	total: 17.5ms	remaining: 275ms
3:	learn: 0.5615531	total: 23.7ms	remaining: 272ms
4:	learn: 0.5334193	total: 29.3ms	remaining: 264ms
5:	learn: 0.5067427	total: 34.8ms	remaining: 255ms
6:	learn: 0.4836411	total: 40ms	remaining: 246ms
7:	learn: 0.4617062	total: 45.6ms	remaining: 239ms
8:	learn: 0.4410377	total: 51.1ms	remaining: 233ms
9:	learn: 0.4212757	total: 56.7ms	remaining: 227ms
10:	learn: 0.4032465	total: 62.2ms	remaining: 221ms
11:	learn: 0.3843483	total: 67.6ms	remaining: 214ms
12:	learn: 0.3684605	total: 73.4ms	remaining: 209ms
13:	learn: 0.3532356	total: 78.8ms	remaining: 203ms
14:	learn: 0.3388118	total: 84.4ms	remaining: 197ms
15:	learn: 0.3239502	total: 89.9ms	remaining: 191ms
16:	learn: 0.3109295	total:

35:	learn: 0.1391266	total: 201ms	remaining: 78.1ms
36:	learn: 0.1336066	total: 207ms	remaining: 72.7ms
37:	learn: 0.1282535	total: 213ms	remaining: 67.1ms
38:	learn: 0.1231734	total: 218ms	remaining: 61.6ms
39:	learn: 0.1191442	total: 224ms	remaining: 56ms
40:	learn: 0.1146548	total: 230ms	remaining: 50.4ms
41:	learn: 0.1099671	total: 235ms	remaining: 44.8ms
42:	learn: 0.1057696	total: 241ms	remaining: 39.2ms
43:	learn: 0.1014196	total: 246ms	remaining: 33.6ms
44:	learn: 0.0974077	total: 252ms	remaining: 28ms
45:	learn: 0.0934837	total: 258ms	remaining: 22.4ms
46:	learn: 0.0903626	total: 263ms	remaining: 16.8ms
47:	learn: 0.0873228	total: 269ms	remaining: 11.2ms
48:	learn: 0.0839263	total: 274ms	remaining: 5.6ms
49:	learn: 0.0805704	total: 280ms	remaining: 0us
0:	learn: 0.6571570	total: 5.59ms	remaining: 274ms
1:	learn: 0.6226507	total: 11.3ms	remaining: 271ms
2:	learn: 0.5916001	total: 17.2ms	remaining: 269ms
3:	learn: 0.5610481	total: 22.6ms	remaining: 260ms
4:	learn: 0.5323544	tota

14:	learn: 0.3330270	total: 83.1ms	remaining: 194ms
15:	learn: 0.3184317	total: 89ms	remaining: 189ms
16:	learn: 0.3052677	total: 94.7ms	remaining: 184ms
17:	learn: 0.2930370	total: 101ms	remaining: 179ms
18:	learn: 0.2814035	total: 107ms	remaining: 174ms
19:	learn: 0.2690495	total: 112ms	remaining: 169ms
20:	learn: 0.2587540	total: 117ms	remaining: 162ms
21:	learn: 0.2488681	total: 123ms	remaining: 156ms
22:	learn: 0.2392307	total: 129ms	remaining: 151ms
23:	learn: 0.2291792	total: 135ms	remaining: 146ms
24:	learn: 0.2205841	total: 140ms	remaining: 140ms
25:	learn: 0.2113220	total: 146ms	remaining: 135ms
26:	learn: 0.2035171	total: 151ms	remaining: 129ms
27:	learn: 0.1956836	total: 157ms	remaining: 123ms
28:	learn: 0.1873901	total: 162ms	remaining: 118ms
29:	learn: 0.1806415	total: 168ms	remaining: 112ms
30:	learn: 0.1734948	total: 174ms	remaining: 106ms
31:	learn: 0.1664340	total: 179ms	remaining: 101ms
32:	learn: 0.1593185	total: 185ms	remaining: 95.2ms
33:	learn: 0.1538715	total: 1

49:	learn: 0.0821109	total: 285ms	remaining: 0us
0:	learn: 0.6572572	total: 5.75ms	remaining: 282ms
1:	learn: 0.6215928	total: 11.5ms	remaining: 277ms
2:	learn: 0.5905717	total: 17.4ms	remaining: 273ms
3:	learn: 0.5601373	total: 23ms	remaining: 265ms
4:	learn: 0.5315776	total: 28.8ms	remaining: 259ms
5:	learn: 0.5069663	total: 34.8ms	remaining: 255ms
6:	learn: 0.4836482	total: 40.7ms	remaining: 250ms
7:	learn: 0.4601595	total: 46.9ms	remaining: 246ms
8:	learn: 0.4399165	total: 53.1ms	remaining: 242ms
9:	learn: 0.4190791	total: 59ms	remaining: 236ms
10:	learn: 0.3989426	total: 64.8ms	remaining: 230ms
11:	learn: 0.3806024	total: 70.9ms	remaining: 224ms
12:	learn: 0.3646616	total: 77.1ms	remaining: 220ms
13:	learn: 0.3493405	total: 83.3ms	remaining: 214ms
14:	learn: 0.3334215	total: 89.3ms	remaining: 208ms
15:	learn: 0.3186752	total: 96.2ms	remaining: 204ms
16:	learn: 0.3056206	total: 102ms	remaining: 198ms
17:	learn: 0.2935110	total: 108ms	remaining: 192ms
18:	learn: 0.2807802	total: 115

17:	learn: 0.2928841	total: 101ms	remaining: 179ms
18:	learn: 0.2800849	total: 107ms	remaining: 175ms
19:	learn: 0.2677940	total: 113ms	remaining: 170ms
20:	learn: 0.2575388	total: 119ms	remaining: 164ms
21:	learn: 0.2477478	total: 125ms	remaining: 159ms
22:	learn: 0.2383143	total: 131ms	remaining: 154ms
23:	learn: 0.2283696	total: 138ms	remaining: 149ms
24:	learn: 0.2189556	total: 144ms	remaining: 144ms
25:	learn: 0.2095762	total: 150ms	remaining: 139ms
26:	learn: 0.2016948	total: 157ms	remaining: 133ms
27:	learn: 0.1936001	total: 163ms	remaining: 128ms
28:	learn: 0.1854276	total: 170ms	remaining: 123ms
29:	learn: 0.1786691	total: 176ms	remaining: 118ms
30:	learn: 0.1714048	total: 183ms	remaining: 112ms
31:	learn: 0.1645560	total: 189ms	remaining: 107ms
32:	learn: 0.1575409	total: 196ms	remaining: 101ms
33:	learn: 0.1514943	total: 203ms	remaining: 95.6ms
34:	learn: 0.1462550	total: 210ms	remaining: 89.8ms
35:	learn: 0.1412179	total: 216ms	remaining: 84ms
36:	learn: 0.1352850	total: 22

33:	learn: 0.1538715	total: 203ms	remaining: 95.5ms
34:	learn: 0.1476537	total: 210ms	remaining: 90.1ms
35:	learn: 0.1425919	total: 218ms	remaining: 84.9ms
36:	learn: 0.1369786	total: 225ms	remaining: 78.9ms
37:	learn: 0.1313402	total: 231ms	remaining: 73ms
38:	learn: 0.1269161	total: 238ms	remaining: 67ms
39:	learn: 0.1219446	total: 244ms	remaining: 60.9ms
40:	learn: 0.1168805	total: 250ms	remaining: 54.8ms
41:	learn: 0.1130199	total: 256ms	remaining: 48.7ms
42:	learn: 0.1085764	total: 262ms	remaining: 42.7ms
43:	learn: 0.1043792	total: 268ms	remaining: 36.5ms
44:	learn: 0.1004339	total: 274ms	remaining: 30.4ms
45:	learn: 0.0963225	total: 280ms	remaining: 24.3ms
46:	learn: 0.0924895	total: 286ms	remaining: 18.2ms
47:	learn: 0.0888264	total: 292ms	remaining: 12.2ms
48:	learn: 0.0855619	total: 298ms	remaining: 6.07ms
49:	learn: 0.0826031	total: 303ms	remaining: 0us
0:	learn: 0.6571018	total: 6.1ms	remaining: 299ms
1:	learn: 0.6225985	total: 12.8ms	remaining: 307ms
2:	learn: 0.5915295	to

In [59]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99874 (+/- 0.00109)
F1 Score: 0.99344 (+/- 0.00570)
Precision: 0.99966 (+/- 0.00135)
Recall: 0.98730 (+/- 0.01159)
