### Step forward feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Read Data

In [2]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [3]:
data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


### Train - Test Split

In [4]:
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

### Remove Correlated features

Step Forward Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
# remove correlated features to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  16


In [6]:
corr_features

{'ackdat',
 'ct_dst_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_ftp_cmd',
 'ct_src_dport_ltm',
 'ct_src_ltm',
 'ct_srv_dst',
 'dbytes',
 'dloss',
 'dwin',
 'is_sm_ips_ports',
 'sbytes',
 'sloss',
 'synack',
 'tcprtt'}

In [7]:
# remove correlated features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((140272, 27), (35069, 27))

### Step Forward Feature Selection

In [8]:
# 1. the algorithm used is RandomForests
# 2. the stopping criteria: 11 features 
# 3. to perform step forward or step backward
# 4. the evaluation metric: the roc_auc
# 5. and the cross-validation
# this is going to take a while

sfs = SFS(RandomForestClassifier(n_estimators=15, n_jobs=4, random_state=0), 
           k_features=11,  # the more features we want, the longer it will take to run
           forward=True, 
           floating=False, # see the docs for more details in this parameter
           verbose=2,      # this indicates how much to print out intermediate steps
           scoring='roc_auc',
           cv=2)

sfs = sfs.fit(np.array(X_train), y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   36.1s finished

[2021-06-01 11:50:32] Features: 1/11 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:   17.0s finished

[2021-06-01 11:50:49] Features: 2/11 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   18.6s finished

[2021-06-01 11:51:07] Features: 3/11 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.9s remaining:  

From the output above, we can see that after adding the 10º feature, the performance begins to plateau. Adding the 11th feature did not increase the performance.

In [9]:
selected_feat = X_train.columns[list(sfs.k_feature_idx_)]
selected_feat

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'rate', 'sttl',
       'dttl', 'sload', 'attack'],
      dtype='object')

### Compare performance of feature subsets

In [10]:
# function to train random forests and evaluate the performance

def run_randomForests(X_train, X_test, y_train, y_test):
    
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [11]:
# evaluate performance of algorithm built using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.9999999999999999
Test set
Random Forests roc-auc: 1.0


In [12]:
# and for comparison, the result with train random forests using all features
# except the correlated ones, which was removed already

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 1.0


As you can see, in this dataset, with 11 characteristics, we get slightly better performance compared to all the variables in the dataset.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((140272, 11), (35069, 11))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers


In [17]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
#%from catboost import CatBoostClassifier

## Metrics Evaluation

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [20]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1).fit(X_train, y_train)

CPU times: user 63.7 ms, sys: 173 ms, total: 237 ms
Wall time: 1.71 s


In [21]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.8110736257193888
F1 Score: 0.8086355230361019
FPR: 0.23091744479027715
TPR: 0.8594991881035808


### Naive Bayes




In [22]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-09).fit(X_train, y_train)

CPU times: user 28.6 ms, sys: 5.71 ms, total: 34.3 ms
Wall time: 32.4 ms


In [23]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.10394919626910101
F1 Score: 0.14704548889224722
FPR: 0.9501259819178894
TPR: 0.1663105717460046


### Random Forest





In [19]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

Wall time: 5.45 s


In [20]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN




In [26]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance').fit(X_train, y_train)

CPU times: user 7.21 ms, sys: 1.36 ms, total: 8.58 ms
Wall time: 6.71 ms


In [27]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.2007938082952967
f1: 0.27812432781243274
fpr: 0.9125537275826293
tpr: 0.3315101273395436


### CatBoost




In [28]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6578789	total: 66.4ms	remaining: 3.25s
1:	learn: 0.6250148	total: 74ms	remaining: 1.78s
2:	learn: 0.5949116	total: 81.4ms	remaining: 1.27s
3:	learn: 0.5658162	total: 89.2ms	remaining: 1.02s
4:	learn: 0.5391590	total: 96.8ms	remaining: 872ms
5:	learn: 0.5144937	total: 104ms	remaining: 762ms
6:	learn: 0.4911437	total: 112ms	remaining: 687ms
7:	learn: 0.4687195	total: 119ms	remaining: 627ms
8:	learn: 0.4478822	total: 127ms	remaining: 579ms
9:	learn: 0.4276795	total: 135ms	remaining: 538ms
10:	learn: 0.4096242	total: 142ms	remaining: 504ms
11:	learn: 0.3921531	total: 150ms	remaining: 476ms
12:	learn: 0.3759627	total: 158ms	remaining: 450ms
13:	learn: 0.3603301	total: 167ms	remaining: 429ms
14:	learn: 0.3453111	total: 175ms	remaining: 409ms
15:	learn: 0.3313422	total: 183ms	remaining: 388ms
16:	learn: 0.3181049	total: 191ms	remaining: 370ms
17:	learn: 0.3055479	total: 198ms	remaining: 352ms
18:	learn: 0.2937139	total: 205ms	remaining: 335ms
19:	learn: 0.2820138	total: 213ms	rema

In [29]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
FPR: 1.0
TPR: 1.0


## Model Evaluation





In [22]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../UNSW_Test.csv")
test_df.shape

(175341, 44)

In [23]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion'])

In [24]:
X_eval = X_eval[selected_feat]

In [25]:
X_eval.shape

(175341, 11)

### Model Evaluation - Logistic Regression

In [34]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [35]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [36]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.92297845738611
Testing accuracy is  0.8110736257193888


In [37]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.8110736257193888
F1 Score: 0.8086355230361019
Precision Score: 0.7634555530251271
Recall Score: 0.8594991881035808
Confusion Matrix:
 [[10378  3116]
 [ 1644 10057]]


### Cross validation - Logistic Regression





In [38]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.83782 (+/- 0.01765)
F1 Score: 0.85305 (+/- 0.01792)
Precision: 0.88069 (+/- 0.02308)
Recall: 0.82746 (+/- 0.03689)


### Model Evaluation - Naive Bayes




In [39]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [40]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [41]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.7960050408327297
Testing accuracy is  0.10394919626910101


In [42]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.10394919626910101
F1 Score: 0.14704548889224722
Precision Score: 0.13178032098598225
Recall Score: 0.1663105717460046
Confusion Matrix:
 [[  673 12821]
 [ 9755  1946]]


### Cross validation - Naive Bayes




In [43]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.79581 (+/- 0.03577)
F1 Score: 0.80397 (+/- 0.04197)
Precision: 0.88439 (+/- 0.01750)
Recall: 0.73800 (+/- 0.07788)


### Model Evaluation - Random Forest




In [26]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [27]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [28]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [29]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest




In [30]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN

In [49]:
modelKNN = KNeighborsClassifier(algorithm='brute',leaf_size=1,n_neighbors=2,weights='distance')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=1, n_neighbors=2,
                     weights='distance')

In [50]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [51]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9999603084037032
Testing accuracy is  0.2007938082952967


In [52]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.2007938082952967
F1 Score: 0.27812432781243274
Precision Score: 0.23954795281911936
Recall Score: 0.3315101273395436
Confusion Matrix:
 [[ 1180 12314]
 [ 7822  3879]]


### Cross validation - KNN



In [53]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.97729 (+/- 0.00713)
F1 Score: 0.98003 (+/- 0.00623)
Precision: 0.98142 (+/- 0.00903)
Recall: 0.97865 (+/- 0.00536)


### Model Evaluation - CatBoost




In [54]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6578789	total: 7.98ms	remaining: 391ms
1:	learn: 0.6250148	total: 15.6ms	remaining: 373ms
2:	learn: 0.5949116	total: 22.4ms	remaining: 351ms
3:	learn: 0.5658162	total: 29.3ms	remaining: 337ms
4:	learn: 0.5391590	total: 36.6ms	remaining: 329ms
5:	learn: 0.5144937	total: 44ms	remaining: 323ms
6:	learn: 0.4911437	total: 51.1ms	remaining: 314ms
7:	learn: 0.4687195	total: 58.3ms	remaining: 306ms
8:	learn: 0.4478822	total: 65.4ms	remaining: 298ms
9:	learn: 0.4276795	total: 72.7ms	remaining: 291ms
10:	learn: 0.4096242	total: 79.9ms	remaining: 283ms
11:	learn: 0.3921531	total: 86.8ms	remaining: 275ms
12:	learn: 0.3759627	total: 94.2ms	remaining: 268ms
13:	learn: 0.3603301	total: 101ms	remaining: 260ms
14:	learn: 0.3453111	total: 108ms	remaining: 252ms
15:	learn: 0.3313422	total: 116ms	remaining: 246ms
16:	learn: 0.3181049	total: 123ms	remaining: 238ms
17:	learn: 0.3055479	total: 130ms	remaining: 230ms
18:	learn: 0.2937139	total: 136ms	remaining: 222ms
19:	learn: 0.2820138	total: 14

<catboost.core.CatBoostClassifier at 0x7fd8f12b1a30>

In [55]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [56]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  0.9893130376970936
Testing accuracy is  0.5355824568366739


In [57]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.5355824568366739
F1 Score: 0.3736022989766641
Precision Score: 0.28684856807120773
Recall Score: 0.5355824568366739
Confusion Matrix:
 [[13494     0]
 [11701     0]]


### Cross validation - CatBoost


In [58]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6604927	total: 4.31ms	remaining: 211ms
1:	learn: 0.6306060	total: 8.38ms	remaining: 201ms
2:	learn: 0.6033294	total: 12.4ms	remaining: 194ms
3:	learn: 0.5782618	total: 16.6ms	remaining: 191ms
4:	learn: 0.5553134	total: 20.8ms	remaining: 187ms
5:	learn: 0.5336507	total: 25ms	remaining: 183ms
6:	learn: 0.5115596	total: 29ms	remaining: 178ms
7:	learn: 0.4927171	total: 33.2ms	remaining: 174ms
8:	learn: 0.4741058	total: 37.3ms	remaining: 170ms
9:	learn: 0.4561951	total: 41.5ms	remaining: 166ms
10:	learn: 0.4392835	total: 45.6ms	remaining: 162ms
11:	learn: 0.4230371	total: 50.4ms	remaining: 160ms
12:	learn: 0.4085204	total: 54.6ms	remaining: 155ms
13:	learn: 0.3939271	total: 58.5ms	remaining: 150ms
14:	learn: 0.3800451	total: 62.8ms	remaining: 146ms
15:	learn: 0.3679132	total: 67.1ms	remaining: 143ms
16:	learn: 0.3558040	total: 71.3ms	remaining: 138ms
17:	learn: 0.3438310	total: 75.4ms	remaining: 134ms
18:	learn: 0.3329938	total: 79.5ms	remaining: 130ms
19:	learn: 0.3230030	total

22:	learn: 0.2934725	total: 96.7ms	remaining: 114ms
23:	learn: 0.2844101	total: 101ms	remaining: 109ms
24:	learn: 0.2764617	total: 105ms	remaining: 105ms
25:	learn: 0.2683684	total: 109ms	remaining: 101ms
26:	learn: 0.2610732	total: 113ms	remaining: 96.1ms
27:	learn: 0.2538513	total: 117ms	remaining: 91.8ms
28:	learn: 0.2469499	total: 121ms	remaining: 87.5ms
29:	learn: 0.2404377	total: 125ms	remaining: 83.4ms
30:	learn: 0.2336600	total: 129ms	remaining: 79.1ms
31:	learn: 0.2274621	total: 133ms	remaining: 75ms
32:	learn: 0.2213539	total: 138ms	remaining: 70.9ms
33:	learn: 0.2157253	total: 142ms	remaining: 66.9ms
34:	learn: 0.2104309	total: 146ms	remaining: 62.7ms
35:	learn: 0.2050871	total: 150ms	remaining: 58.4ms
36:	learn: 0.2001462	total: 154ms	remaining: 54.3ms
37:	learn: 0.1952712	total: 159ms	remaining: 50.1ms
38:	learn: 0.1908766	total: 163ms	remaining: 46ms
39:	learn: 0.1862284	total: 167ms	remaining: 41.8ms
40:	learn: 0.1820389	total: 172ms	remaining: 37.7ms
41:	learn: 0.178487

32:	learn: 0.2216443	total: 136ms	remaining: 69.8ms
33:	learn: 0.2160080	total: 140ms	remaining: 65.7ms
34:	learn: 0.2103541	total: 144ms	remaining: 61.5ms
35:	learn: 0.2055071	total: 147ms	remaining: 57.3ms
36:	learn: 0.2006450	total: 151ms	remaining: 53.2ms
37:	learn: 0.1959134	total: 155ms	remaining: 49.1ms
38:	learn: 0.1914532	total: 159ms	remaining: 45ms
39:	learn: 0.1868003	total: 163ms	remaining: 40.9ms
40:	learn: 0.1826576	total: 168ms	remaining: 36.8ms
41:	learn: 0.1791447	total: 172ms	remaining: 32.7ms
42:	learn: 0.1756304	total: 176ms	remaining: 28.6ms
43:	learn: 0.1713561	total: 180ms	remaining: 24.5ms
44:	learn: 0.1679911	total: 184ms	remaining: 20.4ms
45:	learn: 0.1645902	total: 188ms	remaining: 16.3ms
46:	learn: 0.1613190	total: 192ms	remaining: 12.3ms
47:	learn: 0.1583030	total: 197ms	remaining: 8.2ms
48:	learn: 0.1553426	total: 201ms	remaining: 4.09ms
49:	learn: 0.1525463	total: 205ms	remaining: 0us
0:	learn: 0.6606369	total: 4.24ms	remaining: 208ms
1:	learn: 0.6303965

1:	learn: 0.6306060	total: 8.69ms	remaining: 208ms
2:	learn: 0.6033294	total: 12.7ms	remaining: 199ms
3:	learn: 0.5782618	total: 16.9ms	remaining: 194ms
4:	learn: 0.5553134	total: 20.9ms	remaining: 188ms
5:	learn: 0.5336507	total: 24.9ms	remaining: 183ms
6:	learn: 0.5115596	total: 28.8ms	remaining: 177ms
7:	learn: 0.4927171	total: 33.1ms	remaining: 174ms
8:	learn: 0.4741058	total: 37.2ms	remaining: 170ms
9:	learn: 0.4561951	total: 41.3ms	remaining: 165ms
10:	learn: 0.4392835	total: 45.6ms	remaining: 162ms
11:	learn: 0.4230371	total: 49.8ms	remaining: 158ms
12:	learn: 0.4085204	total: 53.8ms	remaining: 153ms
13:	learn: 0.3939271	total: 57.8ms	remaining: 149ms
14:	learn: 0.3800451	total: 62.2ms	remaining: 145ms
15:	learn: 0.3679132	total: 66.3ms	remaining: 141ms
16:	learn: 0.3558040	total: 70.7ms	remaining: 137ms
17:	learn: 0.3438310	total: 74.7ms	remaining: 133ms
18:	learn: 0.3329938	total: 78.7ms	remaining: 128ms
19:	learn: 0.3230030	total: 83ms	remaining: 124ms
20:	learn: 0.3131713	to

22:	learn: 0.2934725	total: 94.6ms	remaining: 111ms
23:	learn: 0.2844101	total: 98.6ms	remaining: 107ms
24:	learn: 0.2764617	total: 102ms	remaining: 102ms
25:	learn: 0.2683684	total: 106ms	remaining: 98ms
26:	learn: 0.2610732	total: 110ms	remaining: 93.7ms
27:	learn: 0.2538513	total: 114ms	remaining: 89.6ms
28:	learn: 0.2469499	total: 118ms	remaining: 85.5ms
29:	learn: 0.2404377	total: 122ms	remaining: 81.4ms
30:	learn: 0.2336600	total: 126ms	remaining: 77.4ms
31:	learn: 0.2274621	total: 130ms	remaining: 73.3ms
32:	learn: 0.2213539	total: 134ms	remaining: 69.3ms
33:	learn: 0.2157253	total: 138ms	remaining: 65.2ms
34:	learn: 0.2104309	total: 143ms	remaining: 61.1ms
35:	learn: 0.2050871	total: 147ms	remaining: 57.1ms
36:	learn: 0.2001462	total: 151ms	remaining: 53.1ms
37:	learn: 0.1952712	total: 155ms	remaining: 49ms
38:	learn: 0.1908766	total: 159ms	remaining: 45ms
39:	learn: 0.1862284	total: 163ms	remaining: 40.8ms
40:	learn: 0.1820389	total: 167ms	remaining: 36.7ms
41:	learn: 0.178487

48:	learn: 0.1553426	total: 200ms	remaining: 4.09ms
49:	learn: 0.1525463	total: 204ms	remaining: 0us
0:	learn: 0.6606369	total: 4.28ms	remaining: 210ms
1:	learn: 0.6303965	total: 8.73ms	remaining: 210ms
2:	learn: 0.6030867	total: 12.8ms	remaining: 201ms
3:	learn: 0.5779209	total: 17.1ms	remaining: 196ms
4:	learn: 0.5549479	total: 21.2ms	remaining: 191ms
5:	learn: 0.5315644	total: 25.3ms	remaining: 186ms
6:	learn: 0.5107780	total: 29.7ms	remaining: 183ms
7:	learn: 0.4911859	total: 33.9ms	remaining: 178ms
8:	learn: 0.4725286	total: 38.1ms	remaining: 173ms
9:	learn: 0.4547193	total: 42.3ms	remaining: 169ms
10:	learn: 0.4385474	total: 46.3ms	remaining: 164ms
11:	learn: 0.4222202	total: 50.6ms	remaining: 160ms
12:	learn: 0.4075414	total: 54.7ms	remaining: 156ms
13:	learn: 0.3929699	total: 59ms	remaining: 152ms
14:	learn: 0.3792015	total: 63.1ms	remaining: 147ms
15:	learn: 0.3663698	total: 67ms	remaining: 142ms
16:	learn: 0.3540945	total: 71.3ms	remaining: 138ms
17:	learn: 0.3426592	total: 7

14:	learn: 0.3800451	total: 62.3ms	remaining: 145ms
15:	learn: 0.3679132	total: 66.3ms	remaining: 141ms
16:	learn: 0.3558040	total: 70.3ms	remaining: 137ms
17:	learn: 0.3438310	total: 74.3ms	remaining: 132ms
18:	learn: 0.3329938	total: 78.4ms	remaining: 128ms
19:	learn: 0.3230030	total: 82.5ms	remaining: 124ms
20:	learn: 0.3131713	total: 86.5ms	remaining: 119ms
21:	learn: 0.3036604	total: 90.6ms	remaining: 115ms
22:	learn: 0.2942729	total: 94.8ms	remaining: 111ms
23:	learn: 0.2857946	total: 99ms	remaining: 107ms
24:	learn: 0.2771158	total: 103ms	remaining: 103ms
25:	learn: 0.2689083	total: 107ms	remaining: 98.9ms
26:	learn: 0.2616010	total: 111ms	remaining: 94.8ms
27:	learn: 0.2547384	total: 115ms	remaining: 90.7ms
28:	learn: 0.2479248	total: 120ms	remaining: 86.7ms
29:	learn: 0.2411496	total: 124ms	remaining: 82.6ms
30:	learn: 0.2349344	total: 128ms	remaining: 78.5ms
31:	learn: 0.2286674	total: 132ms	remaining: 74.3ms
32:	learn: 0.2225910	total: 136ms	remaining: 70.1ms
33:	learn: 0.21

39:	learn: 0.1862284	total: 166ms	remaining: 41.5ms
40:	learn: 0.1820389	total: 170ms	remaining: 37.3ms
41:	learn: 0.1784871	total: 175ms	remaining: 33.2ms
42:	learn: 0.1748260	total: 178ms	remaining: 29.1ms
43:	learn: 0.1712171	total: 182ms	remaining: 24.9ms
44:	learn: 0.1676162	total: 186ms	remaining: 20.7ms
45:	learn: 0.1641779	total: 190ms	remaining: 16.6ms
46:	learn: 0.1608542	total: 195ms	remaining: 12.4ms
47:	learn: 0.1577683	total: 199ms	remaining: 8.29ms
48:	learn: 0.1547607	total: 203ms	remaining: 4.14ms
49:	learn: 0.1518261	total: 207ms	remaining: 0us
0:	learn: 0.6615113	total: 4.7ms	remaining: 230ms
1:	learn: 0.6325648	total: 8.85ms	remaining: 212ms
2:	learn: 0.6048416	total: 13.2ms	remaining: 206ms
3:	learn: 0.5792385	total: 17.2ms	remaining: 198ms
4:	learn: 0.5561493	total: 21.2ms	remaining: 191ms
5:	learn: 0.5327734	total: 25.2ms	remaining: 185ms
6:	learn: 0.5119447	total: 29.3ms	remaining: 180ms
7:	learn: 0.4913622	total: 33.7ms	remaining: 177ms
8:	learn: 0.4728383	tota

7:	learn: 0.4911859	total: 34ms	remaining: 179ms
8:	learn: 0.4725286	total: 38.1ms	remaining: 173ms
9:	learn: 0.4547193	total: 42ms	remaining: 168ms
10:	learn: 0.4385474	total: 46.1ms	remaining: 163ms
11:	learn: 0.4222202	total: 50.2ms	remaining: 159ms
12:	learn: 0.4075414	total: 54.1ms	remaining: 154ms
13:	learn: 0.3929699	total: 58.1ms	remaining: 149ms
14:	learn: 0.3792015	total: 62.1ms	remaining: 145ms
15:	learn: 0.3663698	total: 66.3ms	remaining: 141ms
16:	learn: 0.3540945	total: 70.4ms	remaining: 137ms
17:	learn: 0.3426592	total: 74.6ms	remaining: 133ms
18:	learn: 0.3318607	total: 78.9ms	remaining: 129ms
19:	learn: 0.3211078	total: 83ms	remaining: 124ms
20:	learn: 0.3112621	total: 87ms	remaining: 120ms
21:	learn: 0.3019678	total: 91.2ms	remaining: 116ms
22:	learn: 0.2925995	total: 95.2ms	remaining: 112ms
23:	learn: 0.2837796	total: 99.6ms	remaining: 108ms
24:	learn: 0.2758219	total: 104ms	remaining: 104ms
25:	learn: 0.2676664	total: 107ms	remaining: 99.2ms
26:	learn: 0.2605890	tot

47:	learn: 0.1587577	total: 201ms	remaining: 8.36ms
48:	learn: 0.1558315	total: 205ms	remaining: 4.18ms
49:	learn: 0.1529345	total: 209ms	remaining: 0us
0:	learn: 0.6617085	total: 4ms	remaining: 196ms
1:	learn: 0.6315147	total: 8.52ms	remaining: 205ms
2:	learn: 0.6035120	total: 12.8ms	remaining: 200ms
3:	learn: 0.5784072	total: 17.2ms	remaining: 197ms
4:	learn: 0.5555145	total: 21.8ms	remaining: 196ms
5:	learn: 0.5337838	total: 26.1ms	remaining: 191ms
6:	learn: 0.5131421	total: 30.4ms	remaining: 186ms
7:	learn: 0.4924077	total: 34.6ms	remaining: 182ms
8:	learn: 0.4738308	total: 38.8ms	remaining: 177ms
9:	learn: 0.4563560	total: 42.9ms	remaining: 172ms
10:	learn: 0.4395238	total: 47.2ms	remaining: 167ms
11:	learn: 0.4231779	total: 51.3ms	remaining: 162ms
12:	learn: 0.4088528	total: 55.2ms	remaining: 157ms
13:	learn: 0.3941483	total: 59.3ms	remaining: 152ms
14:	learn: 0.3802407	total: 63.4ms	remaining: 148ms
15:	learn: 0.3669213	total: 67.4ms	remaining: 143ms
16:	learn: 0.3544000	total: 

13:	learn: 0.3929208	total: 58ms	remaining: 149ms
14:	learn: 0.3791030	total: 62ms	remaining: 145ms
15:	learn: 0.3662821	total: 66ms	remaining: 140ms
16:	learn: 0.3542411	total: 70ms	remaining: 136ms
17:	learn: 0.3432850	total: 74ms	remaining: 132ms
18:	learn: 0.3322727	total: 78ms	remaining: 127ms
19:	learn: 0.3213796	total: 81.8ms	remaining: 123ms
20:	learn: 0.3115331	total: 86ms	remaining: 119ms
21:	learn: 0.3023101	total: 90.2ms	remaining: 115ms
22:	learn: 0.2930425	total: 94.5ms	remaining: 111ms
23:	learn: 0.2840134	total: 98.9ms	remaining: 107ms
24:	learn: 0.2757219	total: 103ms	remaining: 103ms
25:	learn: 0.2677337	total: 107ms	remaining: 98.7ms
26:	learn: 0.2603122	total: 111ms	remaining: 94.6ms
27:	learn: 0.2536412	total: 115ms	remaining: 90.7ms
28:	learn: 0.2467382	total: 119ms	remaining: 86.5ms
29:	learn: 0.2399419	total: 124ms	remaining: 82.5ms
30:	learn: 0.2336483	total: 128ms	remaining: 78.4ms
31:	learn: 0.2274311	total: 132ms	remaining: 74.4ms
32:	learn: 0.2212815	total:

39:	learn: 0.1861066	total: 165ms	remaining: 41.2ms
40:	learn: 0.1820478	total: 169ms	remaining: 37ms
41:	learn: 0.1785034	total: 173ms	remaining: 32.9ms
42:	learn: 0.1749699	total: 177ms	remaining: 28.8ms
43:	learn: 0.1713299	total: 181ms	remaining: 24.7ms
44:	learn: 0.1677345	total: 185ms	remaining: 20.5ms
45:	learn: 0.1642640	total: 189ms	remaining: 16.4ms
46:	learn: 0.1608601	total: 193ms	remaining: 12.3ms
47:	learn: 0.1574104	total: 197ms	remaining: 8.21ms
48:	learn: 0.1542493	total: 201ms	remaining: 4.1ms
49:	learn: 0.1513588	total: 205ms	remaining: 0us
0:	learn: 0.6616490	total: 4.3ms	remaining: 211ms
1:	learn: 0.6312964	total: 8.6ms	remaining: 206ms
2:	learn: 0.6040348	total: 12.7ms	remaining: 199ms
3:	learn: 0.5788558	total: 16.9ms	remaining: 194ms
4:	learn: 0.5559721	total: 21ms	remaining: 189ms
5:	learn: 0.5323055	total: 25ms	remaining: 184ms
6:	learn: 0.5116102	total: 29.3ms	remaining: 180ms
7:	learn: 0.4910021	total: 33.7ms	remaining: 177ms
8:	learn: 0.4725134	total: 37.7m

In [59]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.96673 (+/- 0.00458)
F1 Score: 0.97120 (+/- 0.00387)
Precision: 0.95748 (+/- 0.00940)
Recall: 0.98535 (+/- 0.00710)
