## Step backward feature selection

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_auc_score, r2_score
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## Read Data

In [2]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [3]:
data.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


### Train - Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),
    data['is_intrusion'],
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

### Remove correlated features

Step Backward Feature Selection takes a long time to run, so to speed it up we will reduce the feature space by removing correlated features first.

In [5]:
# remove correlated features to reduce the feature space

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(X_test, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  16


In [6]:
corr_features

{'ackdat',
 'ct_dst_ltm',
 'ct_dst_sport_ltm',
 'ct_dst_src_ltm',
 'ct_ftp_cmd',
 'ct_src_dport_ltm',
 'ct_src_ltm',
 'ct_srv_dst',
 'dbytes',
 'dloss',
 'dwin',
 'is_sm_ips_ports',
 'sbytes',
 'sloss',
 'synack',
 'tcprtt'}

In [7]:
# removed correlated  features
X_train.drop(labels=corr_features, axis=1, inplace=True)
X_test.drop(labels=corr_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((140272, 27), (35069, 27))

### Step Backward Feature Selection

In [8]:
# 1. Algorithm to create, in this case RandomForests
# 2. Stopping criteria: 10 features
# 3. Perform step forward or step backward
# 4. Evaluation metric: in this case the roc_auc
# 5. and cross-validation

sfs = SFS(RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=0),
          k_features=15, # the lower the features we want, the longer this will take
          forward=False,
          floating=False,
          verbose=2,
          scoring='roc_auc',
          cv=2)

sfs = sfs.fit(np.array(X_test), y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    9.6s finished

[2021-05-28 11:41:49] Features: 26/15 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:    9.5s finished

[2021-05-28 11:41:59] Features: 25/15 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    8.9s finished

[2021-05-28 11:42:08] Features: 24/15 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining

### Compare performance of feature subsets

In [9]:
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)

    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [10]:
selected_feat= X_test.columns[list(sfs.k_feature_idx_)]

selected_feat

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sinpkt', 'dinpkt', 'sjit', 'attack'],
      dtype='object')

In [11]:
# evaluate performance of algorithm built using selected features

run_randomForests(X_train[selected_feat],
                  X_test[selected_feat],
                  y_train, y_test)

Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 1.0


In [12]:
# and for comparison, we train random forests using all features

run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 1.0
Test set
Random Forests roc-auc: 1.0


Performance, as expected is roughly the same.

In [13]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [14]:
X_train.shape, X_test.shape

((140272, 15), (35069, 15))

## Standardize Data




In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers



In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation


In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression




In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25).fit(X_train, y_train)

CPU times: user 66.1 ms, sys: 174 ms, total: 240 ms
Wall time: 2.04 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.3201688100601671
F1 Score: 0.0031776560605427107
FPR: 0.004979548283834252
TPR: 0.0015950971749989506


### Naive Bayes




In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-08).fit(X_train, y_train)

CPU times: user 434 ms, sys: 20.1 ms, total: 455 ms
Wall time: 56.7 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7532863782828139
F1 Score: 0.7911053165290453
FPR: 0.10777165214298418
TPR: 0.6876967636317844


### Random Forest




In [24]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000).fit(X_train, y_train)

CPU times: user 47.8 s, sys: 700 ms, total: 48.5 s
Wall time: 48.5 s


In [25]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN




In [26]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform').fit(X_train, y_train)

CPU times: user 6.54 s, sys: 34.1 ms, total: 6.57 s
Wall time: 6.55 s


In [27]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.7005047192677293
f1: 0.793601509226325
fpr: 0.6110617108305175
tpr: 0.8475842673047055


### CatBoost




In [28]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.6539535	total: 63.3ms	remaining: 3.1s
1:	learn: 0.6177720	total: 68.4ms	remaining: 1.64s
2:	learn: 0.5842847	total: 74.3ms	remaining: 1.16s
3:	learn: 0.5532509	total: 83ms	remaining: 954ms
4:	learn: 0.5243310	total: 92ms	remaining: 828ms
5:	learn: 0.4973384	total: 100ms	remaining: 735ms
6:	learn: 0.4721644	total: 109ms	remaining: 671ms
7:	learn: 0.4485204	total: 116ms	remaining: 610ms
8:	learn: 0.4263875	total: 125ms	remaining: 572ms
9:	learn: 0.4055842	total: 135ms	remaining: 538ms
10:	learn: 0.3859849	total: 141ms	remaining: 501ms
11:	learn: 0.3675460	total: 152ms	remaining: 481ms
12:	learn: 0.3501459	total: 159ms	remaining: 454ms
13:	learn: 0.3337275	total: 168ms	remaining: 432ms
14:	learn: 0.3182141	total: 176ms	remaining: 412ms
15:	learn: 0.3035524	total: 185ms	remaining: 393ms
16:	learn: 0.2897036	total: 194ms	remaining: 376ms
17:	learn: 0.2765317	total: 198ms	remaining: 353ms
18:	learn: 0.2640811	total: 207ms	remaining: 338ms
19:	learn: 0.2522655	total: 216ms	remaini

In [29]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


## Model Evaluation




In [31]:
import pandas as pd, numpy as np
data = pd.read_csv('../UNSW_Test.csv')
data.shape

(175341, 44)

In [34]:
# Create feature matrix X and target vextor y
y_eval = data['is_intrusion']
X_eval = data.drop(columns=['is_intrusion'])

In [35]:
X_eval = X_eval[selected_feat]

### Model Evaluation - Logistic Regression


In [36]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=0.1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=0.1, n_jobs=-1, random_state=42)

In [37]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [38]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  1.0
Testing accuracy is  0.32014029484730105


In [39]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.32014029484730105
F1 Score: 0.0031775232042812944
Precision Score: 0.4
Recall Score: 0.0015950971749989506
Confusion Matrix:
 [[11189    57]
 [23785    38]]


### Cross validation - Logistic Regression




In [40]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.74896 (+/- 0.14946)
F1 Score: 0.78542 (+/- 0.16879)
Precision: 0.92492 (+/- 0.18265)
Recall: 0.71270 (+/- 0.32901)


### Model Evaluation - Naive Bayes




In [41]:
modelNB = GaussianNB(var_smoothing=1e-09)
modelNB.fit(X_train, y_train)

GaussianNB()

In [42]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [43]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  1.0
Testing accuracy is  0.8078074652827283


In [44]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.8078074652827283
F1 Score: 0.8462941847206386
Precision Score: 0.9264992260448395
Recall Score: 0.7788691600554086
Confusion Matrix:
 [[ 9774  1472]
 [ 5268 18555]]


### Cross validation - Naive Bayes




In [45]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.84096 (+/- 0.13488)
F1 Score: 0.89520 (+/- 0.07464)
Precision: 0.82883 (+/- 0.13226)
Recall: 0.97746 (+/- 0.02265)


### Model Evaluation - Random Forest




In [46]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [47]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [48]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [49]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest





In [50]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN

In [51]:
modelKNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=1)

In [52]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [53]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9998645488764686
Testing accuracy is  0.7005047192677293


In [54]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.7005047192677293
F1 Score: 0.793601509226325
Precision Score: 0.7460833579663021
Recall Score: 0.8475842673047055
Confusion Matrix:
 [[ 4374  6872]
 [ 3631 20192]]


### Cross validation - KNN




In [55]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.91251 (+/- 0.09721)
F1 Score: 0.93835 (+/- 0.06145)
Precision: 0.92718 (+/- 0.14123)
Recall: 0.95457 (+/- 0.05433)


### Model Evaluation - CatBoost




In [56]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.6539535	total: 5.14ms	remaining: 252ms
1:	learn: 0.6177720	total: 10.3ms	remaining: 248ms
2:	learn: 0.5842847	total: 16.4ms	remaining: 256ms
3:	learn: 0.5532509	total: 24.7ms	remaining: 284ms
4:	learn: 0.5243310	total: 32.8ms	remaining: 296ms
5:	learn: 0.4973384	total: 41.4ms	remaining: 304ms
6:	learn: 0.4721644	total: 49.7ms	remaining: 305ms
7:	learn: 0.4485204	total: 56.2ms	remaining: 295ms
8:	learn: 0.4263875	total: 64.4ms	remaining: 294ms
9:	learn: 0.4055842	total: 72.8ms	remaining: 291ms
10:	learn: 0.3859849	total: 78.6ms	remaining: 279ms
11:	learn: 0.3675460	total: 87ms	remaining: 276ms
12:	learn: 0.3501459	total: 93.3ms	remaining: 266ms
13:	learn: 0.3337275	total: 99.9ms	remaining: 257ms
14:	learn: 0.3182141	total: 108ms	remaining: 251ms
15:	learn: 0.3035524	total: 116ms	remaining: 246ms
16:	learn: 0.2897036	total: 124ms	remaining: 241ms
17:	learn: 0.2765317	total: 129ms	remaining: 229ms
18:	learn: 0.2640811	total: 137ms	remaining: 224ms
19:	learn: 0.2522655	total: 1

<catboost.core.CatBoostClassifier at 0x7fcf300edc10>

In [57]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [58]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [59]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - CatBoost



In [60]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.6539528	total: 5.61ms	remaining: 275ms
1:	learn: 0.6177707	total: 11.3ms	remaining: 272ms
2:	learn: 0.5842812	total: 18.5ms	remaining: 290ms
3:	learn: 0.5532393	total: 27.9ms	remaining: 321ms
4:	learn: 0.5243149	total: 36.9ms	remaining: 332ms
5:	learn: 0.4973200	total: 46.7ms	remaining: 342ms
6:	learn: 0.4721364	total: 56.5ms	remaining: 347ms
7:	learn: 0.4484926	total: 64.4ms	remaining: 338ms
8:	learn: 0.4263566	total: 74.1ms	remaining: 338ms
9:	learn: 0.4055508	total: 83.9ms	remaining: 336ms
10:	learn: 0.3859523	total: 90.5ms	remaining: 321ms
11:	learn: 0.3675123	total: 100ms	remaining: 317ms
12:	learn: 0.3501126	total: 108ms	remaining: 306ms
13:	learn: 0.3336939	total: 115ms	remaining: 296ms
14:	learn: 0.3181798	total: 124ms	remaining: 290ms
15:	learn: 0.3035169	total: 134ms	remaining: 285ms
16:	learn: 0.2896647	total: 144ms	remaining: 279ms
17:	learn: 0.2764945	total: 150ms	remaining: 266ms
18:	learn: 0.2640409	total: 160ms	remaining: 261ms
19:	learn: 0.2522231	total: 16

11:	learn: 0.3675229	total: 104ms	remaining: 328ms
12:	learn: 0.3501226	total: 111ms	remaining: 317ms
13:	learn: 0.3337033	total: 119ms	remaining: 307ms
14:	learn: 0.3181898	total: 128ms	remaining: 299ms
15:	learn: 0.3035265	total: 138ms	remaining: 292ms
16:	learn: 0.2896739	total: 148ms	remaining: 287ms
17:	learn: 0.2765033	total: 154ms	remaining: 273ms
18:	learn: 0.2640503	total: 164ms	remaining: 267ms
19:	learn: 0.2522342	total: 173ms	remaining: 260ms
20:	learn: 0.2409880	total: 179ms	remaining: 248ms
21:	learn: 0.2303313	total: 189ms	remaining: 240ms
22:	learn: 0.2201898	total: 196ms	remaining: 231ms
23:	learn: 0.2105711	total: 206ms	remaining: 223ms
24:	learn: 0.2014263	total: 217ms	remaining: 217ms
25:	learn: 0.1927042	total: 227ms	remaining: 209ms
26:	learn: 0.1843757	total: 233ms	remaining: 199ms
27:	learn: 0.1764507	total: 241ms	remaining: 189ms
28:	learn: 0.1689135	total: 251ms	remaining: 182ms
29:	learn: 0.1617231	total: 260ms	remaining: 173ms
30:	learn: 0.1548545	total: 268

24:	learn: 0.2014222	total: 205ms	remaining: 205ms
25:	learn: 0.1927015	total: 215ms	remaining: 199ms
26:	learn: 0.1843732	total: 222ms	remaining: 189ms
27:	learn: 0.1764483	total: 230ms	remaining: 181ms
28:	learn: 0.1689057	total: 239ms	remaining: 173ms
29:	learn: 0.1617154	total: 248ms	remaining: 165ms
30:	learn: 0.1548473	total: 256ms	remaining: 157ms
31:	learn: 0.1483207	total: 265ms	remaining: 149ms
32:	learn: 0.1420882	total: 274ms	remaining: 141ms
33:	learn: 0.1361198	total: 284ms	remaining: 134ms
34:	learn: 0.1304043	total: 290ms	remaining: 124ms
35:	learn: 0.1249539	total: 297ms	remaining: 115ms
36:	learn: 0.1197603	total: 307ms	remaining: 108ms
37:	learn: 0.1147912	total: 316ms	remaining: 99.7ms
38:	learn: 0.1100254	total: 322ms	remaining: 90.7ms
39:	learn: 0.1054741	total: 330ms	remaining: 82.5ms
40:	learn: 0.1011454	total: 340ms	remaining: 74.7ms
41:	learn: 0.0970001	total: 351ms	remaining: 66.8ms
42:	learn: 0.0930210	total: 360ms	remaining: 58.6ms
43:	learn: 0.0892004	tota

36:	learn: 0.1197572	total: 305ms	remaining: 107ms
37:	learn: 0.1147884	total: 315ms	remaining: 99.5ms
38:	learn: 0.1100227	total: 322ms	remaining: 90.8ms
39:	learn: 0.1054715	total: 331ms	remaining: 82.8ms
40:	learn: 0.1011429	total: 341ms	remaining: 74.8ms
41:	learn: 0.0969962	total: 351ms	remaining: 66.8ms
42:	learn: 0.0930173	total: 360ms	remaining: 58.6ms
43:	learn: 0.0891969	total: 366ms	remaining: 49.8ms
44:	learn: 0.0855588	total: 375ms	remaining: 41.7ms
45:	learn: 0.0820662	total: 383ms	remaining: 33.3ms
46:	learn: 0.0787217	total: 391ms	remaining: 24.9ms
47:	learn: 0.0755242	total: 401ms	remaining: 16.7ms
48:	learn: 0.0724636	total: 411ms	remaining: 8.39ms
49:	learn: 0.0695413	total: 420ms	remaining: 0us
0:	learn: 0.6539528	total: 5.77ms	remaining: 283ms
1:	learn: 0.6177707	total: 11.4ms	remaining: 272ms
2:	learn: 0.5842812	total: 18ms	remaining: 283ms
3:	learn: 0.5532393	total: 27.1ms	remaining: 311ms
4:	learn: 0.5243149	total: 36ms	remaining: 324ms
5:	learn: 0.4973200	total

48:	learn: 0.0724276	total: 405ms	remaining: 8.27ms
49:	learn: 0.0694897	total: 413ms	remaining: 0us
0:	learn: 0.6539528	total: 6.02ms	remaining: 295ms
1:	learn: 0.6177707	total: 12.2ms	remaining: 292ms
2:	learn: 0.5842812	total: 19.4ms	remaining: 304ms
3:	learn: 0.5532397	total: 28.8ms	remaining: 331ms
4:	learn: 0.5243206	total: 37.8ms	remaining: 340ms
5:	learn: 0.4973275	total: 47.5ms	remaining: 349ms
6:	learn: 0.4721508	total: 57.4ms	remaining: 353ms
7:	learn: 0.4485058	total: 65.2ms	remaining: 342ms
8:	learn: 0.4263688	total: 74.6ms	remaining: 340ms
9:	learn: 0.4055627	total: 84.6ms	remaining: 338ms
10:	learn: 0.3859635	total: 91.4ms	remaining: 324ms
11:	learn: 0.3675229	total: 101ms	remaining: 320ms
12:	learn: 0.3501226	total: 109ms	remaining: 310ms
13:	learn: 0.3337033	total: 117ms	remaining: 301ms
14:	learn: 0.3181898	total: 126ms	remaining: 294ms
15:	learn: 0.3035265	total: 136ms	remaining: 288ms
16:	learn: 0.2896739	total: 145ms	remaining: 281ms
17:	learn: 0.2765033	total: 150

10:	learn: 0.3859562	total: 88.6ms	remaining: 314ms
11:	learn: 0.3675158	total: 99ms	remaining: 313ms
12:	learn: 0.3501160	total: 107ms	remaining: 303ms
13:	learn: 0.3336971	total: 115ms	remaining: 295ms
14:	learn: 0.3181834	total: 123ms	remaining: 288ms
15:	learn: 0.3035205	total: 132ms	remaining: 282ms
16:	learn: 0.2896679	total: 142ms	remaining: 276ms
17:	learn: 0.2764975	total: 148ms	remaining: 264ms
18:	learn: 0.2640450	total: 158ms	remaining: 257ms
19:	learn: 0.2522288	total: 167ms	remaining: 250ms
20:	learn: 0.2409829	total: 173ms	remaining: 238ms
21:	learn: 0.2303260	total: 182ms	remaining: 231ms
22:	learn: 0.2201848	total: 189ms	remaining: 222ms
23:	learn: 0.2105665	total: 198ms	remaining: 215ms
24:	learn: 0.2014222	total: 209ms	remaining: 209ms
25:	learn: 0.1927015	total: 219ms	remaining: 202ms
26:	learn: 0.1843732	total: 225ms	remaining: 192ms
27:	learn: 0.1764483	total: 232ms	remaining: 182ms
28:	learn: 0.1689057	total: 241ms	remaining: 175ms
29:	learn: 0.1617154	total: 250

24:	learn: 0.2014232	total: 205ms	remaining: 205ms
25:	learn: 0.1927028	total: 215ms	remaining: 199ms
26:	learn: 0.1843744	total: 222ms	remaining: 189ms
27:	learn: 0.1764494	total: 230ms	remaining: 181ms
28:	learn: 0.1689068	total: 240ms	remaining: 174ms
29:	learn: 0.1617177	total: 249ms	remaining: 166ms
30:	learn: 0.1548493	total: 257ms	remaining: 158ms
31:	learn: 0.1483222	total: 267ms	remaining: 150ms
32:	learn: 0.1420859	total: 276ms	remaining: 142ms
33:	learn: 0.1361175	total: 285ms	remaining: 134ms
34:	learn: 0.1304021	total: 290ms	remaining: 124ms
35:	learn: 0.1249516	total: 297ms	remaining: 116ms
36:	learn: 0.1197572	total: 306ms	remaining: 108ms
37:	learn: 0.1147884	total: 315ms	remaining: 99.4ms
38:	learn: 0.1100227	total: 321ms	remaining: 90.5ms
39:	learn: 0.1054715	total: 328ms	remaining: 82.1ms
40:	learn: 0.1011429	total: 338ms	remaining: 74.2ms
41:	learn: 0.0969962	total: 348ms	remaining: 66.2ms
42:	learn: 0.0930173	total: 357ms	remaining: 58ms
43:	learn: 0.0891969	total:

35:	learn: 0.1248862	total: 300ms	remaining: 117ms
36:	learn: 0.1196874	total: 310ms	remaining: 109ms
37:	learn: 0.1147040	total: 316ms	remaining: 99.9ms
38:	learn: 0.1099532	total: 326ms	remaining: 91.9ms
39:	learn: 0.1053978	total: 333ms	remaining: 83.2ms
40:	learn: 0.1010612	total: 343ms	remaining: 75.3ms
41:	learn: 0.0968932	total: 349ms	remaining: 66.4ms
42:	learn: 0.0929058	total: 355ms	remaining: 57.7ms
43:	learn: 0.0891127	total: 364ms	remaining: 49.7ms
44:	learn: 0.0854796	total: 375ms	remaining: 41.6ms
45:	learn: 0.0819981	total: 385ms	remaining: 33.5ms
46:	learn: 0.0786692	total: 394ms	remaining: 25.1ms
47:	learn: 0.0754880	total: 404ms	remaining: 16.8ms
48:	learn: 0.0724276	total: 414ms	remaining: 8.45ms
49:	learn: 0.0694897	total: 422ms	remaining: 0us
0:	learn: 0.6539528	total: 5.99ms	remaining: 294ms
1:	learn: 0.6177707	total: 11.7ms	remaining: 280ms
2:	learn: 0.5842812	total: 18.8ms	remaining: 294ms
3:	learn: 0.5532397	total: 28.2ms	remaining: 325ms
4:	learn: 0.5243206	t

0:	learn: 0.6539528	total: 5.39ms	remaining: 264ms
1:	learn: 0.6177707	total: 10.7ms	remaining: 256ms
2:	learn: 0.5842812	total: 17.2ms	remaining: 269ms
3:	learn: 0.5532397	total: 26.2ms	remaining: 302ms
4:	learn: 0.5243147	total: 34.9ms	remaining: 314ms
5:	learn: 0.4973201	total: 44.3ms	remaining: 325ms
6:	learn: 0.4721390	total: 53.6ms	remaining: 329ms
7:	learn: 0.4484950	total: 60.7ms	remaining: 319ms
8:	learn: 0.4263606	total: 70ms	remaining: 319ms
9:	learn: 0.4055552	total: 79.9ms	remaining: 320ms
10:	learn: 0.3859562	total: 86.8ms	remaining: 308ms
11:	learn: 0.3675158	total: 96ms	remaining: 304ms
12:	learn: 0.3501160	total: 103ms	remaining: 293ms
13:	learn: 0.3336971	total: 110ms	remaining: 283ms
14:	learn: 0.3181834	total: 118ms	remaining: 276ms
15:	learn: 0.3035205	total: 128ms	remaining: 272ms
16:	learn: 0.2896679	total: 138ms	remaining: 268ms
17:	learn: 0.2764975	total: 144ms	remaining: 256ms
18:	learn: 0.2640450	total: 153ms	remaining: 250ms
19:	learn: 0.2522288	total: 163ms

11:	learn: 0.3675179	total: 95.6ms	remaining: 303ms
12:	learn: 0.3501179	total: 103ms	remaining: 293ms
13:	learn: 0.3336989	total: 111ms	remaining: 286ms
14:	learn: 0.3181846	total: 120ms	remaining: 281ms
15:	learn: 0.3035215	total: 130ms	remaining: 276ms
16:	learn: 0.2896690	total: 139ms	remaining: 270ms
17:	learn: 0.2764986	total: 145ms	remaining: 257ms
18:	learn: 0.2640460	total: 154ms	remaining: 251ms
19:	learn: 0.2522297	total: 163ms	remaining: 245ms
20:	learn: 0.2409838	total: 169ms	remaining: 233ms
21:	learn: 0.2303278	total: 177ms	remaining: 226ms
22:	learn: 0.2201865	total: 185ms	remaining: 217ms
23:	learn: 0.2105681	total: 193ms	remaining: 209ms
24:	learn: 0.2014232	total: 204ms	remaining: 204ms
25:	learn: 0.1927028	total: 214ms	remaining: 197ms
26:	learn: 0.1843744	total: 220ms	remaining: 187ms
27:	learn: 0.1764494	total: 227ms	remaining: 179ms
28:	learn: 0.1689068	total: 236ms	remaining: 171ms
29:	learn: 0.1617177	total: 245ms	remaining: 163ms
30:	learn: 0.1548493	total: 25

24:	learn: 0.2013906	total: 203ms	remaining: 203ms
25:	learn: 0.1926496	total: 210ms	remaining: 194ms
26:	learn: 0.1843526	total: 221ms	remaining: 188ms
27:	learn: 0.1764355	total: 231ms	remaining: 182ms
28:	learn: 0.1688781	total: 239ms	remaining: 173ms
29:	learn: 0.1616667	total: 245ms	remaining: 163ms
30:	learn: 0.1548050	total: 254ms	remaining: 156ms
31:	learn: 0.1482512	total: 262ms	remaining: 148ms
32:	learn: 0.1419940	total: 271ms	remaining: 139ms
33:	learn: 0.1360244	total: 281ms	remaining: 132ms
34:	learn: 0.1303364	total: 290ms	remaining: 124ms
35:	learn: 0.1248862	total: 297ms	remaining: 116ms
36:	learn: 0.1196874	total: 306ms	remaining: 108ms
37:	learn: 0.1147040	total: 312ms	remaining: 98.5ms
38:	learn: 0.1099532	total: 321ms	remaining: 90.5ms
39:	learn: 0.1053978	total: 327ms	remaining: 81.6ms
40:	learn: 0.1010612	total: 336ms	remaining: 73.7ms
41:	learn: 0.0968932	total: 342ms	remaining: 65.1ms
42:	learn: 0.0929058	total: 347ms	remaining: 56.5ms
43:	learn: 0.0891127	tota

47:	learn: 0.0755254	total: 413ms	remaining: 17.2ms
48:	learn: 0.0724647	total: 422ms	remaining: 8.62ms
49:	learn: 0.0695429	total: 432ms	remaining: 0us
0:	learn: 0.6539528	total: 5.59ms	remaining: 274ms
1:	learn: 0.6177707	total: 11ms	remaining: 264ms
2:	learn: 0.5842812	total: 17.9ms	remaining: 281ms
3:	learn: 0.5532397	total: 27.1ms	remaining: 312ms
4:	learn: 0.5243147	total: 35.7ms	remaining: 321ms
5:	learn: 0.4973201	total: 44.7ms	remaining: 328ms
6:	learn: 0.4721390	total: 53.5ms	remaining: 329ms
7:	learn: 0.4484950	total: 60.5ms	remaining: 318ms
8:	learn: 0.4263606	total: 69.5ms	remaining: 316ms
9:	learn: 0.4055552	total: 78.7ms	remaining: 315ms
10:	learn: 0.3859562	total: 85.4ms	remaining: 303ms
11:	learn: 0.3675158	total: 94.9ms	remaining: 301ms
12:	learn: 0.3501160	total: 103ms	remaining: 292ms
13:	learn: 0.3336971	total: 110ms	remaining: 284ms
14:	learn: 0.3181834	total: 119ms	remaining: 278ms
15:	learn: 0.3035205	total: 128ms	remaining: 272ms
16:	learn: 0.2896679	total: 137

11:	learn: 0.3675179	total: 97.6ms	remaining: 309ms
12:	learn: 0.3501179	total: 106ms	remaining: 300ms
13:	learn: 0.3336989	total: 114ms	remaining: 292ms
14:	learn: 0.3181846	total: 123ms	remaining: 286ms
15:	learn: 0.3035215	total: 132ms	remaining: 281ms
16:	learn: 0.2896690	total: 142ms	remaining: 276ms
17:	learn: 0.2764986	total: 148ms	remaining: 262ms
18:	learn: 0.2640460	total: 157ms	remaining: 257ms
19:	learn: 0.2522297	total: 167ms	remaining: 251ms
20:	learn: 0.2409838	total: 174ms	remaining: 240ms
21:	learn: 0.2303278	total: 183ms	remaining: 233ms
22:	learn: 0.2201865	total: 192ms	remaining: 225ms
23:	learn: 0.2105681	total: 202ms	remaining: 218ms
24:	learn: 0.2014232	total: 212ms	remaining: 212ms
25:	learn: 0.1927028	total: 222ms	remaining: 205ms
26:	learn: 0.1843744	total: 229ms	remaining: 195ms
27:	learn: 0.1764494	total: 236ms	remaining: 186ms
28:	learn: 0.1689068	total: 245ms	remaining: 178ms
29:	learn: 0.1617177	total: 255ms	remaining: 170ms
30:	learn: 0.1548493	total: 26

In [61]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)
