## Quasi-constant features

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

## Read Data

In [7]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [8]:
data.head(5)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


### Train - Test Split

In [9]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),       # drop the target
    data['is_intrusion'],                             # just the target
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((140272, 43), (35069, 43))

## Remove constant features (optional)

First, I will remove constant features like I did in the previous lecture. This will allow a better visualisation of the quasi-constant ones.

In [10]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0.01]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
X_train.shape, X_test.shape

((140272, 43), (35069, 43))

## Remove quasi-constant features

### Using the VarianceThreshold from sklearn

In [11]:
sel = VarianceThreshold(threshold=0.03)
sel.fit(X_train)                         # fit finds the features with low variance

VarianceThreshold(threshold=0.03)

In [12]:
# If we sum over get_support, we get the number of features that are not quasi-constant
sum(sel.get_support())

37

In [13]:
# let's print the number of quasi-constant features
quasi_constant = X_train.columns[~sel.get_support()]
len(quasi_constant)

6

We can see that 1 columns / variables are constant. This means that 6 variable show the same value, just one value, for all the observations of the training set.

In [14]:
# let's print the variable names
quasi_constant

Index(['tcprtt', 'synack', 'ackdat', 'is_ftp_login', 'ct_ftp_cmd',
       'is_sm_ips_ports'],
      dtype='object')

In [16]:
# percentage of observations showing each of the different values of the variable
X_train['is_ftp_login'].value_counts() / np.float64(len(X_train))

0    0.985471
1    0.014401
4    0.000093
2    0.000036
Name: is_ftp_login, dtype: float64

We can see that > 98,5% of the observations show one value, 0. Therefore, this features is fairly constant.

In [17]:
# capture feature names
feat_names = X_train.columns[sel.get_support()]

In [18]:
#remove the quasi-constant features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((140272, 37), (35069, 37))

In [19]:
# transform the array into a dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)
X_test = pd.DataFrame(X_test, columns=feat_names)
X_test.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,attack
0,1.157229,113.0,5.0,2.0,10.0,8.0,966.0,354.0,14.690265,254.0,...,1.0,1.0,1.0,1.0,1.0,5.0,0.0,1.0,1.0,1.0
1,4e-06,119.0,2.0,3.0,2.0,0.0,114.0,0.0,250000.0006,254.0,...,14.0,2.0,15.0,14.0,14.0,14.0,0.0,14.0,14.0,1.0
2,0.212211,113.0,0.0,0.0,6.0,2.0,986.0,86.0,32.986038,62.0,...,7.0,3.0,4.0,4.0,1.0,7.0,0.0,5.0,7.0,0.0
3,9e-06,120.0,0.0,3.0,2.0,0.0,200.0,0.0,111111.1072,254.0,...,4.0,2.0,3.0,3.0,3.0,4.0,0.0,3.0,4.0,1.0
4,1.157823,113.0,5.0,2.0,10.0,8.0,526.0,354.0,14.682728,254.0,...,2.0,1.0,1.0,2.0,1.0,1.0,1.0,19.0,1.0,1.0


In the dataset UNSW-NB15, 6 feature are classified as Quasi constant, thus, 37 features remain

## Standardize Data

In [20]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers

In [21]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [22]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [23]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25).fit(X_train, y_train)

CPU times: user 95.8 ms, sys: 197 ms, total: 293 ms
Wall time: 3.46 s


In [24]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.33265847329550313
F1 Score: 0.03900956760974007
FPR: 0.0048906277787657835
TPR: 0.019938714687486883


### Naive Bayes

In [26]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-08).fit(X_train, y_train)

CPU times: user 81 ms, sys: 5.66 ms, total: 86.6 ms
Wall time: 84.7 ms


In [27]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7423365365422453
F1 Score: 0.7751791401273885
FPR: 0.07033611950915881
TPR: 0.653905889266675


### Random Forest

In [28]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000).fit(X_train, y_train)

CPU times: user 1min 22s, sys: 706 ms, total: 1min 23s
Wall time: 1min 23s


In [29]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN

In [30]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform').fit(X_train, y_train)

CPU times: user 23.4 s, sys: 174 ms, total: 23.6 s
Wall time: 23.4 s


In [31]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.6842510479340729
f1: 0.7391703766518267
fpr: 0.26142628490129827
tpr: 0.6586072283087773


### CatBoost

In [32]:
%%time
clf_CB = CatBoostClassifier(depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5203762	total: 82.5ms	remaining: 4.04s
1:	learn: 0.3863715	total: 103ms	remaining: 2.47s
2:	learn: 0.2883336	total: 125ms	remaining: 1.97s
3:	learn: 0.2130400	total: 148ms	remaining: 1.7s
4:	learn: 0.1581939	total: 169ms	remaining: 1.52s
5:	learn: 0.1214909	total: 190ms	remaining: 1.39s
6:	learn: 0.0909181	total: 211ms	remaining: 1.3s
7:	learn: 0.0686590	total: 233ms	remaining: 1.22s
8:	learn: 0.0527265	total: 254ms	remaining: 1.16s
9:	learn: 0.0403258	total: 275ms	remaining: 1.1s
10:	learn: 0.0305359	total: 296ms	remaining: 1.05s
11:	learn: 0.0236932	total: 318ms	remaining: 1.01s
12:	learn: 0.0185799	total: 342ms	remaining: 973ms
13:	learn: 0.0146775	total: 364ms	remaining: 936ms
14:	learn: 0.0118577	total: 385ms	remaining: 899ms
15:	learn: 0.0093696	total: 405ms	remaining: 861ms
16:	learn: 0.0076554	total: 426ms	remaining: 827ms
17:	learn: 0.0063683	total: 447ms	remaining: 795ms
18:	learn: 0.0052865	total: 469ms	remaining: 765ms
19:	learn: 0.0044068	total: 489ms	remaining

In [33]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.8090402771174354
FPR: 1.0
TPR: 1.0


## Model Evaluation

In [34]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../UNSW_Test.csv")
test_df.shape

(175341, 44)

In [35]:
test_df.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,113.0,0.0,2.0,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,113.0,0.0,2.0,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,113.0,0.0,2.0,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,113.0,3.0,2.0,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,113.0,0.0,2.0,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


In [36]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion','tcprtt','synack','ackdat','is_ftp_login','ct_ftp_cmd',
       'is_sm_ips_ports'])

In [37]:
X_eval.shape

(175341, 37)

### Model Evaluation - Logistic Regression

In [38]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25)
modelLR.fit(X_train, y_train)

LogisticRegression(C=25, n_jobs=-1, random_state=42)

In [39]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [40]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  1.0
Testing accuracy is  0.33265847329550313


In [41]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.33265847329550313
F1 Score: 0.03900956760974007
Precision Score: 0.8962264150943396
Recall Score: 0.019938714687486883
Confusion Matrix:
 [[11191    55]
 [23348   475]]


### Cross validation - Logistic Regression

In [42]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.84997 (+/- 0.14701)
F1 Score: 0.90011 (+/- 0.08196)
Precision: 0.84748 (+/- 0.15547)
Recall: 0.96586 (+/- 0.03799)


### Model Evaluation - Naive Bayes


In [43]:
modelNB = GaussianNB(var_smoothing=1e-08)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-08)

In [44]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [45]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  1.0
Testing accuracy is  0.7423365365422453


In [46]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.7423365365422453
F1 Score: 0.7751791401273885
Precision Score: 0.9516769503329464
Recall Score: 0.653905889266675
Confusion Matrix:
 [[10455   791]
 [ 8245 15578]]


### Cross validation - Naive Bayes

In [47]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.81069 (+/- 0.14598)
F1 Score: 0.87452 (+/- 0.07697)
Precision: 0.81743 (+/- 0.15330)
Recall: 0.94776 (+/- 0.06318)


### Model Evaluation - Random Forest


In [48]:
modelRF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=100, n_estimators=1000, random_state=0)

In [49]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [50]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [51]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest

In [64]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=5, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN

In [53]:
modelKNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=1)

In [54]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [55]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9995366145773925
Testing accuracy is  0.6842510479340729


In [56]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.6842510479340729
F1 Score: 0.7391703766518267
Precision Score: 0.8421900161030595
Recall Score: 0.6586072283087773
Confusion Matrix:
 [[ 8306  2940]
 [ 8133 15690]]


### Cross validation - KNN


In [57]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.87901 (+/- 0.13180)
F1 Score: 0.91584 (+/- 0.07682)
Precision: 0.90359 (+/- 0.17653)
Recall: 0.93684 (+/- 0.07517)


### Model Evaluation - CatBoost

In [58]:
modelCB = CatBoostClassifier(depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5203762	total: 20.7ms	remaining: 1.01s
1:	learn: 0.3863715	total: 40.3ms	remaining: 967ms
2:	learn: 0.2883336	total: 61.1ms	remaining: 958ms
3:	learn: 0.2130400	total: 81.2ms	remaining: 934ms
4:	learn: 0.1581939	total: 101ms	remaining: 912ms
5:	learn: 0.1214909	total: 122ms	remaining: 896ms
6:	learn: 0.0909181	total: 143ms	remaining: 878ms
7:	learn: 0.0686590	total: 164ms	remaining: 861ms
8:	learn: 0.0527265	total: 185ms	remaining: 843ms
9:	learn: 0.0403258	total: 206ms	remaining: 823ms
10:	learn: 0.0305359	total: 225ms	remaining: 799ms
11:	learn: 0.0236932	total: 246ms	remaining: 778ms
12:	learn: 0.0185799	total: 268ms	remaining: 762ms
13:	learn: 0.0146775	total: 288ms	remaining: 741ms
14:	learn: 0.0118577	total: 309ms	remaining: 721ms
15:	learn: 0.0093696	total: 328ms	remaining: 698ms
16:	learn: 0.0076554	total: 349ms	remaining: 677ms
17:	learn: 0.0063683	total: 368ms	remaining: 655ms
18:	learn: 0.0052865	total: 389ms	remaining: 635ms
19:	learn: 0.0044068	total: 410ms	rem

<catboost.core.CatBoostClassifier at 0x7fd7ca41c5b0>

In [59]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [60]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [61]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - CatBoost

In [65]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=5, scoring='recall')

0:	learn: 0.5200986	total: 19.7ms	remaining: 967ms
1:	learn: 0.3858020	total: 39.4ms	remaining: 945ms
2:	learn: 0.2877645	total: 60.5ms	remaining: 947ms
3:	learn: 0.2125625	total: 80.4ms	remaining: 924ms
4:	learn: 0.1577542	total: 100ms	remaining: 904ms
5:	learn: 0.1210598	total: 123ms	remaining: 901ms
6:	learn: 0.0906504	total: 143ms	remaining: 881ms
7:	learn: 0.0664036	total: 159ms	remaining: 836ms
8:	learn: 0.0506660	total: 179ms	remaining: 815ms
9:	learn: 0.0386760	total: 199ms	remaining: 796ms
10:	learn: 0.0298533	total: 220ms	remaining: 780ms
11:	learn: 0.0231533	total: 242ms	remaining: 766ms
12:	learn: 0.0183607	total: 260ms	remaining: 739ms
13:	learn: 0.0144009	total: 279ms	remaining: 718ms
14:	learn: 0.0117596	total: 300ms	remaining: 699ms
15:	learn: 0.0094999	total: 319ms	remaining: 678ms
16:	learn: 0.0078327	total: 340ms	remaining: 660ms
17:	learn: 0.0065094	total: 360ms	remaining: 640ms
18:	learn: 0.0055002	total: 380ms	remaining: 619ms
19:	learn: 0.0045881	total: 400ms	rem

20:	learn: 0.0039092	total: 442ms	remaining: 610ms
21:	learn: 0.0033770	total: 462ms	remaining: 588ms
22:	learn: 0.0028857	total: 484ms	remaining: 568ms
23:	learn: 0.0025061	total: 504ms	remaining: 546ms
24:	learn: 0.0021701	total: 523ms	remaining: 523ms
25:	learn: 0.0018741	total: 544ms	remaining: 502ms
26:	learn: 0.0016589	total: 565ms	remaining: 481ms
27:	learn: 0.0014748	total: 585ms	remaining: 460ms
28:	learn: 0.0013155	total: 606ms	remaining: 439ms
29:	learn: 0.0011931	total: 625ms	remaining: 417ms
30:	learn: 0.0010718	total: 646ms	remaining: 396ms
31:	learn: 0.0009572	total: 668ms	remaining: 376ms
32:	learn: 0.0008661	total: 688ms	remaining: 354ms
33:	learn: 0.0007830	total: 706ms	remaining: 332ms
34:	learn: 0.0007286	total: 725ms	remaining: 311ms
35:	learn: 0.0006750	total: 743ms	remaining: 289ms
36:	learn: 0.0006253	total: 762ms	remaining: 268ms
37:	learn: 0.0005802	total: 780ms	remaining: 246ms
38:	learn: 0.0005386	total: 798ms	remaining: 225ms
39:	learn: 0.0005023	total: 816

41:	learn: 0.0004834	total: 839ms	remaining: 160ms
42:	learn: 0.0004515	total: 858ms	remaining: 140ms
43:	learn: 0.0004230	total: 879ms	remaining: 120ms
44:	learn: 0.0004013	total: 898ms	remaining: 99.8ms
45:	learn: 0.0003775	total: 918ms	remaining: 79.8ms
46:	learn: 0.0003600	total: 937ms	remaining: 59.8ms
47:	learn: 0.0003421	total: 955ms	remaining: 39.8ms
48:	learn: 0.0003257	total: 972ms	remaining: 19.8ms
49:	learn: 0.0003160	total: 991ms	remaining: 0us
0:	learn: 0.5202632	total: 21ms	remaining: 1.03s
1:	learn: 0.3860870	total: 41.2ms	remaining: 989ms
2:	learn: 0.2883550	total: 62.4ms	remaining: 977ms
3:	learn: 0.2129820	total: 82.6ms	remaining: 950ms
4:	learn: 0.1581037	total: 103ms	remaining: 927ms
5:	learn: 0.1202168	total: 125ms	remaining: 916ms
6:	learn: 0.0900459	total: 146ms	remaining: 894ms
7:	learn: 0.0679688	total: 167ms	remaining: 874ms
8:	learn: 0.0517948	total: 187ms	remaining: 853ms
9:	learn: 0.0396378	total: 208ms	remaining: 832ms
10:	learn: 0.0300473	total: 229ms	re

10:	learn: 0.0298533	total: 224ms	remaining: 795ms
11:	learn: 0.0231533	total: 244ms	remaining: 774ms
12:	learn: 0.0183607	total: 263ms	remaining: 748ms
13:	learn: 0.0144009	total: 282ms	remaining: 725ms
14:	learn: 0.0117596	total: 304ms	remaining: 708ms
15:	learn: 0.0094999	total: 324ms	remaining: 689ms
16:	learn: 0.0078327	total: 344ms	remaining: 669ms
17:	learn: 0.0065094	total: 366ms	remaining: 650ms
18:	learn: 0.0055002	total: 386ms	remaining: 630ms
19:	learn: 0.0045881	total: 408ms	remaining: 612ms
20:	learn: 0.0038626	total: 426ms	remaining: 588ms
21:	learn: 0.0032867	total: 446ms	remaining: 568ms
22:	learn: 0.0027596	total: 465ms	remaining: 546ms
23:	learn: 0.0023557	total: 485ms	remaining: 526ms
24:	learn: 0.0020347	total: 506ms	remaining: 506ms
25:	learn: 0.0017508	total: 526ms	remaining: 485ms
26:	learn: 0.0015440	total: 547ms	remaining: 466ms
27:	learn: 0.0013531	total: 568ms	remaining: 446ms
28:	learn: 0.0011952	total: 587ms	remaining: 425ms
29:	learn: 0.0010533	total: 608

31:	learn: 0.0009572	total: 661ms	remaining: 372ms
32:	learn: 0.0008661	total: 681ms	remaining: 351ms
33:	learn: 0.0007830	total: 700ms	remaining: 330ms
34:	learn: 0.0007286	total: 719ms	remaining: 308ms
35:	learn: 0.0006750	total: 738ms	remaining: 287ms
36:	learn: 0.0006253	total: 756ms	remaining: 266ms
37:	learn: 0.0005802	total: 775ms	remaining: 245ms
38:	learn: 0.0005386	total: 794ms	remaining: 224ms
39:	learn: 0.0005023	total: 812ms	remaining: 203ms
40:	learn: 0.0004725	total: 830ms	remaining: 182ms
41:	learn: 0.0004430	total: 849ms	remaining: 162ms
42:	learn: 0.0004192	total: 870ms	remaining: 142ms
43:	learn: 0.0003974	total: 888ms	remaining: 121ms
44:	learn: 0.0003764	total: 906ms	remaining: 101ms
45:	learn: 0.0003580	total: 923ms	remaining: 80.3ms
46:	learn: 0.0003412	total: 940ms	remaining: 60ms
47:	learn: 0.0003247	total: 957ms	remaining: 39.9ms
48:	learn: 0.0003098	total: 975ms	remaining: 19.9ms
49:	learn: 0.0002961	total: 991ms	remaining: 0us
0:	learn: 0.5189207	total: 19.9

0:	learn: 0.5202632	total: 21ms	remaining: 1.03s
1:	learn: 0.3860870	total: 41ms	remaining: 985ms
2:	learn: 0.2883550	total: 62.1ms	remaining: 972ms
3:	learn: 0.2129820	total: 82.9ms	remaining: 954ms
4:	learn: 0.1581037	total: 103ms	remaining: 930ms
5:	learn: 0.1202168	total: 124ms	remaining: 909ms
6:	learn: 0.0900459	total: 144ms	remaining: 886ms
7:	learn: 0.0679688	total: 165ms	remaining: 866ms
8:	learn: 0.0517948	total: 186ms	remaining: 848ms
9:	learn: 0.0396378	total: 207ms	remaining: 828ms
10:	learn: 0.0300473	total: 228ms	remaining: 808ms
11:	learn: 0.0234476	total: 249ms	remaining: 787ms
12:	learn: 0.0185541	total: 271ms	remaining: 771ms
13:	learn: 0.0146670	total: 292ms	remaining: 752ms
14:	learn: 0.0117866	total: 315ms	remaining: 734ms
15:	learn: 0.0094116	total: 336ms	remaining: 714ms
16:	learn: 0.0076359	total: 357ms	remaining: 694ms
17:	learn: 0.0062937	total: 379ms	remaining: 673ms
18:	learn: 0.0052892	total: 400ms	remaining: 653ms
19:	learn: 0.0043827	total: 422ms	remaini

In [66]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)
