## Quasi-constant features

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

## Read Data

In [2]:
data = pd.read_csv('../Kyoto_Train.csv')
data.shape

(124055, 24)

In [3]:
data.head(5)

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,2.863309,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,90847.0,14024.0,25836.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,218531.0,8968.0,45541.0,1.0,0.0
2,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,176665.0,15997.0,59860.0,1.0,0.0
3,0.0,0,0,0,0.0,0.0,0.67,49,100,0.02,...,6.0,6.0,0.0,0.0,0.0,52769.0,473.0,40649.0,1.0,0.0
4,0.0,0,0,1,1.0,0.0,0.36,0,2,0.0,...,6.0,0.0,0.0,0.0,0.0,65048.0,16609.0,39283.0,1.0,0.0


### Train - Test Split

In [4]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['Label_code'], axis=1),       # drop the target
    data['Label_code'],                             # just the target
    test_size=0.2,
    random_state=0)

X_train.shape, X_test.shape

((99244, 23), (24811, 23))

## Remove constant features (optional)

First, I will remove constant features like I did in the previous lecture. This will allow a better visualisation of the quasi-constant ones.

In [5]:
constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0.01]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)
X_train.shape, X_test.shape

((99244, 23), (24811, 23))

## Remove quasi-constant features

### Using the VarianceThreshold from sklearn

In [6]:
sel = VarianceThreshold(threshold=0.03)
sel.fit(X_train)                         # fit finds the features with low variance

VarianceThreshold(threshold=0.03)

In [7]:
# If we sum over get_support, we get the number of features that are not quasi-constant
sum(sel.get_support())

22

In [8]:
# let's print the number of quasi-constant features
quasi_constant = X_train.columns[~sel.get_support()]
len(quasi_constant)

1

In [9]:
~sel.get_support()

array([False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

We can see that 1 columns / variables are constant. This means that 22 variable show the same value, just one value, for all the observations of the training set.

In [10]:
# let's print the variable names
quasi_constant

Index(['Dst_host_same_port_rate'], dtype='object')

In [11]:
# percentage of observations showing each of the different values of the variable
X_train['Dst_host_same_port_rate'].value_counts() / np.float64(len(X_train))

0.00    0.959111
1.00    0.022006
0.02    0.003920
0.03    0.003436
0.04    0.001179
0.05    0.001129
0.01    0.000987
0.07    0.000887
0.33    0.000877
0.06    0.000867
0.08    0.000776
0.20    0.000665
0.50    0.000645
0.11    0.000484
0.09    0.000453
0.14    0.000403
0.25    0.000383
0.17    0.000343
0.12    0.000322
0.10    0.000322
0.40    0.000131
0.67    0.000101
0.43    0.000091
0.29    0.000060
0.15    0.000060
0.75    0.000040
0.27    0.000040
0.83    0.000030
0.38    0.000030
0.13    0.000030
0.24    0.000020
0.80    0.000020
0.18    0.000020
0.16    0.000020
0.22    0.000010
0.28    0.000010
0.92    0.000010
0.69    0.000010
0.59    0.000010
0.19    0.000010
0.55    0.000010
0.60    0.000010
0.48    0.000010
0.72    0.000010
0.57    0.000010
Name: Dst_host_same_port_rate, dtype: float64

We can see that > 99% of the observations show one value, 0. Therefore, this features is fairly constant.

In [12]:
# capture feature names
feat_names = X_train.columns[sel.get_support()]

In [13]:
#remove the quasi-constant features
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((99244, 22), (24811, 22))

In [14]:
# transform the array into a dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)
X_test = pd.DataFrame(X_test, columns=feat_names)
X_test.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_serror_rate,...,Destination_Port_Number,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1433.0,6.0,6.0,0.0,0.0,0.0,278791.0,20421.0,4054.0,1.0
1,0.000429,65.0,118.0,89.0,1.0,0.0,0.01,99.0,99.0,0.0,...,53.0,1.0,10.0,0.0,0.0,0.0,255086.0,5573.0,41748.0,2.0
2,3.221504,505.0,1960.0,0.0,0.0,0.0,0.0,16.0,16.0,0.0,...,22.0,13.0,2.0,255.0,0.0,0.0,149516.0,9880.0,33823.0,1.0
3,0.000464,46.0,77.0,23.0,1.0,0.04,0.14,86.0,88.0,0.0,...,53.0,1.0,10.0,0.0,0.0,0.0,255086.0,5573.0,77774.0,2.0
4,0.000518,43.0,100.0,27.0,1.0,0.0,0.0,40.0,100.0,0.0,...,53.0,1.0,10.0,0.0,0.0,0.0,210104.0,5573.0,1191.0,2.0


## Standardize Data

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Classifiers

In [16]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [18]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1).fit(X_train, y_train)

CPU times: user 77.8 ms, sys: 211 ms, total: 289 ms
Wall time: 3.28 s


In [19]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.42787473298133893
F1 Score: 0.23850651789067112
FPR: 0.6265794625378182
TPR: 0.95203426124197


### Naive Bayes

In [20]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-05).fit(X_train, y_train)

CPU times: user 418 ms, sys: 29.9 ms, total: 448 ms
Wall time: 56.5 ms


In [21]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9045584619725122
F1 Score: 0.0
FPR: 0.0014682327816337426
TPR: 0.0


### Random Forest

In [22]:
%%time
clf_RF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100).fit(X_train, y_train)

CPU times: user 7.03 s, sys: 87.6 ms, total: 7.12 s
Wall time: 6.73 s


In [23]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
FPR: 1.0
TPR: 1.0


### KNN

In [24]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform').fit(X_train, y_train)

CPU times: user 9.72 s, sys: 66.5 ms, total: 9.79 s
Wall time: 9.76 s


In [25]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.9058885171899561
f1: 0.0
fpr: 1.0
tpr: 1.0


### CatBoost

In [26]:
%%time
clf_CB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5906847	total: 77.3ms	remaining: 3.79s
1:	learn: 0.4824639	total: 95.4ms	remaining: 2.29s
2:	learn: 0.3842578	total: 115ms	remaining: 1.8s
3:	learn: 0.3331060	total: 132ms	remaining: 1.52s
4:	learn: 0.2930860	total: 150ms	remaining: 1.35s
5:	learn: 0.2281165	total: 168ms	remaining: 1.23s
6:	learn: 0.1752793	total: 188ms	remaining: 1.16s
7:	learn: 0.1362422	total: 208ms	remaining: 1.09s
8:	learn: 0.1050046	total: 226ms	remaining: 1.03s
9:	learn: 0.0805886	total: 246ms	remaining: 984ms
10:	learn: 0.0627954	total: 265ms	remaining: 939ms
11:	learn: 0.0500060	total: 283ms	remaining: 897ms
12:	learn: 0.0393626	total: 303ms	remaining: 862ms
13:	learn: 0.0314047	total: 322ms	remaining: 828ms
14:	learn: 0.0275367	total: 338ms	remaining: 790ms
15:	learn: 0.0223894	total: 356ms	remaining: 756ms
16:	learn: 0.0181111	total: 373ms	remaining: 724ms
17:	learn: 0.0147404	total: 390ms	remaining: 693ms
18:	learn: 0.0118858	total: 406ms	remaining: 663ms
19:	learn: 0.0098139	total: 423ms	remain

In [27]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.9058885171899561
F1 Score: 0.0
FPR: 1.0
TPR: 1.0


## Model Evaluation

In [28]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../Kyoto_Test.csv")
test_df.shape

(62028, 24)

In [29]:
test_df.head()

Unnamed: 0,Duration,Source,Destination_bytes,Count,Same_srv_rate,Serror_rate,Srv_serror_rate,Dst_host_count,Dst_host_srv_count,Dst_host_same_port_rate,...,Service_code,Flag_code,IDS_detection_code,Malware_detection_code,Ashula_detection_code,Source_IP_Address_code,Destination_IP_Address_code,Start_Time_code,Protocol_code,Label_code
0,0.0,0,0,0,0.0,0.0,1.0,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,153095.0,10807.0,8445.0,1.0,0.0
1,0.0,0,0,0,0.0,0.0,0.8,0,0,0.0,...,6.0,6.0,0.0,0.0,0.0,284736.0,19033.0,70490.0,1.0,0.0
2,0.000434,46,104,33,1.0,0.0,0.03,91,96,0.0,...,1.0,10.0,0.0,0.0,0.0,255086.0,5573.0,54893.0,2.0,0.0
3,0.000537,45,86,6,1.0,0.0,0.12,16,55,0.06,...,1.0,10.0,0.0,0.0,0.0,255086.0,5573.0,76339.0,2.0,0.0
4,0.965335,1423,2077,2,1.0,0.0,0.0,21,21,0.0,...,13.0,10.0,0.0,0.0,0.0,140163.0,15548.0,5276.0,1.0,0.0


In [30]:
# Create feature matrix X and target vextor y
y_eval = test_df['Label_code']
X_eval = test_df.drop(columns=['Label_code','Dst_host_same_port_rate'])

### Model Evaluation - Logistic Regression

In [31]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=1)
modelLR.fit(X_train, y_train)

LogisticRegression(C=1, n_jobs=-1, random_state=42)

In [32]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [33]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  0.9301620249083068
Testing accuracy is  0.42787473298133893


In [34]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.42787473298133893
F1 Score: 0.23850651789067112
Precision Score: 0.13633018520789894
Recall Score: 0.95203426124197
Confusion Matrix:
 [[ 8393 14083]
 [  112  2223]]


### Cross validation - Logistic Regression

In [35]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.90011 (+/- 0.00194)
F1 Score: 0.00096 (+/- 0.00409)
Precision: 0.01055 (+/- 0.04742)
Recall: 0.00050 (+/- 0.00214)


### Model Evaluation - Naive Bayes


In [36]:
modelNB = GaussianNB(var_smoothing=1e-05)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-05)

In [37]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [38]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  0.31863891016081575
Testing accuracy is  0.9045584619725122


In [39]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.9045584619725122
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22443    33]
 [ 2335     0]]


### Cross validation - Naive Bayes

In [40]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.51851 (+/- 0.28039)
F1 Score: 0.25979 (+/- 0.02808)
Precision: 0.21404 (+/- 0.38891)
Recall: 0.86306 (+/- 0.46302)


### Model Evaluation - Random Forest


In [41]:
modelRF = RandomForestClassifier(random_state=0,max_depth=70,n_estimators=100)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=70, random_state=0)

In [42]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [43]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [44]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - Random Forest

In [45]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99916 (+/- 0.00066)
F1 Score: 0.99564 (+/- 0.00343)
Precision: 0.99916 (+/- 0.00168)
Recall: 0.99215 (+/- 0.00617)


### Model Evaluation - KNN

In [46]:
modelKNN = KNeighborsClassifier(algorithm='auto',leaf_size=1,n_neighbors=2,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(leaf_size=1, n_neighbors=2)

In [47]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [48]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9948611502962396
Testing accuracy is  0.9058885171899561


In [49]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.0
Precision Score: 0.0
Recall Score: 0.0
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


  _warn_prf(average, modifier, msg_start, len(result))


### Cross validation - KNN


In [50]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.98446 (+/- 0.00246)
F1 Score: 0.91715 (+/- 0.01310)
Precision: 0.94433 (+/- 0.01808)
Recall: 0.89156 (+/- 0.01604)


### Model Evaluation - CatBoost

In [51]:
modelCB = CatBoostClassifier(random_state=0,depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5906847	total: 17.3ms	remaining: 849ms
1:	learn: 0.4824639	total: 34.9ms	remaining: 837ms
2:	learn: 0.3842578	total: 52.6ms	remaining: 824ms
3:	learn: 0.3331060	total: 69.6ms	remaining: 800ms
4:	learn: 0.2930860	total: 87.2ms	remaining: 785ms
5:	learn: 0.2281165	total: 104ms	remaining: 764ms
6:	learn: 0.1752793	total: 122ms	remaining: 747ms
7:	learn: 0.1362422	total: 139ms	remaining: 728ms
8:	learn: 0.1050046	total: 156ms	remaining: 711ms
9:	learn: 0.0805886	total: 173ms	remaining: 692ms
10:	learn: 0.0627954	total: 190ms	remaining: 675ms
11:	learn: 0.0500060	total: 207ms	remaining: 655ms
12:	learn: 0.0393626	total: 224ms	remaining: 638ms
13:	learn: 0.0314047	total: 241ms	remaining: 620ms
14:	learn: 0.0275367	total: 258ms	remaining: 601ms
15:	learn: 0.0223894	total: 275ms	remaining: 584ms
16:	learn: 0.0181111	total: 292ms	remaining: 567ms
17:	learn: 0.0147404	total: 309ms	remaining: 550ms
18:	learn: 0.0118858	total: 327ms	remaining: 533ms
19:	learn: 0.0098139	total: 343ms	re

<catboost.core.CatBoostClassifier at 0x7fad3025e850>

In [52]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [53]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.9058885171899561


In [54]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.9058885171899561
F1 Score: 0.8611563563923045
Precision Score: 0.8206340055766175
Recall Score: 0.9058885171899561
Confusion Matrix:
 [[22476     0]
 [ 2335     0]]


### Cross validation - CatBoost

In [55]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.5880521	total: 14.7ms	remaining: 721ms
1:	learn: 0.4819926	total: 29.6ms	remaining: 710ms
2:	learn: 0.3891170	total: 43.7ms	remaining: 685ms
3:	learn: 0.3340127	total: 57.6ms	remaining: 662ms
4:	learn: 0.2859091	total: 71.6ms	remaining: 645ms
5:	learn: 0.2226428	total: 86ms	remaining: 630ms
6:	learn: 0.1713590	total: 100ms	remaining: 616ms
7:	learn: 0.1394203	total: 114ms	remaining: 600ms
8:	learn: 0.1106009	total: 129ms	remaining: 587ms
9:	learn: 0.0869154	total: 142ms	remaining: 569ms
10:	learn: 0.0687930	total: 156ms	remaining: 555ms
11:	learn: 0.0546116	total: 171ms	remaining: 540ms
12:	learn: 0.0438219	total: 185ms	remaining: 526ms
13:	learn: 0.0354500	total: 199ms	remaining: 511ms
14:	learn: 0.0288938	total: 213ms	remaining: 497ms
15:	learn: 0.0239036	total: 227ms	remaining: 482ms
16:	learn: 0.0198538	total: 241ms	remaining: 467ms
17:	learn: 0.0164515	total: 255ms	remaining: 453ms
18:	learn: 0.0135764	total: 269ms	remaining: 438ms
19:	learn: 0.0121990	total: 283ms	rem

14:	learn: 0.0637744	total: 212ms	remaining: 494ms
15:	learn: 0.0528920	total: 225ms	remaining: 478ms
16:	learn: 0.0481380	total: 239ms	remaining: 463ms
17:	learn: 0.0387473	total: 253ms	remaining: 449ms
18:	learn: 0.0316943	total: 267ms	remaining: 435ms
19:	learn: 0.0281255	total: 281ms	remaining: 421ms
20:	learn: 0.0261296	total: 295ms	remaining: 407ms
21:	learn: 0.0231007	total: 309ms	remaining: 393ms
22:	learn: 0.0208212	total: 322ms	remaining: 378ms
23:	learn: 0.0168802	total: 336ms	remaining: 364ms
24:	learn: 0.0145547	total: 350ms	remaining: 350ms
25:	learn: 0.0130891	total: 364ms	remaining: 336ms
26:	learn: 0.0115054	total: 377ms	remaining: 322ms
27:	learn: 0.0095220	total: 391ms	remaining: 307ms
28:	learn: 0.0081587	total: 405ms	remaining: 293ms
29:	learn: 0.0076846	total: 419ms	remaining: 279ms
30:	learn: 0.0072576	total: 432ms	remaining: 265ms
31:	learn: 0.0069518	total: 446ms	remaining: 251ms
32:	learn: 0.0061355	total: 460ms	remaining: 237ms
33:	learn: 0.0059180	total: 474

25:	learn: 0.0085673	total: 368ms	remaining: 340ms
26:	learn: 0.0072780	total: 382ms	remaining: 325ms
27:	learn: 0.0062663	total: 396ms	remaining: 311ms
28:	learn: 0.0054223	total: 411ms	remaining: 298ms
29:	learn: 0.0046824	total: 425ms	remaining: 283ms
30:	learn: 0.0041512	total: 439ms	remaining: 269ms
31:	learn: 0.0036538	total: 453ms	remaining: 255ms
32:	learn: 0.0032373	total: 467ms	remaining: 241ms
33:	learn: 0.0027883	total: 482ms	remaining: 227ms
34:	learn: 0.0024645	total: 496ms	remaining: 212ms
35:	learn: 0.0022073	total: 510ms	remaining: 198ms
36:	learn: 0.0019552	total: 524ms	remaining: 184ms
37:	learn: 0.0017489	total: 538ms	remaining: 170ms
38:	learn: 0.0015613	total: 552ms	remaining: 156ms
39:	learn: 0.0014424	total: 567ms	remaining: 142ms
40:	learn: 0.0013426	total: 580ms	remaining: 127ms
41:	learn: 0.0012278	total: 595ms	remaining: 113ms
42:	learn: 0.0011949	total: 609ms	remaining: 99.1ms
43:	learn: 0.0010826	total: 623ms	remaining: 85ms
44:	learn: 0.0009858	total: 638

43:	learn: 0.0011689	total: 626ms	remaining: 85.3ms
44:	learn: 0.0011101	total: 639ms	remaining: 71ms
45:	learn: 0.0010852	total: 652ms	remaining: 56.7ms
46:	learn: 0.0010591	total: 667ms	remaining: 42.6ms
47:	learn: 0.0010302	total: 681ms	remaining: 28.4ms
48:	learn: 0.0009824	total: 695ms	remaining: 14.2ms
49:	learn: 0.0009619	total: 709ms	remaining: 0us
0:	learn: 0.5880521	total: 14.5ms	remaining: 708ms
1:	learn: 0.4819926	total: 28.7ms	remaining: 689ms
2:	learn: 0.3891170	total: 43.1ms	remaining: 675ms
3:	learn: 0.3340127	total: 57.5ms	remaining: 662ms
4:	learn: 0.2859091	total: 71.9ms	remaining: 647ms
5:	learn: 0.2226428	total: 85.7ms	remaining: 628ms
6:	learn: 0.1713590	total: 99.7ms	remaining: 612ms
7:	learn: 0.1394203	total: 115ms	remaining: 601ms
8:	learn: 0.1106009	total: 130ms	remaining: 590ms
9:	learn: 0.0869154	total: 144ms	remaining: 575ms
10:	learn: 0.0687930	total: 158ms	remaining: 559ms
11:	learn: 0.0546116	total: 172ms	remaining: 543ms
12:	learn: 0.0438219	total: 186m

7:	learn: 0.1847676	total: 112ms	remaining: 590ms
8:	learn: 0.1631486	total: 126ms	remaining: 575ms
9:	learn: 0.1294721	total: 140ms	remaining: 561ms
10:	learn: 0.1115443	total: 154ms	remaining: 546ms
11:	learn: 0.1006931	total: 168ms	remaining: 531ms
12:	learn: 0.0884195	total: 182ms	remaining: 517ms
13:	learn: 0.0778404	total: 196ms	remaining: 504ms
14:	learn: 0.0637744	total: 210ms	remaining: 490ms
15:	learn: 0.0528920	total: 224ms	remaining: 476ms
16:	learn: 0.0481380	total: 238ms	remaining: 463ms
17:	learn: 0.0387473	total: 252ms	remaining: 449ms
18:	learn: 0.0316943	total: 267ms	remaining: 435ms
19:	learn: 0.0281255	total: 281ms	remaining: 421ms
20:	learn: 0.0261296	total: 294ms	remaining: 406ms
21:	learn: 0.0231007	total: 309ms	remaining: 393ms
22:	learn: 0.0208212	total: 323ms	remaining: 379ms
23:	learn: 0.0168802	total: 337ms	remaining: 366ms
24:	learn: 0.0145547	total: 352ms	remaining: 352ms
25:	learn: 0.0130891	total: 366ms	remaining: 338ms
26:	learn: 0.0115054	total: 380ms	

24:	learn: 0.0103785	total: 351ms	remaining: 351ms
25:	learn: 0.0085673	total: 365ms	remaining: 337ms
26:	learn: 0.0072780	total: 379ms	remaining: 323ms
27:	learn: 0.0062663	total: 394ms	remaining: 309ms
28:	learn: 0.0054223	total: 408ms	remaining: 295ms
29:	learn: 0.0046824	total: 422ms	remaining: 282ms
30:	learn: 0.0041512	total: 437ms	remaining: 268ms
31:	learn: 0.0036538	total: 451ms	remaining: 253ms
32:	learn: 0.0032373	total: 465ms	remaining: 239ms
33:	learn: 0.0027883	total: 480ms	remaining: 226ms
34:	learn: 0.0024645	total: 495ms	remaining: 212ms
35:	learn: 0.0022073	total: 509ms	remaining: 198ms
36:	learn: 0.0019552	total: 522ms	remaining: 184ms
37:	learn: 0.0017489	total: 537ms	remaining: 170ms
38:	learn: 0.0015613	total: 551ms	remaining: 155ms
39:	learn: 0.0014424	total: 566ms	remaining: 141ms
40:	learn: 0.0013426	total: 579ms	remaining: 127ms
41:	learn: 0.0012278	total: 594ms	remaining: 113ms
42:	learn: 0.0011949	total: 608ms	remaining: 98.9ms
43:	learn: 0.0010826	total: 62

37:	learn: 0.0018089	total: 534ms	remaining: 169ms
38:	learn: 0.0016495	total: 548ms	remaining: 155ms
39:	learn: 0.0015040	total: 562ms	remaining: 141ms
40:	learn: 0.0013672	total: 577ms	remaining: 127ms
41:	learn: 0.0012647	total: 591ms	remaining: 112ms
42:	learn: 0.0012161	total: 605ms	remaining: 98.5ms
43:	learn: 0.0011689	total: 619ms	remaining: 84.4ms
44:	learn: 0.0011101	total: 633ms	remaining: 70.3ms
45:	learn: 0.0010852	total: 647ms	remaining: 56.2ms
46:	learn: 0.0010591	total: 661ms	remaining: 42.2ms
47:	learn: 0.0010302	total: 675ms	remaining: 28.1ms
48:	learn: 0.0009824	total: 689ms	remaining: 14.1ms
49:	learn: 0.0009619	total: 703ms	remaining: 0us
0:	learn: 0.5880521	total: 13.6ms	remaining: 667ms
1:	learn: 0.4819926	total: 27.4ms	remaining: 658ms
2:	learn: 0.3891170	total: 41.8ms	remaining: 655ms
3:	learn: 0.3340127	total: 55.6ms	remaining: 639ms
4:	learn: 0.2859091	total: 69.7ms	remaining: 628ms
5:	learn: 0.2226428	total: 84.2ms	remaining: 618ms
6:	learn: 0.1713590	total:

0:	learn: 0.5930124	total: 14.5ms	remaining: 711ms
1:	learn: 0.5142105	total: 28.7ms	remaining: 688ms
2:	learn: 0.4092631	total: 43.2ms	remaining: 677ms
3:	learn: 0.3592468	total: 57.2ms	remaining: 658ms
4:	learn: 0.3148553	total: 71.1ms	remaining: 640ms
5:	learn: 0.2487683	total: 85.3ms	remaining: 625ms
6:	learn: 0.2160689	total: 99.6ms	remaining: 612ms
7:	learn: 0.1847676	total: 114ms	remaining: 599ms
8:	learn: 0.1631486	total: 128ms	remaining: 584ms
9:	learn: 0.1294721	total: 142ms	remaining: 570ms
10:	learn: 0.1115443	total: 156ms	remaining: 554ms
11:	learn: 0.1006931	total: 170ms	remaining: 539ms
12:	learn: 0.0884195	total: 185ms	remaining: 526ms
13:	learn: 0.0778404	total: 199ms	remaining: 512ms
14:	learn: 0.0637744	total: 214ms	remaining: 499ms
15:	learn: 0.0528920	total: 228ms	remaining: 484ms
16:	learn: 0.0481380	total: 242ms	remaining: 470ms
17:	learn: 0.0387473	total: 257ms	remaining: 456ms
18:	learn: 0.0316943	total: 271ms	remaining: 442ms
19:	learn: 0.0281255	total: 285ms	

25:	learn: 0.0085673	total: 368ms	remaining: 339ms
26:	learn: 0.0072780	total: 381ms	remaining: 325ms
27:	learn: 0.0062663	total: 396ms	remaining: 311ms
28:	learn: 0.0054223	total: 410ms	remaining: 297ms
29:	learn: 0.0046824	total: 424ms	remaining: 283ms
30:	learn: 0.0041512	total: 438ms	remaining: 269ms
31:	learn: 0.0036538	total: 452ms	remaining: 254ms
32:	learn: 0.0032373	total: 467ms	remaining: 240ms
33:	learn: 0.0027883	total: 481ms	remaining: 226ms
34:	learn: 0.0024645	total: 495ms	remaining: 212ms
35:	learn: 0.0022073	total: 509ms	remaining: 198ms
36:	learn: 0.0019552	total: 523ms	remaining: 184ms
37:	learn: 0.0017489	total: 537ms	remaining: 170ms
38:	learn: 0.0015613	total: 551ms	remaining: 155ms
39:	learn: 0.0014424	total: 566ms	remaining: 141ms
40:	learn: 0.0013426	total: 579ms	remaining: 127ms
41:	learn: 0.0012278	total: 594ms	remaining: 113ms
42:	learn: 0.0011949	total: 608ms	remaining: 98.9ms
43:	learn: 0.0010826	total: 622ms	remaining: 84.8ms
44:	learn: 0.0009858	total: 6

49:	learn: 0.0009619	total: 709ms	remaining: 0us
0:	learn: 0.5880521	total: 14.3ms	remaining: 699ms
1:	learn: 0.4819926	total: 28.9ms	remaining: 694ms
2:	learn: 0.3891170	total: 42.7ms	remaining: 668ms
3:	learn: 0.3340127	total: 56.6ms	remaining: 651ms
4:	learn: 0.2859091	total: 70.7ms	remaining: 637ms
5:	learn: 0.2226428	total: 85ms	remaining: 624ms
6:	learn: 0.1713590	total: 99ms	remaining: 608ms
7:	learn: 0.1394203	total: 113ms	remaining: 595ms
8:	learn: 0.1106009	total: 127ms	remaining: 580ms
9:	learn: 0.0869154	total: 141ms	remaining: 566ms
10:	learn: 0.0687930	total: 156ms	remaining: 552ms
11:	learn: 0.0546116	total: 170ms	remaining: 539ms
12:	learn: 0.0438219	total: 184ms	remaining: 525ms
13:	learn: 0.0354500	total: 199ms	remaining: 511ms
14:	learn: 0.0288938	total: 213ms	remaining: 498ms
15:	learn: 0.0239036	total: 228ms	remaining: 484ms
16:	learn: 0.0198538	total: 242ms	remaining: 470ms
17:	learn: 0.0164515	total: 256ms	remaining: 456ms
18:	learn: 0.0135764	total: 271ms	remain

22:	learn: 0.0208212	total: 325ms	remaining: 382ms
23:	learn: 0.0168802	total: 338ms	remaining: 367ms
24:	learn: 0.0145547	total: 352ms	remaining: 352ms
25:	learn: 0.0130891	total: 367ms	remaining: 339ms
26:	learn: 0.0115054	total: 381ms	remaining: 324ms
27:	learn: 0.0095220	total: 395ms	remaining: 311ms
28:	learn: 0.0081587	total: 409ms	remaining: 296ms
29:	learn: 0.0076846	total: 424ms	remaining: 283ms
30:	learn: 0.0072576	total: 438ms	remaining: 268ms
31:	learn: 0.0069518	total: 451ms	remaining: 254ms
32:	learn: 0.0061355	total: 466ms	remaining: 240ms
33:	learn: 0.0059180	total: 480ms	remaining: 226ms
34:	learn: 0.0051701	total: 494ms	remaining: 212ms
35:	learn: 0.0047889	total: 508ms	remaining: 198ms
36:	learn: 0.0043963	total: 522ms	remaining: 183ms
37:	learn: 0.0038541	total: 536ms	remaining: 169ms
38:	learn: 0.0036519	total: 550ms	remaining: 155ms
39:	learn: 0.0034060	total: 564ms	remaining: 141ms
40:	learn: 0.0032115	total: 578ms	remaining: 127ms
41:	learn: 0.0028230	total: 593

36:	learn: 0.0019552	total: 523ms	remaining: 184ms
37:	learn: 0.0017489	total: 536ms	remaining: 169ms
38:	learn: 0.0015613	total: 550ms	remaining: 155ms
39:	learn: 0.0014424	total: 564ms	remaining: 141ms
40:	learn: 0.0013426	total: 578ms	remaining: 127ms
41:	learn: 0.0012278	total: 592ms	remaining: 113ms
42:	learn: 0.0011949	total: 607ms	remaining: 98.8ms
43:	learn: 0.0010826	total: 621ms	remaining: 84.7ms
44:	learn: 0.0009858	total: 635ms	remaining: 70.6ms
45:	learn: 0.0009595	total: 649ms	remaining: 56.4ms
46:	learn: 0.0008855	total: 663ms	remaining: 42.3ms
47:	learn: 0.0008187	total: 678ms	remaining: 28.2ms
48:	learn: 0.0008072	total: 692ms	remaining: 14.1ms
49:	learn: 0.0007801	total: 706ms	remaining: 0us
0:	learn: 0.5790751	total: 14.1ms	remaining: 690ms
1:	learn: 0.5022078	total: 28.2ms	remaining: 676ms
2:	learn: 0.4032736	total: 42.5ms	remaining: 666ms
3:	learn: 0.3540270	total: 56.6ms	remaining: 650ms
4:	learn: 0.3044267	total: 71.1ms	remaining: 640ms
5:	learn: 0.2642329	total:

In [56]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.99995 (+/- 0.00021)
F1 Score: 0.99975 (+/- 0.00107)
Precision: 0.99950 (+/- 0.00213)
Recall: 1.00000 (+/- 0.00000)
