## Constant features

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

## Read Data

In [2]:
data = pd.read_csv('../UNSW_Train.csv')
data.shape

(175341, 44)

In [3]:
data.head(5)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack,is_intrusion
0,0.121478,tcp,-,FIN,6,4,258,172,74.08749,252,...,1,1,0,0,0,1,1,0,0,0
1,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,0,0
2,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,0,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,0,0
4,0.449454,tcp,-,FIN,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,0,0


In [4]:
data.convert_dtypes().dtypes

dur                  float64
proto                 string
service               string
state                 string
spkts                  Int64
dpkts                  Int64
sbytes                 Int64
dbytes                 Int64
rate                 float64
sttl                   Int64
dttl                   Int64
sload                float64
dload                float64
sloss                  Int64
dloss                  Int64
sinpkt               float64
dinpkt               float64
sjit                 float64
djit                 float64
swin                   Int64
stcpb                  Int64
dtcpb                  Int64
dwin                   Int64
tcprtt               float64
synack               float64
ackdat               float64
smean                  Int64
dmean                  Int64
trans_depth            Int64
response_body_len      Int64
ct_srv_src             Int64
ct_state_ttl           Int64
ct_dst_ltm             Int64
ct_src_dport_ltm       Int64
ct_dst_sport_l

In [5]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
data["proto"] = ord_enc.fit_transform(data[["proto"]])

In [6]:
ord_enc = OrdinalEncoder()
data["service"] = ord_enc.fit_transform(data[["service"]])

In [7]:
ord_enc = OrdinalEncoder()
data["state"] = ord_enc.fit_transform(data[["state"]])

In [8]:
data.to_csv("UNSW_Train.csv", index=False)

### Train - Test Split

In [9]:
# separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(labels=['is_intrusion'], axis=1),       # drop the target
    data['is_intrusion'],                             # just the target
    test_size=0.2,
    random_state=0)
X_train.shape, X_test.shape

((140272, 43), (35069, 43))

### Using VarianceThreshold from Scikit-learn

The VarianceThreshold from sklearn provides a simple baseline approach to feature selection. It removes all features which variance doesn’t meet a certain threshold. By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [10]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train)  # fit finds the features with zero variance

VarianceThreshold(threshold=0.01)

In [11]:
# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not constant
# (if necessary, print the result of sel.get_support() to understand its output)
sum(sel.get_support())

40

In [12]:
# now let's print the number of constant feautures
# (see how we use ~ to exclude non-constant features)
constant = X_train.columns[~sel.get_support()]
len(constant)

3

We can see that 0 columns / variables are constant. This means that 0 variables show the same value, just one value, for all the observations of the training set.

In [13]:
# let's print the constant variable names
constant

Index(['tcprtt', 'synack', 'ackdat'], dtype='object')

In [15]:
# let's visualise the values of one of the constant variables
# as an example
X_train['tcprtt'].unique()

array([0.000637, 0.      , 0.132797, ..., 0.085089, 0.220217, 0.178287])

In [16]:
# we can do the same for every feature:
for col in constant:
    print(col, X_train[col].unique())

tcprtt [0.000637 0.       0.132797 ... 0.085089 0.220217 0.178287]
synack [0.000512 0.       0.081136 ... 0.068998 0.100862 0.083344]
ackdat [0.000125 0.       0.051661 ... 0.119355 0.094943 0.062623]


We then use the transform() method of the VarianceThreshold to reduce the training and testing sets to its non-constant features.

Note that VarianceThreshold returns a NumPy array without feature names, so we need to capture the names first, and reconstitute the dataframe in a later step.

In [17]:
# capture non-constant feature names
feat_names = X_train.columns[sel.get_support()]

In [18]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)
X_train.shape, X_test.shape

((140272, 40), (35069, 40))

We have now 40 variables.

In [19]:
# X_ train is a NumPy array
X_train

array([[4.636200e-02, 1.130000e+02, 4.000000e+00, ..., 3.000000e+00,
        0.000000e+00, 0.000000e+00],
       [3.000000e-06, 1.190000e+02, 2.000000e+00, ..., 2.600000e+01,
        0.000000e+00, 1.000000e+00],
       [9.000000e-06, 1.190000e+02, 2.000000e+00, ..., 2.800000e+01,
        0.000000e+00, 1.000000e+00],
       ...,
       [1.042756e+00, 1.130000e+02, 5.000000e+00, ..., 1.000000e+00,
        0.000000e+00, 1.000000e+00],
       [9.000000e-06, 1.190000e+02, 2.000000e+00, ..., 2.200000e+01,
        0.000000e+00, 1.000000e+00],
       [1.036587e+00, 1.130000e+02, 0.000000e+00, ..., 2.000000e+00,
        0.000000e+00, 0.000000e+00]])

In [20]:
# reconstitute de dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)
X_train.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack
0,0.046362,113.0,4.0,2.0,6.0,8.0,320.0,1882.0,280.402045,31.0,...,1.0,1.0,8.0,0.0,0.0,0.0,4.0,3.0,0.0,0.0
1,3e-06,119.0,2.0,3.0,2.0,0.0,114.0,0.0,333333.3215,254.0,...,16.0,9.0,26.0,0.0,0.0,0.0,16.0,26.0,0.0,1.0
2,9e-06,119.0,2.0,3.0,2.0,0.0,114.0,0.0,111111.1072,254.0,...,11.0,11.0,28.0,0.0,0.0,0.0,11.0,28.0,0.0,1.0
3,1.352354,97.0,0.0,0.0,4.0,2.0,824.0,432.0,3.697257,254.0,...,1.0,1.0,2.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
4,8e-06,119.0,2.0,3.0,2.0,0.0,114.0,0.0,125000.0003,254.0,...,13.0,13.0,15.0,0.0,0.0,0.0,13.0,14.0,0.0,1.0


In the dataset UNSW-NB15, 3 features was classified as constant were found, remaining the original 40 features of the dataset

## Standardize Data

In [21]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

## Hyperparameter Optimization

In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV

class EstimatorSelectionHelper:
    
    def __init__(self, models, params):
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}
    
    def fit(self, X, y, **grid_kwargs):
        for key in self.keys:
            print('Running GridSearchCV for %s.' % key)
            model = self.models[key]
            params = self.params[key]
            grid_search = GridSearchCV(model, params, **grid_kwargs)
            grid_search.fit(X, y)
            self.grid_searches[key] = grid_search
        print('Done.')
    
    def score_summary(self, sort_by='mean_test_score'):
        frames = []
        for name, grid_search in self.grid_searches.items():
            frame = pd.DataFrame(grid_search.cv_results_)
            frame = frame.filter(regex='^(?!.*param_).*$')
            frame['estimator'] = len(frame)*[name]
            frames.append(frame)
        df = pd.concat(frames)
        
        df = df.sort_values([sort_by], ascending=False)
        df = df.reset_index()
        df = df.drop(['rank_test_score', 'index'], 1)
        
        columns = df.columns.tolist()
        columns.remove('estimator')
        columns = ['estimator']+columns
        df = df[columns]
        return df

In [23]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier


models = { 
    'LogisticRegression': linear_model.LogisticRegression(max_iter=1000),
    'GaussianNB': GaussianNB(),
    'RandomForest': RandomForestClassifier(random_state=123),
    'KNN': KNeighborsClassifier(n_jobs=-1),
    'CatBoost': CatBoostClassifier(),
}

params = { 
    'LogisticRegression': {'C':[0.1,0.5,1,2,3,4,5,10,20,25]},
    'GaussianNB': {'var_smoothing':[1e-9,1e-8,1e-7,1e-6,1e-5,1e-4]},
    'RandomForest': {'max_depth': [70,80,90,100],'n_estimators': [100,1000]},
    'KNN': {'n_neighbors':[2,3,4,5],'leaf_size':[1,2,3],'weights':['uniform', 'distance'],
          'algorithm':['auto','ball_tree','kd_tree','brute']},
    'CatBoost': {'depth':[4,5,6,7],'learning_rate':[0.01,0.02,0.03,0.04],'iterations':[10,20,30,40,50]}
}

In [24]:
%%time
helper = EstimatorSelectionHelper(models, params)
helper.fit(X_test, y_test, scoring='f1', n_jobs=2)

Running GridSearchCV for LogisticRegression.
Running GridSearchCV for GaussianNB.
Running GridSearchCV for RandomForest.




Running GridSearchCV for KNN.




Running GridSearchCV for CatBoost.
0:	learn: 0.6499364	total: 75.8ms	remaining: 682ms
1:	learn: 0.6102433	total: 86.2ms	remaining: 345ms
2:	learn: 0.5700668	total: 96.4ms	remaining: 225ms
3:	learn: 0.5331257	total: 106ms	remaining: 159ms
4:	learn: 0.4973447	total: 117ms	remaining: 117ms
5:	learn: 0.4660139	total: 127ms	remaining: 84.5ms
6:	learn: 0.4342010	total: 137ms	remaining: 58.5ms
7:	learn: 0.4067253	total: 146ms	remaining: 36.6ms
8:	learn: 0.3796417	total: 156ms	remaining: 17.4ms
9:	learn: 0.3545715	total: 166ms	remaining: 0us
Done.
CPU times: user 8.25 s, sys: 1.53 s, total: 9.77 s
Wall time: 15min 19s


In [25]:
helper.score_summary()

Unnamed: 0,estimator,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score
0,CatBoost,1.148308,0.009318,0.089364,0.001718,"{'depth': 7, 'iterations': 50, 'learning_rate'...",1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
1,CatBoost,1.013325,0.006585,0.091869,0.002645,"{'depth': 6, 'iterations': 50, 'learning_rate'...",1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
2,CatBoost,0.898149,0.008498,0.091780,0.003801,"{'depth': 4, 'iterations': 50, 'learning_rate'...",1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
3,CatBoost,0.908147,0.004786,0.088547,0.000883,"{'depth': 4, 'iterations': 50, 'learning_rate'...",1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
4,CatBoost,0.787631,0.007462,0.089686,0.002337,"{'depth': 4, 'iterations': 40, 'learning_rate'...",1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,GaussianNB,0.035154,0.002391,0.008440,0.000261,{'var_smoothing': 1e-09},0.842714,0.844236,0.855511,0.853320,0.856909,0.850538,0.005899
196,GaussianNB,0.027051,0.003963,0.007501,0.000114,{'var_smoothing': 1e-07},0.844121,0.839468,0.840871,0.842784,0.844665,0.842382,0.001958
197,GaussianNB,0.024435,0.001461,0.007653,0.000223,{'var_smoothing': 1e-06},0.823036,0.814587,0.812880,0.818574,0.819975,0.817810,0.003668
198,GaussianNB,0.024375,0.001846,0.007622,0.000103,{'var_smoothing': 1e-05},0.780126,0.775418,0.770941,0.776739,0.776251,0.775895,0.002950


In [26]:
df_gridsearchcv_summary = helper.score_summary()

In [27]:
df_gridsearchcv_summary.to_csv("gridsearchcv_summaryBase.csv", index=False)

## Classifiers

In [28]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

## Metrics Evaluation

In [29]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, f1_score
from sklearn import metrics
from sklearn.model_selection import cross_val_score

### Logistic Regression

In [30]:
%%time
clf_LR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25).fit(X_train, y_train)

CPU times: user 94.1 ms, sys: 198 ms, total: 292 ms
Wall time: 3.46 s


In [31]:
pred_y_test = clf_LR.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_test))

f1 = f1_score(y_test, pred_y_test)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_test)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.33471156862185975
F1 Score: 0.04479017400204708
FPR: 0.0048906277787657835
TPR: 0.02296100407169542


### Naive Bayes

In [32]:
%%time
clf_NB = GaussianNB(var_smoothing=1e-08).fit(X_train, y_train)

CPU times: user 94.4 ms, sys: 19.2 ms, total: 114 ms
Wall time: 112 ms


In [33]:
pred_y_testNB = clf_NB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testNB))

f1 = f1_score(y_test, pred_y_testNB)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testNB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.7423365365422453
F1 Score: 0.7751791401273885
FPR: 0.07033611950915881
TPR: 0.653905889266675


### Random Forest

In [34]:
%%time
clf_RF = RandomForestClassifier(random_state=42,max_depth=100,n_estimators=1000).fit(X_train, y_train)

CPU times: user 1min 19s, sys: 564 ms, total: 1min 19s
Wall time: 1min 19s


In [35]:
pred_y_testRF = clf_RF.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testRF))

f1 = f1_score(y_test, pred_y_testRF, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testRF)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


### KNN

In [36]:
%%time
clf_KNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform').fit(X_train, y_train)

CPU times: user 22.4 s, sys: 167 ms, total: 22.5 s
Wall time: 22.4 s


In [37]:
pred_y_testKNN = clf_KNN.predict(X_test)
print('accuracy_score:', accuracy_score(y_test, pred_y_testKNN))

f1 = f1_score(y_test, pred_y_testKNN)
print('f1:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testKNN)
print('fpr:', fpr[1])
print('tpr:', tpr[1])

accuracy_score: 0.6842510479340729
f1: 0.7391703766518267
fpr: 0.26142628490129827
tpr: 0.6586072283087773


### CatBoost

In [38]:
%%time
clf_CB = CatBoostClassifier(depth=7,iterations=50,learning_rate=0.04).fit(X_train, y_train)

0:	learn: 0.5279791	total: 27.5ms	remaining: 1.35s
1:	learn: 0.3965446	total: 49.2ms	remaining: 1.18s
2:	learn: 0.2948158	total: 70.5ms	remaining: 1.1s
3:	learn: 0.2153082	total: 86.3ms	remaining: 992ms
4:	learn: 0.1586818	total: 106ms	remaining: 955ms
5:	learn: 0.1173974	total: 127ms	remaining: 929ms
6:	learn: 0.0892843	total: 150ms	remaining: 921ms
7:	learn: 0.0664143	total: 168ms	remaining: 881ms
8:	learn: 0.0510172	total: 189ms	remaining: 861ms
9:	learn: 0.0388743	total: 209ms	remaining: 836ms
10:	learn: 0.0301029	total: 230ms	remaining: 817ms
11:	learn: 0.0236982	total: 251ms	remaining: 793ms
12:	learn: 0.0187611	total: 271ms	remaining: 771ms
13:	learn: 0.0150649	total: 292ms	remaining: 750ms
14:	learn: 0.0120568	total: 312ms	remaining: 728ms
15:	learn: 0.0099088	total: 332ms	remaining: 705ms
16:	learn: 0.0080813	total: 353ms	remaining: 686ms
17:	learn: 0.0065832	total: 374ms	remaining: 664ms
18:	learn: 0.0054955	total: 395ms	remaining: 644ms
19:	learn: 0.0046404	total: 415ms	rema

In [39]:
pred_y_testCB = clf_CB.predict(X_test)
print('Accuracy:', accuracy_score(y_test, pred_y_testCB))

f1 = f1_score(y_test, pred_y_testCB, average='weighted', zero_division=0)
print('F1 Score:', f1)

fpr, tpr, thresholds = roc_curve(y_test, pred_y_testCB)
print('FPR:', fpr[1])
print('TPR:', tpr[1])

Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
FPR: 1.0
TPR: 1.0


## Model Evaluation

In [40]:
import pandas as pd, numpy as np
test_df = pd.read_csv("../UNSW_Test.csv")
test_df.shape

(82332, 44)

In [41]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
test_df["proto"] = ord_enc.fit_transform(test_df[["proto"]])

In [42]:
ord_enc = OrdinalEncoder()
test_df["service"] = ord_enc.fit_transform(test_df[["service"]])

In [43]:
ord_enc = OrdinalEncoder()
test_df["state"] = ord_enc.fit_transform(test_df[["state"]])

In [44]:
data.to_csv("UNSW_Test.csv", index=False)

In [45]:
# Create feature matrix X and target vextor y
y_eval = test_df['is_intrusion']
X_eval = test_df.drop(columns=['is_intrusion','tcprtt', 'synack', 'ackdat'])

### Model Evaluation - Logistic Regression

In [51]:
modelLR = linear_model.LogisticRegression(n_jobs=-1, random_state=42, C=25)
modelLR.fit(X_train, y_train)

LogisticRegression(C=25, n_jobs=-1, random_state=42)

In [52]:
# Predict on the new unseen test data
y_evalpredLR = modelLR.predict(X_eval)
y_predLR = modelLR.predict(X_test)

In [53]:
train_scoreLR = modelLR.score(X_train, y_train)
test_scoreLR = modelLR.score(X_test, y_test)
print("Training accuracy is ", train_scoreLR)
print("Testing accuracy is ", test_scoreLR)

Training accuracy is  1.0
Testing accuracy is  0.33471156862185975


In [54]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreLR)
print('F1 Score:',f1_score(y_test, y_predLR))
print('Precision Score:',precision_score(y_test, y_predLR))
print('Recall Score:', recall_score(y_test, y_predLR))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predLR))

Performance measures for test:
--------
Accuracy: 0.33471156862185975
F1 Score: 0.04479017400204708
Precision Score: 0.9086378737541528
Recall Score: 0.02296100407169542
Confusion Matrix:
 [[11191    55]
 [23276   547]]


### Cross validation - Logistic Regression

In [55]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelLR, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.75240 (+/- 0.28455)
F1 Score: 0.73232 (+/- 0.33527)
Precision: 0.85095 (+/- 0.28688)
Recall: 0.66190 (+/- 0.41926)


### Model Evaluation - Naive Bayes

In [56]:
modelNB = GaussianNB(var_smoothing=1e-08)
modelNB.fit(X_train, y_train)

GaussianNB(var_smoothing=1e-08)

In [57]:
# Predict on the new unseen test data
y_evalpredNB = modelNB.predict(X_eval)
y_predNB = modelNB.predict(X_test)

In [58]:
train_scoreNB = modelNB.score(X_train, y_train)
test_scoreNB = modelNB.score(X_test, y_test)
print("Training accuracy is ", train_scoreNB)
print("Testing accuracy is ", test_scoreNB)

Training accuracy is  1.0
Testing accuracy is  0.7423365365422453


In [59]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreNB)
print('F1 Score:',f1_score(y_test, y_predNB))
print('Precision Score:',precision_score(y_test, y_predNB))
print('Recall Score:', recall_score(y_test, y_predNB))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predNB))

Performance measures for test:
--------
Accuracy: 0.7423365365422453
F1 Score: 0.7751791401273885
Precision Score: 0.9516769503329464
Recall Score: 0.653905889266675
Confusion Matrix:
 [[10455   791]
 [ 8245 15578]]


### Cross validation - Naive Bayes

In [60]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelNB, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.68758 (+/- 0.27402)
F1 Score: 0.76521 (+/- 0.18230)
Precision: 0.67567 (+/- 0.23963)
Recall: 0.89546 (+/- 0.15119)


### Model Evaluation - Random Forest

In [61]:
modelRF = RandomForestClassifier(random_state=0,max_depth=100,n_estimators=1000)
modelRF.fit(X_train, y_train)

RandomForestClassifier(max_depth=100, n_estimators=1000, random_state=0)

In [62]:
# Predict on the new unseen test data
y_evalpredRF = modelRF.predict(X_eval)
y_predRF = modelRF.predict(X_test)

In [63]:
train_scoreRF = modelRF.score(X_train, y_train)
test_scoreRF = modelRF.score(X_test, y_test)
print("Training accuracy is ", train_scoreRF)
print("Testing accuracy is ", test_scoreRF)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [64]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [65]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreRF)
print('F1 Score:', f1_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Precision Score:', precision_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predRF, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predRF))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - Random Forest

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelRF, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)


### Model Evaluation - KNN

In [67]:
modelKNN = KNeighborsClassifier(algorithm='ball_tree',leaf_size=1,n_neighbors=5,weights='uniform')
modelKNN.fit(X_train, y_train)

KNeighborsClassifier(algorithm='ball_tree', leaf_size=1)

In [68]:
# Predict on the new unseen test data
y_evalpredKNN = modelKNN.predict(X_eval)
y_predKNN = modelKNN.predict(X_test)

In [69]:
train_scoreKNN = modelKNN.score(X_train, y_train)
test_scoreKNN = modelKNN.score(X_test, y_test)
print("Training accuracy is ", train_scoreKNN)
print("Testing accuracy is ", test_scoreKNN)

Training accuracy is  0.9995080985513859
Testing accuracy is  0.6842510479340729


In [70]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreKNN)
print('F1 Score:', f1_score(y_test, y_predKNN))
print('Precision Score:', precision_score(y_test, y_predKNN))
print('Recall Score:', recall_score(y_test, y_predKNN))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predKNN))

Performance measures for test:
--------
Accuracy: 0.6842510479340729
F1 Score: 0.7391703766518267
Precision Score: 0.8421900161030595
Recall Score: 0.6586072283087773
Confusion Matrix:
 [[ 8306  2940]
 [ 8133 15690]]


### Cross validation - KNN

In [71]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='accuracy')
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))

f = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='f1')
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))

precision = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='precision')
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))

recall = cross_val_score(modelKNN, X_eval, y_eval, cv=10, scoring='recall')
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 0.78820 (+/- 0.21797)
F1 Score: 0.80561 (+/- 0.19879)
Precision: 0.82604 (+/- 0.25233)
Recall: 0.80008 (+/- 0.25124)


### Model Evaluation - CatBoost

In [72]:
modelCB = CatBoostClassifier(depth=7,iterations=50,learning_rate=0.04)
modelCB.fit(X_train, y_train)

0:	learn: 0.5279791	total: 21.1ms	remaining: 1.03s
1:	learn: 0.3965446	total: 42.9ms	remaining: 1.03s
2:	learn: 0.2948158	total: 63.5ms	remaining: 994ms
3:	learn: 0.2153082	total: 79.4ms	remaining: 914ms
4:	learn: 0.1586818	total: 99.6ms	remaining: 896ms
5:	learn: 0.1173974	total: 120ms	remaining: 879ms
6:	learn: 0.0892843	total: 140ms	remaining: 857ms
7:	learn: 0.0664143	total: 156ms	remaining: 820ms
8:	learn: 0.0510172	total: 178ms	remaining: 810ms
9:	learn: 0.0388743	total: 198ms	remaining: 793ms
10:	learn: 0.0301029	total: 220ms	remaining: 779ms
11:	learn: 0.0236982	total: 239ms	remaining: 757ms
12:	learn: 0.0187611	total: 259ms	remaining: 736ms
13:	learn: 0.0150649	total: 279ms	remaining: 718ms
14:	learn: 0.0120568	total: 300ms	remaining: 700ms
15:	learn: 0.0099088	total: 320ms	remaining: 679ms
16:	learn: 0.0080813	total: 341ms	remaining: 662ms
17:	learn: 0.0065832	total: 361ms	remaining: 642ms
18:	learn: 0.0054955	total: 382ms	remaining: 623ms
19:	learn: 0.0046404	total: 402ms	re

<catboost.core.CatBoostClassifier at 0x7fbe615bbf40>

In [73]:
# Predict on the new unseen test data
y_evalpredCB = modelCB.predict(X_eval)
y_predCB = modelCB.predict(X_test)

In [74]:
train_scoreCB = modelCB.score(X_train, y_train)
test_scoreCB = modelCB.score(X_test, y_test)
print("Training accuracy is ", train_scoreCB)
print("Testing accuracy is ", test_scoreCB)

Training accuracy is  1.0
Testing accuracy is  0.6793179161082438


In [75]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
print('Performance measures for test:')
print('--------')
print('Accuracy:', test_scoreCB)
print('F1 Score:',f1_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Precision Score:',precision_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Recall Score:', recall_score(y_test, y_predCB, average='weighted', zero_division=0))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_predCB))

Performance measures for test:
--------
Accuracy: 0.6793179161082438
F1 Score: 0.5495955550990522
Precision Score: 0.4614728311456469
Recall Score: 0.6793179161082438
Confusion Matrix:
 [[    0 11246]
 [    0 23823]]


### Cross validation - CatBoost

In [76]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

accuracy = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='accuracy')
f = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='f1')
precision = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='precision')
recall = cross_val_score(modelCB, X_eval, y_eval, cv=10, scoring='recall')

0:	learn: 0.5255772	total: 17.7ms	remaining: 867ms
1:	learn: 0.3928080	total: 35.9ms	remaining: 862ms
2:	learn: 0.2989532	total: 54.5ms	remaining: 853ms
3:	learn: 0.2243519	total: 72.8ms	remaining: 838ms
4:	learn: 0.1686732	total: 90.8ms	remaining: 818ms
5:	learn: 0.1277835	total: 107ms	remaining: 788ms
6:	learn: 0.0974820	total: 124ms	remaining: 764ms
7:	learn: 0.0754845	total: 141ms	remaining: 738ms
8:	learn: 0.0586912	total: 156ms	remaining: 712ms
9:	learn: 0.0455809	total: 172ms	remaining: 688ms
10:	learn: 0.0362958	total: 188ms	remaining: 666ms
11:	learn: 0.0287279	total: 203ms	remaining: 644ms
12:	learn: 0.0234045	total: 219ms	remaining: 624ms
13:	learn: 0.0190155	total: 236ms	remaining: 606ms
14:	learn: 0.0155282	total: 253ms	remaining: 589ms
15:	learn: 0.0127708	total: 269ms	remaining: 572ms
16:	learn: 0.0105225	total: 286ms	remaining: 555ms
17:	learn: 0.0087707	total: 302ms	remaining: 536ms
18:	learn: 0.0074017	total: 318ms	remaining: 519ms
19:	learn: 0.0063035	total: 334ms	re

13:	learn: 0.0179441	total: 218ms	remaining: 561ms
14:	learn: 0.0146350	total: 234ms	remaining: 546ms
15:	learn: 0.0119941	total: 250ms	remaining: 532ms
16:	learn: 0.0099169	total: 267ms	remaining: 519ms
17:	learn: 0.0082803	total: 284ms	remaining: 505ms
18:	learn: 0.0069249	total: 300ms	remaining: 490ms
19:	learn: 0.0058323	total: 317ms	remaining: 475ms
20:	learn: 0.0049736	total: 334ms	remaining: 461ms
21:	learn: 0.0043082	total: 350ms	remaining: 445ms
22:	learn: 0.0037470	total: 366ms	remaining: 430ms
23:	learn: 0.0032601	total: 384ms	remaining: 416ms
24:	learn: 0.0028843	total: 400ms	remaining: 400ms
25:	learn: 0.0025566	total: 415ms	remaining: 384ms
26:	learn: 0.0022740	total: 432ms	remaining: 368ms
27:	learn: 0.0020364	total: 448ms	remaining: 352ms
28:	learn: 0.0018288	total: 463ms	remaining: 335ms
29:	learn: 0.0016370	total: 479ms	remaining: 319ms
30:	learn: 0.0014951	total: 494ms	remaining: 303ms
31:	learn: 0.0013584	total: 511ms	remaining: 287ms
32:	learn: 0.0012310	total: 527

25:	learn: 0.0027346	total: 425ms	remaining: 392ms
26:	learn: 0.0023777	total: 442ms	remaining: 376ms
27:	learn: 0.0021214	total: 458ms	remaining: 360ms
28:	learn: 0.0019156	total: 473ms	remaining: 343ms
29:	learn: 0.0017298	total: 489ms	remaining: 326ms
30:	learn: 0.0015719	total: 504ms	remaining: 309ms
31:	learn: 0.0014379	total: 520ms	remaining: 292ms
32:	learn: 0.0013169	total: 536ms	remaining: 276ms
33:	learn: 0.0012157	total: 551ms	remaining: 259ms
34:	learn: 0.0011238	total: 566ms	remaining: 243ms
35:	learn: 0.0010403	total: 582ms	remaining: 226ms
36:	learn: 0.0009712	total: 597ms	remaining: 210ms
37:	learn: 0.0009104	total: 613ms	remaining: 193ms
38:	learn: 0.0008540	total: 628ms	remaining: 177ms
39:	learn: 0.0008034	total: 643ms	remaining: 161ms
40:	learn: 0.0007590	total: 657ms	remaining: 144ms
41:	learn: 0.0007174	total: 672ms	remaining: 128ms
42:	learn: 0.0006785	total: 688ms	remaining: 112ms
43:	learn: 0.0006426	total: 702ms	remaining: 95.8ms
44:	learn: 0.0006055	total: 71

37:	learn: 0.0009176	total: 624ms	remaining: 197ms
38:	learn: 0.0008653	total: 640ms	remaining: 180ms
39:	learn: 0.0008092	total: 655ms	remaining: 164ms
40:	learn: 0.0007663	total: 670ms	remaining: 147ms
41:	learn: 0.0007268	total: 686ms	remaining: 131ms
42:	learn: 0.0006946	total: 700ms	remaining: 114ms
43:	learn: 0.0006608	total: 715ms	remaining: 97.4ms
44:	learn: 0.0006227	total: 731ms	remaining: 81.2ms
45:	learn: 0.0005971	total: 746ms	remaining: 64.9ms
46:	learn: 0.0005639	total: 761ms	remaining: 48.6ms
47:	learn: 0.0005429	total: 777ms	remaining: 32.4ms
48:	learn: 0.0005234	total: 792ms	remaining: 16.2ms
49:	learn: 0.0005048	total: 806ms	remaining: 0us
0:	learn: 0.5255772	total: 15.9ms	remaining: 780ms
1:	learn: 0.3928080	total: 32.1ms	remaining: 769ms
2:	learn: 0.2989532	total: 48.6ms	remaining: 761ms
3:	learn: 0.2243519	total: 65.6ms	remaining: 755ms
4:	learn: 0.1686732	total: 82.1ms	remaining: 739ms
5:	learn: 0.1277835	total: 99ms	remaining: 726ms
6:	learn: 0.0974820	total: 11

0:	learn: 0.5252396	total: 17.6ms	remaining: 864ms
1:	learn: 0.3928684	total: 34.1ms	remaining: 818ms
2:	learn: 0.2987886	total: 50.4ms	remaining: 789ms
3:	learn: 0.2243946	total: 67.2ms	remaining: 773ms
4:	learn: 0.1689016	total: 83.9ms	remaining: 755ms
5:	learn: 0.1277158	total: 100ms	remaining: 736ms
6:	learn: 0.0985082	total: 117ms	remaining: 719ms
7:	learn: 0.0721107	total: 129ms	remaining: 678ms
8:	learn: 0.0564579	total: 146ms	remaining: 664ms
9:	learn: 0.0443528	total: 162ms	remaining: 648ms
10:	learn: 0.0346674	total: 179ms	remaining: 633ms
11:	learn: 0.0278850	total: 195ms	remaining: 618ms
12:	learn: 0.0223394	total: 212ms	remaining: 603ms
13:	learn: 0.0179441	total: 229ms	remaining: 588ms
14:	learn: 0.0146350	total: 245ms	remaining: 571ms
15:	learn: 0.0119941	total: 261ms	remaining: 555ms
16:	learn: 0.0099169	total: 278ms	remaining: 540ms
17:	learn: 0.0082803	total: 295ms	remaining: 525ms
18:	learn: 0.0069249	total: 311ms	remaining: 508ms
19:	learn: 0.0058323	total: 328ms	re

12:	learn: 0.0235318	total: 214ms	remaining: 609ms
13:	learn: 0.0190047	total: 230ms	remaining: 591ms
14:	learn: 0.0154928	total: 246ms	remaining: 575ms
15:	learn: 0.0127031	total: 263ms	remaining: 559ms
16:	learn: 0.0104854	total: 280ms	remaining: 543ms
17:	learn: 0.0088878	total: 297ms	remaining: 528ms
18:	learn: 0.0074114	total: 314ms	remaining: 513ms
19:	learn: 0.0063024	total: 331ms	remaining: 497ms
20:	learn: 0.0053620	total: 347ms	remaining: 479ms
21:	learn: 0.0046257	total: 364ms	remaining: 463ms
22:	learn: 0.0040160	total: 380ms	remaining: 446ms
23:	learn: 0.0035055	total: 396ms	remaining: 429ms
24:	learn: 0.0030901	total: 413ms	remaining: 413ms
25:	learn: 0.0027346	total: 429ms	remaining: 396ms
26:	learn: 0.0023777	total: 446ms	remaining: 380ms
27:	learn: 0.0021214	total: 463ms	remaining: 364ms
28:	learn: 0.0019156	total: 479ms	remaining: 347ms
29:	learn: 0.0017298	total: 495ms	remaining: 330ms
30:	learn: 0.0015719	total: 510ms	remaining: 313ms
31:	learn: 0.0014379	total: 526

25:	learn: 0.0026854	total: 431ms	remaining: 398ms
26:	learn: 0.0023961	total: 447ms	remaining: 381ms
27:	learn: 0.0021498	total: 463ms	remaining: 364ms
28:	learn: 0.0019178	total: 480ms	remaining: 347ms
29:	learn: 0.0017131	total: 497ms	remaining: 331ms
30:	learn: 0.0015674	total: 512ms	remaining: 314ms
31:	learn: 0.0014391	total: 528ms	remaining: 297ms
32:	learn: 0.0013240	total: 544ms	remaining: 280ms
33:	learn: 0.0012232	total: 559ms	remaining: 263ms
34:	learn: 0.0011340	total: 576ms	remaining: 247ms
35:	learn: 0.0010410	total: 592ms	remaining: 230ms
36:	learn: 0.0009741	total: 607ms	remaining: 213ms
37:	learn: 0.0009176	total: 623ms	remaining: 197ms
38:	learn: 0.0008653	total: 638ms	remaining: 180ms
39:	learn: 0.0008092	total: 654ms	remaining: 163ms
40:	learn: 0.0007663	total: 669ms	remaining: 147ms
41:	learn: 0.0007268	total: 685ms	remaining: 131ms
42:	learn: 0.0006946	total: 699ms	remaining: 114ms
43:	learn: 0.0006608	total: 714ms	remaining: 97.4ms
44:	learn: 0.0006227	total: 73

39:	learn: 0.0007696	total: 642ms	remaining: 160ms
40:	learn: 0.0007305	total: 657ms	remaining: 144ms
41:	learn: 0.0006940	total: 670ms	remaining: 128ms
42:	learn: 0.0006617	total: 683ms	remaining: 111ms
43:	learn: 0.0006337	total: 696ms	remaining: 95ms
44:	learn: 0.0006012	total: 711ms	remaining: 79ms
45:	learn: 0.0005757	total: 726ms	remaining: 63.1ms
46:	learn: 0.0005476	total: 740ms	remaining: 47.2ms
47:	learn: 0.0005202	total: 755ms	remaining: 31.5ms
48:	learn: 0.0004985	total: 769ms	remaining: 15.7ms
49:	learn: 0.0004735	total: 785ms	remaining: 0us
0:	learn: 0.5252396	total: 16.3ms	remaining: 797ms
1:	learn: 0.3928684	total: 32.3ms	remaining: 775ms
2:	learn: 0.2987886	total: 48.8ms	remaining: 764ms
3:	learn: 0.2243946	total: 65.4ms	remaining: 752ms
4:	learn: 0.1689016	total: 82.3ms	remaining: 741ms
5:	learn: 0.1277158	total: 98.8ms	remaining: 724ms
6:	learn: 0.0985082	total: 115ms	remaining: 708ms
7:	learn: 0.0721107	total: 127ms	remaining: 667ms
8:	learn: 0.0564579	total: 144ms	

0:	learn: 0.5259123	total: 16ms	remaining: 784ms
1:	learn: 0.3933175	total: 32.5ms	remaining: 779ms
2:	learn: 0.2993605	total: 49.4ms	remaining: 774ms
3:	learn: 0.2239881	total: 66ms	remaining: 759ms
4:	learn: 0.1685183	total: 82.5ms	remaining: 743ms
5:	learn: 0.1275872	total: 98.9ms	remaining: 726ms
6:	learn: 0.0973737	total: 116ms	remaining: 710ms
7:	learn: 0.0760479	total: 133ms	remaining: 697ms
8:	learn: 0.0591262	total: 149ms	remaining: 679ms
9:	learn: 0.0459219	total: 166ms	remaining: 663ms
10:	learn: 0.0366679	total: 182ms	remaining: 647ms
11:	learn: 0.0289353	total: 199ms	remaining: 631ms
12:	learn: 0.0235318	total: 216ms	remaining: 615ms
13:	learn: 0.0190047	total: 233ms	remaining: 598ms
14:	learn: 0.0154928	total: 249ms	remaining: 582ms
15:	learn: 0.0127031	total: 266ms	remaining: 566ms
16:	learn: 0.0104854	total: 283ms	remaining: 549ms
17:	learn: 0.0088878	total: 300ms	remaining: 534ms
18:	learn: 0.0074114	total: 317ms	remaining: 518ms
19:	learn: 0.0063024	total: 334ms	remai

12:	learn: 0.0234465	total: 216ms	remaining: 615ms
13:	learn: 0.0190069	total: 233ms	remaining: 598ms
14:	learn: 0.0155068	total: 249ms	remaining: 581ms
15:	learn: 0.0127161	total: 266ms	remaining: 565ms
16:	learn: 0.0105022	total: 282ms	remaining: 548ms
17:	learn: 0.0088199	total: 300ms	remaining: 533ms
18:	learn: 0.0074319	total: 316ms	remaining: 516ms
19:	learn: 0.0062515	total: 333ms	remaining: 499ms
20:	learn: 0.0053715	total: 349ms	remaining: 482ms
21:	learn: 0.0046257	total: 366ms	remaining: 465ms
22:	learn: 0.0040113	total: 383ms	remaining: 449ms
23:	learn: 0.0034711	total: 400ms	remaining: 433ms
24:	learn: 0.0030210	total: 417ms	remaining: 417ms
25:	learn: 0.0026854	total: 432ms	remaining: 399ms
26:	learn: 0.0023961	total: 448ms	remaining: 382ms
27:	learn: 0.0021498	total: 465ms	remaining: 366ms
28:	learn: 0.0019178	total: 483ms	remaining: 349ms
29:	learn: 0.0017131	total: 499ms	remaining: 333ms
30:	learn: 0.0015674	total: 515ms	remaining: 315ms
31:	learn: 0.0014391	total: 531

24:	learn: 0.0030328	total: 415ms	remaining: 415ms
25:	learn: 0.0026819	total: 431ms	remaining: 398ms
26:	learn: 0.0023808	total: 447ms	remaining: 381ms
27:	learn: 0.0021072	total: 463ms	remaining: 364ms
28:	learn: 0.0019054	total: 479ms	remaining: 347ms
29:	learn: 0.0017350	total: 494ms	remaining: 329ms
30:	learn: 0.0015751	total: 510ms	remaining: 312ms
31:	learn: 0.0014186	total: 526ms	remaining: 296ms
32:	learn: 0.0013009	total: 542ms	remaining: 279ms
33:	learn: 0.0011968	total: 558ms	remaining: 262ms
34:	learn: 0.0010919	total: 574ms	remaining: 246ms
35:	learn: 0.0010191	total: 589ms	remaining: 229ms
36:	learn: 0.0009313	total: 606ms	remaining: 213ms
37:	learn: 0.0008740	total: 621ms	remaining: 196ms
38:	learn: 0.0008167	total: 636ms	remaining: 179ms
39:	learn: 0.0007696	total: 653ms	remaining: 163ms
40:	learn: 0.0007305	total: 669ms	remaining: 147ms
41:	learn: 0.0006940	total: 683ms	remaining: 130ms
42:	learn: 0.0006617	total: 698ms	remaining: 114ms
43:	learn: 0.0006337	total: 715

38:	learn: 0.0008422	total: 635ms	remaining: 179ms
39:	learn: 0.0007920	total: 650ms	remaining: 163ms
40:	learn: 0.0007486	total: 666ms	remaining: 146ms
41:	learn: 0.0007065	total: 681ms	remaining: 130ms
42:	learn: 0.0006634	total: 697ms	remaining: 113ms
43:	learn: 0.0006344	total: 711ms	remaining: 97ms
44:	learn: 0.0006067	total: 726ms	remaining: 80.7ms
45:	learn: 0.0005805	total: 741ms	remaining: 64.5ms
46:	learn: 0.0005567	total: 756ms	remaining: 48.3ms
47:	learn: 0.0005342	total: 771ms	remaining: 32.1ms
48:	learn: 0.0005129	total: 786ms	remaining: 16ms
49:	learn: 0.0004930	total: 801ms	remaining: 0us
0:	learn: 0.5259123	total: 16.3ms	remaining: 799ms
1:	learn: 0.3933175	total: 32.5ms	remaining: 779ms
2:	learn: 0.2993605	total: 49.5ms	remaining: 776ms
3:	learn: 0.2239881	total: 66.1ms	remaining: 760ms
4:	learn: 0.1685183	total: 82ms	remaining: 738ms
5:	learn: 0.1275872	total: 97.9ms	remaining: 718ms
6:	learn: 0.0973737	total: 115ms	remaining: 704ms
7:	learn: 0.0760479	total: 131ms	r

0:	learn: 0.5259214	total: 15.3ms	remaining: 751ms
1:	learn: 0.3937459	total: 31ms	remaining: 744ms
2:	learn: 0.2996621	total: 47.3ms	remaining: 741ms
3:	learn: 0.2247911	total: 63.9ms	remaining: 735ms
4:	learn: 0.1691125	total: 80.6ms	remaining: 726ms
5:	learn: 0.1280275	total: 97.1ms	remaining: 712ms
6:	learn: 0.0977116	total: 114ms	remaining: 698ms
7:	learn: 0.0762238	total: 131ms	remaining: 686ms
8:	learn: 0.0591893	total: 147ms	remaining: 669ms
9:	learn: 0.0459769	total: 164ms	remaining: 655ms
10:	learn: 0.0366427	total: 180ms	remaining: 639ms
11:	learn: 0.0288213	total: 198ms	remaining: 626ms
12:	learn: 0.0234465	total: 215ms	remaining: 612ms
13:	learn: 0.0190069	total: 232ms	remaining: 597ms
14:	learn: 0.0155068	total: 249ms	remaining: 581ms
15:	learn: 0.0127161	total: 266ms	remaining: 566ms
16:	learn: 0.0105022	total: 283ms	remaining: 549ms
17:	learn: 0.0088199	total: 301ms	remaining: 534ms
18:	learn: 0.0074319	total: 317ms	remaining: 517ms
19:	learn: 0.0062515	total: 333ms	rem

In [77]:
print("Accuracy: %0.5f (+/- %0.5f)" % (accuracy.mean(), accuracy.std() * 2))
print("F1 Score: %0.5f (+/- %0.5f)" % (f.mean(), f.std() * 2))
print("Precision: %0.5f (+/- %0.5f)" % (precision.mean(), precision.std() * 2))
print("Recall: %0.5f (+/- %0.5f)" % (recall.mean(), recall.std() * 2))

Accuracy: 1.00000 (+/- 0.00000)
F1 Score: 1.00000 (+/- 0.00000)
Precision: 1.00000 (+/- 0.00000)
Recall: 1.00000 (+/- 0.00000)
