In [1]:
import sys
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils import download_db as ddb
from analyse.utils.global_config import CONFIG

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


In [2]:
logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)

# Download data and preprocess it

In [3]:
signals = ddb.get_all_signals()

In [4]:
X_train, y_train, X_test, y_test = ddb.split_dbs(test_size=0.3, seed=42)


In [5]:
X_train

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,-0.002451,0.002032,0.000934,0.024158,0.058824,-0.058824,0.060974,28,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.002451,0.000752,0.000837,0.022877,0.056995,-0.058824,0.022559,28,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.000000,0.002929,0.000839,0.023034,0.056995,-0.058824,0.087862,28,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.000000,0.001653,0.000907,0.024309,0.056995,-0.058824,0.049592,28,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.002451,0.000618,0.000898,0.023948,0.056995,-0.058824,0.018548,28,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1682597,-0.306454,2.769789,127.350883,3.679040,53.666667,-1.000000,66.474932,0,0,1,...,2,1,1,0,1,0,4,1,2,3
1682598,-0.290503,2.933693,132.243131,3.795520,53.666667,-1.000000,67.474932,0,0,1,...,2,1,1,0,1,0,3,1,2,3
1682599,-0.306454,2.796087,127.168515,3.652742,53.666667,-1.000000,67.106080,0,0,1,...,3,1,1,0,1,0,3,1,2,3
1682600,-0.306454,2.710769,127.488025,3.579329,53.666667,-1.000000,65.058460,0,0,1,...,2,1,1,0,1,0,3,1,2,4


In [6]:
y_train

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
1682597,True
1682598,True
1682599,True
1682600,True


In [7]:
X_test

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,0.000000,0.033000,0.088570,0.118645,1.100000,-0.35,0.990002,19,0,1,...,0,1,0,1,0,0,2,0,0,0
1,0.000000,0.033160,0.088560,0.118805,1.100000,-0.35,0.994786,19,0,1,...,0,1,0,1,0,0,2,0,0,0
2,0.000000,0.033328,0.088548,0.118636,1.100000,-0.35,0.999837,19,0,1,...,0,1,0,1,0,0,2,0,0,0
3,0.000000,0.032989,0.088567,0.118298,1.100000,-0.35,0.989685,19,0,1,...,0,1,0,1,0,0,2,0,0,0
4,0.000000,0.033148,0.088557,0.118457,1.100000,-0.35,0.994447,19,0,1,...,0,1,0,1,0,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749613,-0.167675,-0.008293,0.578342,0.618515,1.595238,-1.00,-0.215608,0,0,0,...,6,0,1,1,0,1,5,1,4,1
749614,-0.146002,0.060472,0.560167,0.610357,1.595238,-1.00,1.572271,0,0,0,...,6,0,1,1,0,1,4,1,5,1
749615,-0.126866,0.055407,0.540087,0.590576,1.595238,-1.00,1.496000,0,0,0,...,6,0,1,1,1,1,4,1,5,1
749616,-0.126866,0.029971,0.518256,0.565140,1.595238,-1.00,0.809223,0,0,0,...,5,0,1,1,1,1,4,1,5,1


In [8]:
y_test

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
749613,True
749614,True
749615,True
749616,True


# Machine learning using Gradient Boosting

In [9]:
classifier = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=CONFIG.get("est_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)
model = classifier.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV 4/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=1;, score=0.994 total time= 2.9min
[CV 1/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=3;, score=0.970 total time= 2.9min
[CV 1/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=1;, score=0.970 total time= 2.9min
[CV 3/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=1;, score=0.991 total time= 2.9min
[CV 2/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=1;, score=0.994 total time= 2.9min
[CV 5/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=1;, score=0.916 total time= 2.9min
[CV 3/5] END colsample_bytree=0.3, gamma=0.0, learning_rate=0.05, max_depth=4, min_child_weight=3;, score=0.991 total time= 2.9min
[CV 2/5] END colsam

In [10]:
mode_file_name = "../../analyse/models/XGBClassifier.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

In [11]:
model.best_params_

{'colsample_bytree': 0.4,
 'gamma': 0.1,
 'learning_rate': 0.15,
 'max_depth': 8,
 'min_child_weight': 3}

# TESTS

**Check test dataset**

In [12]:
print(model.score(X_test, y_test))

0.976789074413055


**Calculate f1 norm**

In [13]:
y_pred = model.predict(X_test)
print(f1_score(y_test, y_pred))

0.9370863153104704
