In [15]:
import sys
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils import download_db as ddb
from analyse.utils.global_config import CONFIG

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)

# Download data and preprocess it

In [16]:
signals = ddb.get_all_signals()

In [17]:
X_train, y_train, X_test, y_test = ddb.split_preprocess_signals(signals, test_size=0.5, seed=42)


In [18]:
X_train

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,0.004706,-0.000326,0.000551,0.020020,0.033175,-0.049327,-0.009769,28,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.006999,0.001191,0.000592,0.020939,0.036530,-0.049327,0.035729,28,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.006999,0.001223,0.000589,0.020907,0.036530,-0.049327,0.036686,28,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.006999,0.001497,0.000609,0.021182,0.041096,-0.049327,0.044918,28,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.004706,-0.000127,0.000646,0.021897,0.041096,-0.049327,-0.003806,28,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1088431,0.000000,0.001550,0.003017,0.035990,0.156627,-0.187500,0.046491,24,1,0,...,0,1,0,0,0,0,0,0,0,0
1088432,0.000000,0.000161,0.002994,0.035395,0.156627,-0.187500,0.004824,24,1,0,...,0,1,0,0,0,0,0,0,0,0
1088433,0.000000,0.001333,0.002980,0.035026,0.156627,-0.187500,0.039994,24,1,0,...,0,1,0,0,0,0,0,0,0,0
1088434,0.000000,0.001735,0.003002,0.035427,0.156627,-0.187500,0.052040,24,1,0,...,0,1,0,0,0,0,0,0,0,0


In [19]:
y_train

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
1088431,True
1088432,True
1088433,True
1088434,True


In [20]:
X_test

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,0.000000,0.012293,0.034454,0.063244,0.931298,-0.324742,0.368796,25,0,0,...,0,1,0,0,0,0,1,0,0,0
1,0.000000,0.022771,0.030575,0.052766,0.931298,-0.237154,0.683122,26,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0.000000,-0.008097,0.002118,0.021898,0.032432,-0.237154,-0.242913,27,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0.002604,-0.000017,0.000310,0.014168,0.032432,-0.046632,-0.000523,28,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0.002604,0.001363,0.000236,0.012787,0.032432,-0.026316,0.040901,28,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1343779,0.003623,0.000620,0.005276,0.036957,0.203540,-0.289308,0.018596,23,1,0,...,0,0,0,0,1,0,0,0,0,0
1343780,0.003704,0.001929,0.005343,0.038266,0.203540,-0.289308,0.057861,23,1,0,...,0,0,0,0,1,0,0,0,0,0
1343781,0.000027,0.001188,0.005362,0.039007,0.203540,-0.289308,0.035639,23,1,0,...,0,0,0,0,1,0,0,0,0,0
1343782,0.007407,0.002917,0.005370,0.039297,0.203540,-0.289308,0.087525,23,1,0,...,0,0,0,0,1,0,0,0,0,0


In [21]:
y_test

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
1343779,False
1343780,False
1343781,False
1343782,False


# Machine learning using Gradient Boosting

In [8]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

classifier = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=CONFIG.get("est_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)
model = classifier.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 2/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.967 total time=43.0min
[CV 4/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.972 total time=43.6min
[CV 1/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.958 total time=43.9min
[CV 3/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.975 total time=44.2min
[CV 5/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.978 total time=44.5min
[CV 2/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.967 total time=54.3min
[CV 1/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.958 total time=55.1min
[CV 3/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.974 total time=55.3min
[CV 4/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.973 total time=49.3min
[CV 5/5] END eta=0.15, max_depth=12, n_e

In [11]:
mode_file_name = "../../analyse/models/XGBClassifier.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

In [26]:
model.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.15,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.150000006, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=16, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, ...)

# TESTS

**Check test dataset**

In [27]:
print(model.score(X_test, y_test))

0.9889147952289529


**Calculate f1 norm**

In [28]:
from sklearn.metrics import f1_score

y_pred = model.predict(X_test)
print(f1_score(y_test, y_pred))

0.9594859861436197
