In [1]:
import sys
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils import download_db as ddb
from analyse.utils.global_config import CONFIG

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)

# Download data and preprocess it

In [2]:
signals = ddb.get_all_signals()

In [3]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2432220


# Machine learning using Gradient Boosting

In [4]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, train_size=0.5, random_state=42)

print(len(X_train) / len(X_test))


1.0


In [5]:
windows_pd

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,-0.002326,0.055419,0.099190,0.222344,0.923729,-0.420833,1.662562,4,0,2,...,0,2,0,0,1,4,0,0,0,0
1,-0.004695,0.038917,0.093349,0.207399,0.923729,-0.420833,1.167500,5,0,2,...,0,2,0,0,1,4,0,0,0,0
2,-0.004695,0.040613,0.091894,0.205703,0.923729,-0.420833,1.218393,5,0,3,...,0,2,0,0,1,3,0,0,0,0
3,-0.004695,0.038467,0.091732,0.203557,0.923729,-0.420833,1.154017,5,0,3,...,0,2,0,0,1,3,0,0,0,0
4,-0.004785,0.015771,0.082734,0.188534,0.923729,-0.420833,0.473120,5,0,3,...,0,2,0,1,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432215,0.029740,0.144267,0.373776,0.458281,1.949367,-0.658747,4.328006,0,0,0,...,3,0,0,0,0,4,4,0,5,0
2432216,-0.048210,0.124958,0.378138,0.456837,1.949367,-0.658747,3.748738,0,0,0,...,2,0,0,0,0,4,5,0,5,0
2432217,0.029740,0.153716,0.374756,0.462206,1.949367,-0.658747,4.611483,0,0,0,...,2,0,0,0,0,4,5,0,4,0
2432218,0.029740,0.152094,0.376591,0.463828,1.949367,-0.658747,4.562824,0,0,0,...,2,0,0,0,0,4,5,0,4,0


In [6]:
classification_pd

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
2432215,True
2432216,True
2432217,True
2432218,True


In [7]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

classifier = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=CONFIG.get("est_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)
model = classifier.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 4/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.998 total time=42.7min
[CV 1/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.998 total time=42.7min
[CV 5/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.998 total time=42.7min
[CV 3/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.998 total time=42.8min
[CV 2/5] END eta=0.15, max_depth=12, n_estimators=200, verbosity=0;, score=0.998 total time=42.9min
[CV 2/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.998 total time=52.5min
[CV 3/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.999 total time=52.6min
[CV 1/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.999 total time=52.6min
[CV 5/5] END eta=0.15, max_depth=12, n_estimators=250, verbosity=0;, score=0.998 total time=50.7min
[CV 4/5] END eta=0.15, max_depth=12, n_e

# TESTS

**Check test dataset**

In [8]:
print(model.score(X_test, y_test))

0.9993620588906265


**Create some more datasets for testing and get mean**

In [9]:
results = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=i)
    results.append(model.score(X_test, y_test))

print("average:", np.mean(results))
print("min:", np.min(results))
print("max:", np.max(results))

average: 0.9997302147602533
min: 0.9993765581072673
max: 0.9997560368324663


In [10]:
mode_file_name = "../../analyse/models/XGBClassifier.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

**Let's try to create dataset full of ecg siganls with defects**

In [11]:
only_bad_window = []
for i in range(len(windows)):
    if classification[i] == 1:
        only_bad_window.append(windows[i])

only_bad_window = pd.DataFrame(only_bad_window)

print(len(only_bad_window))

1170323


In [12]:
print(
    len([i for i in model.predict(only_bad_window) if i == 1]) / len(only_bad_window)
)


0.9939973836282804


**Let's try to create dataset full of ecg siganls without defects**

In [13]:
only_good_window = []
for i in range(len(windows)):
    if classification[i] == 0:
        only_good_window.append(windows[i])

only_good_window = pd.DataFrame(only_good_window)

print(len(only_good_window))

1261897


In [14]:
print(
    len([i for i in model.predict(only_good_window) if i == 0]) / len(only_good_window)
)

0.9970243213194104


**Check all ecg signals at once**

In [15]:
print(model.score(windows_pd, classification_pd))

0.9997324742570611
