In [1]:
import sys
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.download_db import (
    get_signals,
    get_db,
)

from analyse.utils.global_config import GlobalConfig

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)

# Download data and preprocess it

In [2]:
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [3]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2294582


# Machine learning using Gradient Boosting

In [4]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, train_size=0.5, random_state=42)

print(len(X_train) / len(X_test))


1.0


In [5]:
windows_pd

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,-0.002326,0.055419,0.099190,0.222344,0.923729,-0.420833,1.662562,4,0,2,...,0,2,0,0,1,4,0,0,0,0
1,-0.004695,0.038917,0.093349,0.207399,0.923729,-0.420833,1.167500,5,0,2,...,0,2,0,0,1,4,0,0,0,0
2,-0.004695,0.040613,0.091894,0.205703,0.923729,-0.420833,1.218393,5,0,3,...,0,2,0,0,1,3,0,0,0,0
3,-0.004695,0.038467,0.091732,0.203557,0.923729,-0.420833,1.154017,5,0,3,...,0,2,0,0,1,3,0,0,0,0
4,-0.004785,0.015771,0.082734,0.188534,0.923729,-0.420833,0.473120,5,0,3,...,0,2,0,1,1,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2294577,-0.224906,2.857417,155.126511,3.605519,61.406250,-1.000000,68.578016,0,0,0,...,2,0,1,1,0,2,2,1,2,5
2294578,-0.236220,2.727775,149.324823,3.476644,61.406250,-1.000000,68.194373,0,0,0,...,2,0,1,1,0,2,2,1,2,5
2294579,-0.213592,5.590630,328.090512,6.307107,71.166466,-1.000000,139.765746,0,0,0,...,2,0,1,0,0,2,2,1,2,5
2294580,-0.236220,5.551042,328.571580,6.346694,71.166466,-1.000000,138.776056,0,0,0,...,2,0,1,0,0,2,3,1,2,5


In [6]:
classification_pd

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
2294577,True
2294578,True
2294579,True
2294580,True


In [26]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

classifier = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid = {
        "max_depth" : [12, 16],
        "n_estimators" : [150, 200, 250],
        "eta" : [0.05, 0.15, 0.3],
    },
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)
model = classifier.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 2/5] END eta=0.05, max_depth=12, n_estimators=150;, score=0.999 total time=62.3min
[CV 3/5] END eta=0.05, max_depth=12, n_estimators=150;, score=0.999 total time=62.3min
[CV 5/5] END eta=0.05, max_depth=12, n_estimators=150;, score=0.999 total time=62.4min
[CV 4/5] END eta=0.05, max_depth=12, n_estimators=150;, score=0.999 total time=62.5min
[CV 1/5] END eta=0.05, max_depth=12, n_estimators=150;, score=0.999 total time=62.5min
[CV 2/5] END eta=0.05, max_depth=12, n_estimators=200;, score=0.999 total time=79.3min
[CV 3/5] END eta=0.05, max_depth=12, n_estimators=200;, score=0.999 total time=79.5min
[CV 1/5] END eta=0.05, max_depth=12, n_estimators=200;, score=0.999 total time=79.5min
[CV 4/5] END eta=0.05, max_depth=12, n_estimators=200;, score=0.999 total time=77.5min
[CV 5/5] END eta=0.05, max_depth=12, n_estimators=200;, score=0.999 total time=77.7min
[CV 2/5] END eta=0.05, max_depth=12, n_estimators=250;, score=0.999 t

# TESTS

**Check test dataset**

In [27]:
print(model.score(X_test, y_test))

0.9998915025238249


**Create some more datasets for testing and get mean**

In [29]:
results = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=i)
    results.append(model.score(X_test, y_test))

print("average:", np.mean(results))
print("min:", np.min(results))
print("max:", np.max(results))

average: 0.9999779465809661
min: 0.9998915025238249
max: 0.9999872063241116


In [30]:
mode_file_name = "../../analyse/models/XGBClassifier-copy.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

**Let's try to create dataset full of ecg siganls with defects**

In [31]:
only_bad_window = []
for i in range(len(windows)):
    if classification[i] == 1:
        only_bad_window.append(windows[i])

only_bad_window = pd.DataFrame(only_bad_window)

print(len(only_bad_window))

1071632


In [32]:
print(
    len([i for i in model.predict(only_bad_window) if i == 1]) / len(only_bad_window)
)


0.9989054078265673


**Let's try to create dataset full of ecg siganls without defects**

In [33]:
only_good_window = []
for i in range(len(windows)):
    if classification[i] == 0:
        only_good_window.append(windows[i])

only_good_window = pd.DataFrame(only_good_window)

print(len(only_good_window))

1222950


In [34]:
print(
    len([i for i in model.predict(only_good_window) if i == 0]) / len(only_good_window)
)

0.9995093830491844


**Check all ecg signals at once**

In [35]:
print(model.score(windows_pd, classification_pd))

0.9999790899566827
