In [2]:
import sys
import wfdb
import os
import matplotlib.pyplot as plt
from wfdb import processing 
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.download_db import (
    get_signals,
    get_db,
)

from analyse.utils.global_config import GlobalConfig

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x10389d600>

# Download data and preprocess it

In [3]:
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [4]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2294582


# Machine learning using Gradient Boosting

In [11]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, train_size=0.5, random_state=42)

print(len(X_train) / len(X_test))


1.0


In [12]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

rf = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=GlobalConfig.get("est_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)

In [13]:
rf_model = rf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 3/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.989 total time= 5.4min
[CV 5/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.989 total time= 5.4min
[CV 4/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.990 total time= 5.4min
[CV 2/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.989 total time= 5.4min
[CV 1/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.989 total time= 5.4min
[CV 1/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=10.7min
[CV 3/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=10.8min
[CV 2/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=10.9min
[CV 5/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=11.0min
[CV 4/5] END eta=0.05, max_depth=6, n_estimators=100

In [23]:
mode_file_name = "../../analyse/models/XGBClassifier.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        rf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

In [24]:
print(rf_model.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.3,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, ...)


# TESTS

**Check test dataset**

In [15]:
print(rf_model.score(X_test, y_test))

0.9997132910989879


**Create some more datasets for testing and get mean**

In [17]:
results = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=i)
    results.append(rf_model.score(X_test, y_test))

print("average:", np.mean(results))
print("min:", np.min(results))
print("max:", np.max(results))

average: 0.9998702403517342
min: 0.9997048967447011
max: 0.9998920911084009


**Let's try to create dataset full of ecg siganls with defects**

In [18]:
only_bad_window = []
for i in range(len(windows)):
    if classification[i] == 1:
        only_bad_window.append(windows[i])

only_bad_window = pd.DataFrame(only_bad_window)

print(len(only_bad_window))

1071632


In [19]:
print(
    len([i for i in rf_model.predict(only_bad_window) if i == 1]) / len(only_bad_window)
)


0.9955833719037879


**Let's try to create dataset full of ecg siganls without defects**

In [20]:
only_good_window = []
for i in range(len(windows)):
    if classification[i] == 0:
        only_good_window.append(windows[i])

only_good_window = pd.DataFrame(only_good_window)

print(len(only_good_window))

1222950


In [21]:
print(
    len([i for i in rf_model.predict(only_good_window) if i == 0]) / len(only_good_window)
)

0.9980015536203443


**Check all ecg signals at once**

In [22]:
print(rf_model.score(windows_pd, classification_pd))

0.9998726263213067
