In [1]:
import sys
import wfdb
import os
import matplotlib.pyplot as plt
from wfdb import processing 
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.download_db import (
    get_signals,
    get_db,
)

from analyse.utils.global_config import GlobalConfig

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x108e23f10>

In [2]:
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [3]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2294582


In [4]:
print(windows[1])

{'median': -0.004695249641794352, 'mean': 0.03891665901297154, 'variance': 0.09334862334380883, 'mean_abs': 0.20739941875907825, 'max': 0.923728813559322, 'min': -0.4208333333333333, 'sum': 1.167499770389146, 'AAA': 5, 'AAB': 0, 'AAC': 2, 'ABA': 1, 'ABB': 0, 'ABC': 0, 'ACA': 1, 'ACB': 2, 'ACC': 0, 'BAA': 1, 'BAB': 1, 'BAC': 1, 'BBA': 1, 'BBB': 0, 'BBC': 3, 'BCA': 1, 'BCB': 2, 'BCC': 0, 'CAA': 2, 'CAB': 0, 'CAC': 0, 'CBA': 1, 'CBB': 4, 'CBC': 0, 'CCA': 0, 'CCB': 0, 'CCC': 0}


In [5]:
print(''.join(signals[0].windows[1].alphabet))

CBBCBBCAAAACAACBABACBBCBBAAAAA


In [6]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=0)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

rf = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=GlobalConfig.get("est_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)

In [9]:
rf_model = rf.fit(X_train, y_train)


Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 2/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.990 total time=16.7min
[CV 3/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.990 total time=16.7min
[CV 4/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.990 total time=16.7min
[CV 1/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.989 total time=17.0min
[CV 5/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.989 total time=17.1min
[CV 3/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=31.0min
[CV 2/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=31.3min
[CV 1/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=31.3min
[CV 4/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.992 total time=29.5min
[CV 5/5] END eta=0.05, max_depth=6, n_estimators=100,

In [13]:
print(rf_model.score(X_test, y_test))

0.9990214641343784


In [17]:
results = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=i)
    results.append(rf_model.score(X_test, y_test))

print(np.mean(results))

0.999336115629704


In [20]:
only_bad_window = []
only_bad_classification = []
for i in range(len(windows)):
    if classification[i] == 1:
        only_bad_window.append(windows[i])
        only_bad_classification.append(1)

only_bad_window = pd.DataFrame(only_bad_window)
only_bad_classification = pd.DataFrame(only_bad_classification)

print(len(only_bad_window))

1071632


In [26]:
print(
    len([i for i in rf_model.predict(only_bad_window) if i == 1]) / len(only_bad_window)
)



0.9866241396300223


In [27]:
only_good_window = []
only_good_classification = []
for i in range(len(windows)):
    if classification[i] == 0:
        only_good_window.append(windows[i])
        only_good_classification.append(1)

only_good_window = pd.DataFrame(only_good_window)
only_good_classification = pd.DataFrame(only_good_classification)

print(len(only_good_window))

1222950


In [29]:
print(
    len([i for i in rf_model.predict(only_good_window) if i == 0]) / len(only_good_window)
)

0.9926750889243223


In [12]:
mode_file_name = "model.pickle" 
 
with open(mode_file_name, 'wb') as bin_file:
    pickle.dump(
        rf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )