In [1]:
import sys
import wfdb
import os
import matplotlib.pyplot as plt
from wfdb import processing 
import numpy as np
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.download_db import (
    get_signals,
    get_db,
)

from analyse.utils.global_config import GlobalConfig

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x105fbf370>

In [2]:
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [3]:
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

2295319


In [4]:
print(windows[1])

{'median': -0.0024154589371980784, 'mean': 0.03447209835635421, 'variance': 0.09116142464488852, 'mean_abs': 0.22039996150795363, 'max': 0.5657894736842106, 'min': -0.4145299145299145, 'sum': 0.4826093769889589, 'AAA': 2, 'AAB': 0, 'AAC': 1, 'ABA': 0, 'ABB': 0, 'ABC': 0, 'ACA': 1, 'ACB': 0, 'ACC': 0, 'BAA': 0, 'BAB': 0, 'BAC': 0, 'BBA': 0, 'BBB': 0, 'BBC': 2, 'BCA': 1, 'BCB': 2, 'BCC': 0, 'CAA': 1, 'CAB': 0, 'CAC': 0, 'CBA': 0, 'CBB': 2, 'CBC': 0, 'CCA': 0, 'CCB': 0, 'CCC': 0}


In [5]:
print(''.join(signals[0].windows[1].alphabet))

BCBBCBBCAAAACA


In [6]:
from sklearn.model_selection import train_test_split

import pandas as pd


windows_pd = pd.DataFrame(windows)
classification_pd = pd.DataFrame(classification)

X_train, X_test, y_train, y_test = train_test_split(windows_pd, classification_pd, random_state=0)


In [7]:
print(GlobalConfig.get("est_params"))
print(X_train)

{'n_estimators': [50, 100, 150], 'max_depth': [6, 8], 'eta': [0.05, 0.15, 0.3], 'verbosity': [0]}
           median      mean  variance  mean_abs       max       min       sum  \
40881   -0.002370 -0.001617  0.000157  0.009932  0.024272 -0.023697 -0.022638   
1500953  0.019927  0.023811  0.025044  0.099390  0.504505 -0.198276  0.333359   
156121  -0.006921  0.001661  0.000357  0.016420  0.035461 -0.020690  0.023253   
1715870  0.000000  0.032186  0.089131  0.123081  1.042017 -0.331461  0.450599   
930489  -0.002463 -0.001677  0.000101  0.008740  0.014925 -0.019324 -0.023484   
...           ...       ...       ...       ...       ...       ...       ...   
2249467  0.000000  0.018269  0.034983  0.108499  0.441441 -0.439394  0.255765   
963395   0.000000  0.001494  0.000040  0.005183  0.015544 -0.005208  0.020913   
2215104 -0.028302  0.071122  0.166065  0.347100  0.724138 -0.453125  0.995711   
1484405  0.000000  0.000444  0.000126  0.008695  0.016043 -0.026178  0.006217   
305711  -0.

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

rf = GridSearchCV(
    estimator=XGBClassifier(),
    param_grid=GlobalConfig.get("est_params"),
    n_jobs=-1,
    scoring='roc_auc',
    verbose=3,
    refit=True
)

In [9]:
rf_model = rf.fit(X_train, y_train)


Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

In [4]:
import regex as re

string = "AAABCAA"
ngramm = 'AA'
re.findall(ngramm, string, overlapped=True)

['AA', 'AA', 'AA']

In [13]:
from itertools import product

string = "ABC"

print(list(map(lambda s : ''.join(s), product(string, repeat=3))))

['AAA', 'AAB', 'AAC', 'ABA', 'ABB', 'ABC', 'ACA', 'ACB', 'ACC', 'BAA', 'BAB', 'BAC', 'BBA', 'BBB', 'BBC', 'BCA', 'BCB', 'BCC', 'CAA', 'CAB', 'CAC', 'CBA', 'CBB', 'CBC', 'CCA', 'CCB', 'CCC']
