In [1]:
import sys
import os
import numpy as np
import pandas as pd
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.global_config import GlobalConfig
from analyse.utils.download_db import (
    get_signals,
    get_db,
)

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x111000130>

In [43]:
# Downloading dataset
url = "https://physionet.org/static/published-projects/vfdb/mit-bih-malignant-ventricular-ectopy-database-1.0.0.zip"
name = "MIT-BIH-Malignant-Ventricular-Ectopy"

db_path = get_db(url, name, "../../analyse/data/")

signals = get_signals(db_path, reload=False)

In [52]:
# Preprocessing
windows = []
classification = []
for sig in signals:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows.append(metrics)
        classification.append(has_defect)
print(len(windows))

137638


In [53]:
# Splitting dataset into training and testing datasets
X = pd.DataFrame(windows)
y = pd.DataFrame(classification)

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

print(X_test.iloc[:,0:-1].values)
print(len(X_train))

[[-2.19882729e-01 -1.41107676e-03  1.39524628e+00 ...  2.00000000e+00
   1.00000000e+00  2.00000000e+00]
 [ 0.00000000e+00 -6.55222008e-04  3.98165094e-04 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-1.68241793e-02  1.03953335e-02  7.21503758e-02 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 ...
 [-7.49074178e-03  6.84665386e-02  1.94494871e-01 ...  4.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [-6.09756098e-02  3.29288199e-02  3.33371699e-01 ...  4.00000000e+00
   2.00000000e+00  3.00000000e+00]
 [-3.82072829e-02  1.83617180e-02  5.84644005e-02 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]
110110


In [54]:
# Searching for the best hyper-parameters:
xgb_estimator = XGBClassifier()
xgb_clsf = GridSearchCV(
    estimator=xgb_estimator,
    param_grid=GlobalConfig.get("est_params"),
    verbose=4,
    n_jobs=-1,
    refit=True,
    scoring='roc_auc'
)
xgb_clsf_model = xgb_clsf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


In [55]:
# Caching model
model_filename = "../../analyse/models/XGBClassifier_Malignant_Ventricular_Ectopy.pickle"

with open(model_filename, 'wb') as bin_file:
    pickle.dump(
        xgb_clsf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

[CV 2/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.913 total time=  39.4s
[CV 5/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.914 total time=  44.3s
[CV 4/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.938 total time= 1.4min
[CV 3/5] END eta=0.05, max_depth=6, n_estimators=150, verbosity=0;, score=0.948 total time= 2.2min
[CV 2/5] END eta=0.05, max_depth=6, n_estimators=200, verbosity=0;, score=0.958 total time= 2.9min
[CV 1/5] END eta=0.05, max_depth=8, n_estimators=50, verbosity=0;, score=0.949 total time=  58.2s
[CV 3/5] END eta=0.05, max_depth=8, n_estimators=50, verbosity=0;, score=0.953 total time= 1.0min
[CV 1/5] END eta=0.05, max_depth=8, n_estimators=100, verbosity=0;, score=0.967 total time= 2.0min
[CV 5/5] END eta=0.05, max_depth=8, n_estimators=100, verbosity=0;, score=0.969 total time= 1.9min
[CV 4/5] END eta=0.05, max_depth=8, n_estimators=150, verbosity=0;, score=0.976 total time= 2.7min
[CV 3/5] END e

[CV 1/5] END eta=0.05, max_depth=6, n_estimators=50, verbosity=0;, score=0.912 total time=  39.5s
[CV 1/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.937 total time= 1.4min
[CV 5/5] END eta=0.05, max_depth=6, n_estimators=100, verbosity=0;, score=0.940 total time= 1.4min
[CV 4/5] END eta=0.05, max_depth=6, n_estimators=150, verbosity=0;, score=0.951 total time= 2.2min
[CV 3/5] END eta=0.05, max_depth=6, n_estimators=200, verbosity=0;, score=0.955 total time= 2.9min
[CV 2/5] END eta=0.05, max_depth=8, n_estimators=50, verbosity=0;, score=0.951 total time=  59.1s
[CV 5/5] END eta=0.05, max_depth=8, n_estimators=50, verbosity=0;, score=0.952 total time=  54.6s
[CV 4/5] END eta=0.05, max_depth=8, n_estimators=100, verbosity=0;, score=0.969 total time= 2.1min
[CV 3/5] END eta=0.05, max_depth=8, n_estimators=150, verbosity=0;, score=0.976 total time= 2.7min
[CV 2/5] END eta=0.05, max_depth=8, n_estimators=200, verbosity=0;, score=0.981 total time= 3.4min
[CV 1/5] END 

In [57]:
print(xgb_clsf_model.best_estimator_)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.3,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, ...)


In [108]:
# Getting scores
print("Test dataset:", xgb_clsf_model.score(X_test, y_test))
print("Train dataset:", xgb_clsf_model.score(X_train, y_train))
print("Full dataset:", xgb_clsf_model.score(X, y))

Test dataset: 0.9990910605454583
Train dataset: 0.9999998273118195
Full dataset: 0.9998849297201643


In [133]:
from sklearn.metrics import matthews_corrcoef, roc_auc_score

print(matthews_corrcoef(y_test, xgb_clsf_model.predict(X_test)))
print(roc_auc_score(y_test, xgb_clsf.predict(X_test)))

0.9945374614292751
0.9976217691943042


In [109]:
from sklearn.metrics import classification_report
print(classification_report(y_test, xgb_clsf_model.predict(X_test)))

              precision    recall  f1-score   support

       False       0.98      0.98      0.98      7619
        True       0.99      0.99      0.99     19909

    accuracy                           0.99     27528
   macro avg       0.99      0.99      0.99     27528
weighted avg       0.99      0.99      0.99     27528



In [110]:
# Effectiveness on unhealthy signals only
unhealthy_X = X.drop(y.drop(y[y[0] == True].index).index)
unhealthy_X

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
713,-0.039098,0.121850,0.345461,0.419220,1.546667,-0.485507,3.655508,0,0,2,...,4,1,2,1,0,0,5,1,3,0
714,-0.073357,0.078109,0.304098,0.381034,1.546667,-0.485507,2.343259,0,0,2,...,3,1,2,1,0,0,5,1,3,0
715,-0.051171,0.086359,0.300056,0.372783,1.546667,-0.485507,2.590785,0,0,2,...,3,2,2,1,0,0,5,1,2,0
716,-0.051171,0.065598,0.331241,0.393545,1.546667,-0.986486,1.967935,0,0,3,...,3,2,2,1,0,0,4,1,2,0
717,-0.051171,4.992979,715.392740,5.320926,149.000000,-0.986486,149.789363,0,0,3,...,3,2,2,1,0,0,4,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137633,0.029740,0.144267,0.373776,0.458281,1.949367,-0.658747,4.328006,0,0,0,...,3,0,0,0,0,4,4,0,5,0
137634,-0.048210,0.124958,0.378138,0.456837,1.949367,-0.658747,3.748738,0,0,0,...,2,0,0,0,0,4,5,0,5,0
137635,0.029740,0.153716,0.374756,0.462206,1.949367,-0.658747,4.611483,0,0,0,...,2,0,0,0,0,4,5,0,4,0
137636,0.029740,0.152094,0.376591,0.463828,1.949367,-0.658747,4.562824,0,0,0,...,2,0,0,0,0,4,5,0,4,0


In [111]:
print("Accuracy for unhealthy signals:",
      len([row for row in xgb_clsf_model.predict(unhealthy_X) if row == True]) / len(unhealthy_X))

Accuracy for unhealthy signals: 0.9982813379432339


In [112]:
# Effectiveness on healthy signals only
healthy_X = X.drop(y.drop(y[y[0] == False].index).index)
healthy_X

Unnamed: 0,median,mean,variance,mean_abs,max,min,sum,AAA,AAB,AAC,...,BCC,CAA,CAB,CAC,CBA,CBB,CBC,CCA,CCB,CCC
0,-0.042558,0.110940,0.314108,0.389761,1.261905,-0.413043,3.328204,3,0,2,...,3,3,1,0,0,0,6,0,3,0
1,-0.021796,0.122727,0.306837,0.377974,1.261905,-0.413043,3.681810,4,0,2,...,3,3,1,0,0,0,5,0,3,0
2,-0.021796,0.082418,0.262118,0.337665,1.259259,-0.413043,2.472537,5,0,2,...,3,3,1,0,0,0,5,0,3,0
3,-0.021796,0.090877,0.257901,0.329206,1.259259,-0.413043,2.726296,6,0,2,...,3,2,1,0,0,0,5,0,3,0
4,-0.032246,0.076556,0.266664,0.343527,1.259259,-0.437037,2.296667,5,0,3,...,3,2,1,0,0,0,5,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132612,0.010714,0.018473,0.061482,0.190191,0.574468,-0.347973,0.554195,0,0,4,...,0,1,1,0,1,3,2,0,0,0
132613,0.010714,0.027861,0.057819,0.180803,0.574468,-0.347973,0.835834,1,0,4,...,0,1,0,0,1,3,2,0,0,0
132614,0.010714,0.030679,0.057356,0.177985,0.574468,-0.347973,0.920377,2,0,4,...,0,1,0,0,1,3,2,0,0,0
132615,0.010714,0.035478,0.062780,0.182783,0.670270,-0.347973,1.064331,2,1,4,...,0,1,0,0,1,3,2,0,0,0


In [113]:
print("Accuracy for healthy signals:",
      len([row for row in xgb_clsf_model.predict(healthy_X) if row == False]) / len(healthy_X))

Accuracy for healthy signals: 0.996617901525877


In [134]:
# Checking accuracy on random datasets
scores = []
matthews = []
roc_auc = []
for i in range(0, 300, 2):
    _, X_test, _, y_test = train_test_split(X, y, test_size = 0.3, random_state = i)
    scores.append(xgb_clsf_model.score(X_test, y_test))
    matthews.append(matthews_corrcoef(y_test, xgb_clsf_model.predict(X_test)))
    roc_auc.append(roc_auc_score(y_test, xgb_clsf_model.predict(X_test)))

print("SCORES:")
print("average:", np.mean(scores))
print("min:", np.min(scores))
print("max:", np.max(scores))

print("\nMatthews_corrcoef:")
print("average:", np.mean(matthews))
print("min:", np.min(matthews))
print("max:", np.max(matthews))

print("\nRoc_auc_score:")
print("average:", np.mean(roc_auc))
print("min:", np.min(roc_auc))
print("max:", np.max(roc_auc))



SCORES:
average: 0.9998863416743086
min: 0.999801133297194
max: 0.9999393430179293

Matthews_corrcoef:
average: 0.9945612042773472
min: 0.9926482125525635
max: 0.9956670783927253

Roc_auc_score:
average: 0.99744264917733
min: 0.996615775676747
max: 0.9981068098541414


In [114]:
# Не очень понятно, что делать с NOISE
import wfdb
data, info = wfdb.rdsamp(f"{db_path}/421")
data = np.array(data)

info['annotation'] = wfdb.rdann(f"{db_path}/421", 'atr')
print(info)
info['annotation'].__dict__['aux_note']

{'fs': 250, 'sig_len': 525000, 'n_sig': 2, 'base_date': None, 'base_time': None, 'units': ['mV', 'mV'], 'sig_name': ['ECG', 'ECG'], 'comments': [], 'annotation': <wfdb.io.annotation.Annotation object at 0x13ef5dbb0>}


['(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(NOISE\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 '(VT\x00',
 '(N\x00',
 

In [143]:
# Trying another hyper-parameters search with wider range
from sklearn.model_selection import RandomizedSearchCV
xgb_estimator_enhanced = XGBClassifier()
xgb_clsf_enhanced = RandomizedSearchCV(
    xgb_estimator_enhanced,
    {
        "max_depth" : range(2, 16, 1),
        "n_estimators" : range(60, 261, 40),
        "learning_rate": np.linspace(0.01, 2, 20),
        "subsample": np.linspace(0.7, 0.9, 20),
        "colsample_bytree": np.linspace(0.5, 0.98, 10),
        "min_child_weight": range(1, 9, 1)
    },
    cv=3,
    scoring='roc_auc',
    n_iter=300,
    n_jobs=-1
)

xgb_clsf_enhanced_model = xgb_clsf_enhanced.fit(X_train, y_train)

In [173]:
xgb_clsf_enhanced.best_params_

{'subsample': 0.8473684210526315,
 'n_estimators': 220,
 'min_child_weight': 1,
 'max_depth': 12,
 'learning_rate': 0.3242105263157895,
 'colsample_bytree': 0.7666666666666666}

In [153]:
# Getting scores
print("Test dataset:", xgb_clsf_enhanced_model.score(X_test, y_test))
print("Train dataset:", xgb_clsf_enhanced_model.score(X_train, y_train))
print("Full dataset:", xgb_clsf_enhanced_model.score(X, y))

# Checking accuracy on random datasets
scores = []
matthews = []
roc_auc = []
for i in range(0, 300, 2):
    _, X_test, _, y_test = train_test_split(X, y, test_size = 0.3, random_state = i)
    scores.append(xgb_clsf_enhanced_model.score(X_test, y_test))
    matthews.append(matthews_corrcoef(y_test, xgb_clsf_enhanced_model.predict(X_test)))
    roc_auc.append(roc_auc_score(y_test, xgb_clsf_enhanced_model.predict(X_test)))

print("SCORES:")
print("average:", np.mean(scores))
print("min:", np.min(scores))
print("max:", np.max(scores))

print("\nMatthews_corrcoef:")
print("average:", np.mean(matthews))
print("min:", np.min(matthews))
print("max:", np.max(matthews))

print("\nRoc_auc_score:")
print("average:", np.mean(roc_auc))
print("min:", np.min(roc_auc))
print("max:", np.max(roc_auc))

Test dataset: 0.9999152195652785
Train dataset: 0.9999999866212971
Full dataset: 0.9999113109116669
SCORES:
average: 0.9999133638710184
min: 0.9998456085679346
max: 0.9999644684064626

Matthews_corrcoef:
average: 0.9947471507844298
min: 0.9933690394780842
max: 0.9961895964218587

Roc_auc_score:
average: 0.997455489262267
min: 0.9967884217483203
max: 0.9981620312601638


In [154]:
# Caching new model and deleting old
model_filename = "../../analyse/models/XGBClassifier_Malignant_Ventricular_Ectopy.pickle"

with open(model_filename, 'wb') as bin_file:
    pickle.dump(
        xgb_clsf_enhanced_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

In [151]:
# Checking new model on old Atrial Fibrillation dataset
url = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name = "MIT-BIH-AtrialFibrillation"

db_path = get_db(url, name, "../../analyse/data/")

signals_atrial = get_signals(db_path, reload=False)

In [152]:
windows_atrial = []
classification_atrial = []
for sig in signals_atrial:
    for window in sig.windows:
        metrics, has_defect = window.get_data()
        windows_atrial.append(metrics)
        classification_atrial.append(has_defect)
print(len(windows_atrial))

2294582


In [155]:
# splitting dataset into training and testing data
X_atrial = pd.DataFrame(windows_atrial)
y_atrial = pd.DataFrame(classification_atrial)

X_atrial_train, X_atrial_test, y_atrial_train, y_atrial_test = train_test_split(X_atrial, y_atrial, 
                                                                                test_size = 0.2, random_state = 21)

In [156]:
# Getting scores of new model for old dataset
print("Test dataset:", xgb_clsf_enhanced_model.score(X_atrial_test, y_atrial_test))
print("Full dataset:", xgb_clsf_enhanced_model.score(X_atrial, y_atrial))

Test dataset: 0.49404899554212184
Full dataset: 0.49449987675555307
