In [3]:
import sys
import numpy as np
import pandas as pd
import pickle
import logging

sys.path.append('/content/Storing-Analyzing-ECG/analyse')
sys.path.append('/content/Storing-Analyzing-ECG/')

from analyse.utils.global_config import GlobalConfig
from analyse.utils.download_db import (
    get_signals,
    get_db,
)
"""
logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
"""
GlobalConfig(r'/content/Storing-Analyzing-ECG/analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x7fd5aa2b1550>

In [8]:
# Downloading datasets
url1 = "https://physionet.org/static/published-projects/vfdb/mit-bih-malignant-ventricular-ectopy-database-1.0.0.zip"
name1 = "MIT-BIH-Malignant-Ventricular-Ectopy"
url2 = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name2 = "MIT-BIH-AtrialFibrillation"

db_path1 = get_db(url1, name1, "/content/Storing-Analyzing-ECG/analyse/data/")
db_path2 = get_db(url2, name2, "/content/Storing-Analyzing-ECG/analyse/data/")

signals1 = get_signals(db_path1, reload=False)
signals2 = get_signals(db_path2, reload=False)



In [9]:
# Splitting and preprocessing
from analyse.utils.download_db import split_preprocess_signals
X_train1, y_train1, X_test1, y_test1 = split_preprocess_signals(signals1)
X_train2, y_train2, X_test2, y_test2 = split_preprocess_signals(signals2)

In [10]:
# Uniting two datasets
X_train = pd.concat([X_train1, X_train2], axis=0)
y_train = pd.concat([y_train1, y_train2], axis=0)
X_test = pd.concat([X_test1, X_test2], axis=0)
y_test = pd.concat([y_test1, y_test2], axis=0)

In [12]:
# Training model on both datasets with previously found good hyper-parameters
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb_clsf = XGBClassifier(
    n_estimators=220,
    subsample=0.84736842,
    colsample_bytree=0.766666,
    max_depth=12,
    
    n_jobs=-1
)

xgb_clsf_model = xgb_clsf.fit(X_train, y_train.values.ravel())

In [14]:
# Getting scores
print("Test both:", xgb_clsf_model.score(X_test, y_test))
print("Test 1:", xgb_clsf_model.score(X_test1, y_test1))
print("Test 2:", xgb_clsf_model.score(X_test2, y_test2))

Test both: 0.9368190624380273
Test 1: 0.7021704374788044
Test 2: 0.9525891822195596


In [16]:
# Caching new model
model_filename = "/content/Storing-Analyzing-ECG/analyse/models/XGBClassifier_all_datasets.pickle"

with open(model_filename, 'wb') as bin_file:
    pickle.dump(
        xgb_clsf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )