In [24]:
import sys
import numpy as np
import pandas as pd
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.global_config import GlobalConfig
from analyse.utils.download_db import (
    get_signals,
    get_db,
)

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x17aef4c40>

In [25]:
# Downloading datasets
url1 = "https://physionet.org/static/published-projects/vfdb/mit-bih-malignant-ventricular-ectopy-database-1.0.0.zip"
name1 = "MIT-BIH-Malignant-Ventricular-Ectopy"
url2 = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name2 = "MIT-BIH-AtrialFibrillation"

db_path1 = get_db(url1, name1, "../../analyse/data/")
db_path2 = get_db(url2, name2, "../../analyse/data/")

signals1 = get_signals(db_path1, reload=False)
signals2 = get_signals(db_path2, reload=False)

In [27]:
# Splitting and preprocessing
from analyse.utils.download_db import split_preprocess_signals
X_train1, y_train1, X_test1, y_test1 = split_preprocess_signals(signals1, test_size=0.2)
X_train2, y_train2, X_test2, y_test2 = split_preprocess_signals(signals2, test_size=0.45)

In [31]:
# Uniting two datasets
X_train = pd.concat([X_train1, X_train2], axis=0)
y_train = pd.concat([y_train1, y_train2], axis=0)
X_test = pd.concat([X_test1, X_test2], axis=0)
y_test = pd.concat([y_test1, y_test2], axis=0)

In [32]:
# Training model on both datasets with searching best hyper-parameters
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

xgb_estimator = XGBClassifier(n_jobs=-1)
xgb_clsf = RandomizedSearchCV(
    xgb_estimator,
    {
        "max_depth" : range(5, 16, 1),
        "n_estimators" : range(60, 221, 40),
        "learning_rate": np.linspace(0.01, 2, 20),
        "subsample": np.linspace(0.7, 0.9, 20),
        "colsample_bytree": np.linspace(0.5, 0.98, 10),
        "min_child_weight": range(1, 9, 1)
    },
    cv=3,
    scoring='roc_auc',
    n_iter=50,
    n_jobs=-1,
    verbose=3
)

xgb_clsf_model = xgb_clsf.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3] END colsample_bytree=0.9266666666666666, learning_rate=0.01, max_depth=13, min_child_weight=6, n_estimators=100, subsample=0.8578947368421053;, score=0.946 total time=24.1min
[CV 3/3] END colsample_bytree=0.6599999999999999, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=180, subsample=0.7105263157894737;, score=0.985 total time=14.6min
[CV 1/3] END colsample_bytree=0.5, learning_rate=1.8952631578947368, max_depth=9, min_child_weight=7, n_estimators=100, subsample=0.8157894736842105;, score=0.925 total time= 5.6min
[CV 1/3] END colsample_bytree=0.9266666666666666, learning_rate=1.6857894736842105, max_depth=13, min_child_weight=6, n_estimators=100, subsample=0.8157894736842105;, score=0.931 total time=13.8min
[CV 2/3] END colsample_bytree=0.9266666666666666, learning_rate=1.5810526315789473, max_depth=6, min_child_weight=8, n_estimators=140, subsample=0.7736842105263158;, score=0.969 total time= 9

[CV 2/3] END colsample_bytree=0.6599999999999999, learning_rate=0.42894736842105263, max_depth=8, min_child_weight=6, n_estimators=60, subsample=0.8157894736842105;, score=0.983 total time= 6.3min
[CV 1/3] END colsample_bytree=0.6599999999999999, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=180, subsample=0.7105263157894737;, score=0.945 total time=16.7min
[CV 2/3] END colsample_bytree=0.6599999999999999, learning_rate=0.01, max_depth=7, min_child_weight=1, n_estimators=180, subsample=0.7105263157894737;, score=0.988 total time=14.9min
[CV 3/3] END colsample_bytree=0.5533333333333333, learning_rate=1.371578947368421, max_depth=9, min_child_weight=7, n_estimators=180, subsample=0.7;, score=0.972 total time=11.6min
[CV 3/3] END colsample_bytree=0.9266666666666666, learning_rate=1.6857894736842105, max_depth=13, min_child_weight=6, n_estimators=100, subsample=0.8157894736842105;, score=0.971 total time=14.8min
[CV 2/3] END colsample_bytree=0.8733333333333333, learning

In [33]:
# Best hyper-parameters we got
xgb_clsf.best_params_

{'subsample': 0.7842105263157895,
 'n_estimators': 60,
 'min_child_weight': 1,
 'max_depth': 11,
 'learning_rate': 0.01,
 'colsample_bytree': 0.82}

In [34]:
# Getting scores
print("Test both:", xgb_clsf_model.score(X_test, y_test))
print("Test 1:", xgb_clsf_model.score(X_test1, y_test1))
print("Test 2:", xgb_clsf_model.score(X_test2, y_test2))

Test both: 0.9735402948909538
Test 1: 0.5882628833675645
Test 2: 0.9799121296589737


In [35]:
# Caching new model
model_filename = "../../analyse/models/XGBClassifier_all_datasets.pickle"

with open(model_filename, 'wb') as bin_file:
    pickle.dump(
        xgb_clsf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )