In [24]:
import sys
import numpy as np
import pandas as pd
import pickle
import logging

sys.path.append('../../analyse')
sys.path.append('../../')

from analyse.utils.global_config import GlobalConfig
from analyse.utils.download_db import (
    get_signals,
    get_db,
)

logging.basicConfig(
    filename='run-logs.log', 
    encoding='utf-8', 
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.DEBUG, 
    filemode='w'
)
GlobalConfig(r'../../analyse/config/params.json')

<analyse.utils.global_config.GlobalConfig at 0x17aef4c40>

In [25]:
# Downloading datasets
url1 = "https://physionet.org/static/published-projects/vfdb/mit-bih-malignant-ventricular-ectopy-database-1.0.0.zip"
name1 = "MIT-BIH-Malignant-Ventricular-Ectopy"
url2 = "https://physionet.org/static/published-projects/afdb/mit-bih-atrial-fibrillation-database-1.0.0.zip"
name2 = "MIT-BIH-AtrialFibrillation"

db_path1 = get_db(url1, name1, "../../analyse/data/")
db_path2 = get_db(url2, name2, "../../analyse/data/")

signals1 = get_signals(db_path1, reload=False)
signals2 = get_signals(db_path2, reload=False)

In [27]:
# Splitting and preprocessing
from analyse.utils.download_db import split_preprocess_signals
X_train1, y_train1, X_test1, y_test1 = split_preprocess_signals(signals1, test_size=0.2)
X_train2, y_train2, X_test2, y_test2 = split_preprocess_signals(signals2, test_size=0.45)

In [41]:
# Uniting two datasets
X_train = pd.concat([X_train1, X_train2], axis=0)
y_train = pd.concat([y_train1, y_train2], axis=0)
X_test = pd.concat([X_test1, X_test2], axis=0)
y_test = pd.concat([y_test1, y_test2], axis=0)

In [49]:
# Training model on both datasets with searching best hyper-parameters
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

xgb_estimator = XGBClassifier(n_jobs=-1)
xgb_clsf = RandomizedSearchCV(
    xgb_estimator,
    {
        "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
         "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
         "min_child_weight": [1, 3, 5, 7],
         "gamma": [0.0, 0.1, 0.2 , 0.3, 0.4],
         "colsample_bytree": [0.3, 0.4, 0.5, 0.7]
    },
    cv=3,
    scoring="f1",
    n_iter=100,
    n_jobs=-1,
    verbose=3,
    random_state=42
)

xgb_clsf_model = xgb_clsf.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 2/3] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.3, max_depth=5, min_child_weight=5;, score=0.940 total time= 3.9min
[CV 3/3] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.3, max_depth=15, min_child_weight=3;, score=0.939 total time=14.0min
[CV 1/3] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.15, max_depth=12, min_child_weight=5;, score=0.876 total time= 8.7min
[CV 2/3] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.3, max_depth=8, min_child_weight=1;, score=0.939 total time= 4.7min
[CV 3/3] END colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=8, min_child_weight=5;, score=0.942 total time= 5.8min
[CV 1/3] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.2, max_depth=3, min_child_weight=1;, score=0.870 total time= 2.3min
[CV 2/3] END colsample_bytree=0.5, gamma=0.2, learning_rate=0.2, max_depth=3, min_child_weight=1;, score=0.938 total time= 2.3min
[CV 3/3] END colsample_

[CV 1/3] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.3, max_depth=5, min_child_weight=5;, score=0.872 total time= 3.9min
[CV 1/3] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=5;, score=0.874 total time= 7.7min
[CV 2/3] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=5;, score=0.939 total time= 7.5min
[CV 3/3] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.15, max_depth=12, min_child_weight=5;, score=0.941 total time= 8.7min
[CV 2/3] END colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=8, min_child_weight=5;, score=0.939 total time= 5.8min
[CV 3/3] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.25, max_depth=15, min_child_weight=3;, score=0.939 total time=13.9min
[CV 1/3] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.1, max_depth=15, min_child_weight=3;, score=0.878 total time= 8.5min
[CV 1/3] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.25, max_depth=8, min

[CV 3/3] END colsample_bytree=0.3, gamma=0.3, learning_rate=0.3, max_depth=5, min_child_weight=5;, score=0.948 total time= 3.9min
[CV 2/3] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.3, max_depth=15, min_child_weight=3;, score=0.937 total time=14.1min
[CV 2/3] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.15, max_depth=12, min_child_weight=5;, score=0.939 total time= 8.8min
[CV 3/3] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.3, max_depth=8, min_child_weight=1;, score=0.942 total time= 4.7min
[CV 1/3] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.25, max_depth=15, min_child_weight=3;, score=0.875 total time=13.3min
[CV 2/3] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.05, max_depth=5, min_child_weight=1;, score=0.941 total time= 3.7min
[CV 3/3] END colsample_bytree=0.4, gamma=0.1, learning_rate=0.1, max_depth=15, min_child_weight=3;, score=0.943 total time= 8.8min
[CV 3/3] END colsample_bytree=0.3, gamma=0.4, learning_rate=0.25, max_depth=8, min_

[CV 1/3] END colsample_bytree=0.7, gamma=0.4, learning_rate=0.3, max_depth=15, min_child_weight=3;, score=0.874 total time=14.9min
[CV 3/3] END colsample_bytree=0.7, gamma=0.1, learning_rate=0.25, max_depth=8, min_child_weight=5;, score=0.945 total time= 7.5min
[CV 1/3] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.3, max_depth=8, min_child_weight=1;, score=0.876 total time= 4.7min
[CV 1/3] END colsample_bytree=0.5, gamma=0.0, learning_rate=0.25, max_depth=8, min_child_weight=5;, score=0.873 total time= 5.9min
[CV 2/3] END colsample_bytree=0.7, gamma=0.0, learning_rate=0.25, max_depth=15, min_child_weight=3;, score=0.937 total time=14.0min
[CV 3/3] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.05, max_depth=5, min_child_weight=1;, score=0.955 total time= 3.7min
[CV 1/3] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.1, max_depth=4, min_child_weight=7;, score=0.869 total time= 2.4min
[CV 2/3] END colsample_bytree=0.4, gamma=0.4, learning_rate=0.1, max_depth=4, min_ch

In [50]:
# Best hyper-parameters we got
xgb_clsf.best_params_

{'min_child_weight': 7,
 'max_depth': 8,
 'learning_rate': 0.05,
 'gamma': 0.1,
 'colsample_bytree': 0.3}

In [52]:
# Getting scores
print("Test both:", xgb_clsf_model.score(X_test, y_test))
print("Test 1:", xgb_clsf_model.score(X_test1, y_test1))
print("Test 2:", xgb_clsf_model.score(X_test2, y_test2))

Test both: 0.9367307924894126
Test 1: 0.7789117068886193
Test 2: 0.9441382512271994


In [35]:
# Caching new model
model_filename = "../../analyse/models/XGBClassifier_all_datasets.pickle"

with open(model_filename, 'wb') as bin_file:
    pickle.dump(
        xgb_clsf_model,
        file=bin_file,
        protocol=pickle.HIGHEST_PROTOCOL
    )

In [44]:
# Testing Ivan's model on my dataset
ivans_model_filename = r'../../analyse/models/XGBClassifier.pickle'
with open(ivans_model_filename, 'rb') as bin_file:
    ivans_model = pickle.load(bin_file)

print("Test both:", ivans_model.score(X_test, y_test))
print("Test 1:", ivans_model.score(X_test1, y_test1))
print("Test 2:", ivans_model.score(X_test2, y_test2))

Test both: 0.9965899058562665
Test 1: 0.9956356124336642
Test 2: 0.9966083876214605


In [47]:
from sklearn.metrics import f1_score

print("f1 score both:", f1_score(y_test, ivans_model.predict(X_test)))
print("f1 score malignant:", f1_score(y_test1, ivans_model.predict(X_test1)))
print("f1 score atrial:", f1_score(y_test2, ivans_model.predict(X_test2)))

f1 score both: 0.9846669052515246
f1 score malignant: 0.9854424171081028
f1 score atrial: 0.9846284395277595


In [53]:
print("f1 score both:", f1_score(y_test, xgb_clsf_model.predict(X_test)))
print("f1 score malignant:", f1_score(y_test1, xgb_clsf_model.predict(X_test1)))
print("f1 score atrial:", f1_score(y_test2, xgb_clsf_model.predict(X_test2)))

f1 score both: 0.9367307924894126
f1 score malignant: 0.7789117068886193
f1 score atrial: 0.9441382512271994
