# Single label Classifier - AF Detection

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [10]:
import sys
import os
#sys.path.append("../../tutorials/fastai/old/") # go to parent dir

import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import scipy


from xverse.transformer import WOE

from structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display

from sklearn import metrics
from sklearn.metrics import f1_score, fbeta_score, make_scorer, recall_score, precision_score

import mlflow
import mlflow.sklearn

import plotly.express as px

## Helper functions

In [4]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [5]:
def print_fscores(m, x, y):
    res = [f1_score(y, m.predict(x), pos_label=0), fbeta_score(y, m.predict(x), beta=2, pos_label=0)]
    print(res)

In [6]:
lead = 'lead2'

In [7]:
from sklearn.model_selection import train_test_split
df = pd.read_feather('datasets/fitted/af-dataset-' + lead)

y = df['label']
X = df.drop('label', axis=1)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = X_train.copy()
df_eval = X_eval.copy()
df_train.insert(loc=len(X_train.columns), column='label', value=y_train)
df_eval.insert(loc=len(X_eval.columns), column='label', value=y_eval)

df_train.drop(['index'], axis=1, inplace=True)
df_eval.drop(['index'], axis=1, inplace=True)
df_train

Unnamed: 0,mean_P_Peaks,mean_Q_Peaks,HRV,kurt_R_Peaks,mean_T_Peaks,age,skew_R_Peaks,RSSSD,fmax,mean_S_Peaks,std_R_Peaks,label
4383,92.050000,-69.550000,662.664182,2.548453,39.200000,53,-2.085590,400.867482,6.823821,-165.050000,6.420436e+05,Non-AF
3538,50.090909,-80.181818,550.824341,-1.939266,59.545455,80,0.116944,453.243280,3.476934,-57.909091,9.170248e+05,Non-AF
4943,33.052632,-159.263158,374.499017,-1.991971,106.684211,53,0.006135,207.487979,4.072592,-3.473684,2.103260e+06,Non-AF
5810,-52.809524,-169.428571,511.075782,-1.886651,366.904762,15,0.097520,430.335288,1.047819,-260.952381,4.525275e+06,Non-AF
3429,159.625000,-181.187500,621.583232,-0.468678,68.187500,76,-1.126299,472.166482,2.604989,-103.437500,1.158203e+06,Non-AF
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,-8.666667,-74.916667,458.664110,5.629034,132.208333,63,1.715212,428.202787,0.762050,-69.375000,7.042247e+05,AF
5191,109.692308,-88.384615,674.398843,15.295272,46.358974,64,-3.656884,214.918542,2.926359,-148.769231,4.360616e+05,Non-AF
5226,11.600000,-28.466667,579.097066,0.848942,48.933333,75,-1.590086,386.657029,3.101240,-67.533333,3.631268e+05,AF
5390,38.500000,-112.400000,614.537033,-1.650002,170.600000,48,-0.294958,511.877620,2.100840,-199.900000,3.802282e+05,Non-AF


In [8]:
labels = pd.get_dummies(df_train['label'])
labels.describe()

Unnamed: 0,AF,Non-AF
count,5501.0,5501.0
mean,0.179967,0.820033
std,0.384195,0.384195
min,0.0,0.0
25%,0.0,1.0
50%,0.0,1.0
75%,0.0,1.0
max,1.0,1.0


In [11]:
X_train, y_train, nas = proc_df(df_train, 'label')
X_eval, y_eval, nas = proc_df(df_eval, 'label', na_dict=nas)

m_af = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_features='sqrt', n_jobs=7, oob_score=True)
m_af.fit(X_train, y_train)

print(f'F1 Score: {f1_score(y_eval, m_af.predict(X_eval), pos_label=0)}')
print(f'Precision: {precision_score(y_eval, m_af.predict(X_eval), pos_label=0)}')
print(f'Recall: {recall_score(y_eval, m_af.predict(X_eval), pos_label=0)}')

F1 Score: 0.8269662921348314
Precision: 0.8598130841121495
Recall: 0.7965367965367965


In [10]:
mlflow.set_experiment('AF_sampling_testing-' + lead)
with mlflow.start_run() as run:
    mlflow.log_params({'n_estimators': '1000', 'min_samples_leaf': '1', 'max_features': 'sqrt'})
    mlflow.log_param('n_Features', len(list(X_train.columns)))
    mlflow.log_param('AF mean', labels['AF'].mean())
    mlflow.log_metric('f1_score', f1_score(y_eval, m_af.predict(X_eval)))
    mlflow.log_metric('f2_score', fbeta_score(y_eval, m_af.predict(X_eval), beta=2))

### Oversampling to improve AF representation in the dataset

In [12]:
# Class count
count_non, count_af = df_train.label.value_counts()

# Divide by class
df_af = df_train[df_train['label'] == 'AF']
df_non = df_train[df_train['label'] == 'Non-AF']

df_af_over = df_af.sample(int(count_non), replace=True)
df_train_over = pd.concat([df_non, df_af_over], axis=0)

In [13]:
labels = pd.get_dummies(df_train_over['label'])
labels.describe()

Unnamed: 0,AF,Non-AF
count,9022.0,9022.0
mean,0.5,0.5
std,0.500028,0.500028
min,0.0,0.0
25%,0.0,0.0
50%,0.5,0.5
75%,1.0,1.0
max,1.0,1.0


In [28]:
X_train, y_train, nas = proc_df(df_train_over, 'label')
X_eval, y_eval, nas = proc_df(df_eval, 'label', na_dict=nas)

m_af = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_features='sqrt', n_jobs=7, oob_score=True)
m_af.fit(X_train, y_train)

print(f'F1 Score: {f1_score(y_eval, m_af.predict(X_eval), pos_label=0)}')
print(f'Precision: {precision_score(y_eval, m_af.predict(X_eval), pos_label=0)}')
print(f'Recall: {recall_score(y_eval, m_af.predict(X_eval), pos_label=0)}')

F1 Score: 0.8207343412526998
Precision: 0.8189655172413793
Recall: 0.8225108225108225


In [14]:
mlflow.set_experiment('AF_sampling_testing-' + lead)
with mlflow.start_run() as run:
    mlflow.log_params({'n_estimators': '1000', 'min_samples_leaf': '1', 'max_features': 'sqrt'})
    mlflow.log_param('n_Features', len(list(X_train.columns)))
    mlflow.log_param('AF mean', labels['AF'].mean())
    mlflow.log_metric('f1_score', f1_score(y_eval, m_af.predict(X_eval)))
    mlflow.log_metric('f2_score', fbeta_score(y_eval, m_af.predict(X_eval), beta=2))

In [26]:
import imblearn
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)
X_train_smote.describe()

Unnamed: 0,mean_P_Peaks,mean_Q_Peaks,HRV,kurt_R_Peaks,mean_T_Peaks,age,skew_R_Peaks,RSSSD,fmax,mean_S_Peaks,std_R_Peaks
count,9022.0,9022.0,9022.0,9022.0,9022.0,9022.0,9022.0,9022.0,9022.0,9022.0,9022.0
mean,57.050052,-66.481632,537.66989,1.489105,102.810271,64.85092,-0.437314,383.24617,3.477532,-169.639349,2301158.0
std,215.08975,65.170704,111.441095,7.207536,299.765506,17.678021,1.541889,133.427564,7.538265,151.830561,16491420.0
min,-151.0,-1771.875,271.437736,-1.999329,-417.263158,-1.0,-6.601977,28.374207,0.01695,-1855.5,4614.371
25%,13.622466,-93.058007,462.323342,-1.820898,29.272727,55.0,-1.14764,315.459437,1.40056,-236.700464,236519.8
50%,39.466667,-56.823529,518.038609,-0.917869,79.071429,68.0,-0.259067,389.2226,2.70108,-130.18254,496551.8
75%,75.823477,-28.815789,595.401507,1.539344,143.425595,78.0,0.125964,455.06588,4.246575,-67.25,942493.2
max,9859.528302,699.2,2623.733374,99.425911,11663.069182,104.0,9.601375,7497.828309,250.0,1156.5,332307300.0


In [27]:
m_af = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_features='sqrt', n_jobs=7, oob_score=True)
m_af.fit(X_train_smote, y_train_smote)

print(f'F1 Score: {f1_score(y_eval, m_af.predict(X_eval), pos_label=0)}')
print(f'Precision: {precision_score(y_eval, m_af.predict(X_eval), pos_label=0)}')
print(f'Recall: {recall_score(y_eval, m_af.predict(X_eval), pos_label=0)}')

F1 Score: 0.8146551724137931
Precision: 0.8111587982832618
Recall: 0.8181818181818182


In [18]:
os.makedirs('datasets/fitted', exist_ok=True)
columns = list(X_train.columns)

columns.append('label')

df = pd.concat([df_train, df_eval])
df.reset_index(inplace=True)

df.to_feather('datasets/fitted/af-dataset-' + lead + '-oversampled')

We save the current model to disk

In [19]:
pickle.dump(m_af, open('models/af/base/lead2-oversampled.sav', 'wb'))

In [117]:
m_af = pickle.load(open('models/af/base/lead2.sav', 'rb'))