# Single label Classifier - AF Detection

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [3]:
import sys
sys.path.append("../../tutorials/fastai/old/") # go to parent dir

from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier
from IPython.display import display

from sklearn import metrics
from sklearn.metrics import f1_score, fbeta_score, make_scorer

import mlflow
import mlflow.sklearn

## Helper functions

In [4]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [5]:
def print_fscores(m, x, y):
    res = [f1_score(y, m.predict(x)), fbeta_score(y, m.predict(x), beta=2)]
    print(res)

In [6]:
lead = 'lead2'

In [7]:
from sklearn.model_selection import train_test_split
df = pd.read_feather('datasets/fitted/af-dataset-' + lead)

y = df['label']
X = df.drop('label', axis=1)
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)

df_train = X_train.copy()
df_eval = X_eval.copy()
df_train.insert(loc=len(X_train.columns), column='label', value=y_train)
df_eval.insert(loc=len(X_eval.columns), column='label', value=y_eval)

df_train.drop(['index'], axis=1, inplace=True)
df_eval.drop(['index'], axis=1, inplace=True)
df_train

Unnamed: 0,mean_P_Peaks,mean_Q_Peaks,HRV,mean_T_Peaks,kurt_R_Peaks,age,skew_R_Peaks,fmax,mean_S_Peaks,std_R_Peaks,PT_duration,label
4383,92.050000,-69.550000,331.332091,39.200000,2.548453,53,-2.085590,6.823821,-165.050000,6.420436e+05,519.400000,Non-AF
3538,38.272727,-66.363636,275.412170,62.545455,-1.939266,80,0.116944,3.476934,-60.272727,9.170248e+05,535.090909,Non-AF
4943,33.052632,-159.263158,187.249508,106.684211,-1.991971,53,0.006135,4.072592,-3.473684,2.103260e+06,572.421053,Non-AF
5810,-74.571429,-157.380952,255.537891,364.095238,-1.886651,15,0.097520,1.047819,-268.619048,4.525275e+06,487.809524,Non-AF
3429,137.625000,-164.375000,310.791616,73.062500,-0.468678,76,-1.126299,2.604989,-90.812500,1.158203e+06,481.000000,Non-AF
...,...,...,...,...,...,...,...,...,...,...,...,...
3772,-8.666667,-74.916667,229.332055,132.208333,5.629034,63,1.715212,0.762050,-69.375000,7.042247e+05,574.833333,AF
5191,109.410256,-91.717949,337.199422,45.461538,15.295272,64,-3.656884,2.926359,-145.923077,4.360616e+05,497.948718,Non-AF
5226,11.600000,-28.466667,289.548533,48.933333,0.848942,75,-1.590086,3.101240,-67.533333,3.631268e+05,487.600000,AF
5390,38.600000,-130.600000,307.268516,174.500000,-1.650002,48,-0.294958,2.100840,-193.400000,3.802282e+05,551.800000,Non-AF


In [8]:
labels = pd.get_dummies(df_train['label'])
labels.describe()

Unnamed: 0,AF,Non-AF
count,5501.0,5501.0
mean,0.179967,0.820033
std,0.384195,0.384195
min,0.0,0.0
25%,0.0,1.0
50%,0.0,1.0
75%,0.0,1.0
max,1.0,1.0


In [9]:
X_train, y_train, nas = proc_df(df_train, 'label')
X_eval, y_eval, nas = proc_df(df_eval, 'label', na_dict=nas)

m_af = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_features='sqrt', n_jobs=7, oob_score=True)
m_af.fit(X_train, y_train)

f1_score(y_eval, m_af.predict(X_eval))

0.9632829373650108

In [10]:
mlflow.set_experiment('AF_sampling_testing-' + lead)
with mlflow.start_run() as run:
    mlflow.log_params({'n_estimators': '1000', 'min_samples_leaf': '1', 'max_features': 'sqrt'})
    mlflow.log_param('n_Features', len(list(X_train.columns)))
    mlflow.log_param('AF mean', labels['AF'].mean())
    mlflow.log_metric('f1_score', f1_score(y_eval, m_af.predict(X_eval)))
    mlflow.log_metric('f2_score', fbeta_score(y_eval, m_af.predict(X_eval), beta=2))

### Oversampling to improve AF representation in the dataset

In [10]:
# Class count
count_non, count_af = df_train.label.value_counts()

# Divide by class
df_af = df_train[df_train['label'] == 'AF']
df_non = df_train[df_train['label'] == 'Non-AF']

df_af_over = df_af.sample(int(count_non*0.4), replace=True)
df_train_over = pd.concat([df_non, df_af_over], axis=0)

In [10]:
df_train = pd.concat([df_train[(df_train.loc[:, 'label'] == 'AF')], df_train])

In [11]:
labels = pd.get_dummies(df_train_over['label'])
labels.describe()

Unnamed: 0,AF,Non-AF
count,6315.0,6315.0
mean,0.285669,0.714331
std,0.451768,0.451768
min,0.0,0.0
25%,0.0,0.0
50%,0.0,1.0
75%,1.0,1.0
max,1.0,1.0


In [14]:
X_train, y_train, nas = proc_df(df_train_over, 'label')
X_eval, y_eval, nas = proc_df(df_eval, 'label', na_dict=nas)

m_af = RandomForestClassifier(n_estimators=1000, min_samples_leaf=1, max_features='sqrt', n_jobs=7, oob_score=True)
m_af.fit(X_train, y_train)

f1_score(y_eval, m_af.predict(X_eval))

0.9648894668400522

In [14]:
mlflow.set_experiment('AF_sampling_testing-' + lead)
with mlflow.start_run() as run:
    mlflow.log_params({'n_estimators': '1000', 'min_samples_leaf': '1', 'max_features': 'sqrt'})
    mlflow.log_param('n_Features', len(list(X_train.columns)))
    mlflow.log_param('AF mean', labels['AF'].mean())
    mlflow.log_metric('f1_score', f1_score(y_eval, m_af.predict(X_eval)))
    mlflow.log_metric('f2_score', fbeta_score(y_eval, m_af.predict(X_eval), beta=2))

In [18]:
os.makedirs('datasets/fitted', exist_ok=True)
columns = list(X_train.columns)

columns.append('label')

df = pd.concat([df_train, df_eval])
df.reset_index(inplace=True)

df.to_feather('datasets/fitted/af-dataset-' + lead + '-oversampled')

We save the current model to disk

In [19]:
pickle.dump(m_af, open('models/af/base/lead2-oversampled.sav', 'wb'))

In [117]:
m_af = pickle.load(open('models/af/base/lead2.sav', 'rb'))