In [1]:
# import essential libraries
import pandas as pd
import numpy as np
import pickle
import scipy
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, SGDClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_error, f1_score
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import svm, linear_model
from sklearn import tree, metrics
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
import lightgbm
from bayes_opt import BayesianOptimization
from catboost import CatBoostClassifier, cv, Pool
import gzip

### Training (All Features)

In [None]:
df = pd.read_csv('../../../data/afib_data/coorteeqsrafva.csv', sep=';', header=0, index_col=0)
df.head()

In [None]:
df['scp_codes'].value_counts()

In [None]:
sub_df = df.drop(columns=['diagnosi', 'ecg_id', 'patient_id', 'recording_date', 'scp_codes', 'infarction_stadium1', 'infarction_stadium2', 'initial_autogenerated_report', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'filename_lr', 'filename_hr'])
sub_df.head()

### Training 1 (11 Features)

In [None]:
# train test
df = pd.read_csv('../../../data/afib_data/training_1.csv')
print(df.shape)
df.head()

In [None]:
X = df.drop(columns='ritmi')
y = df['ritmi']

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.25, random_state = 246)

### Random Forest

In [None]:
# Declare a variable called entr_model and use tree.DecisionTreeClassifier. 
entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 1234)

# Call fit() on entr_model
entr_model.fit(X_train, y_train)

# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = entr_model.predict(X_test)

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
y_pred = pd.Series(y_pred)

# Check out entr_model
entr_model

In [None]:
# Run this block for model evaluation metrics 
print("Model Entropy - no max depth")
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

In [None]:
# Plug in appropriate max_depth and random_state parameters 
firstRFModel = RandomForestClassifier(max_depth= 7, random_state= 1234)
firstRFModel.fit(X_train, y_train)
y_pred = firstRFModel.predict(X_test)
y_pred = pd.Series(y_pred)

# Model and fit
print("RandomForest model - max depth 7")
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

### LightGBM

In [None]:
def lgb_eval(num_leaves,max_depth,lambda_l2,lambda_l1,min_child_samples, min_data_in_leaf):
    params = {
        "objective" : "binary",
        "metric" : "auc", 
        'is_unbalance': True,
        "num_leaves" : int(num_leaves),
        "max_depth" : int(max_depth),
        "lambda_l2" : lambda_l2,
        "lambda_l1" : lambda_l1,
        "num_threads" : 20,
        "min_child_samples" : int(min_child_samples),
        'min_data_in_leaf': int(min_data_in_leaf),
        "learning_rate" : 0.03,
        "subsample_freq" : 5,
        "bagging_seed" : 42,
        "verbosity" : -1
    }
    lgtrain = lightgbm.Dataset(X_train, y_train)
    cv_result = lightgbm.cv(params,
                       lgtrain,
                       1000,
                       early_stopping_rounds=100,
                       stratified=True,
                       nfold=3)
    return cv_result['auc-mean'][-1]

In [None]:
df.info()

In [None]:
# lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (25, 4000),
#                                                 'max_depth': (5, 63),
#                                                 'lambda_l2': (0.0, 0.05),
#                                                 'lambda_l1': (0.0, 0.05),
#                                                 'min_child_samples': (50, 10000),
#                                                 'min_data_in_leaf': (100, 2000)
#                                                 })

# lgbBO.maximize(n_iter=10, init_points=2)

### Regressions

In [None]:
# clfl2=LogisticRegression(max_iter=1000000)
# parameters = {'C': [10000], 'solver': ['saga'],  'multi_class': ['auto']} # 0.4681891485581523

# clfl2 = svm.SVC()
# parameters = {'kernel':['linear'], 'C':[8]} #0.45041137078037446

# clfl2 = LogisticRegressionCV(max_iter=100000)
# parameters = {"Cs": [10], 'solver': ['saga'], 'fit_intercept':[True], 'penalty': ['l1']} # 0.4711466447997813

# clfl2 = RidgeClassifier(max_iter=1000)
# parameters = {'alpha': [0.9], 'solver': ['auto']} # 0.897

clfl2 = KNeighborsClassifier()
parameters = {'n_neighbors': [170], 'weights': ['distance'], 'metric': ['euclidean']} #0.4830258302583026

# clfl2 = SGDClassifier(max_iter=100000)
# parameters = {'loss': ['epsilon_insensitive'], 'alpha': [0.0001], 'penalty': ['l2'], 'epsilon': [0.5], 'learning_rate': ['optimal'], 'eta0': [1.0]} # 0.897

fitmodel = GridSearchCV(clfl2, param_grid=parameters, cv=5, refit=True, scoring="accuracy", n_jobs=-1, verbose=3)
fitmodel.fit(X_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

### Training 2 (1 Feature)

In [None]:
# train test
df2 = pd.read_csv('../../../data/afib_data/training_2.csv')
print(df2.shape)
df2.head()

In [None]:
X = df2['leads']
y = df2['ritmi']
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.25, random_state = 246)

In [2]:
df = pd.read_csv('../../../data/afib_data/coorteeqsrafva.csv', sep=';', header=0, index_col=0)
label_df = df[['ritmi']]
label_df['ritmi'] = label_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values
label_df

Unnamed: 0,ritmi
0,2
1,1
2,0
3,2
4,2
...,...
6423,2
6424,2
6425,0
6426,1


In [3]:
# f = gzip.GzipFile("../../../data/afib_data/compressed_npy.gz", "w")
# np.save(file=f, arr=ecg_arr)
# f.close()

In [4]:
f = gzip.GzipFile('../../../data/afib_data/compressed_npy.gz', "r")
X = np.load(f)

In [5]:
X.shape

(6428, 5000, 12)

In [6]:
y = label_df

In [7]:
new_arr = X.reshape(X.shape[0], (X.shape[1]*X.shape[2]))

In [8]:
new_arr.shape

(6428, 60000)

In [9]:
X_train, X_test, y_train, y_test=train_test_split(new_arr, y, test_size = 0.2)

In [None]:
# Declare a variable called entr_model and use tree.DecisionTreeClassifier. 
entr_model = tree.DecisionTreeClassifier(criterion="entropy", random_state = 1234)

# Call fit() on entr_model
entr_model.fit(X_train, y_train)

# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = entr_model.predict(X_test)

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
y_pred = pd.Series(y_pred)

# Check out entr_model
entr_model

In [None]:
# Run this block for model evaluation metrics 
print("Model Entropy - no max depth")
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

In [None]:
# Plug in appropriate max_depth and random_state parameters
rf = RandomForestClassifier()
rf_param_grid = {'n_estimators': [100], 'criterion': ['gini'], 'max_depth': [5]}
# rf_param_grid = {'n_estimators': [100,200], 'criterion': ['gini', 'entropy'], 'max_depth': [5,10]}
rf_cv= GridSearchCV(rf,rf_param_grid,cv=5)
rf_cv.fit(X_train,y_train)

# y_pred = rf.predict(X_test)
# y_pred = pd.Series(y_pred)

# Model and fit
print("RandomForest model - max depth 7")
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))
print("Balanced accuracy:", metrics.balanced_accuracy_score(y_test,y_pred))

In [None]:
print("Best Score:" + str(rf_cv.best_score_))
print("Best Parameters: " + str(rf_cv.best_params_))

In [None]:
clfl2=LogisticRegression(max_iter=1000000)
parameters = {'C': [10000], 'solver': ['saga'],  'multi_class': ['auto']} # 0.4681891485581523

# clfl2 = svm.SVC()
# parameters = {'kernel':['linear'], 'C':[8]} #0.45041137078037446

# clfl2 = LogisticRegressionCV(max_iter=100000)
# parameters = {"Cs": [10], 'solver': ['saga'], 'fit_intercept':[True], 'penalty': ['l1']} # 0.4711466447997813

# clfl2 = RidgeClassifier(max_iter=1000)
# parameters = {'alpha': [0.9], 'solver': ['auto']} # 0.897

# clfl2 = KNeighborsClassifier()
# parameters = {'n_neighbors': [170], 'weights': ['distance'], 'metric': ['euclidean']} #0.4830258302583026

# clfl2 = SGDClassifier(max_iter=100000)
# parameters = {'loss': ['epsilon_insensitive'], 'alpha': [0.0001], 'penalty': ['l2'], 'epsilon': [0.5], 'learning_rate': ['optimal'], 'eta0': [1.0]} # 0.897

fitmodel = GridSearchCV(clfl2, param_grid=parameters, cv=5, refit=True, scoring="accuracy", n_jobs=-1, verbose=3)
fitmodel.fit(X_train, y_train)
print(fitmodel.best_estimator_, fitmodel.best_params_, fitmodel.best_score_)

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors':49}
knn = KNeighborsClassifier()
knn_cv= GridSearchCV(knn,param_grid,cv=5)
knn_cv.fit(new_arr,y)

print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

In [10]:
from sklearn.ensemble import RandomForestClassifier

# rf_param_grid = {'n_estimators':np.arange(48,50)} # Best Score:0.5003130307655924; Best Parameters: {'n_estimators': 49}
rf_param_grid = {'n_estimators':np.arange(50,60)}
rf = RandomForestClassifier()
rf_cv= GridSearchCV(rf,rf_param_grid,cv=5,verbose=3,n_jobs=-1)
rf_cv.fit(X_train,y_train)

print("Best Score:" + str(rf_cv.best_score_))
print("Best Parameters: " + str(rf_cv.best_params_))

Fitting 5 folds for each of 10 candidates, totalling 50 fits


MemoryError: Unable to allocate 1.84 GiB for an array with shape (4113, 60000) and data type float64