# Final Script -- 30/11/2019

In [3]:
import numpy as np
import pandas as pd
from biosppy.signals import ecg
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
import xgboost as xgb

In [5]:
xtrain = pd.read_csv("data/X_train.csv")
xtrain.drop("id", axis=1, inplace = True)

xtest =  pd.read_csv("data/X_test.csv")
xtest.drop("id", axis=1, inplace = True)

ytrain = pd.read_csv("data/y_train.csv")
ytrain.drop("id", axis=1, inplace = True)

print(xtrain.shape, xtest.shape, ytrain.shape)

(5117, 17807) (3411, 17807) (5117, 1)


In [6]:
def mean_sqrd_diff(rpeaks):
    diff = np.diff(rpeaks)
    mean_sqrd = np.mean(diff*diff)
    return mean_sqrd

def obtain_features(signal, sampling_rate):
    
    # features obtained from biosppy
    ts, filtered, rpeaks, templates_ts, templates, heart_rate_ts, heart_rate = ecg.ecg(signal, sampling_rate, show = False)
    
    # Correct R-peak locations to the maximum --- introduce some tolerance level
    rpeaks = ecg.correct_rpeaks(signal = signal, rpeaks = rpeaks, sampling_rate = sampling_rate, tol = 0.01)  
    
    # extracting values of R-peaks -- Note: rpeaks gives only indices for R-peaks location
    peak_values = signal[rpeaks]
    
    # Set heart rates to array of nans if contains no elements, otherwise min and max are not defined
    if len(heart_rate) == 0:
        heart_rate = np.array([np.nan, np.nan])
    if len(heart_rate_ts) == 0:
        heart_rate_ts = np.array([np.nan, np.nan])
    
    # Add a bunch of features
    feats = np.array([])
    feats = np.append(feats, np.mean(peak_values))
    feats = np.append(feats, np.median(peak_values))
    feats = np.append(feats, np.min(peak_values))
    feats = np.append(feats, np.max(peak_values))
    feats = np.append(feats, np.std(peak_values))
    feats = np.append(feats, np.mean(rpeaks))
    feats = np.append(feats, np.median(rpeaks))
    feats = np.append(feats, np.min(rpeaks))
    feats = np.append(feats, np.max(rpeaks))
    feats = np.append(feats, np.std(rpeaks))
    feats = np.append(feats, np.sqrt(mean_sqrd_diff(rpeaks)))
    feats = np.append(feats, np.mean(np.diff(rpeaks)))
    feats = np.append(feats, np.median(np.diff(rpeaks)))
    feats = np.append(feats, np.min(np.diff(rpeaks)))
    feats = np.append(feats, np.max(np.diff(rpeaks)))
    feats = np.append(feats, np.std(np.diff(rpeaks)))
    feats = np.append(feats, np.mean(templates, axis = 0))
    feats = np.append(feats, np.median(templates, axis = 0))
    feats = np.append(feats, np.min(templates, axis=0))
    feats = np.append(feats, np.max(templates, axis=0))
    feats = np.append(feats, np.std(templates, axis = 0))
    feats = np.append(feats, np.mean(heart_rate))
    feats = np.append(feats, np.median(heart_rate))
    feats = np.append(feats, np.min(heart_rate))
    feats = np.append(feats, np.max(heart_rate))
    feats = np.append(feats, np.std(heart_rate))
    feats = np.append(feats, np.mean(heart_rate_ts))
    feats = np.append(feats, np.median(heart_rate_ts))
    feats = np.append(feats, np.min(heart_rate_ts))
    feats = np.append(feats, np.max(heart_rate_ts))
    feats = np.append(feats, np.std(heart_rate_ts))
    # Once again check, if heart_rate arrays contain one element min and max of differences will return error
    if len(heart_rate) == 1:
        heart_rate = np.array([np.nan, np.nan])
    if len(heart_rate_ts) == 1:
        heart_rate_ts = np.array([np.nan, np.nan])
    feats = np.append(feats, np.mean(np.diff(heart_rate)))
    feats = np.append(feats, np.median(np.diff(heart_rate)))
    feats = np.append(feats, np.min(np.diff(heart_rate)))
    feats = np.append(feats, np.max(np.diff(heart_rate)))
    feats = np.append(feats, np.std(np.diff(heart_rate)))
    feats = np.append(feats, np.mean(np.diff(heart_rate_ts)))
    feats = np.append(feats, np.median(np.diff(heart_rate_ts)))
    feats = np.append(feats, np.min(np.diff(heart_rate_ts)))
    feats = np.append(feats, np.max(np.diff(heart_rate_ts)))
    feats = np.append(feats, np.std(np.diff(heart_rate_ts)))
    
    #feats = np.append(feats, np.abs(np.fft.rfft(np.mean(templates, axis=0), axis=0))[0:45] # adding FFT (choose only half of entries)
    '''removed fft -- no improvements by adding it'''

    return feats

In [7]:
for i in np.arange(xtrain.shape[0]):
    if i == 0:
        row = np.array(xtrain.iloc[i].dropna())
        X_train = [obtain_features(row, 300)]
    else: 
        row = np.array(xtrain.iloc[i].dropna())
        X_train = np.append(X_train, [obtain_features(row, 300)], axis = 0)
    
for i in np.arange(xtest.shape[0]):
    if i == 0:
        row = np.array(xtest.iloc[i].dropna())
        X_test = [obtain_features(row, 300)]
    else: 
        row = np.array(xtest.iloc[i].dropna())
        X_test = np.append(X_test, [obtain_features(row, 300)], axis = 0)

y_train = np.ravel(np.array(ytrain.values))    

print(X_train.shape, y_train.shape, X_test.shape)

(5117, 936) (5117,) (3411, 936)


In [8]:
'''
use random subset of initial dataframe X for model selection  
'''

X_train = pd.DataFrame(X_train) 
X_train['y'] = y_train
X_sub = pd.DataFrame(X_train).sample(frac = 0.40, replace = False, axis = 0)
y_sub = X_sub['y']
X_sub = X_sub.drop('y', axis = 1).values
X_train = X_train.drop('y', axis = 1)
print(y_sub.shape, X_sub.shape)

'''define score function'''
scorer_f1 = make_scorer(f1_score, greater_is_better = True, average = 'micro')

(2047,) (2047, 936)


In [7]:
'''
## SVC APPROACH -- GRID-SEARCH CV

steps = [("impute", SimpleImputer()),
            ("scaler", preprocessing.StandardScaler()), 
            ("classifier", SVC())]
pipeline = Pipeline(steps = steps)

parameters = {"impute__strategy": ["mean", "median", "constant"],
              "impute__fill_value": [0],
              "classifier__kernel": ["rbf", "poly"],
              "classifier__gamma": ["auto"],
              "classifier__C": [15,30,45,60,75],  
              "classifier__class_weight": ["balanced"],
              "classifier__degree": [2,4,6,8]
             }

grid = GridSearchCV(pipeline, parameters, cv = 5, scoring = scorer_f1, verbose = 2)

grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

estimator = SVC(C = grid.best_params_['classifier__C'], gamma = 'auto', 
                class_weight = 'balanced', 
                kernel = grid.best_params_['classifier__kernel'], 
                degree = grid.best_params_['classifier__degree'])

estimator.fit(xtrain_scaled, y)
pred = estimator.predict(xtest_scaled)
make_submission("prediction_trial.csv", pred)
'''

'''
Gradient Boosting APPROACH -- GRID-SEARCH CV
''' 
steps = [("impute", SimpleImputer()),
         ("scaler", preprocessing.StandardScaler()), 
         ("classifier", GradientBoostingClassifier())]
pipeline = Pipeline(steps = steps)

parameters = {"impute__strategy": ["mean", "median", "constant"],
              "impute__fill_value": [0],
              "classifier__max_depth": [3,4,5,6,7,8],
              "classifier__n_estimators": [200,250,300],
              "classifier__learning_rate": [0.1,0.08,0.05,0.03],
              "classifier__max_features": [40,50,60]
             }

grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 1)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

'''
## XGB APPROACH -- GRID-SEARCH CV


steps = [("impute", SimpleImputer()),
            ("scaler", preprocessing.StandardScaler()), 
            ("classifier", xgb.XGBClassifier())]
pipeline = Pipeline(steps = steps)

parameters = {"impute__strategy": ["mean", "median", "constant"],
              "impute__fill_value": [0],
              "classifier__max_depth": [5,10,15],
              "classifier__n_estimators": [200],
              "classifier__learning_rate": [0.05,0.1],
              "classifier__max_features": [20,40]
             }

grid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 1)

grid.fit(X_sub, y_sub)
print(grid.best_score_)
print(grid.best_params_)

'''

Fitting 2 folds for each of 648 candidates, totalling 1296 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1296 out of 1296 | elapsed: 577.2min finished


0.8151260504201681
{'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__max_features': 60, 'classifier__n_estimators': 250, 'impute__fill_value': 0, 'impute__strategy': 'median'}


'\n## XGB APPROACH -- GRID-SEARCH CV\n\n\nsteps = [("scaler", preprocessing.StandardScaler()), ("classifier", xgb.XGBClassifier())]\npipeline = Pipeline(steps = steps)\n\nparameters = {"classifier__max_depth": [5,10,15],\n              "classifier__n_estimators": [200],\n              "classifier__learning_rate": [0.05,0.1],\n              "classifier__max_features": [20,40]\n             }\n\ngrid = GridSearchCV(pipeline, parameters, cv = 2, scoring = scorer_f1, verbose = 1)\n\ngrid.fit(X_sub, y_sub)\nprint(grid.best_score_)\nprint(grid.best_params_)\n\n'

In [10]:
#replacing NaNs with median of columns

impute1 = SimpleImputer(strategy = 'median', fill_value = 0)
X_train = impute1.fit_transform(X_train)
impute2 = SimpleImputer(strategy = 'median', fill_value = 0)
X_test = impute2.fit_transform(X_test)

#rescaling data
scaler = StandardScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


#using best parameter given by GS
estimator = GradientBoostingClassifier(n_estimators = 250, 
                                       max_depth = 5,
                                       learning_rate = 0.1, 
                                       max_features = 60)

estimator.fit(X_train, y_train)
predictions = estimator.predict(X_test)

sample =  pd.read_csv("data/sample.csv")
sample["y"] = predictions
sample.to_csv("submission.csv", index = False)