This notebook provides a template for training classical ML models using k-fold cross validation. The first part, Configurations, is the only section that needs to be editted. Editting the automated code is only needed if a custom pipeline is needed.

#### Classical ML Workflow

1. Convert dataset into .h5 file. See `wearablebp_benchmarks/datasets_to_h5.ipynb`
2. Create folder in `wearablebp_benchmarks/classical_ml` with feature extractor name. Build and run feature extraction algorithm.
3. Save features in `wearablebp_benchmarks/results/features/`. Use naming conventions specified below.
4. Use `wearablebp_benchmarks/classical_ml/train_feats.ipynb` to train model using features extracted in 3. Model is saved in .pkl file using the same name in 3.
5. Use `wearablebp_benchmarks/make_plots.ipynb` to visualize data and compute Explained Deviation metrics

In [2]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import dill as pickle

from scipy.stats import pearsonr
from scipy import stats
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor

# Configurations

In [8]:
class options:
    def __init__(self):
        
        # set model name based on results naming convention (see wearablebp/README.md)
        self.dataset_name = 'mimic'
        self.filter_name = 'hasandazeh19'
        self.alg_name = 'hasandazeh19'
        self.datapath = '../results/features/features_' + self.dataset_name + '_' + self.filter_name + '_' + self.alg_name + '.csv'
        
        # assign feature columns and SBP/DBP columns
        if ('provided' in self.dataset_name) or ('features' in self.dataset_name):
            self.sbp_col = '4'
            self.dbp_col = '5'
            self.feature_cols = np.arange(11, 101).astype(str)
        else:
            self.sbp_col = 'SBP'
            self.dbp_col = 'DBP'

            df = pd.read_csv(self.datapath)
            self.feature_cols = df.columns[(df.columns != 'SBP') & (df.columns != 'DBP') & 
                                           (df.columns != 'MBP') & (df.columns != 'segment_num') 
                                           & (df.columns != 'subject_id')]
        
        # configure machine learning estimators and other hyperparameters
        self.k_folds = 10
        self.scaler = preprocessing.StandardScaler()
        self.sbp_models = [RandomForestRegressor(n_jobs=4,n_estimators=100), 
                           AdaBoostRegressor(n_estimators=200),
                           LinearRegression(n_jobs=4),
                           DecisionTreeRegressor()
                          ]
        self.sbp_model_names = ['RandomForest', 'AdaBoost', 'Linear', 'DecisionTree']
        self.dbp_models = [RandomForestRegressor(n_jobs=4,n_estimators=100), 
                           AdaBoostRegressor(n_estimators=200),
                           LinearRegression(n_jobs=4),
                           DecisionTreeRegressor()
                          ]
        self.dbp_model_names = ['RandomForest', 'AdaBoost', 'Linear', 'DecisionTree']

# Automated Code (no configuration needed unless using custom pipeline)

In [9]:
opt = options()
df = pd.read_csv(opt.datapath)

# make changes from provided features.csv on github
# available at https://github.com/navidhasanzadeh/BPPPG
# features extracted from script have been filtered, so this part is unneeded
if ('provided' in opt.datapath) or ('features' in opt.dataset_name):
    df = df.drop(['0','1','2','3','6','7','8','9'], axis=1)
    df = df.replace([np.inf, -np.inf],np.nan)
    df = df.fillna(df.mean())
    df = df.reset_index(drop=True)
    df = df.drop(df[df['4'] < 90].index)
    df = df.drop(df[df['5'] < 40].index)
    df = df.drop(df[df['10'] < 50].index)
    df = df.drop(df[df['4'] > 200].index)
    df = df.drop(df[df['5'] > 100].index)
    df = df.drop(df[df['10'] > 140].index)
    df = df.drop(['10'], axis=1)
else:
    df = df.drop(df[df[opt.sbp_col] < 90].index)
    df = df.drop(df[df[opt.dbp_col] < 40].index)
    df = df.drop(df[df[opt.sbp_col] > 200].index)
    df = df.drop(df[df[opt.dbp_col] > 100].index)
feature_df = df.copy()

# build classical ML model
fold_size = int(len(feature_df)/opt.k_folds)
sbp_std = feature_df[opt.sbp_col].std()
dbp_std = feature_df[opt.dbp_col].std()
sbp_model_results = {}
dbp_model_results = {}
for m in tqdm(range(len(opt.sbp_models))):
    sbp_ests = np.array([])
    sbp_gts = np.array([])
    sbp_maes = np.array([])
    sbp_mes = np.array([])
    sbp_raw_model_result = {}
    dbp_ests = np.array([])
    dbp_gts = np.array([])
    dbp_maes = np.array([])
    dbp_mes = np.array([])
    dbp_raw_model_result = {}
    print('Training Models: ' + opt.sbp_model_names[m] + ' for SBP and ' + opt.sbp_model_names[m] + ' for DBP')
    for i in tqdm(range(1, opt.k_folds)):
        print('Training ' + 'fold number ' + str(i))
        trainData = feature_df.loc[0:i*fold_size]
        trainData = trainData.append(feature_df.loc[(i+1)*fold_size:])
        testData = feature_df.loc[i*fold_size:(i+1)*fold_size]
        Xy_cols = np.append([opt.sbp_col, opt.dbp_col], opt.feature_cols)
        trainData = trainData[Xy_cols]
        testData = testData[Xy_cols]

        X_train = np.array(trainData[opt.feature_cols])
        y_train = np.array(trainData[[opt.sbp_col, opt.dbp_col]])
        X_test = np.array(testData[opt.feature_cols])
        y_test = np.array(testData[[opt.sbp_col, opt.dbp_col]])    

        scaler = opt.scaler.fit(X_train)
        X_train = scaler.transform(X_train) 
        X_test = scaler.transform(X_test)   
        
        X_train[np.isinf(X_train)] = 0
        X_train[np.isnan(X_train)] = 0
        X_test[np.isinf(X_test)] = 0
        X_test[np.isnan(X_test)] = 0

        sbp_model = opt.sbp_models[m]
        sbp_model.fit(X_train, y_train[:, 0])
        sbp_est = sbp_model.predict(X_test) 
        dbp_model = opt.dbp_models[m]
        dbp_model.fit(X_train, y_train[:, 1])
        dbp_est = dbp_model.predict(X_test)
        
        sbp_gts = np.append(sbp_gts, y_test[:, 0])
        sbp_ests = np.append(sbp_ests, sbp_est)
        sbp_mes = np.append(sbp_mes, sbp_est - y_test[:, 0])
        dbp_gts = np.append(dbp_gts, y_test[:, 1])
        dbp_ests = np.append(dbp_ests, dbp_est)
        dbp_mes = np.append(dbp_mes, dbp_est - y_test[:, 1])
    
    sbp_model_result = {}
    sbp_model_result['raw ests'] = sbp_ests
    sbp_model_result['raw gts'] = sbp_gts
    sbp_model_result['bias'] = (sbp_ests - sbp_gts).mean()
    sbp_model_result['err std'] = (sbp_ests - sbp_gts).std()
    sbp_model_result['ED'] = sbp_std/(sbp_ests - sbp_gts).std()
    sbp_model_result['dist std'] = sbp_std
    sbp_model_results[opt.sbp_model_names[m]] = sbp_model_result
    dbp_model_result = {}
    dbp_model_result['raw ests'] = dbp_ests
    dbp_model_result['raw gts'] = dbp_gts
    dbp_model_result['bias'] = (dbp_ests - dbp_gts).mean()
    dbp_model_result['err std'] = (dbp_ests - dbp_gts).std()
    dbp_model_result['ED'] = dbp_std/(dbp_ests - dbp_gts).std()
    sbp_model_result['dist std'] = sbp_std
    dbp_model_results[opt.dbp_model_names[m]] = dbp_model_result

pickle_dict = {'sbp': sbp_model_results, 'dbp': dbp_model_results, 'opt': opt}
with open('../results/training/' + opt.dataset_name + '_' + opt.filter_name + '_' + opt.alg_name + '.pickle', 'wb') as f:
    pickle.dump(pickle_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/9 [00:00<?, ?it/s][A

Training Models: RandomForest for SBP and RandomForest for DBP
Training fold number 1



 11%|█         | 1/9 [00:01<00:13,  1.64s/it][A

Training fold number 2



 22%|██▏       | 2/9 [00:03<00:11,  1.60s/it][A

Training fold number 3



 33%|███▎      | 3/9 [00:04<00:09,  1.58s/it][A

Training fold number 4



 44%|████▍     | 4/9 [00:06<00:07,  1.55s/it][A

Training fold number 5



 56%|█████▌    | 5/9 [00:07<00:06,  1.55s/it][A

Training fold number 6



 67%|██████▋   | 6/9 [00:09<00:04,  1.55s/it][A

Training fold number 7



 78%|███████▊  | 7/9 [00:10<00:03,  1.54s/it][A

Training fold number 8



 89%|████████▉ | 8/9 [00:12<00:01,  1.52s/it][A

Training fold number 9



100%|██████████| 9/9 [00:13<00:00,  1.53s/it][A
 25%|██▌       | 1/4 [00:13<00:41, 13.80s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A

Training Models: AdaBoost for SBP and AdaBoost for DBP
Training fold number 1



 11%|█         | 1/9 [00:03<00:29,  3.68s/it][A

Training fold number 2



 22%|██▏       | 2/9 [00:07<00:25,  3.67s/it][A

Training fold number 3



 33%|███▎      | 3/9 [00:10<00:21,  3.66s/it][A

Training fold number 4



 44%|████▍     | 4/9 [00:14<00:18,  3.66s/it][A

Training fold number 5



 56%|█████▌    | 5/9 [00:18<00:14,  3.65s/it][A

Training fold number 6



 67%|██████▋   | 6/9 [00:21<00:10,  3.65s/it][A

Training fold number 7



 78%|███████▊  | 7/9 [00:25<00:07,  3.64s/it][A

Training fold number 8



 89%|████████▉ | 8/9 [00:29<00:03,  3.63s/it][A

Training fold number 9



100%|██████████| 9/9 [00:32<00:00,  3.64s/it][A
 50%|█████     | 2/4 [00:46<00:38, 19.49s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A
100%|██████████| 9/9 [00:00<00:00, 72.46it/s][A
 75%|███████▌  | 3/4 [00:46<00:13, 13.68s/it]
  0%|          | 0/9 [00:00<?, ?it/s][A

Training Models: Linear for SBP and Linear for DBP
Training fold number 1
Training fold number 2
Training fold number 3
Training fold number 4
Training fold number 5
Training fold number 6
Training fold number 7
Training fold number 8
Training fold number 9
Training Models: DecisionTree for SBP and DecisionTree for DBP
Training fold number 1



 22%|██▏       | 2/9 [00:00<00:00, 10.39it/s][A

Training fold number 2
Training fold number 3
Training fold number 4



 44%|████▍     | 4/9 [00:00<00:00, 10.47it/s][A
 67%|██████▋   | 6/9 [00:00<00:00, 10.42it/s][A

Training fold number 5
Training fold number 6
Training fold number 7



 78%|███████▊  | 7/9 [00:00<00:00, 10.25it/s][A
100%|██████████| 9/9 [00:00<00:00, 10.41it/s][A
100%|██████████| 4/4 [00:47<00:00, 11.89s/it]

Training fold number 8
Training fold number 9



