In [1]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
pip install neurokit2 # install neurokit which is a python module for signal processing

Collecting neurokit2
[?25l  Downloading https://files.pythonhosted.org/packages/4e/4a/d2a9502942cb60e61c9ba9772c04ebd0a945fe248ed42cb520334da582b2/neurokit2-0.1.1-py2.py3-none-any.whl (990kB)
[K     |████████████████████████████████| 993kB 5.2MB/s 
Installing collected packages: neurokit2
Successfully installed neurokit2-0.1.1


In [3]:
pip install biosppy==0.6.1 # install biosppy which is a python module for signal processing

Collecting biosppy==0.6.1
[?25l  Downloading https://files.pythonhosted.org/packages/b7/60/d09a277f9d31a2fc9190edf7e8a685c4f9b54b5dff487f523b916f441e1a/biosppy-0.6.1-py2.py3-none-any.whl (76kB)
[K     |████▎                           | 10kB 13.9MB/s eta 0:00:01[K     |████████▋                       | 20kB 19.7MB/s eta 0:00:01[K     |████████████▉                   | 30kB 10.3MB/s eta 0:00:01[K     |█████████████████▏              | 40kB 8.2MB/s eta 0:00:01[K     |█████████████████████▍          | 51kB 5.4MB/s eta 0:00:01[K     |█████████████████████████▊      | 61kB 6.3MB/s eta 0:00:01[K     |██████████████████████████████  | 71kB 6.3MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 4.0MB/s 
Collecting bidict
  Downloading https://files.pythonhosted.org/packages/67/d4/eaf9242722bf991e0955380dd6168020cb15a71cc0d3cc2373f4911b1f1d/bidict-0.21.2-py2.py3-none-any.whl
Collecting shortuuid
  Downloading https://files.pythonhosted.org/packages/25/a6/2ecc1daa6a304

In [4]:
pip install mne# install mne which is a python module for signal processing

Collecting mne
[?25l  Downloading https://files.pythonhosted.org/packages/60/f7/2bf5de3fad42b66d00ee27539bc3be0260b4e66fdecc12f740cdf2daf2e7/mne-0.23.0-py3-none-any.whl (6.9MB)
[K     |████████████████████████████████| 7.0MB 4.9MB/s 
Installing collected packages: mne
Successfully installed mne-0.23.0


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time 
import re
import csv
import scipy.io
import biosppy
import mne
import neurokit2 as nk
import ast
import os
import scipy.io
from sklearn.preprocessing import LabelEncoder
import time
import datetime
from datetime import datetime
import glob
from scipy.stats import zscore, norm
from neurokit2 import eda_phasic
from scipy.stats import linregress
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [6]:
# this function is to convert the TimeStamp column (first column) from Unix Epoch time to standard datetime format
def TimeStamp_Conversion(ts):
  """
  we have a unix epoch time in milliseconds i.e, a string with a length of 13 charcters example:1.5789360034388428E12
  
  parameters:
  -----
  ts = Epoch timesatmp in milliseconds.

  Returns:
  -----
  Std_Unix = standard epoch timestamp in seconds.

  """

  float_Input = float(ts) # converting the string-type(1.5789360034388428E12) Unix Epoch to float-type(1578936003.4388428).

  # float input is divided by 1000 to convert the Unix epoch in milliseconds to seconds 
  Std_Unix = float_Input/1000

  datetime_Input = datetime.fromtimestamp(Std_Unix) 
  # datetime.fromtimestamp converts the unix epoch in seconds to datetime returns example:datetime.datetime(2020, 1, 13, 17, 20, 3, 438843)

  return Std_Unix

In [7]:
def column_formatting(Timestamp_DF):
  """
  Column names of Timestamp annotation excel have column index attached to column name as we only need column name we are parsing column names.

  Parameters:
  -----
  Timestamp_DF = Input the DF after reading the timestamp annotationexcel file to  get list of column names['A1- ECG baseline start','B1- ECG baseline end',.....].

  Returns:
  -----
  Parsed_ColumnNames = list of parsed column names. ['ECG baseline start','ECG baseline end',....]

  """

  Parsed_colnames = ['Subject_ID'] ## Column with Participant ID is not named, so declaring first column as Subject_ID to an empty list

  for index in range(1,len(Timestamp_DF.columns)): ## Looping through the list of timestamp annotation columns list
    column = Timestamp_DF.columns[index][4:].lstrip() ## Drop first 3 indices of each column and strip space(" ") if present as left most
    Parsed_colnames.append(column) ## appending each column name after parsing

  return Parsed_colnames ## returns list fo parsed col names

In [8]:
def Annotation_timestamp(timestamp_path, sheet_name):
  """
  This function is to change the column names of timestamp annotations table and convert timestamps from milliseconds to standart epoch format of seconds.

  Parameters:
  -----
  timestamp_path = path to the directory of file location
  sheet_name =  there are two sheets present in the file, we work on file named D.

  Results:
  -----
  VR_Timestamps_D = Clean dataframe of timestamp annotations table.

  """

  VR_TimeStamps_D = pd.read_excel(Timestamp_path, sheet_name) ## read timestamp annotation file
  Parsed_colnames = column_formatting(VR_TimeStamps_D) ## using the column_formatting function defined earlier parse columns
  VR_TimeStamps_D.columns = Parsed_colnames ## Change colnames of Dataframe using the parsed list of col names
  
  ## As timestamp is in string format and in milli seconds iterating through each column to change the timestamp to standard epoch format.
  for col in VR_TimeStamps_D.columns: 
    ## Using Timestamp_Conversion function and lambda fucntion to map the function to each row of the column.
    if col == 'Subject_ID':
      pass
    else:
      VR_TimeStamps_D[col] = VR_TimeStamps_D[col].map(lambda instance: TimeStamp_Conversion(instance)) 

  return VR_TimeStamps_D

In [9]:
def Shimmers_csv2DF(path,filename):
  """
  This function is to read Shimmer data files and create a dataframe from tidy shimmers csv tables.

  Parameters:
  -----
  path = path to directory of shimmers file folder.

  filename = name of the file to be loaded.

  Results:
  -----
  Dataframe = organized and structured Shimmers Data.

  """

  with open(path + '/' + filename, 'r',) as file: # read the file
    reader = csv.reader(file)

    lists_eachrow = []
    for row in reader:
      lists_eachrow.append(row) # append each row in reader to a list

  del lists_eachrow[0] # del first row of list as it is only about \t delimiter used

  newlists = [] 
  # loop through the list of lists and split columnar values using the delimiter 
  for list_row in lists_eachrow:
    for row in list_row:
      newlists.append(list(row.split('\t')))
  # Extract subjectID from the file name for future use
  filename_parse = filename.replace("_", " ")
  Participant_ID = ast.literal_eval(re.findall(r'\b\d+\b', filename_parse)[0])
  
  # create dataframe from the list of columnar values 
  Dataframe = pd.DataFrame(newlists, columns = newlists[0])
  Dataframe = Dataframe.drop([0,1]) # drop columns 1 and 2 which are column names and units as we already have column names for new dataframe.
  Dataframe.reset_index(drop=True, inplace=True) # reset index

  return Dataframe, Participant_ID 

In [10]:
# IBI(inter beat interval or RR interval) column in the ECG data consits of location of R-peaks and the ECG sample at each R peak location is given as an input to HRV function to extract features
# This function extrcats the indices where R peak is located in IBI columns.
def Rpeak_Indices(Dataframe):
  Rpeak_Indices = []
  for i in Dataframe.columns:
    if 'IBI' in i:
      index_list = []
      index_list = Dataframe[i].index[Dataframe[i] != -1].tolist()
      Rpeak_Indices += index_list
  Rpeak_Indices = np.unique(Rpeak_Indices)
  return Rpeak_Indices

In [11]:
# ECG Data consists of 4 different electrodes collecting data from 4 limbs (LA, RA, RL and LL)
# It has raw ECG signal, IBI and Heart Rate signals with each of 4 columns
# HRV is Heart Rate Variability, low HRV indicates high stress
# IBI is Inter-Beat Interval or RR interval is the duration between two R peaks
# 
# Function to extract HRV features from ECG signal

def ECG_features(path,Timestamp_path,sheet_name = 'D'):
  # taking path location of Shimmer file as input and using glob function to find csv and text files in folder and appending each subject's filename to a list
  os.chdir(path)
  list_csv = glob.glob('*.{}'.format('txt'))
  list_csv.extend(glob.glob('*.{}'.format('csv')))
  
  # Creating an empty DataFrame to append all the features
  Data = pd.DataFrame()

  # loop to feature extraction on each subject file.
  for index in list_csv:
    Dataframe, Participant_ID = Shimmers_csv2DF(path,index) # formatting csv file to create a clean DataFrame and extract Participant ID from file name using Shimmers_csv2DF function
    # Convert the timestamp column in nanoseconds to milliseconds
    Dataframe['Shimmer_CB7C_Timestamp_Unix_CAL'] = Dataframe['Shimmer_CB7C_Timestamp_Unix_CAL'].map(lambda instance: TimeStamp_Conversion(instance))
    
    ''' 
    --- One of the Shimmer file with Subject ID 877 do not have IBI columns, we need to extract the location of R peaks using the nk.ecg_peaks()
        function from neurokit ECG processing module. After getting R-peak locations we need to extract ECG samples for those R-peak locations.

    --- For those files with IBI columns we need to pass those columns through IBI_indices function to extract indices of R peak location a get ECG
        samples for those R-peak locations.      
    '''

    try:

      # ECG columns in the shimmer data are in string type, below loop itetrate through each row of all the ECG columns to cast string type to int/float
      for index in range(0,5):
        columns = ['Shimmer_CB7C_ECG_IBI_LA_RA_CAL',
                   'Shimmer_CB7C_ECG_IBI_LL_LA_CAL', 
                   'Shimmer_CB7C_ECG_IBI_LL_RA_CAL',
                   'Shimmer_CB7C_ECG_IBI_Vx_RL_CAL',
                   'Shimmer_CB7C_ECG_LA-RA_24BIT_CAL']
        Dataframe[columns[index]] = Dataframe[columns[index]].map(lambda ind : ast.literal_eval(ind))

      # Extract indices where we have R peaks 
      Rpeak_loc = Rpeak_Indices(Dataframe)

      # Dataframe with rows where R peaks are located
      Dataframe_Rpeak = Dataframe.iloc[Rpeak_loc]
 
      # creating a numpy array of raw ECG signal from column LA_RA lead(one of four electrodes and is closer to heart) where we have R peaks
      ecg = np.array(Dataframe_Rpeak['Shimmer_CB7C_ECG_LA-RA_24BIT_CAL'])

      # For the file with out IBI column will result in KeyError, below colde helps to extract R peak locations from raw ECG signal using n.ecg_peaks 
    except KeyError:

      # empty list to append peaks extracted from signals of four electrodes
      Rpeaks_index = []

      # ECG columns in the shimmer data are in string type, below loop itetrate through each row of all the ECG columns to cast string type to int/float
      for index in range(0,4):
           
        columns = ['Shimmer_CB7C_ExG1_CH1_24BIT_CAL', 'Shimmer_CB7C_ExG1_CH2_24BIT_CAL',
       'Shimmer_CB7C_ExG2_CH1_24BIT_CAL', 'Shimmer_CB7C_ExG2_CH2_24BIT_CAL']
        
        Dataframe[columns[index]] = Dataframe[columns[index]].map(lambda ind : ast.literal_eval(ind))

        # each column is cleaned and R peaks locations(indices) are extracted  and appended to peaks_index
        ecg = Dataframe[columns[index]]
        cleaned = nk.ecg_clean(ecg, sampling_rate=128)
        signals, info = nk.ecg_peaks(cleaned, sampling_rate=128, correct_artifacts=True)
        peaks = info['ECG_R_Peaks']
        for i in peaks:
          Rpeaks_index.append(i)
      
      # as we are looking for R peaks location in four different signals we may have duplicates when we append all of them together, so removing duplicates.
      Rpeaks_index = np.unique(Rpeaks_index)
      Rpeaks_index = np.array(Rpeaks_index) # creating an array of R peak indices

      # Dataframe with rows where R peaks are located
      Dataframe_Rpeak = Dataframe.iloc[Rpeaks_index]

      # creating a numpy array of raw ECG signal where we have R peaks from LA_RA lead(one of four electrodes and is closer to heart) called as channel 1 in this file
      ecg = np.array(Dataframe_Rpeak['Shimmer_CB7C_ExG1_CH1_24BIT_CAL'])
      
    # numpy array of timestamp column with indices where R peaks are located
    timestamp = np.array(Dataframe_Rpeak['Shimmer_CB7C_Timestamp_Unix_CAL'])

    '''
    --- As the neurokit function we are using to extract HRV features will only give one row as output when a signal is passed as input
    --- I am using sliding window technique to extract features for every 60 data points(window length) and an overlap of 30 data points
    --- we can expect average of one R peak for every second(i.e, 128Hz), as we are using 60 data points in each window means we are windowing for every 60 seconds with a 30 second overlap 
    '''
    start_window=0
    overlap=30
    window_length=60
    end_window = window_length

    data = pd.DataFrame() # empty dataframe to append HRV features for each window
    while (start_window + overlap) <= len(ecg): # loop conditon to carry out windowing and calculate features till the end 
      
      # creating start, mean and end timestamps helps to  make sure that each window completely falls in annotation period
      Start_timestamp = timestamp[start_window]
      Mean_timestamp = np.mean(timestamp[start_window:end_window])
      try:
        end_timestamp = timestamp[end_window]
      except IndexError: 
        # index of last window might exceed the last timestamp of our data, if this happens it will use last timestamp of data as end timsatmp of last window
        end_timestamp = timestamp[-1]

      ecg_window = ecg[start_window:end_window] # getting ecg samples for each winodw

      hrv = nk.hrv_time(ecg_window, sampling_rate=128, show=False) # This takes ecg signal input for each window and gives each feature output as data frame column  
      
      # appending start, mean and end timestamp which helps in labelling event periods to each efature column
      hrv['Start_timestamp'] = Start_timestamp 
      hrv['Mean_timestamp'] = Mean_timestamp
      hrv['end_timestamp'] = end_timestamp
      hrv['subject_ID'] = Participant_ID
      
      # append HRV features from each window to empty datafrme
      data = data.append(hrv)

      # increasing window length
      start_window += overlap
      end_window += overlap

    data = data.dropna() # drop null values if any, as few rows consists of nulls values and infinity values while computing features
    Data = Data.append(data) # appending HRV features of each subject to an empty DataFrame
  
  return Data  

In [None]:
path = '/content/gdrive/My Drive/Food_VR/Food_VR/ECG_Processing/ECG_DATA/'
Timestamp_path = '/content/gdrive/My Drive/Food_VR/Food_VR/VR Timestamps for Phase B & D_W&SP20.xlsx'
Data = ECG_features(path,Timestamp_path,sheet_name = 'D')

In [None]:
Data.to_csv('/content/gdrive/My Drive/Food_VR/Food_VR/ECG_Processing/HRV_features.csv',index=False)

In [16]:
Timestamp_path = '/content/gdrive/My Drive/Food_VR/Food_VR/VR Timestamps for Phase B & D_W&SP20.xlsx' # Event period Timestamp table

In [21]:
Data = pd.read_csv('/content/gdrive/My Drive/Food_VR/Food_VR/ECG_Processing/HRV_features.csv') 
VR_TimeStamps_D = Annotation_timestamp(Timestamp_path, sheet_name = 'D') # passing timestamp data table path in to Annotation_timestamp to create a clean dataframe

In [22]:
# This function helps to label each row of HRV feature dataframe based on start, and end timestamp assoisated with it
def Label_Data(Data,VR_TimeStamps_D):
  list_subject = np.unique(Data['subject_ID']) # list out unique subjects
  
  # for each subject present in subject list we take out start and end time stamps of each event period 
  for index in list_subject:

    VRBaseline_start = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == index,'VR baseline start'].iloc[0]
    VRBaseline_end = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == index,'VR baseline end'].iloc[0]
    Speech_start = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == index,'Emotion-induction speech start'].iloc[0]
    Speech_end = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == index,'Emotion-induction speech end'].iloc[0]
    Food_start = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == index,'Food selection start'].iloc[0]
    Food_end = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == index,'food selection end'].iloc[0]

    # using the start and end timestamps of each event period we check whether the start and end timestamps of ecah feature row fall in event start and end, if yes it is labelled as particular event and if not check for other event
    # if the feature start and end timestamp does not lie in any of the vents start and end timestamp it results in NAN label 
    for i in range(0,len(Data)):
      if ((VRBaseline_start <= Data['Start_timestamp'][i] <= VRBaseline_end) and (VRBaseline_start <= Data['Start_timestamp'][i] <= VRBaseline_end) and (VRBaseline_start <= Data['Start_timestamp'][i] <= VRBaseline_end)).all():
        Data.loc[i,'Event'] = 'VR baseline'
      elif (Speech_start <= Data['Start_timestamp'][i] <= Speech_end and Speech_start <= Data['Mean_timestamp'][i] <= Speech_end and Speech_start <= Data['end_timestamp'][i] <= Speech_end).all():
        Data.loc[i,'Event'] = 'Speech Emotion'
      elif (Food_start <= Data['Start_timestamp'][i] <= Food_end and Food_start <= Data['Mean_timestamp'][i] <= Food_end and Food_start <= Data['end_timestamp'][i] <= Food_end).all():
        Data.loc[i,'Event'] = 'Food Selection'
      else:
        pass
     
  return Data # labelled Feature Data
    

In [23]:
Data = Label_Data(Data,VR_TimeStamps_D)

In [25]:
Data.to_csv("HRV_features_Label.csv", index = False) # save labelled data

In [27]:
DataFrame = Data.drop(['end_timestamp','Start_timestamp','Mean_timestamp','HRV_MCVNN'], axis=1) # drop start, end and mean timestamp columns present in labelled Feature Data
# 'HRV_MCVNN' column consist of infinity values which causes err while modeling so dropping that column as well

In [29]:
DataFrame.subject_ID.value_counts()

966     89
793     89
1056    71
942     66
984     61
961     61
946     57
963     53
962     52
1058    49
820     42
877     29
937      4
Name: subject_ID, dtype: int64

In [32]:
from numpy import interp
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_curve,auc, roc_auc_score, make_scorer
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
import matplotlib.pylab as plt
from sklearn.utils import shuffle

In [33]:
rf = RandomForestClassifier(random_state = 42) # random forest classifier as rf 
knn = KNeighborsClassifier() # K-Nearest Neighbor classifier as knn

In [34]:
def Training(empty_exp_list,Data,KFold:bool):
  
  '''
  --- We are carrying out three different Binary classification experiments for each of two event periods (Baseline vs Speech, Speech vs Food Selection, Food selec vs Baseline).
      So that we will be able to discriminate the behaviour of each of the event when compared with other event.
  --- Implementing K-fold cross val and Leave One Group Out cross val using rf, knn algorithms. 
  --- This function takes HRV feature data as input along with KFold Boolean value when True computes K-fold Cross val and when False computes LOGO cross val
  --- In the LOGO Cross val we are computing the mean scores of all the subjects and also scores while subject is left out.
  --- We are getting accuracy score and area under the curve score of each experiment with above mentioned models as dictionary key value pairs.
  '''

  # split data with relevance to event periods in each experiment and convert event column to category type
  VRBaseline_Speech = DataFrame.loc[DataFrame['Event'].isin(['VR baseline','Speech Emotion'])]
  VRBaseline_Speech["Event"] = VRBaseline_Speech["Event"].astype('category').cat.codes
  VRBaseline_FoodSelec = DataFrame.loc[DataFrame['Event'].isin(['VR baseline','Food Selection'])]
  VRBaseline_FoodSelec["Event"] = VRBaseline_FoodSelec["Event"].astype('category').cat.codes
  Speech_FoodSelec = DataFrame.loc[DataFrame['Event'].isin(['Food Selection','Speech Emotion'])]
  Speech_FoodSelec["Event"] = Speech_FoodSelec["Event"].astype('category').cat.codes

  # Giving keys while each experiment dataframe are values
  dict_exp = {'VR baseline vs Speech':VRBaseline_Speech,
            'VR baseline vs VR FoodSelec':VRBaseline_FoodSelec,
            'Speech vs VR FoodSelec':Speech_FoodSelec}

  for experiment, data in dict_exp.items():
    
    # result dictionary with experiment name as key value pair
    result_dict = {'experiment':experiment}
    
    # while KFold is True(input), we are asking this function to carry out K-fold Cross Val
    if KFold == True:
      data = data.drop(['subject_ID'], axis=1) # Drop subject_ID column as K-fold does not handle subject wise analysis

      feature = np.array(data.iloc[:,:-1]) # feature columns
      target = np.array(data.iloc[:,-1]) # target column

      cv = StratifiedKFold(n_splits=13) # Stratified splitting of data with 13 folds
      
      # empty lists for accuarcy and auc scores for each algorithm
      accuracy_rf=[]
      accuracy_knn=[]
      auc_rf=[]
      auc_knn=[]
      
      # Loop to carry out predictions on each of the 13 folds and append scores to empty lists algorithm wise and mean is computed as key value apir to result_dict
      for train, test in cv.split(feature,target):
        subject_ID = None # Drop subject_ID column as K-fold does not handle subject wise analysis

        # fitiing both the models
        rf_cv=rf.fit(feature[train],target[train])
        knn_cv=knn.fit(feature[train],target[train])
        
        # predicting and finding scores of rf algorithm
        y_pred_rf = rf_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_rf)
        auc_RF = auc(fpr,tpr)
        accuracy_RF = accuracy_score(target[test],y_pred_rf)

        # append scores to empty lists
        accuracy_rf.append(accuracy_RF)
        auc_rf.append(auc_RF)
        
        # predicting and finding scores of knn algorithm
        y_pred_knn = knn_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_knn)
        auc_KNN = auc(fpr,tpr)
        accuracy_KNN = accuracy_score(target[test],y_pred_knn)

        # append scores to empty lists
        accuracy_knn.append(accuracy_KNN)
        auc_knn.append(auc_KNN)

    else: # Where KFold (input) False and the function performs LOGO cross val
      subject_id = data['subject_ID'] # taking out the subject column to pass it as list to the LOGO splits
      data = data.drop(['subject_ID'], axis=1) # dropping subject_ID

      feature = np.array(data.iloc[:,:-1]) # feature columns
      target = np.array(data.iloc[:,-1]) # target column
      groups = np.array(subject_id)
    
      logo = LeaveOneGroupOut() # group wise splitting of train and test data
      logo.get_n_splits(feature, target, groups)
      logo.get_n_splits(groups=groups)
      
      # empty lists for accuarcy and auc scores for each algorithm
      accuracy_rf=[]
      accuracy_knn=[]
      auc_rf=[]
      auc_knn=[]

      subject_ID_avg = {}
      
      # Loop to carry out predictions on each of the 13 folds and append scores to empty lists algorithm wise and mean is computed as key value apir to result_dict
      # Subject wise predictions are appended to subject_ID_avg
      for train,test in logo.split(feature,target,groups):
        
        rf_cv=rf.fit(feature[train],target[train])
        knn_cv=knn.fit(feature[train],target[train])

        y_pred_rf = rf_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_rf)
        auc_RF = auc(fpr,tpr)
        accuracy_RF = accuracy_score(target[test],y_pred_rf)

        accuracy_rf.append(accuracy_RF)
        auc_rf.append(auc_RF)

        y_pred_knn = knn_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_knn)
        auc_KNN = auc(fpr,tpr)
        accuracy_KNN = accuracy_score(target[test],y_pred_knn)
        
        accuracy_knn.append(accuracy_KNN)
        auc_knn.append(auc_KNN)

        subject_ID = groups[test][0]
        subject_ID_avg[str(subject_ID)] = {'accuracy_rf':accuracy_RF,
                                          'auc_rf':auc_RF,
                                          'accuracy_knn':accuracy_KNN,
                                          'auc_knn':auc_KNN}
    
    if KFold == True:
      pass
    else:
      result_dict['subject_ID'] = subject_ID_avg

    result_dict['accuracy_score_testing'] = {'rf_cv':np.mean(accuracy_rf),
                                              'knn_cv':np.mean(accuracy_knn)}

    result_dict['auc_score_testing'] = {'rf_cv':np.mean(auc_rf),
                                              'knn_cv':np.mean(auc_knn)}

    empty_exp_list.append(result_dict)

  return empty_exp_list

In [None]:
KFoldCV_results = []
KFoldCV_results = Training(KFoldCV_results,DataFrame, KFold = True) 
# Predictions while KFold is Ture which means asking to run Kfold cross val

In [36]:
KFoldCV = pd.DataFrame() # empty dataframe
# loop to append each of the index from Training functions to a new df and append that df to above empty dataframe
for index in KFoldCV_results: 
  df = pd.DataFrame(index)
  KFoldCV = KFoldCV.append(df)

In [37]:
KFoldCV.to_csv('/content/gdrive/My Drive/Food_VR/Food_VR/ECG_Processing/ECG_Predictions/HRV_predictions_K-FoldCV.csv', index = True) # save it as csv

In [None]:
LOGO_results = []
LOGO_results = Training(LOGO_results,DataFrame, KFold = False)
# Predictions while KFold is False which means asking to run LOGO cross val resulting in Mean of LOGO cross val and subject wise predictions

In [39]:
# code in below cells is to separate mean LOGO predictions and subject wise predictions
LOGO = pd.DataFrame()
for index in LOGO_results:
  df = pd.DataFrame(index)
  LOGO = LOGO.append(df)

In [40]:
list_index = LOGO.index
indices = []
for index in range(0,len(list_index)):
  try:
    element = ast.literal_eval(list_index[index])
  except ValueError:
    indices.append(list_index[index])

In [41]:
Mean_LOGO = pd.DataFrame(LOGO.loc[indices])
Mean_LOGO = Mean_LOGO.drop_duplicates()
Mean_LOGO = Mean_LOGO.drop(columns = ['subject_ID'])
Mean_LOGO.to_csv("/content/gdrive/My Drive/Food_VR/Food_VR/ECG_Processing/ECG_Predictions/HRV_predictions_MeanLOGO.csv", index = True) # LOGO Mean Predictions

In [42]:
Subject_LOGO = LOGO.drop(indices).drop(columns = ['accuracy_score_testing','auc_score_testing'])

In [43]:
list_=[]
for w in range(0,len(Subject_LOGO)):
  index = Subject_LOGO.index
  list_.append(index[w])

In [44]:
dummy_df = pd.DataFrame()
for one in range(0,len(Subject_LOGO)):
  Dict = Subject_LOGO['subject_ID'][one]
  df = pd.DataFrame(Dict,index=[list_[one]])
  exp = Subject_LOGO['experiment'][one]
  df['experiment'] = exp
  dummy_df = dummy_df.append(df)

In [45]:
dummy_df.to_csv("/content/gdrive/My Drive/Food_VR/Food_VR/ECG_Processing/ECG_Predictions/HRV_predictions_SubjectWise_LOGO.csv", index = True) # LOGO subject wise predictions