In [3]:
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [4]:
pip install neurokit2

Collecting neurokit2
[?25l  Downloading https://files.pythonhosted.org/packages/4e/4a/d2a9502942cb60e61c9ba9772c04ebd0a945fe248ed42cb520334da582b2/neurokit2-0.1.1-py2.py3-none-any.whl (990kB)
[K     |▎                               | 10kB 16.6MB/s eta 0:00:01[K     |▋                               | 20kB 18.7MB/s eta 0:00:01[K     |█                               | 30kB 16.3MB/s eta 0:00:01[K     |█▎                              | 40kB 14.3MB/s eta 0:00:01[K     |█▋                              | 51kB 8.4MB/s eta 0:00:01[K     |██                              | 61kB 9.6MB/s eta 0:00:01[K     |██▎                             | 71kB 8.0MB/s eta 0:00:01[K     |██▋                             | 81kB 8.8MB/s eta 0:00:01[K     |███                             | 92kB 8.8MB/s eta 0:00:01[K     |███▎                            | 102kB 7.8MB/s eta 0:00:01[K     |███▋                            | 112kB 7.8MB/s eta 0:00:01[K     |████                            | 122kB 7.8

In [5]:
pip install biosppy==0.6.1



In [6]:
pip install mne

Collecting mne
[?25l  Downloading https://files.pythonhosted.org/packages/60/f7/2bf5de3fad42b66d00ee27539bc3be0260b4e66fdecc12f740cdf2daf2e7/mne-0.23.0-py3-none-any.whl (6.9MB)
[K     |████████████████████████████████| 7.0MB 5.8MB/s 
Installing collected packages: mne
Successfully installed mne-0.23.0


In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time 
import re
import csv
import scipy.io
import biosppy
import mne
import neurokit2 as nk
import ast
import os
import scipy.io
from sklearn.preprocessing import LabelEncoder
import time
import datetime
from datetime import datetime
import glob
from scipy.stats import zscore, norm
from neurokit2 import eda_phasic
from scipy.stats import linregress
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [8]:
# this function is to convert the TimeStamp column (first column) from Unix Epoch time to standard datetime format
def TimeStamp_Conversion(ts):
  """
  we have a unix epoch time in milliseconds i.e, a string with a length of 13 charcters example:1.5789360034388428E12
  
  parameters:
  -----
  ts = Epoch timesatmp in milliseconds.

  Returns:
  -----
  Std_Unix = standard epoch timestamp in seconds.

  """

  float_Input = float(ts) # converting the string-type(1.5789360034388428E12) Unix Epoch to float-type(1578936003.4388428).

  # float input is divided by 1000 to convert the Unix epoch in milliseconds to seconds 
  Std_Unix = float_Input/1000

  datetime_Input = datetime.fromtimestamp(Std_Unix) 
  # datetime.fromtimestamp converts the unix epoch in seconds to datetime returns example:datetime.datetime(2020, 1, 13, 17, 20, 3, 438843)

  return Std_Unix

In [9]:
def column_formatting(Timestamp_DF):
  """
  Column names of Timestamp annotation excel have column index attached to column name as we only need column name we are parsing column names.

  Parameters:
  -----
  Timestamp_DF = Input the DF after reading the timestamp annotationexcel file to  get list of column names['A1- ECG baseline start','B1- ECG baseline end',.....].

  Returns:
  -----
  Parsed_ColumnNames = list of parsed column names. ['ECG baseline start','ECG baseline end',....]

  """

  Parsed_colnames = ['Subject_ID'] ## Column with Participant ID is not named, so declaring first column as Subject_ID to an empty list

  for index in range(1,len(Timestamp_DF.columns)): ## Looping through the list of timestamp annotation columns list
    column = Timestamp_DF.columns[index][4:].lstrip() ## Drop first 3 indices of each column and strip space(" ") if present as left most
    Parsed_colnames.append(column) ## appending each column name after parsing

  return Parsed_colnames ## returns list fo parsed col names

In [10]:
def Annotation_timestamp(timestamp_path, sheet_name):
  """
  This function is to change the column names of timestamp annotations table and convert timestamps from milliseconds to standart epoch format of seconds.

  Parameters:
  -----
  timestamp_path = path to the directory of file location
  sheet_name =  there are two sheets present in the file, we work on file named D.

  Results:
  -----
  VR_Timestamps_D = Clean dataframe of timestamp annotations table.

  """

  VR_TimeStamps_D = pd.read_excel(Timestamp_path, sheet_name) ## read timestamp annotation file
  Parsed_colnames = column_formatting(VR_TimeStamps_D) ## using the column_formatting function defined earlier parse columns
  VR_TimeStamps_D.columns = Parsed_colnames ## Change colnames of Dataframe using the parsed list of col names
  
  ## As timestamp is in string format and in milli seconds iterating through each column to change the timestamp to standard epoch format.
  for col in VR_TimeStamps_D.columns: 
    ## Using Timestamp_Conversion function and lambda fucntion to map the function to each row of the column.
    if col == 'Subject_ID':
      pass
    else:
      VR_TimeStamps_D[col] = VR_TimeStamps_D[col].map(lambda instance: TimeStamp_Conversion(instance)) 

  return VR_TimeStamps_D

In [11]:
def Shimmers_csv2DF(path,filename):
  """
  This function is to read Shimmer data files and create a dataframe from tidy shimmers csv tables.

  Parameters:
  -----
  path = path to directory of shimmers file folder.

  filename = name of the file to be loaded.

  Results:
  -----
  Dataframe = organized and structured Shimmers Data.

  """

  with open(path + '/' + filename, 'r',) as file: # read the file
    reader = csv.reader(file)

    lists_eachrow = []
    for row in reader:
      lists_eachrow.append(row) # append each row in reader to a list

  del lists_eachrow[0] # del first row of list as it is only about \t delimiter used

  newlists = [] 
  # loop through the list of lists and split columnar values using the delimiter 
  for list_row in lists_eachrow:
    for row in list_row:
      newlists.append(list(row.split('\t')))
  # Extract subjectID from the file name for future use
  filename_parse = filename.replace("_", " ")
  Participant_ID = ast.literal_eval(re.findall(r'\b\d+\b', filename_parse)[0])
  
  # create dataframe from the list of columnar values 
  Dataframe = pd.DataFrame(newlists, columns = newlists[0])
  Dataframe = Dataframe.drop([0,1]) # drop columns 1 and 2 as are column names and units which we already have for new dataframe.
  Dataframe.reset_index(drop=True, inplace=True) # reset index

  return Dataframe, Participant_ID 

In [12]:
def Sliding_Window_GSRFeatureExt(GSR_16Hz,Participant_ID, start_window, overlap, window_length):
  """
  This function is to create a dataframe consisting of statistical features extracted using sliding window technique on Phasic and Tonic components of GSR signal.

  Parameters:
  -----
  GSR_16Hz = This the Dataframe consisting of Phasic and Tonic components of GSR signal along with epoch timestamp of each instance.

  Participant_ID = Subject ID extracted in Shimmers_csv2DF function from the file name.
  
  start_window = 0, starting index of the sliding window
  
  overlap = 50, sliding window tech. with 50 overlap
  
  window_length = 100, length of each window (100 rows at once)

  Results:
  -----
  Dataframe consisting of statistical features like Mean Phasic&Tonic components, std Phasic&Tonic components, Count and Mean of Phasic&Tonic Peaks, 
  Min and Max, Slope, AUC of Phasic&Tonic components for each window of length 100 and overlap of 50.

 """
  # creating an array of phasic, tonic and timestamps from the GSR_16Hz dataframe, will be easy to calculate statistical features of numpy array.
  phasic  = np.array(GSR_16Hz['phasic']) 
  tonic  = np.array(GSR_16Hz['tonic'])
  timestamp = np.array(GSR_16Hz['TimeStamp'])
  dummy = list(range(0,len(GSR_16Hz))) # this is to use as secondary axis to calculate slope.

  end_window = window_length
  Start_timestamp = []
  Mean_timestamp = []
  end_timestamp = []
  Mean_phasic = []
  Mean_tonic = []
  Std_phasic = []
  Std_tonic = []
  CountPeak_phasic = []
  CountPeak_tonic = []
  MeanPeak_phasic = []
  MeanPeak_tonic = []
  Min_phasic = []
  Max_phasic = []
  Min_tonic = []
  Max_tonic = []
  Slope_tonic = []
  Slope_phasic = []
  AUC_tonic = []
  AUC_phasic = []
  while abs(end_window-window_length) <= len(GSR_16Hz): # loop conditon to carry out windowing and calculate features till the end 

    # creating start, mean and end timestamps helps to  make sure that each window completely falls in annotation period
    Start_timestamp.append(timestamp[start_window]) 
    Mean_timestamp.append(np.mean(timestamp[start_window:end_window]))
    try:
      end_timestamp.append(timestamp[end_window])
    except IndexError: 
      # index of last window might exceed the last timestamp of our data, if this happens it will use last timestamp of data as end timsatmp of last window
      end_timestamp.append(timestamp[-1])

    # using numpy to compute statistucal features like mean, std, count
    Mean_phasic.append(np.mean(phasic[start_window:end_window]))
    Mean_tonic.append(np.mean(tonic[start_window:end_window]))
    Std_phasic.append(np.std(phasic[start_window:end_window]))
    Std_tonic.append(np.std(tonic[start_window:end_window]))
    # scipy.signal.find_peaks exctracts the peaks present in teh signal
    CountPeak_phasic.append(scipy.signal.find_peaks(phasic[start_window:end_window])[0].size)
    CountPeak_tonic.append(scipy.signal.find_peaks(tonic[start_window:end_window])[0].size)
    MeanPeak_phasic.append(np.mean(scipy.signal.find_peaks(phasic[start_window:end_window])[0]))
    MeanPeak_tonic.append(np.mean(scipy.signal.find_peaks(tonic[start_window:end_window])[0]))
    Min_phasic.append(min(phasic[start_window:end_window]))
    Max_phasic.append(max(phasic[start_window:end_window]))
    Min_tonic.append(min(tonic[start_window:end_window]))
    Max_tonic.append(max(tonic[start_window:end_window]))
    # slope is computed using linregress module
    Slope_tonic.append(linregress(dummy[start_window:end_window],tonic[start_window:end_window])[0])
    Slope_phasic.append(linregress(dummy[start_window:end_window],phasic[start_window:end_window])[0])
    # Area under curve is calculated using sk learn metrics
    AUC_tonic.append(metrics.auc(dummy[start_window:end_window],tonic[start_window:end_window]))
    AUC_phasic.append(metrics.auc(dummy[start_window:end_window],phasic[start_window:end_window]))
      
    # to increment start and end for next window
    start_window += overlap
    end_window += overlap
  
  # create Data frame from the lists of values appended to each feature.
  Feature_DF = pd.DataFrame()
  Feature_DF['Start_timestamp'] = Start_timestamp
  Feature_DF['Mean_timestamp'] = Mean_timestamp
  Feature_DF['End_timestamp'] = end_timestamp
  Feature_DF['Mean_phasic'] = Mean_phasic
  Feature_DF['Mean_tonic'] = Mean_tonic
  Feature_DF['Std_phasic'] = Std_phasic
  Feature_DF['Std_tonic'] = Std_tonic
  Feature_DF['CountPeak_phasic'] = CountPeak_phasic
  Feature_DF['CountPeak_tonic'] = CountPeak_tonic
  Feature_DF['MeanPeak_phasic'] = MeanPeak_phasic
  Feature_DF['MeanPeak_tonic'] = MeanPeak_tonic
  Feature_DF['Min_phasic'] = Min_phasic
  Feature_DF['Max_phasic'] = Max_phasic
  Feature_DF['Min_tonic'] = Min_tonic
  Feature_DF['Max_tonic'] = Max_tonic
  Feature_DF['Slope_tonic'] = Slope_tonic
  Feature_DF['Slope_phasic'] = Slope_phasic
  Feature_DF['AUC_tonic'] = AUC_tonic
  Feature_DF['AUC_phasic'] = AUC_phasic
  Feature_DF['Subject_ID'] = Participant_ID

  return Feature_DF

In [13]:
def GSR_features(path,Timestamp_path,method = 'cvxEDA',sheet_name = 'D',start_window=0, overlap=50, window_length=100):
  
  # taking path location of Shimmer file as input and using glob function to find csv and text files in folder and appending each subject's filename to a list
  os.chdir(path)
  list_csv = glob.glob('*.{}'.format('txt'))
  list_csv.extend(glob.glob('*.{}'.format('csv')))
  
  # Creating an empty DataFrame to append all the features
  Data = pd.DataFrame()

  # loop to feature extraction on each subject file.
  for index in list_csv:
    # formatting csv file to create a clean DataFrame and extract Participant ID from file name using Shimmers_csv2DF function
    Dataframe, Participant_ID = Shimmers_csv2DF(path,index)

    GSR_128Hz = pd.DataFrame(Dataframe, columns = ['Shimmer_89C4_Timestamp_Unix_CAL','Shimmer_89C4_GSR_Skin_Conductance_CAL']) # just using timestamp and GSr signal columns
    
    # Convert the timestamp column in nanoseconds to milliseconds
    GSR_128Hz['Timestamp_s'] = GSR_128Hz['Shimmer_89C4_Timestamp_Unix_CAL'].map(lambda instance: TimeStamp_Conversion(instance))

    signal = list(GSR_128Hz['Shimmer_89C4_GSR_Skin_Conductance_CAL']) # list out of GSR signal column
    signal = list(map(lambda index : ast.literal_eval(index),signal)) # string type to int

    signal = mne.filter.resample(signal, down = 8) # downsample using mne resample by factor of 8
    len_downsample = len(signal) 
    
    # taking start and end time of timestamp column to create a sequential timestamp list of downsampled size
    start_time = list(GSR_128Hz['Timestamp_s'])[0] 
    end_time = list(GSR_128Hz['Timestamp_s'])[-1]
    duration= end_time-start_time
    downsample_freq = len_downsample/duration

    list_ts = np.array(np.linspace(start_time,end_time,len_downsample))

    # creating new df with downsampled size 
    GSR_16Hz = pd.DataFrame()
    GSR_16Hz['TimeStamp'] = list_ts
    GSR_16Hz['GSRC'] = signal
    
    # Smoothen and normalize the signal
    GSR_16Hz['GSRC'] = GSR_16Hz['GSRC'].ewm(span = 16).mean() # carryout estimated weighted moving average to 
    min_max_scaler = MinMaxScaler()
    GSR_16Hz['GSRC'] = min_max_scaler.fit_transform(np.array(GSR_16Hz['GSRC']).reshape(-1, 1)) # Normalize using MinMaxScalar
 
    eda = (GSR_16Hz['GSRC'])

    # extracting phasic and tonic components using eda_phasic() function from neurokit.py module
    Phasic_Tonic_DF = eda_phasic(eda,downsample_freq, method)

    GSR_16Hz['phasic'] = Phasic_Tonic_DF['EDA_Phasic']
    GSR_16Hz['tonic'] = Phasic_Tonic_DF['EDA_Tonic']

    # Implenting sliding window technique to extract statistical features from phasic and tonic components
    Feature_DF = Sliding_Window_GSRFeatureExt(GSR_16Hz,Participant_ID, start_window, overlap, window_length)
    
    # Input event timestamp path to Annnotation_timstamp function to get a clean event period timestamp
    VR_TimeStamps_D = Annotation_timestamp(Timestamp_path, sheet_name)
    
    # for each subject present in subject list we take out start and end time stamps of each event period 
    VRBaseline_start = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == Participant_ID,'VR baseline start'].iloc[0]
    VRBaseline_end = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == Participant_ID,'VR baseline end'].iloc[0]
    Speech_start = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == Participant_ID,'Emotion-induction speech start'].iloc[0]
    Speech_end = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == Participant_ID,'Emotion-induction speech end'].iloc[0]
    Food_start = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == Participant_ID,'Food selection start'].iloc[0]
    Food_end = VR_TimeStamps_D.loc[VR_TimeStamps_D['Subject_ID'] == Participant_ID,'food selection end'].iloc[0]

    Feature_DF['Event'] = ''

    # using the start and end timestamps of each event period we check whether the start and end timestamps of ecah feature row fall in event start and end, if yes it is labelled as particular event and if not check for other event
    # if the feature start and end timestamp does not lie in any of the vents start and end timestamp it results in NAN label 
    for i in range(0,len(Feature_DF)):
      if VRBaseline_start <= Feature_DF['Start_timestamp'][i] <= VRBaseline_end and VRBaseline_start <= Feature_DF['Mean_timestamp'][i] <= VRBaseline_end and VRBaseline_start <= Feature_DF['End_timestamp'][i] <= VRBaseline_end:
        Feature_DF['Event'][i] = 'VR Baseline'
      elif Speech_start <= Feature_DF['Start_timestamp'][i] <= Speech_end and Speech_start <= Feature_DF['Mean_timestamp'][i] <= Speech_end and Speech_start <= Feature_DF['End_timestamp'][i] <= Speech_end:
        Feature_DF['Event'][i] = 'Speech Emotion'
      elif Food_start <= Feature_DF['Start_timestamp'][i] <= Food_end and Food_start <= Feature_DF['Mean_timestamp'][i] <= Food_end and Food_start <= Feature_DF['End_timestamp'][i] <= Food_end:
        Feature_DF['Event'][i] = 'Food Selection'
      else:
        Feature_DF.drop(i, inplace = True)

    # append features from each window to an empty Dataframe 
    Data = Data.append(Feature_DF, ignore_index=True)
  
  return Data # labelled Feature Data

In [None]:
path = '/content/gdrive/My Drive/Food_VR/Food_VR/GSR_Processing/GSR_Data'
Timestamp_path = '/content/gdrive/My Drive/Food_VR/Food_VR/VR Timestamps for Phase B & D_W&SP20.xlsx'
Data = GSR_features(path,Timestamp_path,method = 'cvxEDA',sheet_name = 'D',start_window=0, overlap=50, window_length=100)

In [13]:
Data.to_csv("/content/gdrive/My Drive/Food_VR/Food_VR/GSR_Processing/GSR_Features.csv", index = True) # save features

In [14]:
Data = pd.read_csv("/content/gdrive/My Drive/Food_VR/Food_VR/GSR_Processing/GSR_Features.csv")

In [16]:
DataFrame = Data.drop(['End_timestamp','Start_timestamp','Mean_timestamp' ], axis=1) # drop timesatmp columns

In [17]:
DataFrame.shape

(1462, 19)

In [20]:
DataFrame = DataFrame.fillna(0)

In [21]:
DataFrame.Event.value_counts()

Food Selection    489
Speech Emotion    487
VR Baseline       486
Name: Event, dtype: int64

In [22]:
DataFrame.Subject_ID.value_counts()

966     171
984     121
1056    119
961     118
820     118
946     111
942     111
877     111
793     108
963     102
1058     99
962      99
937      74
Name: Subject_ID, dtype: int64

In [23]:
def Tonic_col(columns):
  tonic_col = []
  for x in columns:
    if 'tonic' in x:
      tonic_col.append(x)
  return tonic_col

In [24]:
from numpy import interp
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate, LeaveOneGroupOut, StratifiedKFold
from sklearn import metrics
from sklearn.metrics import roc_curve,auc, roc_auc_score, make_scorer
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
import matplotlib.pylab as plt

In [25]:
rf = RandomForestClassifier(random_state = 42) # random forest classifier as rf 
logreg = LogisticRegression(solver='lbfgs',penalty='l2',max_iter=500) # Logistic Regression Classifier as logreg
knn = KNeighborsClassifier() # K-Nearest Neighbor classifier as knn

In [26]:
def Training(empty_exp_list,Data,Phasic:bool,Tonic:bool,KFold:bool):

  '''
  --- We are carrying out three different Binary classification experiments for each of two event periods (Baseline vs Speech, Speech vs Food Selection, Food selec vs Baseline).
      So that we will be able to discriminate the behaviour of each of the event when compared with other event.
  --- Implementing K-fold cross val and Leave One Group Out cross val using rf, logreg, knn algorithms. 
  --- This function takes GSR feature data as input along with phasic, tonic, KFold Boolean value when True computes K-fold Cross val and when False computes LOGO cross val
  --- In the LOGO Cross val we are computing the mean scores of all the subjects and also scores while one subject is left out.
  --- We are getting accuracy score and area under the curve score of each experiment with above mentioned models as dictionary key value pairs.
  '''
  
  if Phasic == True: # if Phasic input is True, function carry out predictions on phasic data
    col_list = Data.columns
    tonic_col = Tonic_col(col_list) # Tonic_col function to extract tonic columns
    DataFrame = Data.drop(tonic_col,axis =1)
  elif Tonic == True: # if Tonic input is True, function carry out predictions on tonic data
    col_list = Data.columns
    tonic_col = Tonic_col(col_list) # Tonic_col function to extract tonic columns
    tonic_col.append('Event') # append Event 
    tonic_col.append('Subject_ID') # append Subject_ID
    DataFrame = Data[tonic_col]
  else: # if None of Phasic or Tonic input is True, function carries out predictions on both the phasic and tonic data
    DataFrame = Data
  
  # split data with relevance to event periods in each experiment and convert event column to category type
  VRBaseline_Speech = DataFrame.loc[DataFrame['Event'].isin(['VR Baseline','Speech Emotion'])]
  VRBaseline_Speech["Event"] = VRBaseline_Speech["Event"].astype('category').cat.codes
  VRBaseline_FoodSelec = DataFrame.loc[DataFrame['Event'].isin(['VR Baseline','Food Selection'])]
  VRBaseline_FoodSelec["Event"] = VRBaseline_FoodSelec["Event"].astype('category').cat.codes
  Speech_FoodSelec = DataFrame.loc[DataFrame['Event'].isin(['Food Selection','Speech Emotion'])]
  Speech_FoodSelec["Event"] = Speech_FoodSelec["Event"].astype('category').cat.codes

  # Giving keys while each experiment dataframe are values  
  dict_exp = {'VR Baseline vs Speech':VRBaseline_Speech,
            'VR Baseline vs VR FoodSelec':VRBaseline_FoodSelec,
            'Speech vs VR FoodSelec':Speech_FoodSelec}

  for experiment, data in dict_exp.items():
    
    # result dictionary with experiment name as key value pair
    result_dict = {'experiment':experiment}
    
    # while KFold is True(input), we are asking this function to carry out K-fold Cross Val
    if KFold == True:

      # Drop subject_ID column as K-fold does not handle subject wise analysis
      data = data.drop(['Subject_ID'], axis=1)

      feature = np.array(data.iloc[:,:-1]) # feature columns
      target = np.array(data.iloc[:,-1]) # target column

      cv = StratifiedKFold(n_splits=13) # Stratified splitting of data with 13 folds

      # empty lists for accuarcy and auc scores for each algorithm
      accuracy_rf=[]
      accuracy_logreg=[]
      accuracy_knn=[]
      auc_rf=[]
      auc_logreg=[]
      auc_knn=[]
      
      # Loop to carry out predictions on each of the 13 folds and append scores to empty lists algorithm wise and mean is computed as key value apir to result_dict
      for train, test in cv.split(feature,target):
        subject_ID = None

        # fitiing both the models
        rf_cv=rf.fit(feature[train],target[train])
        logreg_cv=logreg.fit(feature[train],target[train])
        knn_cv=knn.fit(feature[train],target[train])

        # predicting and finding scores of rf algorithm
        y_pred_rf = rf_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_rf)
        auc_RF = auc(fpr,tpr)
        accuracy_RF = accuracy_score(target[test],y_pred_rf)

        accuracy_rf.append(accuracy_RF)
        auc_rf.append(auc_RF)

        # predicting and finding scores of logreg algorithm
        y_pred_logreg = logreg_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_logreg)
        auc_LR = auc(fpr,tpr)
        accuracy_LR = accuracy_score(target[test],y_pred_logreg)

        accuracy_logreg.append(accuracy_LR)
        auc_logreg.append(auc_LR)
        
        # predicting and finding scores of knn algorithm
        y_pred_knn = knn_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_logreg)
        auc_KNN = auc(fpr,tpr)
        accuracy_KNN = accuracy_score(target[test],y_pred_knn)

        accuracy_knn.append(accuracy_KNN)
        auc_knn.append(auc_KNN)

    else: # Where KFold (input) False and the function performs LOGO cross val
      subject_id = data['Subject_ID'] # taking out the subject column to pass it as list to the LOGO splits
      data = data.drop(['Subject_ID'], axis=1)

      feature = np.array(data.iloc[:,:-1]) # feature columns
      target = np.array(data.iloc[:,-1]) # target column
      groups = np.array(subject_id)
    
      logo = LeaveOneGroupOut() # group wise splitting of train and test data
      logo.get_n_splits(feature, target, groups)
      logo.get_n_splits(groups=groups)

      # empty lists for accuarcy and auc scores for each algorithm
      accuracy_rf=[]
      accuracy_logreg=[]
      accuracy_knn=[]
      auc_rf=[]
      auc_logreg=[]
      auc_knn=[]

      subject_ID_avg = {}

      # Loop to carry out predictions on each of the 13 folds and append scores to empty lists algorithm wise and mean is computed as key value apir to result_dict
      # Subject wise predictions are appended to subject_ID_avg
      for train,test in logo.split(feature,target,groups):
        
        rf_cv=rf.fit(feature[train],target[train])
        logreg_cv=logreg.fit(feature[train],target[train])
        knn_cv=knn.fit(feature[train],target[train])

        y_pred_rf = rf_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_rf)
        auc_RF = auc(fpr,tpr)
        accuracy_RF = accuracy_score(target[test],y_pred_rf)

        accuracy_rf.append(accuracy_RF)
        auc_rf.append(auc_RF)

        y_pred_logreg = logreg_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_logreg)
        auc_LR = auc(fpr,tpr)
        accuracy_LR = accuracy_score(target[test],y_pred_logreg)

        accuracy_logreg.append(accuracy_LR)
        auc_logreg.append(auc_LR)

        y_pred_knn = knn_cv.predict(feature[test])
        fpr, tpr, threshhold = roc_curve(target[test],y_pred_knn)
        auc_KNN = auc(fpr,tpr)
        accuracy_KNN = accuracy_score(target[test],y_pred_knn)
        
        accuracy_knn.append(accuracy_KNN)
        auc_knn.append(auc_KNN)

        subject_ID = groups[test][0]
        subject_ID_avg[str(subject_ID)] = {'accuracy_rf':accuracy_RF,
                                          'auc_rf':auc_RF, 
                                          'accuracy_logreg':accuracy_LR,
                                          'auc_logreg':auc_LR,
                                          'accuracy_knn':accuracy_KNN,
                                          'auc_knn':auc_KNN}
    
    if KFold == True:
      pass
    else:
      result_dict['Subject_ID'] = subject_ID_avg

    result_dict['accuracy_score_testing'] = {'rf_cv':np.mean(accuracy_rf),
                                              'logreg_cv':np.mean(accuracy_logreg),
                                              'knn_cv':np.mean(accuracy_knn)}

    result_dict['auc_score_testing'] = {'rf_cv':np.mean(auc_rf),
                                              'logreg_cv':np.mean(auc_logreg),
                                              'knn_cv':np.mean(auc_knn)}

    empty_exp_list.append(result_dict)

  return empty_exp_list

In [None]:
KFoldCV_results = []
KFoldCV_results = Training(KFoldCV_results,DataFrame,Phasic = False, Tonic = False, KFold = True)
# Predictions while KFold is Ture which means asking to run Kfold cross val and on both phasic and tonic columns

In [27]:
KFoldCV = pd.DataFrame()# empty dataframe
# loop to append each of the index from Training functions to a new df and append that df to above empty dataframe
for index in KFoldCV_results:
  df = pd.DataFrame(index)
  KFoldCV = KFoldCV.append(df)

In [None]:
KFoldCV

Unnamed: 0,experiment,accuracy_score_testing,auc_score_testing
rf_cv,VR Baseline vs Speech,0.889785,0.890387
logreg_cv,VR Baseline vs Speech,0.814082,0.815215
knn_cv,VR Baseline vs Speech,0.849702,0.815215
rf_cv,VR Baseline vs VR FoodSelec,0.821538,0.822218
logreg_cv,VR Baseline vs VR FoodSelec,0.704615,0.706177
knn_cv,VR Baseline vs VR FoodSelec,0.715897,0.706177
rf_cv,Speech vs VR FoodSelec,0.485762,0.484462
logreg_cv,Speech vs VR FoodSelec,0.561538,0.559935
knn_cv,Speech vs VR FoodSelec,0.497976,0.559935


In [28]:
KFoldCV.to_csv("Phasic&Tonic_predictions_K-FoldCV.csv", index = True)

In [None]:
KFoldCV_Phasic_results = []
KFoldCV_Phasic_results = Training(KFoldCV_Phasic_results,DataFrame,Phasic = True, Tonic= False, KFold = True)
# Predictions while KFold is Ture which means asking to run Kfold cross val and on both phasic columns

In [30]:
KFoldCV_Phasic = pd.DataFrame()# empty dataframe
# loop to append each of the index from Training functions to a new df and append that df to above empty dataframe
for index in KFoldCV_Phasic_results:
  df = pd.DataFrame(index)
  KFoldCV_Phasic = KFoldCV_Phasic.append(df)

In [31]:
KFoldCV_Phasic.to_csv("PhasicFeatures_predictions_K-FoldCV.csv", index = True)

In [None]:
KFoldCV_Tonic_results = []
KFoldCV_Tonic_results = Training(KFoldCV_Tonic_results,DataFrame,Phasic = False, Tonic= True, KFold = True)
# Predictions while KFold is Ture which means asking to run Kfold cross val and on both tonic columns

In [28]:
KFoldCV_Tonic = pd.DataFrame()# empty dataframe
# loop to append each of the index from Training functions to a new df and append that df to above empty dataframe
for index in KFoldCV_Tonic_results:
  df = pd.DataFrame(index)
  KFoldCV_Tonic = KFoldCV_Tonic.append(df)

In [30]:
KFoldCV_Tonic.to_csv("/content/gdrive/My Drive/Food_VR/Food_VR/GSR_Processing/GSR_Predictions/TonicFeatures_predictions_K-FoldCV.csv", index = True)

In [None]:
LOGO_results = []
LOGO_results = Training(LOGO_results,DataFrame,Phasic = False, Tonic=False, KFold = False)
# Predictions while KFold is False which means asking to run LOGO cross val resulting in Mean of LOGO cross val and subject wise predictions on both phasic and tonic data

In [46]:
# code in below cells is to separate mean LOGO predictions and subject wise predictions
LOGO = pd.DataFrame()
for index in LOGO_results:
  df = pd.DataFrame(index)
  LOGO = LOGO.append(df)

In [47]:
list_index = LOGO.index
indices = []
for index in range(0,len(list_index)):
  try:
    element = ast.literal_eval(list_index[index])
  except ValueError:
    indices.append(list_index[index])

In [48]:
Mean_LOGO = pd.DataFrame(LOGO.loc[indices])
Mean_LOGO = Mean_LOGO.drop_duplicates()
Mean_LOGO = Mean_LOGO.drop(columns = ['Subject_ID'])
Mean_LOGO.to_csv("Phasic&Tonic_predicitons_MeanLOGO.csv", index = True) # save Mean LOGO of both Phasic and TOnic 

In [49]:
Subject_LOGO = LOGO.drop(indices).drop(columns = ['accuracy_score_testing','auc_score_testing'])

In [50]:
list_=[]
for w in range(0,len(Subject_LOGO)):
  index = Subject_LOGO.index
  list_.append(index[w])


In [51]:
dummy_df = pd.DataFrame()
for one in range(0,len(Subject_LOGO)):
  Dict = Subject_LOGO['Subject_ID'][one]
  df = pd.DataFrame(Dict,index=[list_[one]])
  exp = Subject_LOGO['experiment'][one]
  df['experiment'] = exp
  dummy_df = dummy_df.append(df)


In [52]:
dummy_df.to_csv("Phasic&Tonic_predictions_LOGO.csv", index = True) # save subjectwise LOGO predictions of both phasic and tonic data

In [None]:
LOGO_phasic_results = []
LOGO_phasic_results = Training(LOGO_phasic_results,DataFrame,Phasic = True, Tonic = False, KFold = False)
# Predictions while KFold is False which means asking to run LOGO cross val resulting in Mean of LOGO cross val and subject wise predictions on both phasic data

In [54]:
# code in below cells is to separate mean LOGO predictions and subject wise predictions
LOGO_phasic = pd.DataFrame()
for index in LOGO_phasic_results:
  df = pd.DataFrame(index)
  LOGO_phasic = LOGO_phasic.append(df)

In [55]:
list_index = LOGO_phasic.index
indices = []
for index in range(0,len(list_index)):
  try:
    element = ast.literal_eval(list_index[index])
  except ValueError:
    indices.append(list_index[index])

In [56]:
Mean_LOGO_phasic = pd.DataFrame(LOGO_phasic.loc[indices])
Mean_LOGO_phasic = Mean_LOGO_phasic.drop_duplicates()
Mean_LOGO_phasic = Mean_LOGO_phasic.drop(columns = ['Subject_ID'])
Mean_LOGO_phasic.to_csv("Phasic_predicitons_MeanLOGO.csv", index = True) # save Mean LOGO of Phasic

In [57]:
Subject_LOGO = LOGO_phasic.drop(indices).drop(columns = ['accuracy_score_testing','auc_score_testing'])

In [58]:
list_=[]
for w in range(0,len(Subject_LOGO)):
  index = Subject_LOGO.index
  list_.append(index[w])

In [59]:
dummy_df = pd.DataFrame()
for one in range(0,len(Subject_LOGO)):
  Dict = Subject_LOGO['Subject_ID'][one]
  df = pd.DataFrame(Dict,index=[list_[one]])
  exp = Subject_LOGO['experiment'][one]
  df['experiment'] = exp
  dummy_df = dummy_df.append(df)

In [60]:
dummy_df.to_csv("Phasic_predictions_LOGO.csv", index = True)# save subjectwise LOGO predictions of phasic

In [None]:
LOGO_tonic_results = []
LOGO_tonic_results = Training(LOGO_tonic_results,DataFrame,Phasic = False, Tonic = True, KFold = False)
# Predictions while KFold is False which means asking to run LOGO cross val resulting in Mean of LOGO cross val and subject wise predictions on phasic data

In [63]:
LOGO_tonic = pd.DataFrame()
for index in LOGO_tonic_results:
  df = pd.DataFrame(index)
  LOGO_tonic = LOGO_tonic.append(df)
# code in below cells is to separate mean LOGO predictions and subject wise predictions

In [64]:
list_index = LOGO_tonic.index
indices = []
for index in range(0,len(list_index)):
  try:
    element = ast.literal_eval(list_index[index])
  except ValueError:
    indices.append(list_index[index])

In [65]:
Mean_LOGO_tonic = pd.DataFrame(LOGO_tonic.loc[indices])
Mean_LOGO_tonic = Mean_LOGO_tonic.drop_duplicates()
Mean_LOGO_tonic = Mean_LOGO_tonic.drop(columns = ['Subject_ID'])
Mean_LOGO_tonic.to_csv("Tonic_predicitons_MeanLOGO.csv", index = True)# save Mean LOGO of Tonic 

In [66]:
Subject_LOGO = LOGO_tonic.drop(indices).drop(columns = ['accuracy_score_testing','auc_score_testing'])

In [67]:
list_=[]
for w in range(0,len(Subject_LOGO)):
  index = Subject_LOGO.index
  list_.append(index[w])

In [68]:
dummy_df = pd.DataFrame()
for one in range(0,len(Subject_LOGO)):
  Dict = Subject_LOGO['Subject_ID'][one]
  df = pd.DataFrame(Dict,index=[list_[one]])
  exp = Subject_LOGO['experiment'][one]
  df['experiment'] = exp
  dummy_df = dummy_df.append(df)

In [69]:
dummy_df.to_csv("Tonic_predictions_LOGO.csv", index = True)# save subjectwise LOGO predictions of tonic data