###  Faint Dataset - Preprocessing

In [1]:
# import neccessary packages
import pandas as pd
import numpy as np
import os
import re
import logging
from tsfresh import extract_features

In [2]:
logging.basicConfig(level=logging.INFO)

#### Read and combine CSV files

In [3]:
# define a function to return csv_files related to given stage (t)

def create_folder (path,search_key):
    # create a folder name list to show filtered folder names -- keep only those with stage 4 
    file_name=[]
    for item in os.listdir(path):
        for key in search_key: 
            if key in  item:
                file_name.append(item)
    return file_name
    

In [4]:
# a for loop to create master dataset for several stages (stage 1, 2 and 4-5-6 as combined)

search={1:["01.csv"], 2:["02.csv"], 4:["_04","-04",",04"]}

stages=[1,2,4]

# define folder path 
folder_path ='C:\\Users\\mtyur\\Desktop\\FAINT\\datain'

for i in stages:
    
    # define a key will be used for file search
    search_key = search[i]
    
    # create a list to report file paths
    files=create_folder(folder_path, search_key)
    
    # create empty dataset 
    df_s= pd.DataFrame()

    # loop to read files and append all csv files to create a master dataset 
    for file in files: 
    
        # read file
        df = pd.read_csv( rf'C:\\Users\\mtyur\\Desktop\\FAINT\\datain\\{file}')

        # extract donor id: (extract the numbers corresponding donor_id)
        donor_id = re.findall(r'\d+', file)[0] 

        # trim column names - use lower cases
        df.columns=df.columns.str.strip().str.lower()

        # keep subset of variables
        df_selected = df[['timestamp','au01_r', 'au02_r', 'au04_r', 'au05_r', 'au06_r', 'au07_r', 'au09_r', 'au10_r', 'au12_r', 'au14_r', 'au15_r', 'au17_r', 'au20_r',
                      'au23_r', 'au25_r', 'au26_r', 'au45_r']].copy()

        # add donor id - change type of column 
        df_selected['donor_id'] = donor_id
        df_selected['stage'] = i
        df_selected['donor_id'] = df_selected['donor_id'].astype('int64')

        # create a master dataset for donation stage
        df_s =pd.concat([df_s, df_selected], ignore_index=True)
        
    # create seperate master dataset for each stage 
    if i == 1:
        df1 = df_s.copy()
    elif i == 2:
        df2 = df_s.copy()  
    else: 
        df4 = df_s.copy() # although it named as df4, it also includes data from stage 3, 5 and 6
        
    logging.info(f'Produced master folder for stage {i}')

INFO:root:Produced master folder for stage 1
INFO:root:Produced master folder for stage 2
INFO:root:Produced master folder for stage 4


#### Extract timestamps to identify the time point --> needle inserted

In [5]:
# upload timestamp dataset
filepath_t =  rf'C:\Users\mtyur\Desktop\FAINT\datain\FAINT_timestamp.csv'
df_time= pd.read_csv(filepath_t)

In [6]:
# filter stage  4 and create new dataframes
df_t4=df_time[(df_time['Time_point'] == 4)][['ID', 'start (seconds)']]. copy()

# rename columns 
df_t4.rename(columns={'start (seconds)':'stage4'}, inplace=True)

# rename id as donor_id --> it will be important for following steps
df_t4.rename(columns={'ID':'donor_id'}, inplace=True)

#### Combine starting and ending points to filter stage 4

In [7]:
# merge dataset using id 
df4_merge=pd.merge(df4, df_t4, on='donor_id', how='left')

# filter dataset based on stage4 starting and stage5 starting timestamp
df4_final=df4_merge[df4_merge['timestamp']<df4_merge['stage4']].copy()

# drop treshold variables
df4_final.drop(columns=['stage4'], inplace=True)

#### Produce continues time flow from stage 1 to stage 4 (exclude stage 3)

In [8]:
# append dataset from the first 3 stages:  
df_124 =pd.concat([df1, df2, df4_final], ignore_index=True)
# sort dataset 
df_124.sort_values(by=['donor_id', 'stage', 'timestamp'], inplace=True)
# reset index
df_124.reset_index(drop=True, inplace=True)    

In [9]:
# extract min and max timestamps by donor id and stage 
df124_max=df_124.groupby(['donor_id', 'stage'])['timestamp'].agg(['min', 'max']).reset_index()

# identify max timestamps from stage 1 and 2
for i in [1, 2]:
    df124_max[f's{i}_max']=df124_max.groupby('donor_id')['max'].shift(i)


In [10]:
# adjust timestamps to create continuous data flow
for i in  [1, 2, 4]:
    if i ==1:
        df124_max.loc[df124_max['stage'] == i, 'add_timestamp'] = 0
    else:
        df124_max.loc[df124_max['stage'] == i, 'add_timestamp'] =  df124_max['s1_max'].add(df124_max['s2_max'], fill_value=0) + 0.04*(i-1)  


In [11]:
# keep only necessary variables
df124_max=df124_max[['donor_id', 'stage', 'add_timestamp']]

In [12]:
# merge master dataset with the dataset reporting number of seconds should be added to produce continues data flow (stage 1 to stage 4)
df124_final=pd.merge(df_124, df124_max, on=['donor_id', 'stage'], how='left')

# recode timestamp --> allows to produce continues data flow from stage 1 to stage 3.
df124_final['timestamp']=df124_final['timestamp'].add(df124_final['add_timestamp'], fill_value=0)

In [13]:
# drop stage variable
df124_final.drop(columns=['stage', 'add_timestamp'], inplace=True)

In [14]:
# define output file
output_raw = 'dataout\\df124_intensities_raw.csv'
df124_final.to_csv(output_raw, index=False)   

In [15]:
# check number of datapoints 
df124_final.shape[0]

3362818

In [16]:
# check number of non-missing observations for each column.
df124_final.count()

timestamp    3362818
au01_r       3362818
au02_r       3362818
au04_r       3362818
au05_r       3362818
au06_r       3362818
au07_r       3362818
au09_r       3362818
au10_r       3362818
au12_r       3362818
au14_r       3362818
au15_r       3362818
au17_r       3362818
au20_r       3362818
au23_r       3362818
au25_r       3362818
au26_r       3362818
au45_r       3362818
donor_id     3362818
dtype: int64

In [17]:
# check for NaN values
df124_final.isna().sum()

timestamp    0
au01_r       0
au02_r       0
au04_r       0
au05_r       0
au06_r       0
au07_r       0
au09_r       0
au10_r       0
au12_r       0
au14_r       0
au15_r       0
au17_r       0
au20_r       0
au23_r       0
au25_r       0
au26_r       0
au45_r       0
donor_id     0
dtype: int64

#### Use tsfresh package to produce  descriptive statistics for each stage

In [18]:
# define parameters will be used for tsfresh
ts_parameters = {
    "sum_values": None,
    "variance": None,
    "standard_deviation": None,
    "mean": None,
    "median": None,
    "minimum": None,
    "maximum": None,
    "mean_change": None,
    "root_mean_square": None,
}

In [19]:
# define output file
output = 'dataout\\intensity_combined.csv'
    
# extract summary statistics 
summary_features = extract_features(df124_final, default_fc_parameters=ts_parameters, column_id="donor_id", column_sort="timestamp")
  
# reset index, 
summary_features.reset_index(inplace=True)
    

Feature Extraction: 100%|██████████| 30/30 [00:07<00:00,  4.08it/s]


In [20]:
#rename columns, rename index as donor_id 
summary_features.rename(columns={'index':'donor_id'}, inplace=True)
    
# extract csv file for stage1
summary_features.to_csv(output, index=False)
summary_features.head()

Unnamed: 0,donor_id,au01_r__sum_values,au01_r__variance,au01_r__standard_deviation,au01_r__mean,au01_r__median,au01_r__minimum,au01_r__maximum,au01_r__mean_change,au01_r__root_mean_square,...,au26_r__root_mean_square,au45_r__sum_values,au45_r__variance,au45_r__standard_deviation,au45_r__mean,au45_r__median,au45_r__minimum,au45_r__maximum,au45_r__mean_change,au45_r__root_mean_square
0,5,1569.79,0.243053,0.493004,0.259469,0.03,0.0,4.48,0.000321,0.557115,...,0.507946,2100.68,0.44558,0.667518,0.34722,0.0,0.0,3.95,0.0,0.752424
1,6,2179.09,0.333328,0.577345,0.369337,0.03,0.0,4.12,-6.3e-05,0.685374,...,0.81529,1912.63,0.306231,0.553382,0.324175,0.02,0.0,3.17,0.000154,0.641343
2,7,2341.13,0.249784,0.499784,0.284636,0.0,0.0,4.22,-6e-05,0.575154,...,0.979199,6828.36,1.708007,1.306908,0.830196,0.05,0.0,5.0,-3.5e-05,1.5483
3,8,2934.72,0.16902,0.41112,0.278833,0.06,0.0,4.33,0.0,0.496757,...,0.965606,5096.9,0.48754,0.698241,0.484266,0.09,0.0,3.8,9.5e-05,0.849737
4,9,2074.14,0.16799,0.409866,0.249896,0.01,0.0,3.4,0.0,0.48004,...,0.799094,4667.12,0.85776,0.926153,0.562304,0.05,0.0,4.64,0.000194,1.083487


In [21]:
# further check number of observations --> some csv files are not available in the main FAINT folder

# not_available in df1--> [212, 213, 214, 215, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276]
# not_available in df2--> [49, 55, 212, 213, 214, 215, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 332]
# not_available in df4--> [56, 86, 212, 213, 214, 215, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 324, 328]

assert df1['donor_id'].nunique()==310
assert df2['donor_id'].nunique()==307
assert df4['donor_id'].nunique()==308

In [22]:
# deploy master datasets for intensities
df_master= pd.read_csv( rf'C:\\Users\\mtyur\\Desktop\\FAINT\\dataout\\intensity_combined.csv')

In [23]:
df_master.head()

Unnamed: 0,donor_id,au01_r__sum_values,au01_r__variance,au01_r__standard_deviation,au01_r__mean,au01_r__median,au01_r__minimum,au01_r__maximum,au01_r__mean_change,au01_r__root_mean_square,...,au26_r__root_mean_square,au45_r__sum_values,au45_r__variance,au45_r__standard_deviation,au45_r__mean,au45_r__median,au45_r__minimum,au45_r__maximum,au45_r__mean_change,au45_r__root_mean_square
0,5,1569.79,0.243053,0.493004,0.259469,0.03,0.0,4.48,0.000321,0.557115,...,0.507946,2100.68,0.44558,0.667518,0.34722,0.0,0.0,3.95,0.0,0.752424
1,6,2179.09,0.333328,0.577345,0.369337,0.03,0.0,4.12,-6.3e-05,0.685374,...,0.81529,1912.63,0.306231,0.553382,0.324175,0.02,0.0,3.17,0.000154,0.641343
2,7,2341.13,0.249784,0.499784,0.284636,0.0,0.0,4.22,-6e-05,0.575154,...,0.979199,6828.36,1.708007,1.306908,0.830196,0.05,0.0,5.0,-3.5e-05,1.5483
3,8,2934.72,0.16902,0.41112,0.278833,0.06,0.0,4.33,0.0,0.496757,...,0.965606,5096.9,0.48754,0.698241,0.484266,0.09,0.0,3.8,9.5e-05,0.849737
4,9,2074.14,0.16799,0.409866,0.249896,0.01,0.0,3.4,0.0,0.48004,...,0.799094,4667.12,0.85776,0.926153,0.562304,0.05,0.0,4.64,0.000194,1.083487


In [24]:
# check master dataset
print ("Number of unique observation:")
print(df_master['donor_id'].nunique())

Number of unique observation:
313


####  Keep master dataset for demographics, vvr scores and somatosensory amplification

In [25]:
# upload csv file reporting the responses to the FAINT questionnaire
df_personal= pd.read_csv( 'C:\\Users\\mtyur\\Desktop\\FAINT\\datain_p\\FAINT_alldatafromallparticipants.csv')

# define neccessary variables
keep_var=['ID','Condition', 'Gender', 'Age', 'BMI', 'Q4.1_1', 'Q4.1_2', 'Q4.1_3', 'Q4.1_4', 'Q4.1_5', 'Q4.1_6','Q4.1_7', 'Q4.1_8', 'Q4.1_9', 'Q4.1_10']

# filter the personal questionnaire, keep main demographics (same across time_points)
df_personal_master= df_personal.loc[df_personal['Time_point']==4, keep_var].copy()

# reset index
df_personal_master.reset_index(drop=True, inplace=True)

In [26]:
# keep subset of main questionnaire dataset (stage 4 to 7 with vvr subscores)
df_personal_long = df_personal.loc[df_personal['Time_point'].isin([4,5,6,7]),['ID','Time_point', 'VVR_psych_tp', 'VVR_phys_tp']].copy()

# create wide format of the personal questionnaire
df_personal_wide=df_personal_long.pivot_table(index='ID', columns='Time_point', values={'VVR_psych_tp', 'VVR_phys_tp'}, aggfunc='mean')

# rename columns to remove multi-ndexing
df_personal_wide.columns=['_'.join(map(str,col)).strip() for col in df_personal_wide.columns.values]

# keep index value as donor_id
df_personal_wide.reset_index(inplace=True)

In [27]:
# merge vvr scores with main demographics 
df_master_p = pd.merge(df_personal_master, df_personal_wide, on='ID', how= 'outer')

In [28]:
# rename ID column as donor_id -- will be useful to merge with AU intensities
df_master_p.rename(columns={'ID':'donor_id'}, inplace=True)

####  Merge master dataset on facial action units with master dataset on personal questionnaire 

In [29]:
# merge master_intensity dataset with preprocessed master personal dataset 
df_final_raw = pd.merge(df_master_p, df_master, on='donor_id', how= 'outer')

# extract csv file for final master dataset
# define output file
output_final =  'dataout\preprocessed_faint.csv'
df_final_raw.to_csv(output_final, index=False)

In [30]:
df_final_raw.head()

Unnamed: 0,donor_id,Condition,Gender,Age,BMI,Q4.1_1,Q4.1_2,Q4.1_3,Q4.1_4,Q4.1_5,...,au26_r__root_mean_square,au45_r__sum_values,au45_r__variance,au45_r__standard_deviation,au45_r__mean,au45_r__median,au45_r__minimum,au45_r__maximum,au45_r__mean_change,au45_r__root_mean_square
0,5,1.0,2.0,33.0,25.306932,2.0,3.0,4.0,2.0,4.0,...,0.507946,2100.68,0.44558,0.667518,0.34722,0.0,0.0,3.95,0.0,0.752424
1,6,2.0,1.0,33.0,27.76343,2.0,2.0,3.0,1.0,1.0,...,0.81529,1912.63,0.306231,0.553382,0.324175,0.02,0.0,3.17,0.000154,0.641343
2,7,2.0,2.0,25.0,23.999459,1.0,1.0,1.0,4.0,1.0,...,0.979199,6828.36,1.708007,1.306908,0.830196,0.05,0.0,5.0,-3.5e-05,1.5483
3,8,1.0,1.0,56.0,27.458654,1.0,2.0,4.0,1.0,1.0,...,0.965606,5096.9,0.48754,0.698241,0.484266,0.09,0.0,3.8,9.5e-05,0.849737
4,9,1.0,1.0,41.0,26.122449,1.0,5.0,4.0,1.0,1.0,...,0.799094,4667.12,0.85776,0.926153,0.562304,0.05,0.0,4.64,0.000194,1.083487
