# Feature engineering
Feature engineering is used to create extra variables form the exisiting psychological signals to have extra training data. This is only applied ot the `physio_trans_data_session.pickle`, because those are the independent variables for the ML models.

In [None]:
# import dependencies
import pickle
import os
import pandas as pd
import numpy as np
from scipy.signal import find_peaks

BASE_DIR = os.getcwd()  # Works in Jupyter
print(BASE_DIR)

# specify paths
questionnaire_path = "../data/raw/2_Questionnaire/Transformed/quest_trans_data_segments.pickle"
physio_path = "../data/raw/3_Physio/Transformed/physio_trans_data_segments.pickle"
annotation_path = "../data/raw/4_Annotation/Transformed/ann_trans_data_segments.pickle"


c:\Users\Zita\Repositories\affective-states


# Original dataset

In [104]:
with open(physio_path, 'rb') as f:
    data = pickle.load(f)

# Inspect the type and structure
print(type(data))  # Check the data type

<class 'dict'>


In [74]:
data.keys()

dict_keys(['filt_EDA', 'filt_PPG', 'ts', 'sampling_rate', 'packet_number', 'EDR', 'hr', 'raw_EDA', 'raw_PPG', 'hr_idx', 'EDA_quality_idx', 'PPG_quality_idx'])

## Dictionary keys explanation
**filt_EDA** : Filtered signal of Electro Dermal Activity, which assesses the naturally occurring changes in electrical properties of human skin, measure sweat gland activity.

**filt_PPG** : The photoplethysmographic (PPG) signal is defined as oscillations in light transmission through a tissue.  It provides a continuous signal that can be analyzed to derive different cardiovascular metrics, including heart rate.

**ts** : Timestamps in seconds.

**packet_number** : Unknown and not relevant.

**raw_EDA** : Raw signal of EDA. (not used)

**sampling_rate** : The amount of samples per second. It is 100 for all segments, which means that the time between each sample is 0.1 seconds, as you can see by the calculation in the cell below.

**raw_PPG** : Raw singal of PPG. (not used)

**hr** : Heart rate in bpm (beats per minutes.

**EDR** : Electro Dermal Response which is the derivative of EDA.

**hr_idx** : Derivate of heart rate.

**EDA_quality_idx** : Quality index of the EDA signal defined by a float between 0 and 240.

**PPG_quality_idx** : Quality index of the PPG singal defined by a float between 0 and 240.

This dataset is sourced from Boda et al. (2024) and bad quality data has already been discarded and the dataset is cleaned. There can be missing values and the segments have different lenghts. We will use feature engineering to add more features to this dataset.

In [75]:
print(data["sampling_rate"])

[np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(100), np.int64(

In [76]:
# calculate sampling rate

sampling_rate = data["sampling_rate"][0]  # seconds^-1
dt = 1.0 / sampling_rate
print(dt) # this matches what we got above

0.01


In [77]:
print("Number of instances: " + str(len(data['filt_EDA']) ))

Number of instances: 1481


In [78]:
print(data['filt_EDA'][1])

print("\n Example of filtered EDA signal length (of instance 1): " +  str(len(data['filt_EDA'][1])))

[0. 0. 0. ... 0. 0. 0.]

 Example of filtered EDA signal length (of instance 1): 2000


# Feature engineering
The following feature are added:

**SCL** : 

**SCR** : Phasic EDA signal.

**hr** : Heart rate in bpm

**HRV** : Heart Rate Variability

**EDA peaks**:


TODO: finish the definitions for all the new features

In [79]:
# make data into a df_physio dataframe
df_physio = pd.DataFrame(dict([ (key, pd.Series(val)) for key, val in data.items() ]))

# Display the resulting DataFrame
df_physio.head()

Unnamed: 0,filt_EDA,filt_PPG,ts,sampling_rate,packet_number,EDR,hr,raw_EDA,raw_PPG,hr_idx,EDA_quality_idx,PPG_quality_idx
0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-600.060815151149, -595.2853451896902, -587.9...","[5.0, 5.01, 5.0200000000000005, 5.03, 5.04, 5....",100,"[8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[71.2523979172376, 68.60128444958119, 66.18906...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1191.0, 1193.0, 1200.0, 1204.0, 1206.0, 1216....","[187, 276, 368, 459, 542, 625, 707, 788, 866, ...",5.0,0.0
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1627.5018611126122, 1609.5223131853395, 1600....","[621.0, 621.01, 621.02, 621.03, 621.04, 621.05...",100,"[8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[70.66836951894423, 70.15177388417735, 71.8546...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3535.0, 3533.0, 3534.0, 3536.0, 3534.0, 3536....","[173, 254, 343, 424, 508, 611, 713, 813, 910, ...",6.0,1.0
2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1483.4034056640082, 1513.4885977757735, 1543....","[467.0, 467.01, 467.02, 467.03000000000003, 46...",100,"[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[83.51101634683724, 74.71290165712789, 71.1779...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[3536.0, 3536.0, 3536.0, 3536.0, 3536.0, 3539....","[150, 234, 329, 405, 493, 580, 666, 758, 829, ...",8.0,2.0
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-876.333693217288, -879.9930179232609, -878.3...","[313.0, 313.01, 313.02, 313.03000000000003, 31...",100,"[8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[66.20553359683794, 73.45191040843214, 71.9148...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[607.0, 595.0, 594.0, 601.0, 608.0, 624.0, 643...","[126, 214, 283, 382, 470, 547, 648, 724, 811, ...",9.0,3.0
4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[-1236.631858494471, -1220.5943338457344, -119...","[159.0, 159.01, 159.02, 159.03, 159.04, 159.05...",100,"[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[82.36125839519265, 76.63156510230421, 71.7424...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[442.0, 448.0, 465.0, 479.0, 502.0, 533.0, 560...","[106, 188, 274, 357, 427, 510, 593, 672, 759, ...",10.0,4.0


In [80]:
def arr_calculation(df, col, type, name=None):
    if name is None:
        name = type + "_" + col

    if type == "mean":
        df[name] = np.array([np.mean(arr) if len(arr) > 0 else np.nan for arr in df[col]])
    elif type == "std":
        df[name] = np.array([np.std(arr) if len(arr) > 0 else np.nan for arr in df[col]])
    elif type == "max":     
        df[name] = np.array([np.max(arr) if len(arr) > 0 else np.nan for arr in df[col]])
    elif type == "min":
        df[name] = np.array([np.min(arr) if len(arr) > 0 else np.nan for arr in df[col]])  
    elif type == "gradient":
        df[name] = [np.gradient(arr) if len(arr) > 0 else [] for arr in df_physio[col]] 
    elif type == "diff":
        df[name] = [np.diff(arr) if len(arr) > 0 else [] for arr in df_physio[col]]

    return df


def statistics(df, col):
    df = arr_calculation(df, col, "mean")
    df = arr_calculation(df, col, "std")
    df = arr_calculation(df, col, "max")
    df = arr_calculation(df, col, "min")
    return df

In [81]:
# add summary statistics on EDA to the dataframe
df_physio = statistics(df_physio, 'filt_EDA')
# compute the first derivate on EDA and add summary statistics on filt_EDA_dot to the dataframe
df_physio = arr_calculation(df_physio, 'filt_EDA', 'gradient', 'filt_EDA_dot')
df_physio = statistics(df_physio, 'filt_EDA_dot')
# compute the second derivative on EDA and add summary statistics on filt_EDA_dott to the dataframe
df_physio = arr_calculation(df_physio, 'filt_EDA_dot', 'gradient', 'filt_EDA_ddot')
df_physio = statistics(df_physio, 'filt_EDA_ddot')

In [82]:
# add summary statistics on EDR to the dataframe
df_physio = statistics(df_physio, 'EDR')
# compute the first derivate on EDR and add summary statistics on EDR_dot to the dataframe
df_physio = arr_calculation(df_physio, 'EDR', 'gradient', 'EDR_dot')
df_physio = statistics(df_physio, 'EDR_dot')
# compute the second derivative on EDR and add summary statistics on EDR_dott to the dataframe
df_physio = arr_calculation(df_physio, 'EDR_dot', 'gradient', 'EDR_ddot')
df_physio = statistics(df_physio, 'EDR_ddot')

In [83]:
# Create a time vector for the HR values (if hr_idx are valid indices into time_segment)
df_physio['hr_time'] = df_physio.apply(
    lambda row: row['ts'][row['hr_idx']] if len(row['hr_idx']) > 0 else [],     # copies the time stamps form ts to hr_time if there is a valid hr_idx
    axis=1
)

df_physio = statistics(df_physio, 'hr')

# compute the first derivative on HR and add summary statistics on hr_dot to the dataframe
df_physio['hr'] = df_physio['hr'].apply(lambda x: x if len(x) > 1 else [np.nan, np.nan])
df_physio = arr_calculation(df_physio, 'hr', 'gradient', 'hr_dot')

df_physio = statistics(df_physio, 'hr_dot')



In [84]:
# adding features on RR & HRV

df_physio = arr_calculation(df_physio, 'hr_time', 'diff')                                       # time in seconds between heart rate measurements
df_physio = statistics(df_physio, 'diff_hr_time')
df_physio = arr_calculation(df_physio, 'diff_hr_time', 'diff', 'successive_diff_hr_time')       # first derivative of diff_hr_time

bm_rr_int_count = [len(arr) > 1 for arr in df_physio["diff_hr_time"]]                           # count the number of RR intervals in each segment
                                                                                                # and check if there are at least 2 intervals    
df_physio["SDNN"] = [
    np.std(arr) * 1000 if valid else np.nan
    for arr, valid in zip(df_physio["diff_hr_time"], bm_rr_int_count)
] 

df_physio["rMSSD"] = [
    np.sqrt(np.mean(arr**2)) * 1000 if valid else np.nan
    for arr, valid in zip(df_physio["successive_diff_hr_time"], bm_rr_int_count)
]

In [85]:
peaks, properties = find_peaks(df_physio["filt_EDA"][1], height=0.1)
print(np.mean(df_physio["filt_EDA"][1][peaks]))
print(np.mean(properties["peak_heights"]))

nan
nan


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [86]:
# adding features on EDA peaks

peak_counts, mean_peak_amp, std_peak_amp = [], [], []

for i in range(len(df_physio)):
    if len(df_physio["filt_EDA"][i]) > 0:
        peaks, properties = find_peaks(df_physio["filt_EDA"][i], height=0.1)
        peak_amplitudes = df_physio["filt_EDA"][i][peaks]
        peak_counts.append(int(len(peaks)))
        mean_peak_amp.append(np.mean(properties["peak_heights"]))
        std_peak_amp.append(np.std(properties["peak_heights"]))

df_physio[["n_peaks_EDA", "mean_peak_amp_EDA", "std_peak_amp_EDA"]] = pd.DataFrame({
    "n_peaks_EDA": peak_counts,
    "mean_peak_amp_EDA": mean_peak_amp,
    "std_peak_amp_EDA": std_peak_amp
}).values

  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


# Save data in one dataframe
All column with lists are removed as they cannot be used as model input.


In [87]:
# opening annotated data
with open(annotation_path, 'rb') as f:
    annotation_data = pickle.load(f)

# Inspect the type and structure
print(type(annotation_data))  # Check the data type
print(annotation_data.keys())

<class 'dict'>
dict_keys(['ar_seg', 'vl_seg', 'unc_seg', 'ts_seg'])


In [88]:
with open(questionnaire_path, 'rb') as f:
    quest_data = pickle.load(f)

print(type(quest_data))  # Check the data type
print(quest_data.keys())
print(len(quest_data["ID"]))

<class 'dict'>
dict_keys(['ID', 'device'])
1481


In [89]:
# Add the labels to the dataframe

df_physio[['ar_seg', 'vl_seg', 'unc_seg', 'ts_seg', 'ID', 'device']] = pd.DataFrame({
    "ar_seg": annotation_data["ar_seg"],
    "vl_seg": annotation_data["vl_seg"],
    "unc_seg": annotation_data["unc_seg"],
    "ts_seg": annotation_data["ts_seg"],
    "ID": quest_data["ID"],
    "device": quest_data["device"]
}).values

In [None]:
# Drop the columns that are of type array
df = df_physio.drop(columns=["filt_EDA", "EDR", "hr", "ts", "hr_idx", "hr_time", "diff_hr_time", "successive_diff_hr_time", "filt_PPG", "sampling_rate", "packet_number", "raw_EDA", "raw_PPG", "filt_EDA_dot"])

In [91]:
df.head()


Unnamed: 0,EDA_quality_idx,PPG_quality_idx,mean_filt_EDA,std_filt_EDA,max_filt_EDA,min_filt_EDA,filt_EDA_dot,mean_filt_EDA_dot,std_filt_EDA_dot,max_filt_EDA_dot,...,rMSSD,n_peaks_EDA,mean_peak_amp_EDA,std_peak_amp_EDA,ar_seg,vl_seg,unc_seg,ts_seg,ID,device
0,5.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,...,76.811457,0.0,,,2.0,4.0,,5.0,1,12
1,6.0,1.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,...,84.323937,0.0,,,2.0,4.0,,621.0,1,12
2,8.0,2.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,...,110.45361,0.0,,,3.0,4.0,,467.0,1,12
3,9.0,3.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,...,137.058382,0.0,,,3.0,4.0,,313.0,1,12
4,10.0,4.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,0.0,...,80.960147,0.0,,,3.0,4.0,,159.0,1,12


In [93]:
df.describe(include='all')  # Display all columns in the DataFrame

Unnamed: 0,EDA_quality_idx,PPG_quality_idx,mean_filt_EDA,std_filt_EDA,max_filt_EDA,min_filt_EDA,filt_EDA_dot,mean_filt_EDA_dot,std_filt_EDA_dot,max_filt_EDA_dot,...,rMSSD,n_peaks_EDA,mean_peak_amp_EDA,std_peak_amp_EDA,ar_seg,vl_seg,unc_seg,ts_seg,ID,device
count,1270.0,1213.0,1481.0,1481.0,1481.0,1481.0,1481,1481.0,1481.0,1481.0,...,1229.0,1481.0,1175.0,1175.0,1481.0,1481.0,251.0,1481.0,1481.0,1481.0
unique,,,,,,,1481,,,,...,,,,,5.0,5.0,5.0,1254.0,191.0,11.0
top,,,,,,,"[0.004170039784639812, 0.0042075818374200935, ...",,,,...,,,,,3.0,3.0,1.0,5.0,28.0,34.0
freq,,,,,,,1,,,,...,,,,,463.0,562.0,93.0,11.0,83.0,160.0
mean,764.859843,758.239901,832.1076,98.432209,1063.041435,697.670659,,0.02629414,0.623954,2.485187,...,126.846606,4.008103,968.347956,73.694489,,,,,,
std,423.272925,458.663607,665.4763,161.071655,820.985816,608.898847,,0.21969,1.309673,5.321005,...,141.382275,4.547044,709.889161,146.965663,,,,,,
min,5.0,0.0,-7.359384e-33,0.0,0.0,-0.004448,,-1.700899,0.0,-0.212733,...,9.229582,0.0,0.186771,0.0,,,,,,
25%,397.5,325.0,297.5628,15.863498,463.91262,162.102846,,-0.03627039,0.03945,0.090221,...,43.169855,1.0,438.16525,2.126256,,,,,,
50%,781.5,855.0,750.5492,46.019174,947.543958,630.272887,,1.041039e-13,0.186826,0.632475,...,91.485328,3.0,864.763931,17.580717,,,,,,
75%,1130.75,1168.0,1227.434,109.661175,1519.567496,1032.942743,,0.07592171,0.611346,2.346875,...,164.012195,5.0,1368.467225,70.075211,,,,,,


In [111]:
# Ensure the directory exists
output_path = os.path.join("c:/Users/Zita/Repositories/affective-states/data/processed", "processed_data.pkl")
df.to_pickle(output_path)
print("Data saved to pickle file successfully.")


Data saved to pickle file successfully.
