## Sample Script for 1 df


In [57]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from dataclasses import dataclass
import plotly.express as px


PROJECT_DIR = Path.cwd()
if PROJECT_DIR.stem == 'data':
    PROJECT_DIR = PROJECT_DIR.parents[1]
    sys.path.insert(0, PROJECT_DIR.as_posix())
    %load_ext autoreload
    %autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [58]:
def flatten_list(l):
    """Flatten a list that is one level deep (i.e., a list of lists)."""
    return [item for sublist in l for item in sublist]

In [59]:
DATA_DIR = PROJECT_DIR / 'data' / 'raw' / '002_pilot_melis'


@dataclass
class SignalData:
    path: str
    imotions_columns: list # columns as named in the imotions csv file
    keep_columns: list
    plot_columns: list = None
    rename_columns: dict = None
    native_sampling_rate: float = None

    

trial = SignalData(
    path = DATA_DIR / 'ExternalMarker_ET_EventAPI_ExternDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","MarkerName","MarkerDescription","MarkerType","SceneType"],
    keep_columns = ["Timestamp","MarkerName","MarkerDescription","MarkerType"],
)

temperature = SignalData(
    path = DATA_DIR / 'TemperatureCurve_TemperatureCurve@1_ET_EventAPI_ExternDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","Temperature"],
    keep_columns = ["Timestamp","Temperature"],
    plot_columns = ["Temperature"],
)

rating = SignalData(
    path = DATA_DIR / 'RatingCurve_RatingCurve@1_ET_EventAPI_ExternDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","Rating"],
    keep_columns = ["Timestamp","Rating"],
    plot_columns = ["Rating"],
)

eda = SignalData(
    path = DATA_DIR / 'Shimmer3_GSR+_&_EDA_(D200)_Shimmer3_GSR+_&_EDA_(D200)_ET_Shimmer_ShimmerDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","SampleNumber","Timestamp RAW","Timestamp CAL","System Timestamp CAL","VSenseBatt RAW","VSenseBatt CAL","GSR RAW","GSR Resistance CAL","GSR Conductance CAL","Packet reception rate RAW"],
    keep_columns = ["Timestamp","GSR RAW","GSR Resistance CAL","GSR Conductance CAL"],
    plot_columns = ["GSR Conductance CAL"],
    native_sampling_rate = 128,
)

ecg = SignalData(
    path = DATA_DIR / 'Shimmer3_ECG_(68BF)_Shimmer3_ECG_(68BF)_ET_Shimmer_ShimmerDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","SampleNumber","Timestamp RAW","Timestamp CAL","System Timestamp CAL","VSenseBatt RAW","VSenseBatt CAL","EXG1 Status RAW","ECG LL-RA RAW","ECG LL-RA CAL","ECG LA-RA RAW","ECG LA-RA CAL","EXG2 Status RAW","ECG Vx-RL RAW","ECG Vx-RL CAL","Heart Rate ECG LL-RA ALG","IBI ECG LL-RA ALG","Packet reception rate RAW"],
    keep_columns = ["Timestamp","ECG LL-RA RAW","ECG LL-RA CAL","ECG LA-RA RAW","ECG LA-RA CAL","ECG Vx-RL RAW","ECG Vx-RL CAL","Heart Rate ECG LL-RA ALG","IBI ECG LL-RA ALG"],
    plot_columns = ["ECG LL-RA CAL","ECG LA-RA CAL","ECG Vx-RL CAL","Heart Rate ECG LL-RA ALG","IBI ECG LL-RA ALG"],
    native_sampling_rate = 512,
)

eeg = None

pupillometry = SignalData(
    path = DATA_DIR / 'ET_Eyetracker.csv',
    imotions_columns = ["RowNumber","Timestamp","ET_GazeLeftx","ET_GazeLefty","ET_GazeRightx","ET_GazeRighty","ET_PupilLeft","ET_PupilRight","ET_TimeSignal","ET_DistanceLeft","ET_DistanceRight","ET_CameraLeftX","ET_CameraLeftY","ET_CameraRightX","ET_CameraRightY"],
    keep_columns = ["Timestamp","ET_GazeLeftx","ET_GazeLefty","ET_GazeRightx","ET_GazeRighty","ET_PupilLeft","ET_PupilRight","ET_TimeSignal","ET_DistanceLeft","ET_DistanceRight","ET_CameraLeftX","ET_CameraLeftY","ET_CameraRightX","ET_CameraRightY"],
    plot_columns = ["ET_GazeLeftx","ET_GazeLefty","ET_GazeRightx","ET_GazeRighty","ET_PupilLeft","ET_PupilRight","ET_TimeSignal","ET_DistanceLeft","ET_DistanceRight","ET_CameraLeftX","ET_CameraLeftY","ET_CameraRightX","ET_CameraRightY"],
    native_sampling_rate = 60,
)

affectiva = SignalData(
    path = DATA_DIR / 'Affectiva_AFFDEX_ET_Affectiva_AffectivaCameraDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","SampleNumber","Anger","Contempt","Disgust","Fear","Joy","Sadness","Surprise","Engagement","Valence","Sentimentality","Confusion","Neutral","Attention","Brow Furrow","Brow Raise","Cheek Raise","Chin Raise","Dimpler","Eye Closure","Eye Widen","Inner Brow Raise","Jaw Drop","Lip Corner Depressor","Lip Press","Lip Pucker","Lip Stretch","Lip Suck","Lid Tighten","Mouth Open","Nose Wrinkle","Smile","Smirk","Upper Lip Raise","Blink","BlinkRate","Pitch","Yaw","Roll","Interocular Distance"],
    keep_columns = ["Timestamp","Anger","Contempt","Disgust","Fear","Joy","Sadness","Surprise","Engagement","Valence","Sentimentality","Confusion","Neutral","Attention","Brow Furrow","Brow Raise","Cheek Raise","Chin Raise","Dimpler","Eye Closure","Eye Widen","Inner Brow Raise","Jaw Drop","Lip Corner Depressor","Lip Press","Lip Pucker","Lip Stretch","Lip Suck","Lid Tighten","Mouth Open","Nose Wrinkle","Smile","Smirk","Upper Lip Raise","Blink","BlinkRate","Pitch","Yaw","Roll","Interocular Distance"],
    plot_columns = ["Anger","Contempt","Disgust","Fear","Joy","Sadness","Surprise","Engagement","Valence","Sentimentality","Confusion","Neutral","Attention","Brow Furrow","Brow Raise","Cheek Raise","Chin Raise","Dimpler","Eye Closure","Eye Widen","Inner Brow Raise","Jaw Drop","Lip Corner Depressor","Lip Press","Lip Pucker","Lip Stretch","Lip Suck","Lid Tighten","Mouth Open","Nose Wrinkle","Smile","Smirk","Upper Lip Raise","Blink","BlinkRate","Pitch","Yaw","Roll","Interocular Distance"],
    )

system = SignalData(
    path = DATA_DIR / 'System_Load_Monitor_iMotions.SysMonitor@1_ET_EventAPI_ExternDevice.csv',
    imotions_columns = ["RowNumber","Timestamp","CPU Sys","Memory Sys","CPU Proc","Memory Proc"],
    keep_columns = ["Timestamp","CPU Sys","Memory Sys","CPU Proc","Memory Proc"],
)

signal_data_list = [
    trial,
    temperature, 
    rating, 
    # eda, 
    # ecg,
    # pupillometry,
    # affectiva,
    # system,
    ]


In [60]:
def load_imotions_csv(signal_data: SignalData):
    """
    Load date from iMotions csv file and return a pandas dataframe without unwanted columns.
    """
    path = signal_data.path
    # Find the index of the line that contains '#DATA' for the pandas.read_csv() 'skiprows' parameter
    with open(path, 'r') as file:
        lines = file.readlines(2**16) # only the first few bytes 
    data_start_index = next(i for i, line in enumerate(lines) if "#DATA" in line)
    del lines
    df = pd.read_csv(
        path, 
        skiprows=data_start_index + 1,
        usecols=lambda column: column in signal_data.keep_columns,
        )
    return df


def merge_data_frames(data_frames):
    # Do not use pd.concat() because it will add duplicate time stamps
    return reduce(
        lambda left,right:
            pd.merge(left,right,on=['Timestamp'], how='outer'),
            data_frames
        ).sort_values(by=['Timestamp'])


def create_timedelta_index(df):
    """Convert the time stamp to time delta and set it as index."""
    # just casting to timedelta64[ms] is faster but less accurate
    df["Time"] = pd.to_timedelta(df["Timestamp"], unit='ms').round('ms').astype('timedelta64[ms]')
    df.set_index("Time", append=True if 'Trial' in df.index.names else False, inplace=True)
    # Remove duplicate index
    df = df[~df.index.duplicated(keep='first')]
    return df


def create_trial_index(df):
    """Create a trial index based on the MarkerDescription which contains the stimulus seed and is originally send once at the start and end of each trial."""
    # Find trial start and end
    # 1. Forward fill and backward fill columns
    ffill = df['MarkerDescription'].ffill()
    bfill = df['MarkerDescription'].bfill()
    # 2. Where forward fill and backward fill are equal, replace the NaNs in the original MarkerDescription
    df['MarkerDescription'] = np.where(ffill == bfill, ffill, df['MarkerDescription'])
    # 3. Only keep rows where the MarkerDescription is not NaN
    df = df[df['MarkerDescription'].notna()]
    df['MarkerDescription'] = df['MarkerDescription'].astype(int)
    # Create a new column that contains the trial number
    df['Trial'] = df['MarkerDescription'].diff().ne(0).cumsum()
    # Add Trial to the index
    df.set_index('Trial', append=True if 'Time' in df.index.names else False, inplace=True)
    return df


def reorder_multiindex(df):
    if ('Trial' in df.index.names) and ('Time' in df.index.names):
        df = df.reorder_levels(['Trial', 'Time'])
    return df


def get_data(signal_data_list):
    dfs = [load_imotions_csv(data) for data in signal_data_list]
    df = merge_data_frames(dfs)
    df = create_timedelta_index(df)
    df = create_trial_index(df)
    df = reorder_multiindex(df)
    return df


def interpolate_data(df, method='linear'):
    columns_to_interpolate = df.columns[(df.dtypes == float)]
    if 'Trial' in df.index.names:
        df[columns_to_interpolate] = df.groupby('Trial')[columns_to_interpolate].transform(lambda x: x.interpolate(method=method))
    else:
        df[columns_to_interpolate] = df[columns_to_interpolate].interpolate(method=method)
    return df


def standardize_data(df):
    # Exclude 'Timestamp' from the columns to be standardized
    columns_to_standardize = df.columns[(df.dtypes == float) & (df.columns != 'Timestamp')]
    if 'Trial' in df.index.names:
        df[columns_to_standardize] = df.groupby('Trial')[columns_to_standardize].transform(lambda x: (x - x.mean()) / x.std())
    else:
        df[columns_to_standardize] = (df[columns_to_standardize] - df[columns_to_standardize].mean()) / df[columns_to_standardize].std()
    return df



In [61]:
df = get_data(signal_data_list)
#df = interpolate_data(df)
#df_standardized = standardize_data(df.copy())
print('df size:', round(sys.getsizeof(df) / 1024**2), 'MB')
df


df size: 25 MB


Unnamed: 0_level_0,Unnamed: 1_level_0,Timestamp,MarkerName,MarkerDescription,MarkerType,Temperature,Rating
Trial,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0 days 00:02:13.972000,1.339723e+05,stimulus,320,S,,
1,0 days 00:02:13.977000,1.339773e+05,,320,,38.700,50.0
1,0 days 00:02:13.997000,1.339973e+05,,320,,38.700,50.0
1,0 days 00:02:14.011000,1.340112e+05,,320,,,50.0
1,0 days 00:02:14.012000,1.340122e+05,,320,,38.700,
...,...,...,...,...,...,...,...
11,0 days 01:00:51.331000,3.651331e+06,,3,,40.913,100.0
11,0 days 01:00:51.348000,3.651348e+06,,3,,40.914,100.0
11,0 days 01:00:51.365000,3.651365e+06,,3,,40.914,100.0
11,0 days 01:00:51.398000,3.651398e+06,,3,,,100.0


In [62]:
trial = 9
df_stimuli = df.loc[trial]

signals = [signal_data_list[i].plot_columns for i in range(len(signal_data_list)) if signal_data_list[i].plot_columns is not None]
signals = flatten_list(signals)

fig = px.line(df_stimuli, y=signals, x=df_stimuli['Timestamp']/1000)
fig.update_xaxes(
    title_text='Time (s)',
    tickmode='linear',
    tick0=0,
    dtick=10)
fig.show()

In [63]:
import neurokit2 as nk
import matplotlib.pyplot as plt

ecg = df_stimuli['ECG LL-RA CAL'].dropna()
ecg
processed_ecg, info = nk.ecg_process(ecg, sampling_rate=60)
# Plot
nk.ecg_plot(processed_ecg, info=info)
fig = plt.gcf()
fig.set_size_inches(10, 12, forward=True)


KeyError: 'ECG LL-RA CAL'

In [None]:
nk.ecg_quality(processed_ecg)


Too few peaks detected to compute the rate. Returning empty vector.



ValueError: cannot convert float NaN to integer

In [None]:
processed_ecg.ECG_Quality.mean()

0.77199487088504

### Sanity checks
TODO: make unit test

In [None]:
# Check stimuli length, must be >= stimuli.duration
# where we create a stimulus object from the config file with the right seed, etc.
(df.groupby('Trial').last().Timestamp - df.groupby('Trial').first().Timestamp).astype('timedelta64[ms]')


Trial
1    0 days 00:04:40.031000
2    0 days 00:04:40.013000
3    0 days 00:04:40.028000
4    0 days 00:04:40.011000
5    0 days 00:04:40.021000
6    0 days 00:04:40.022000
7    0 days 00:04:40.023000
8    0 days 00:04:40.026000
9    0 days 00:04:40.055000
10   0 days 00:04:40.010000
11   0 days 00:04:40.007000
Name: Timestamp, dtype: timedelta64[ms]