## Sample Script for 1 df
### In this notebook we unfold the steps of the transformation pipeline

In [13]:
import os
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd()
if PROJECT_DIR.stem == 'data':
    PROJECT_DIR = PROJECT_DIR.parents[1]
    sys.path.insert(0, PROJECT_DIR.as_posix())
    os.chdir(PROJECT_DIR.as_posix())
    %load_ext autoreload
    %autoreload 2

## First, in polars

In [14]:
import os
from pathlib import Path
from functools import reduce
from dataclasses import dataclass
from typing import Dict, List
import logging

import numpy as np
import pandas as pd
import polars as pl
import neurokit2 as nk

from src.data.process_data import load_participant_datasets
from src.data.transform_data import apply_func_participant
from src.data.config_data import DataConfigBase
from src.data.config_data_imotions import iMotionsConfig, IMOTIONS_LIST
from src.data.config_data_raw import RawConfig, RAW_LIST
from src.data.config_participant import ParticipantConfig, PARTICIPANT_LIST

from src.log_config import configure_logging
configure_logging()

pl.Config.set_tbl_rows(7) # don't print too many rows in the book


polars.config.Config

In [15]:
list_of_data_configs = [
    IMOTIONS_LIST,
    # RAW_LIST,
    # TRIAL_LIST,
]

participant_data = load_participant_datasets(PARTICIPANT_LIST[0], IMOTIONS_LIST)
participant_data.eda

Timestamp,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate
f64,f64,f64,i64
50.8622,3648.307692,7.343876,100
50.8622,3625.005128,7.33877,100
51.8618,3654.133333,7.33877,100
…,…,…,…
2.4026e6,3627.917949,0.00999,99
2.4026e6,3625.005128,0.00999,99
2.4026e6,3606.071795,0.00999,99


### before we can anything we need to merge trial information

In [16]:
trial = participant_data.trial
temperature = participant_data.temperature

new = trial.join(
    trial,
    on='Timestamp',
    how='outer_coalesce',
    # join_nulls=True,
).sort(
    'Timestamp'
)
new

Timestamp,Stimuli_Seed,Stimuli_Seed_right
f64,i64,i64
188041.074,1,1
468085.8797,1,1
549706.8485,2,2
…,…,…
1.9799e6,5,5
2.0235e6,6,6
2.3036e6,6,6


In [17]:
# Special case for imotions data: we need to add the stimuli seed column
# datasets are missing the trial information (via Stimuli_Seed) and need to be merged with the trial data first
for data_config in IMOTIONS_LIST:
    # add the stimuli seed column to all datasets of the participant except for the trial data which already has it
    if "Stimuli_Seed" not in participant_data.datasets[data_config.name].dataset.columns:
        participant_data.datasets[data_config.name].dataset = participant_data.datasets[data_config.name].dataset.join(
            participant_data.trial,
            on='Timestamp',
            how='outer_coalesce',
        ).sort('Timestamp')
    assert participant_data.datasets[data_config.name].dataset['Timestamp'].is_sorted(descending=False)


participant_data.eda

Timestamp,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,Stimuli_Seed
f64,f64,f64,i64,i64
50.8622,3648.307692,7.343876,100,
50.8622,3625.005128,7.33877,100,
51.8618,3654.133333,7.33877,100,
…,…,…,…,…
2.4026e6,3627.917949,0.00999,99,
2.4026e6,3625.005128,0.00999,99,
2.4026e6,3606.071795,0.00999,99,


### merge trial information

In [18]:
def create_trials(df):
    # TODO: maybe we need to interpolate here for the nan at the start and end of each trial
    # Check if all trials are complete  # TODO
    # Forward fill and backward fill columns
    ffill = df['Stimuli_Seed'].fill_null(strategy='forward')
    bfill = df['Stimuli_Seed'].fill_null(strategy='backward')
    # Where forward fill and backward fill are equal, replace the NaNs in the original Stimuli_Seed
    # this is the same as np.where(ffill == bfill, ffill, df['Stimuli_Seed'])
    df = df.with_columns(
        pl.when(ffill == bfill)
        .then(ffill)
        .otherwise(df['Stimuli_Seed'])
        .alias('Stimuli_Seed')
    )
    assert df['Timestamp'].is_sorted(descending=False)
    # Only keep rows where the Stimuli_Seed is not NaN
    df = df.filter(df['Stimuli_Seed'].is_not_null())
    # Create a new column that contains the trial number
    df = df.with_columns(
        pl.col('Stimuli_Seed')
        .diff()                     # Calculate differences
        .fill_null(value=0)         # Replace initial null with 0 because the first trial is always 0
        .ne(0)                      # Check for non-zero differences
        .cum_sum()                  # Cumulative sum of boolean values
        .cast(pl.UInt8)             # Cast to UInt8
        .alias('Trial')             # Rename the series to 'Trial'
    )
    return df

participant_data = apply_func_participant(participant_data, create_trials)
participant_data.ecg

Timestamp,ECG_d_Battery,ECG_LL-RA,ECG_LA-RA,ECG_Vx-RL,ECG_LL-RA_HeartRate,ECG_LL-RA_IBI,ECG_d_PacketReceptionRate,Stimuli_Seed,Trial
f64,f64,f64,f64,f64,f64,f64,i64,i64,u8
188041.074,,,,,,,,1,0
188112.8797,3756.082051,0.06289,-0.915007,-603.386639,66.349892,-1.0,60,1,0
188112.8797,3756.082051,0.062674,-0.92056,-603.369763,66.349892,-1.0,60,1,0
…,…,…,…,…,…,…,…,…,…
2.3035e6,3747.34359,-1.150196,-1.294295,-603.277375,66.637744,-1.0,73,6,5
2.3035e6,3734.235897,-1.150989,-1.286217,-603.271172,66.637744,-1.0,73,6,5
2.3036e6,,,,,,,,6,5


In [19]:
def interpolate_to_marker_timestamps(df):
    # Define a custom function for the transformation
    def replace_timestamps(group_df):
        """
        We define the timestamp where the marker was send as the first measurement timestamp 
        of the device to have the exact same trial duration for each modality 
        (different devices have different sampling rates). This shifts the data by about 5 ms
        and could be interpreted as an interpolation.
        """
        # Get the first and last timestamp of the group
        # TODO: NOTE: there is a difference between using the integer indexing and boolean indexing below
        # - we should decide depending on how duplicate timestamps are handled
        # especially in what order duplicate timestamps are removed
        first_timestamp = group_df["Timestamp"][0]
        second_timestamp = group_df["Timestamp"][1]
        second_to_last_timestamp = group_df["Timestamp"][-2]
        last_timestamp = group_df["Timestamp"][-1]
        
        # Replace the second and second-to-last timestamps
        return group_df.with_columns(
            pl.when(pl.col("Timestamp") == group_df["Timestamp"][1])
            .then(first_timestamp)
            .when(pl.col("Timestamp") == group_df["Timestamp"][-2])
            .then(last_timestamp)
            .otherwise(pl.col("Timestamp"))
            .alias("Timestamp")
        ).drop_nulls()

    # Only if there are nulls in the df
    if sum(df.null_count()).item() == 0:
        return df
    # Apply the custom function to each group
    return df.group_by("Trial", maintain_order=True).map_groups(replace_timestamps)
    
participant_data = apply_func_participant(participant_data, interpolate_to_marker_timestamps)
participant_data.ecg.filter(pl.col('Trial') == 0)


Timestamp,ECG_d_Battery,ECG_LL-RA,ECG_LA-RA,ECG_Vx-RL,ECG_LL-RA_HeartRate,ECG_LL-RA_IBI,ECG_d_PacketReceptionRate,Stimuli_Seed,Trial
f64,f64,f64,f64,f64,f64,f64,i64,i64,u8
188041.074,3756.082051,0.06289,-0.915007,-603.386639,66.349892,-1.0,60,1,0
188041.074,3756.082051,0.062674,-0.92056,-603.369763,66.349892,-1.0,60,1,0
188113.8763,3753.169231,0.064116,-0.917892,-603.375965,66.349892,-1.0,60,1,0
…,…,…,…,…,…,…,…,…,…
468085.8797,3742.974359,-3.69032,-1.435653,-603.619881,66.064516,-1.0,68,1,0
468085.8797,3756.082051,-3.718014,-1.43623,-603.616635,66.064516,-1.0,68,1,0
468085.8797,3726.953846,-3.689526,-1.392885,-603.606899,66.064516,-1.0,68,1,0


beautiful

### add timedelta column just to see how it looks

In [20]:
def add_time_column(df):
    """Create a new column that contains the time from Timestamp in ms."""
    df = df.with_columns(
        df['Timestamp']
        .cast(pl.Duration(time_unit='ms'))
        .alias('Time')
    )
    return df

participant_data = apply_func_participant(participant_data, add_time_column)


In [21]:
participant_data.eda

Timestamp,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,Stimuli_Seed,Trial,Time
f64,f64,f64,i64,i64,u8,duration[ms]
188041.074,3638.112821,6.057357,100,1,0,3m 8s 41ms
188049.0512,3659.958974,6.062463,100,1,0,3m 8s 49ms
188049.0512,3655.589744,6.057357,100,1,0,3m 8s 49ms
…,…,…,…,…,…,…
2.3035e6,3630.830769,4.515577,99,6,5,38m 23s 536ms
2.3036e6,3617.723077,4.520683,99,6,5,38m 23s 571ms
2.3036e6,3616.266667,4.520683,99,6,5,38m 23s 571ms


looks amazing

### calculate sampling rate to help to decide between up or down sampling

In [22]:

# TODO: add more quality checks based on non-trialized imotions data
# - packet less
# - battery

def quality_check_sample_rate(df):
    timestamp_start = df.group_by("Trial").agg([
        pl.first('Timestamp'),
    ]).sort('Trial').drop('Trial')
    timestamp_end = df.group_by("Trial").agg([
        pl.last('Timestamp'),
    ]).sort('Trial').drop('Trial')
    duration_in_s = (timestamp_end-timestamp_start) / 1000

    samples = df.group_by("Trial").agg([
        pl.count('Timestamp'),
    ]).sort('Trial').drop('Trial')

    sample_rate_per_trial = (samples/duration_in_s)
    sample_rate_mean = (sample_rate_per_trial).mean().item()
    coeff_of_variation = ((sample_rate_per_trial).std() / (sample_rate_per_trial).mean() * 100).item()
    
    logging.debug(f"Sample rate per trial: {np.round(sample_rate_per_trial.to_numpy().flatten(), 2)}")    
    logging.info(f"The mean sample rate is {(sample_rate_mean):.2f}.")
    if coeff_of_variation > 0.5:
        logging.warning(f"Sample rate varies more than 0.5% between trials: {coeff_of_variation:.2f}%.")
    # TODO: write to participant config json?

for data in participant_data.datasets.keys():
    quality_check_sample_rate(participant_data(data))

17:57:12 |[32m INFO    [0m| root | The mean sample rate is 0.01.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 57.44.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 57.45.


17:57:12 |[32m INFO    [0m| root | The mean sample rate is 128.01.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 510.79.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 499.97.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 59.93.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 9.79.
17:57:12 |[32m INFO    [0m| root | The mean sample rate is 0.97.


In [23]:
participant_data.eda

Timestamp,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,Stimuli_Seed,Trial,Time
f64,f64,f64,i64,i64,u8,duration[ms]
188041.074,3638.112821,6.057357,100,1,0,3m 8s 41ms
188049.0512,3659.958974,6.062463,100,1,0,3m 8s 49ms
188049.0512,3655.589744,6.057357,100,1,0,3m 8s 49ms
…,…,…,…,…,…,…
2.3035e6,3630.830769,4.515577,99,6,5,38m 23s 536ms
2.3036e6,3617.723077,4.520683,99,6,5,38m 23s 571ms
2.3036e6,3616.266667,4.520683,99,6,5,38m 23s 571ms


### interpolate & resample (comes later maybe)

In [12]:
load_participant_datasets(PARTICIPANT_LIST[0], RAW_LIST)

Participant(id=001_pilot_bjoern, datasets=dict_keys(['temperature', 'rating', 'eda', 'ecg', 'eeg', 'pupillometry', 'affectiva']))

In [13]:
participant_data.eda

Timestamp,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,Stimuli_Seed,Trial,Time
f64,f64,f64,i64,i64,u8,duration[ms]
188041.074,,,,1,0,3m 8s 41ms
188046.0611,3638.112821,6.057357,100,1,0,3m 8s 46ms
188049.0512,3659.958974,6.062463,100,1,0,3m 8s 49ms
…,…,…,…,…,…,…
2.3036e6,3617.723077,4.520683,99,6,5,38m 23s 551ms
2.3036e6,3616.266667,4.520683,99,6,5,38m 23s 551ms
2.3036e6,,,,6,5,38m 23s 571ms


looks beautiful

### resample based on Time column

In [15]:
import polars as pl
from datetime import timedelta

def resample_polars(df, ms):
    # Convert milliseconds to timedelta for resampling
    delta = timedelta(milliseconds=ms)

    # Ensure 'Time' is a datetime column
    df = df.with_column(pl.col("Time").str.strptime(pl.Datetime))

    # Function to perform the resampling
    def resample_group(group_df):
        # Create a range of timestamps with specified interval
        time_range = pl.date_range(low=group_df["Time"].min(), high=group_df["Time"].max(), interval=delta)

        # Perform resampling
        return (group_df
                .join(time_range, how="right", on="Time")
                .fill_none("forward")
                .groupby_dynamic("Time", every=delta)
                .mean())

    # Check if 'Trial' column is present
    if 'Trial' in df.columns:
        # Resample for each 'Trial' group
        return df.groupby("Trial").apply(resample_group)
    else:
        # Resample the entire dataframe
        return resample_group(df)

# Example usage
# df = pl.DataFrame({'Time': [...], 'Trial': [...], other columns...})
# resampled_df = resample_polars(df, 1000)  # Resample with 1000 milliseconds interval


In [17]:
raw = load_participant_datasets(PARTICIPANT_LIST[0], RAW_LIST)

raw2 = raw.temperature.join(
    raw.rating,
    on=['Timestamp','Trial'],
    how='outer_coalesce',
).sort('Timestamp')
raw2

Timestamp,Temperature,Trial,Rating
f64,f64,i64,f64
188041.074,,0,
188047.0584,38.6,0,48.083
188080.9654,,0,48.083
…,…,…,…
2.3035e6,,5,27.0
2.3036e6,,5,27.0
2.3036e6,,5,


In [18]:
raw.ecg.describe()

describe,Timestamp,ECG_LL-RA,ECG_LA-RA,ECG_Vx-RL,ECG_LL-RA_HeartRate,ECG_LL-RA_IBI,Trial
str,f64,f64,f64,f64,f64,f64,f64
"""count""",858233.0,858221.0,858221.0,858221.0,858221.0,858221.0,858233.0
"""null_count""",0.0,12.0,12.0,12.0,12.0,12.0,0.0
"""mean""",1.2495e6,-4.165392,-1.160292,-603.086763,68.784282,0.522843,2.499848
…,…,…,…,…,…,…,…
"""50%""",1.1791e6,-3.614448,-1.174068,-603.074425,67.665198,-1.0,2.0
"""75%""",1.8397e6,-2.230866,-1.035955,-602.916334,72.969121,-1.0,4.0
"""max""",2.3036e6,1.670625,0.3821,-602.284549,94.523077,1763.671875,5.0


In [19]:
data = [raw.temperature, raw.rating, raw.eda, raw.eeg]

In [20]:
def merge_dfs(dfs: List[pl.DataFrame]) -> pl.DataFrame:
    return reduce(
        lambda left, right: 
            left.join(right, on=['Timestamp','Trial'], how='outer_coalesce')
            .sort('Timestamp'),
        dfs)

In [22]:
raw2.group_by('Trial').plot(x='Timestamp', y=['Temperature', 'Rating'])

AttributeError: 'GroupBy' object has no attribute 'plot'

In [None]:
raw.ecg.unique('Timestamp').sort('Timestamp').plot(x='Timestamp', y='ECG_LL-RA')


In [None]:
import polars as pl
import datetime
import polars as pl
import numpy as np

In [None]:
import os
import sys
from pathlib import Path
PROJECT_DIR = Path.cwd()
if PROJECT_DIR.stem == 'data':
    PROJECT_DIR = PROJECT_DIR.parents[1]
    sys.path.insert(0, PROJECT_DIR.as_posix())
    os.chdir(PROJECT_DIR.as_posix())
    %load_ext autoreload
    %autoreload 2

In [None]:
import os
from pathlib import Path
from functools import reduce
from dataclasses import dataclass
from typing import Dict, List
import logging

import pandas as pd
import polars as pl

from src.data.process_data import load_participant_datasets
from src.data.config_data import DataConfigBase
from src.data.config_data_imotions import iMotionsConfig, IMOTIONS_LIST
from src.data.config_data_raw import RawConfig, RAW_LIST
from src.data.config_data_trial import TrialConfig, TRIAL_LIST
from src.data.config_participant import ParticipantConfig, PARTICIPANT_LIST

from src.log_config import configure_logging
configure_logging()


ModuleNotFoundError: No module named 'src.data.config_data_trial'

In [None]:
@dataclass
class Data:
    """Dataclass for a single csv files"""
    name: str
    dataset: pd.DataFrame

@dataclass
class Participant:
    """Dataclass for a single participant"""
    id: str
    datasets: Dict[str, Data]
    
    def __getattr__(self, name):
        if name in self.datasets:
           return self.datasets[name].dataset
        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")


In [None]:
# Prepare data configs
TEST_DIR = Path.cwd() / 'data' / '_test'
for data_config in IMOTIONS_LIST:
    data_config.load_dir = TEST_DIR / 'imotions'
    data_config.save_dir = TEST_DIR / 'raw'
for data_config in RAW_LIST:
    data_config.load_dir = TEST_DIR / 'raw'
    data_config.save_dir = TEST_DIR / 'trial'
for data_config in TRIAL_LIST:
    data_config.load_dir = TEST_DIR / 'trial'
    data_config.save_dir = TEST_DIR / 'processed'

participant = PARTICIPANT_LIST[0]

## 1. iMotions -> Raw

### 1.1 iMotions load

In [None]:
datasets = {}
for data_config in IMOTIONS_LIST:

    # Special case for iMotions data: find the start of the data in the file
    file_path = data_config.load_dir / participant.id / f"{data_config.name_imotions}.csv"
    with open(file_path, 'r') as file:
        lines = file.readlines(2**16) # only read a few lines
        file_start_index = next(i for i, line in enumerate(lines) if "#DATA" in line) + 1

    # Load and process data
    dataset = pd.read_csv(
        file_path,
        skiprows=None if not file_start_index else file_start_index,
        usecols=lambda column: column in data_config.load_columns,
        )

    # Special case for iMotions data: we also want to rename some columns
    if isinstance(data_config, iMotionsConfig):
        dataset.rename(columns=data_config.rename_columns, inplace=True) if data_config.rename_columns else None
    dataset = Data(name=data_config.name, dataset=dataset)
    datasets[data_config.name] = dataset

participant_imotions = Participant(id=participant.id, datasets=datasets)
participant_imotions.eeg


Unnamed: 0,Timestamp,EEG_RAW_Ch1,EEG_RAW_Ch2,EEG_RAW_Ch3,EEG_RAW_Ch4,EEG_RAW_Ch5,EEG_RAW_Ch6,EEG_RAW_Ch7,EEG_RAW_Ch8
0,5.982810e+01,-4361.342773,-6395.005859,12318.420898,-2381.229004,1936.005981,6634.855957,-3910.541992,-2543.639893
1,6.009020e+01,-4367.398926,-6375.790039,12322.760742,-2373.503906,1938.581055,6633.663086,-3906.250000,-2539.920898
2,6.018110e+01,-4360.247070,-6347.513184,12331.343750,-2366.590088,1946.592041,6636.523926,-3900.813965,-2533.625977
3,6.026490e+01,-4366.063965,-6371.688965,12311.220703,-2389.525879,1925.707031,6618.690918,-3921.460938,-2553.081055
4,6.032850e+01,-4366.397949,-6389.475098,12317.467773,-2385.092041,1935.291016,6630.850098,-3913.354980,-2544.020996
...,...,...,...,...,...,...,...,...,...
1201240,2.402589e+06,-4003.382080,-6123.448242,17756.750000,-2073.955078,2945.470947,6527.232910,-4694.318848,-2134.989990
1201241,2.402589e+06,-3984.547119,-6113.053223,17771.484375,-2069.998047,2958.916992,6539.487793,-4683.780762,-2116.774902
1201242,2.402589e+06,-3987.025879,-6137.084961,17768.527344,-2075.529053,2953.481934,6533.050781,-4688.500977,-2116.489014
1201243,2.402589e+06,-3993.750000,-6142.425781,17758.228516,-2079.010010,2941.322021,6519.651855,-4693.984863,-2135.514893


### 1.2 iMotions transform

In [None]:
None

### 1.3 iMotions save -> Raw

In [None]:
for data_config in IMOTIONS_LIST:
    output_dir = data_config.save_dir / participant.id
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    file_path = output_dir / f"{participant.id}_{data_config.name}.csv"
    participant_imotions.datasets[data_config.name].dataset.to_csv(
        file_path, 
        index=True)
    logging.info(f"Dataset '{data_config.name}' for participant {participant.id} saved to {file_path}")


[18:52:06] [INFO] [root] - Dataset 'trial' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/raw/001_pilot_bjoern/001_pilot_bjoern_trial.csv


[18:52:06] [INFO] [root] - Dataset 'temperature' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/raw/001_pilot_bjoern/001_pilot_bjoern_temperature.csv
[18:52:06] [INFO] [root] - Dataset 'rating' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/raw/001_pilot_bjoern/001_pilot_bjoern_rating.csv
[18:52:07] [INFO] [root] - Dataset 'eda' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/raw/001_pilot_bjoern/001_pilot_bjoern_eda.csv


KeyboardInterrupt: 

## 2. Raw -> Trial

### 2.0 Raw config

In [None]:
for data_config in RAW_LIST:
    data_config.load_dir = TEST_DIR / 'raw'
    data_config.save_dir = TEST_DIR / 'trial'


### 2.1 Raw load

In [None]:
datasets = {}
for data_config in RAW_LIST:
    file_path = data_config.load_dir / participant.id / f"{participant.id}_{data_config.name}.csv"
    # Load and process data
    dataset = pd.read_csv(
        file_path,
        skiprows=None,
        usecols=lambda column: column in data_config.load_columns,
        )

    dataset = Data(name=data_config.name, dataset=dataset)
    datasets[data_config.name] = dataset

participant_raw = Participant(id=participant.id, datasets=datasets)
participant_raw.eeg


Unnamed: 0,Timestamp,EEG_RAW_Ch1,EEG_RAW_Ch2,EEG_RAW_Ch3,EEG_RAW_Ch4,EEG_RAW_Ch5,EEG_RAW_Ch6,EEG_RAW_Ch7,EEG_RAW_Ch8
0,5.982810e+01,-4361.342773,-6395.005859,12318.420898,-2381.229004,1936.005981,6634.855957,-3910.541992,-2543.639893
1,6.009020e+01,-4367.398926,-6375.790039,12322.760742,-2373.503906,1938.581055,6633.663086,-3906.250000,-2539.920898
2,6.018110e+01,-4360.247070,-6347.513184,12331.343750,-2366.590088,1946.592041,6636.523926,-3900.813965,-2533.625977
3,6.026490e+01,-4366.063965,-6371.688965,12311.220703,-2389.525879,1925.707031,6618.690918,-3921.460938,-2553.081055
4,6.032850e+01,-4366.397949,-6389.475098,12317.467773,-2385.092041,1935.291016,6630.850098,-3913.354980,-2544.020996
...,...,...,...,...,...,...,...,...,...
1201240,2.402589e+06,-4003.382080,-6123.448242,17756.750000,-2073.955078,2945.470947,6527.232910,-4694.318848,-2134.989990
1201241,2.402589e+06,-3984.547119,-6113.053223,17771.484375,-2069.998047,2958.916992,6539.487793,-4683.780762,-2116.774902
1201242,2.402589e+06,-3987.025879,-6137.084961,17768.527344,-2075.529053,2953.481934,6533.050781,-4688.500977,-2116.489014
1201243,2.402589e+06,-3993.750000,-6142.425781,17758.228516,-2079.010010,2941.322021,6519.651855,-4693.984863,-2135.514893


### 2.2 Raw transform

In [None]:
# Special case for trial data: we need to add the stimuli seed column
# RawConfig datasets are missing the trial information (via Stimuli_Seed) and need to be merged with the trial data first
for data_config in RAW_LIST:
    pd.options.mode.chained_assignment = None  # default='warn'
    # add the stimuli seed column to all raw datasets of the participant
    if "Stimuli_Seed" not in participant_raw.datasets[data_config.name].dataset.columns:
        participant_raw.datasets[data_config.name].dataset = pd.merge(
            participant_raw.datasets[data_config.name].dataset, 
            participant_raw.trial, 
            on='Timestamp', how='outer')
        participant_raw.datasets[data_config.name].dataset.sort_values(by=['Timestamp'], inplace=True)
        participant_raw.datasets[data_config.name].dataset.reset_index(drop=True, inplace=True)
pd.options.mode.chained_assignment = 'warn'

for data_config in RAW_LIST:
    for transformation in data_config.transformations:
        participant_raw.datasets[data_config.name].dataset = transformation(participant_raw.datasets[data_config.name].dataset)
   
participant_raw.eeg

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Stimuli_Seed'] = df['Stimuli_Seed'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Trial'] = df.Stimuli_Seed.diff().ne(0).cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Stimuli_Seed'] = df['Stimuli_Seed'].astype(int)
A value is trying to be set on a copy of a slice 

Unnamed: 0_level_0,Unnamed: 1_level_0,Timestamp,EEG_RAW_Ch1,EEG_RAW_Ch2,EEG_RAW_Ch3,EEG_RAW_Ch4,EEG_RAW_Ch5,EEG_RAW_Ch6,EEG_RAW_Ch7,EEG_RAW_Ch8,Stimuli_Seed
Trial,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0 days 00:03:08.041000,1.880411e+05,-4107.475098,-6186.104004,12735.891602,-2445.841064,2163.601074,6617.785156,-3973.197998,-2446.747070,1
1,0 days 00:03:08.055000,1.880554e+05,-4107.475098,-6186.104004,12735.891602,-2445.841064,2163.601074,6617.785156,-3973.197998,-2446.747070,1
1,0 days 00:03:08.056000,1.880556e+05,-4117.774902,-6187.868164,12731.649414,-2451.467041,2159.309082,6615.590820,-3977.537109,-2447.510010,1
1,0 days 00:03:08.057000,1.880566e+05,-4122.782227,-6187.009766,12735.654297,-2449.702881,2162.408936,6617.165039,-3974.389893,-2446.604004,1
1,0 days 00:03:08.068000,1.880679e+05,-4110.289062,-6174.469238,12745.142578,-2435.826904,2174.472900,6628.180176,-3959.273926,-2439.545898,1
...,...,...,...,...,...,...,...,...,...,...,...
6,0 days 00:38:23.533000,2.303533e+06,-4180.479004,-6212.568848,17189.695312,-2142.954102,2721.452881,6450.605957,-4731.226074,-2096.270996,6
6,0 days 00:38:23.565000,2.303565e+06,-4149.817871,-6184.912109,17211.009766,-2132.271973,2750.444092,6468.105957,-4715.347168,-2078.770996,6
6,0 days 00:38:23.567000,2.303567e+06,-4155.587891,-6183.911133,17207.623047,-2130.413086,2746.343018,6457.472168,-4726.838867,-2084.017090,6
6,0 days 00:38:23.568000,2.303568e+06,-4161.405762,-6189.823242,17209.007812,-2123.165039,2743.339111,6464.004883,-4714.917969,-2084.969971,6


### 2.3 Raw save -> Trial

In [None]:
for data_config in RAW_LIST:
    output_dir = data_config.save_dir / participant.id
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    file_path = output_dir / f"{participant.id}_{data_config.name}.csv"
    participant_imotions.datasets[data_config.name].dataset.to_csv(
        file_path, 
        index=True)
    logging.info(f"Dataset '{data_config.name}' for participant {participant.id} saved to {file_path}")


[17:50:05] [INFO] [root] - Dataset 'trial' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/trial/001_pilot_bjoern/001_pilot_bjoern_trial.csv


[17:50:05] [INFO] [root] - Dataset 'temperature' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/trial/001_pilot_bjoern/001_pilot_bjoern_temperature.csv
[17:50:05] [INFO] [root] - Dataset 'rating' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/trial/001_pilot_bjoern/001_pilot_bjoern_rating.csv
[17:50:06] [INFO] [root] - Dataset 'eda' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/trial/001_pilot_bjoern/001_pilot_bjoern_eda.csv
[17:50:11] [INFO] [root] - Dataset 'ecg' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/trial/001_pilot_bjoern/001_pilot_bjoern_ecg.csv
[17:50:17] [INFO] [root] - Dataset 'eeg' for participant 001_pilot_bjoern saved to /Users/visser/drive/PhD/Code/pain-measurement/data/_test/trial/001_pilot_bjoern/001_pilot_bjoern_eeg.csv
[17:50:18] [INFO] [root] - Dataset 'pu

Unnamed: 0_level_0,Unnamed: 1_level_0,Timestamp,EEG_RAW_Ch1,EEG_RAW_Ch2,EEG_RAW_Ch3,EEG_RAW_Ch4,EEG_RAW_Ch5,EEG_RAW_Ch6,EEG_RAW_Ch7,EEG_RAW_Ch8,Stimuli_Seed
Trial,Time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0 days 00:03:08.041000,1.880411e+05,-4107.475098,-6186.104004,12735.891602,-2445.841064,2163.601074,6617.785156,-3973.197998,-2446.747070,1
1,0 days 00:03:08.055000,1.880554e+05,-4107.475098,-6186.104004,12735.891602,-2445.841064,2163.601074,6617.785156,-3973.197998,-2446.747070,1
1,0 days 00:03:08.056000,1.880556e+05,-4117.774902,-6187.868164,12731.649414,-2451.467041,2159.309082,6615.590820,-3977.537109,-2447.510010,1
1,0 days 00:03:08.057000,1.880566e+05,-4122.782227,-6187.009766,12735.654297,-2449.702881,2162.408936,6617.165039,-3974.389893,-2446.604004,1
1,0 days 00:03:08.068000,1.880679e+05,-4110.289062,-6174.469238,12745.142578,-2435.826904,2174.472900,6628.180176,-3959.273926,-2439.545898,1
...,...,...,...,...,...,...,...,...,...,...,...
6,0 days 00:38:23.533000,2.303533e+06,-4180.479004,-6212.568848,17189.695312,-2142.954102,2721.452881,6450.605957,-4731.226074,-2096.270996,6
6,0 days 00:38:23.565000,2.303565e+06,-4149.817871,-6184.912109,17211.009766,-2132.271973,2750.444092,6468.105957,-4715.347168,-2078.770996,6
6,0 days 00:38:23.567000,2.303567e+06,-4155.587891,-6183.911133,17207.623047,-2130.413086,2746.343018,6457.472168,-4726.838867,-2084.017090,6
6,0 days 00:38:23.568000,2.303568e+06,-4161.405762,-6189.823242,17209.007812,-2123.165039,2743.339111,6464.004883,-4714.917969,-2084.969971,6


In [None]:
# Get dataframe without temperature
participant_raw.datasets['eeg'].dataset.drop(columns=['Temperature'])
participant_raw.temperature

KeyError: "['Temperature'] not found in axis"

In [None]:
def merge_participant_datasets(participant: Participant) -> pd.DataFrame:
    data_frames = [data.dataset.drop(columns=['Timestamp','Stimuli_Seed']) for data in participant.datasets.values()]
    # Use reduce to merge all DataFrames on 'Timestamp'
    merged_df = reduce(
        # pd.concat would lead to duplicate timestamps
        lambda left, right: pd.merge(left, right, on='Time', how='outer'),
        data_frames
    )
    merged_df.sort_values(by=['Time'], inplace=True)
    logging.info(f"Dataframe shape: {merged_df.shape}")
    return merged_df


[23:54:38] [INFO] [root] - Dataframe shape: (596056, 67)


Unnamed: 0_level_0,Temperature,Rating,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,ECG_d_Battery,ECG_LL-RA,ECG_LA-RA,ECG_Vx-RL,ECG_LL-RA_HeartRate,...,Blink,BlinkRate,Pitch,Yaw,Roll,Interocular Distance,CPU Sys,Memory Sys,CPU Proc,Memory Proc
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0 days 00:03:08.041000,38.600,48.083,3638.112821,6.057357,100.0,3756.082051,0.06289,-0.915007,-603.386639,66.349892,...,0.0,30.0,0.433156,1.623179,-0.515956,135.472092,37.263802,19.966390,23.124765,715.867188
0 days 00:03:08.046000,,,3638.112821,6.057357,100.0,,,,,,...,,,,,,,,,,
0 days 00:03:08.047000,38.600,48.083,,,,,,,,,...,,,,,,,,,,
0 days 00:03:08.049000,,,3659.958974,6.062463,100.0,,,,,,...,,,,,,,,,,
0 days 00:03:08.050000,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 00:38:23.562000,,,,,,,,,,,...,,,,,,,,,,
0 days 00:38:23.565000,,27.000,,,,,,,,,...,,,,,,,,,,
0 days 00:38:23.567000,,,,,,,,,,,...,,,,,,,,,,
0 days 00:38:23.568000,,,,,,,,,,,...,,,,,,,,,,


In [None]:
def merge_participant_datasets_2(participant: Participant) -> pd.DataFrame:
    data_frames = [data.dataset.reset_index().drop(columns=['Time','Stimuli_Seed','Trial']) for data in participant.datasets.values()]
    # Use reduce to merge all DataFrames on 'Timestamp'
    merged_df = reduce(
        # pd.concat would lead to duplicate timestamps
        lambda left, right: pd.merge(left, right, on='Timestamp', how='outer'),
        data_frames
    )
    merged_df.sort_values(by=['Timestamp'], inplace=True)
    logging.info(f"Dataframe shape: {merged_df.shape}")
    return merged_df

In [None]:
# %%timeit
merge_participant_datasets(participant_raw)


[00:00:05] [INFO] [root] - Dataframe shape: (596056, 67)


Unnamed: 0_level_0,Temperature,Rating,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,ECG_d_Battery,ECG_LL-RA,ECG_LA-RA,ECG_Vx-RL,ECG_LL-RA_HeartRate,...,Blink,BlinkRate,Pitch,Yaw,Roll,Interocular Distance,CPU Sys,Memory Sys,CPU Proc,Memory Proc
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0 days 00:03:08.041000,38.600,48.083,3638.112821,6.057357,100.0,3756.082051,0.06289,-0.915007,-603.386639,66.349892,...,0.0,30.0,0.433156,1.623179,-0.515956,135.472092,37.263802,19.966390,23.124765,715.867188
0 days 00:03:08.046000,,,3638.112821,6.057357,100.0,,,,,,...,,,,,,,,,,
0 days 00:03:08.047000,38.600,48.083,,,,,,,,,...,,,,,,,,,,
0 days 00:03:08.049000,,,3659.958974,6.062463,100.0,,,,,,...,,,,,,,,,,
0 days 00:03:08.050000,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0 days 00:38:23.562000,,,,,,,,,,,...,,,,,,,,,,
0 days 00:38:23.565000,,27.000,,,,,,,,,...,,,,,,,,,,
0 days 00:38:23.567000,,,,,,,,,,,...,,,,,,,,,,
0 days 00:38:23.568000,,,,,,,,,,,...,,,,,,,,,,


In [None]:
# %%timeit
merge_participant_datasets_2(participant_raw)


[00:00:07] [INFO] [root] - Dataframe shape: (660553, 68)


Unnamed: 0,Timestamp,Temperature,Rating,EDA_d_Battery,EDA_RAW,EDA_d_PacketReceptionRate,ECG_d_Battery,ECG_LL-RA,ECG_LA-RA,ECG_Vx-RL,...,Blink,BlinkRate,Pitch,Yaw,Roll,Interocular Distance,CPU Sys,Memory Sys,CPU Proc,Memory Proc
0,1.880411e+05,38.600,48.083,3638.112821,6.057357,100.0,3756.082051,0.06289,-0.915007,-603.386639,...,0.0,30.0,0.433156,1.623179,-0.515956,135.472092,37.263802,19.966390,23.124765,715.867188
107446,1.880461e+05,,,3638.112821,6.057357,100.0,,,,,...,,,,,,,,,,
12,1.880471e+05,38.600,48.083,,,,,,,,...,,,,,,,,,,
107447,1.880491e+05,,,3659.958974,6.062463,100.0,,,,,...,,,,,,,,,,
517945,1.880501e+05,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107445,2.303565e+06,,27.000,,,,,,,,...,,,,,,,,,,
517942,2.303565e+06,,,,,,,,,,...,,,,,,,,,,
517943,2.303567e+06,,,,,,,,,,...,,,,,,,,,,
517944,2.303568e+06,,,,,,,,,,...,,,,,,,,,,


## 3. Trial -> Processed

### 3.1 Trial load

### 3.2 Trial transform

### 3.3 Trial save

In [None]:
obj = pd.Series([4, 7, -5, 3])
a = obj[obj > 0]
a, a.reset_index(drop=True)

(0    4
 1    7
 3    3
 dtype: int64,
 0    4
 1    7
 2    3
 dtype: int64)

In [None]:
animals = [
    'cat',
    'dog',
    'panda'
]
animals[0] = 'bug'
new_animals = animals
new_animals[0] = 'elephant'

print(animals)
print(new_animals)

['elephant', 'dog', 'panda']
['elephant', 'dog', 'panda']


In [None]:
a = [1, 2, 3, 4]
b = a
b[1] = 0
a,b

([1, 0, 3, 4], [1, 0, 3, 4])

In [None]:
points = np.arange(-5, 5, 0.01) # 100 equally spaced points

xs, ys = np.meshgrid(points, points)

ys.T


array([[-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       ...,
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99]])

In [None]:
arr = np.array([0, 1, 2, 3])
arr.cumsum()


array([0, 1, 3, 6])

In [None]:
xs

array([[-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       ...,
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99],
       [-5.  , -4.99, -4.98, ...,  4.97,  4.98,  4.99]])

In [None]:
participant = load_participant_datasets(PARTICIPANT_LIST[0], RAW_LIST)

UnboundLocalError: cannot access local variable 'file_start_index' where it is not associated with a value