In [37]:
%%capture
from pathlib import Path

if Path.cwd().stem == "notebooks":
    %cd ..
    %load_ext autoreload
    %autoreload 2

In [60]:
import logging

import duckdb
import holoviews as hv
import hvplot.polars
import neurokit2 as nk
import pandas as pd
import polars as pl
from icecream import ic
from polars import col

from src.data.database_manager import DatabaseManager
from src.features.eda import nk_process_eda
from src.features.quality_checks import check_sample_rate
from src.features.scaling import scale_min_max
from src.features.transformations import map_trials, remove_duplicate_timestamps
from src.log_config import configure_logging
from src.visualization.utils import prepare_multiline_hvplot

configure_logging(
    stream_level=logging.DEBUG, ignore_libs=("Comm", "bokeh", "tornado", "matplotlib")
)
logger = logging.getLogger(__name__.rsplit(".", maxsplit=1)[-1])

pl.Config.set_tbl_rows(12)  # for the 12 trials
hv.output(widget_location="bottom", size=130)

In [54]:
db = DatabaseManager()

In [56]:
with db:
    df = db.read_table("Raw_eda")
# df = df.unique("timestamp").sort("trial_id","timestamp")  # sample_rate = 100 Hz
df = remove_duplicate_timestamps(df)
check_sample_rate(df)

12:37:53 | [36mDEBUG   [0m| quality_checks | Sample rate per trial: [ 97.89  98.81  98.71  98.69  98.84  98.98  98.52  98.75  98.61  98.9
  98.54  98.71 100.41 100.75 100.66 100.68 100.7  100.44 100.57 100.63
 100.69 100.48 100.68 100.37  99.03  99.    99.    98.93  99.23  98.69
  99.08  98.84  99.27  98.82  98.93  99.09  99.07  98.93  99.2   99.09
  99.23  99.11  99.31  98.88  98.73  98.78  98.69  98.78  98.52  98.78
  98.59  98.49  98.7   98.63  98.72  98.6   98.7   98.82  99.02  98.66
  98.65  98.63  98.92  98.97  98.9   98.81  98.9   98.85  99.17  99.14
  98.83  98.97  98.98  98.85  99.07  99.    98.79  98.75  98.76  98.7
  98.9   98.73  98.96  98.71  98.91  98.8   98.6   98.88  98.81  98.72
  98.74  98.63 100.51 100.54 100.5  100.51 100.67 100.65 100.65 100.57
 100.63 100.6  100.61 100.61  98.6   98.92  98.54  98.82  98.75  98.72
  98.69  98.85  98.63  98.82  98.57  98.54  98.93  98.83  98.93  98.93
  99.02  98.97  98.98  98.91  98.86  98.8   98.8   98.68  98.9   98.42
  98.72  

In [57]:
# @map_trials
# def nk_process_eda(
#     df: pl.DataFrame,
#     sampling_rate: int = 100,
#     method: str = "neurokit",
# ) -> pl.DataFrame:
#     """
#     Process EDA signal using NeuroKit2.

#     The default method "neurokit" is based on a high-pass filter of 0.05 Hz as used in
#     the BIOPAC algorithm.

#     https://www.biopac.com/knowledge-base/phasic-eda-issue/,
#     https://github.com/neuropsychology/NeuroKit/blob/1aa8deee392f8098df4fd77a23f696c2ff2d29db/neurokit2/eda/eda_phasic.py#L141
#     """
#     eda_raw = df.get_column("eda_raw").to_numpy()
#     eda_processed: pd.DataFrame = nk.eda_phasic(
#         eda_signal=eda_raw,
#         sampling_rate=sampling_rate,
#         method=method,
#     )  # this returns EDA_Phasic and EDA_Tonic columns
#     df = df.hstack(pl.from_pandas(eda_processed))
#     return df.select(pl.all().name.to_lowercase())


# df = nk_process_eda(df)
# df.plot(
#     x="timestamp",
#     y=["eda_raw", "eda_tonic"],
#     groupby="trial_id",
# )

## Comparing neurokit / biopac / highpass filter (0.05 Hz) vs Median smoothing

From the neurokit2 documentation:
  * **High-pass filtering**: Method implemented in Biopac's Acqknowledge. The raw EDA signal
    is passed through a high pass filter with a cutoff frequency of 0.05 Hz
    (cutoff frequency can be adjusted by the ``cutoff`` argument).
  * **Median smoothing**: Method implemented in Biopac's Acqknowledge. The raw EDA signal is
    passed through a median value smoothing filter, which removes areas of rapid change. The
    phasic component is then calculated by subtracting the smoothed signal from the original.
    This method is computationally intensive and the processing time depends on the smoothing
    factor, which can be controlled by the as ``smoothing_factor`` argument, set by default to
    ``4`` seconds. Higher values will produce results more rapidly.

-> high pass filter is much faster than median smoothing

In [42]:
scale_min_max(
    nk_process_eda(
        df,
        method="neurokit",
    )
).plot(
    x="timestamp",
    y=["eda_raw", "eda_phasic", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'052214c4-74d5-461a-8cb5-37a49d8f0110': {'version…

In [43]:
# median smoothing is much slower, that why we only use it for one trial
trial_id = 98

scale_min_max(
    nk_process_eda(
        df.filter(col("trial_id") == trial_id),
        method="median",
    )
).plot(
    x="timestamp",
    y=["eda_raw", "eda_phasic", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'fd78ad2d-ac81-4331-a497-f435eca2de42': {'version…

## Other available methods:
- convex
- sparse

In [44]:
# convex smoothing is even slower
trial_id = 98

scale_min_max(
    nk_process_eda(
        df.filter(col("trial_id") == trial_id),
        method="convex",
    )
).plot(
    x="timestamp",
    y=["eda_raw", "eda_phasic", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'4c659946-d772-4dcc-8f22-e9681e88f1c6': {'version…

In [45]:
# sparse smoothing is even slower
trial_id = 98

scale_min_max(
    nk_process_eda(
        df.filter(col("trial_id") == trial_id),
        method="sparse",
    )
).plot(
    x="timestamp",
    y=["eda_raw", "eda_phasic", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'34f9c8f3-2de4-4054-8671-3f12317ab05c': {'version…

In [46]:
with db:
    df = db.read_table("Preprocess_eda")
df.plot(
    x="timestamp",
    y=["eda_raw", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'16d291ca-3a0e-4a7c-aabc-0c622775aa8d': {'version…

## Join with Stimulus data

In [47]:
query = """
SELECT * FROM Preprocess_EDA AS pe
ASOF JOIN Raw_Stimulus rs USING (trial_id, timestamp)
ORDER BY pe.trial_id, pe.timestamp
"""

with db:
    df = db.execute(query).pl()
df
ic(df.height, df.unique("timestamp").height)
# ensure that the timestamps are unique
ic(df.height - df.unique("timestamp").height)
df

ic| df.height: 5934569, df.unique("timestamp").height: 5934569
ic| df.height - df.unique("timestamp").height: 0


trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_d_battery,eda_raw,eda_d_packetreceptionrate,eda_tonic,eda_phasic,temperature,rating
u16,u8,u8,u32,f64,i64,f64,f64,i64,f64,f64,f64,f64
1,1,1,0,294224.331,57895,3677.435897,0.753247,100,0.752117,0.00113,45.75,42.5
1,1,1,0,294242.275,57896,3687.630769,0.754135,100,0.752119,0.002016,45.75,42.5
1,1,1,0,294248.2588,57898,3692.0,0.754135,100,0.752121,0.002014,45.75,42.5
1,1,1,0,294276.1835,57899,3678.892308,0.754135,100,0.752123,0.002012,45.75,42.5
1,1,1,0,294277.1819,57900,3692.0,0.752359,100,0.752125,0.000234,45.75,42.5
1,1,1,0,294309.0952,57902,3705.107692,0.752359,100,0.752127,0.000232,45.75,42.5
…,…,…,…,…,…,…,…,…,…,…,…,…
332,12,28,21610,2.7771e6,467073,3603.158974,13.679468,99,13.578895,-0.012186,45.582614,85.0
332,12,28,21610,2.7771e6,467074,3617.723077,13.679468,99,13.578895,-0.012764,45.582614,85.0
332,12,28,21610,2.7771e6,467075,3582.769231,13.674363,99,13.578895,-0.01845,45.582614,85.0


In [48]:
(df).plot(
    x="timestamp",
    y=["eda_raw", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'23e7ba23-484d-49f6-bf8d-f4ab5d44b4a8': {'version…

In [49]:
scale_min_max(df).plot(
    x="timestamp",
    y=["eda_raw", "eda_phasic", "eda_tonic"],
    groupby="trial_id",
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'b1483d2d-7c9a-4542-90b4-f39bc0355e5a': {'version…