In [1]:
%%capture
from pathlib import Path

if Path.cwd().stem == "notebooks":
    %cd ..
    %load_ext autoreload
    %autoreload 2

In [2]:
import logging

import holoviews as hv
import hvplot.polars  # noqa
import neurokit2 as nk
import pandas as pd
import polars as pl
from icecream import ic
from polars import col

from src.data.database_manager import DatabaseManager
from src.features.quality_checks import check_sample_rate
from src.features.resampling import add_timestamp_μs_column
from src.features.scaling import scale_min_max
from src.features.transforming import map_trials, merge_data_dfs
from src.log_config import configure_logging
from src.plots.utils import prepare_multiline_hvplot

configure_logging(
    stream_level=logging.DEBUG, ignore_libs=("Comm", "bokeh", "tornado", "matplotlib")
)
logger = logging.getLogger(__name__.rsplit(".", maxsplit=1)[-1])

pl.Config.set_tbl_rows(12)  # for the 12 trials
hv.output(widget_location="bottom", size=130)

In [3]:
db = DatabaseManager()

In [54]:
with db:
    stimulus = db.get_table("feature_stimulus")
    trials = db.get_table("trials")  # get trials for stimulus seeds


df = merge_data_dfs(
    [stimulus, trials],
    merge_on=["participant_id", "trial_id", "trial_number"],
).drop("duration", "skin_area", "timestamp_start", "timestamp_end", strict=False)
df

trial_id,trial_number,participant_id,rownumber,timestamp,temperature,rating,stimulus_seed
u16,u8,u8,u32,f64,f64,f64,u16
1,1,1,0,294224.331,0.0,0.425,396
1,1,1,1,294357.9645,0.000069,0.425,396
1,1,1,2,294458.0292,0.000277,0.35375,396
1,1,1,3,294558.6006,0.000622,0.14875,396
1,1,1,4,294658.3354,0.001106,0.10125,396
1,1,1,5,294758.4957,0.001728,0.2275,396
…,…,…,…,…,…,…,…
332,12,28,21606,2.7766e6,0.158607,0.85,133
332,12,28,21607,2.7767e6,0.157223,0.85,133
332,12,28,21608,2.7768e6,0.156232,0.85,133


In [55]:
def zero_based_timestamps(df: pl.DataFrame) -> pl.DataFrame:
    return df.with_columns(
        (col("timestamp") - col("timestamp").min().over("trial_id")).alias(
            "zeroed_timestamp"
        )
    )


def aggregate_over_seeds(
    df: pl.DataFrame,
    bin_size: int = 1,  # TODO
) -> pl.DataFrame:
    """Aggregate over seeds for each trial using group_by_dynamic."""
    # Note: without group_by_dynamic, this would be something like
    # >>> df.with_columns(
    # >>>     [(col("zeroed_timestamp") // 1000).cast(pl.Int32).alias("time_bin")]
    # >>>     )
    # >>>     .group_by(["stimulus_seed", "time_bin"])

    # Zero-based timestamp in milliseconds
    df = zero_based_timestamps(df)
    # Add microsecond timestamp column for better precision as group_by_dynamic uses int
    df = add_timestamp_μs_column(df, "zeroed_timestamp")
    return (
        (
            df.sort("zeroed_timestamp_µs")
            .group_by_dynamic(
                "zeroed_timestamp_µs",
                every=f"{int((1000 / 1)*1000)}i",
                group_by=["stimulus_seed"],
            )
            .agg(
                [
                    col("rating").mean().alias("avg_rating"),
                    col("temperature").mean().alias("avg_temperature"),
                    col("rating").std().alias("std_dev"),
                    pl.len().alias("sample_size"),
                ]
            )
        )
        .with_columns(
            (col("zeroed_timestamp_µs") / 1_000_000).cast(pl.Int16).alias("time_bin")
        )
        .sort("stimulus_seed", "time_bin")
        # remove measures at exactly 180s so that they don't get their own bin
        .filter(col("time_bin") < 180)
        .drop("zeroed_timestamp_µs")
    )


aggregate_over_seeds(df)

stimulus_seed,avg_rating,avg_temperature,std_dev,sample_size,time_bin
u16,f64,f64,f64,u32,i16
133,0.432017,0.007027,0.260652,282,0
133,0.580223,0.052573,0.219553,280,1
133,0.619137,0.141308,0.224974,281,2
133,0.657968,0.265075,0.218392,283,3
133,0.697452,0.412534,0.204328,287,4
133,0.736053,0.567752,0.180708,285,5
…,…,…,…,…,…
952,0.147058,0.212601,0.212265,280,174
952,0.120062,0.161466,0.196615,280,175
952,0.103728,0.119274,0.185324,280,176


In [None]:
result = (
    (
        df.with_columns([(col("zeroed_timestamp") // 1000 * 1000).alias("time_bin")])
        .group_by(["stimulus_seed", "time_bin"])
        .agg(
            [
                col("rating").mean().alias("avg_rating"),
                col("temperature").mean().alias("avg_temperature"),
                col("rating").std().alias("std_dev"),
                pl.len().alias("sample_size"),
            ]
        )
    )
    .sort("stimulus_seed", "time_bin")
    .filter(
        col("time_bin") < 180000
    )  # remove measures at exactly 180s so that they don't get their own bin
)
result

stimulus_seed,time_bin,avg_rating,avg_temperature,std_dev,sample_size
u16,f64,f64,f64,f64,u32
133,0.0,0.432017,0.007027,0.260652,282
133,1000.0,0.580223,0.052573,0.219553,280
133,2000.0,0.619137,0.141308,0.224974,281
133,3000.0,0.657968,0.265075,0.218392,283
133,4000.0,0.697452,0.412534,0.204328,287
133,5000.0,0.736053,0.567752,0.180708,285
…,…,…,…,…,…
952,174000.0,0.147058,0.212601,0.212265,280
952,175000.0,0.120062,0.161466,0.196615,280
952,176000.0,0.103728,0.119274,0.185324,280


In [17]:
# NOTE: in some edge cases, there is aexactly one sample for timestamp=180s.
# TODO: Maybe they should be removed.

result = (
    df.with_columns([(col("zeroed_timestamp") // 1000 * 1000).alias("time_bin")])
    .group_by(["stimulus_seed", "time_bin"])
    .agg(
        [
            col("rating").mean().alias("avg_rating"),
            col("temperature").mean().alias("avg_temperature"),
            col("rating").std().alias("std_dev"),
            pl.len().alias("sample_size"),
        ]
    )
    .with_columns(
        [
            (
                col("avg_rating") - 1.96 * (col("std_dev") / col("sample_size").sqrt())
            ).alias("ci_lower"),
            (
                col("avg_rating") + 1.96 * (col("std_dev") / col("sample_size").sqrt())
            ).alias("ci_upper"),
        ]
    )
    .select(
        [
            "stimulus_seed",
            "time_bin",
            "avg_rating",
            "ci_lower",
            "ci_upper",
            "avg_temperature",
        ]
    )
    .sort(["stimulus_seed", "time_bin"])
)
results = result.with_columns(col("time_bin"))

result.plot(
    x="time_bin",
    y=["avg_rating", "avg_temperature"],
    groupby="stimulus_seed",
    kind="line",
    grid=True,
) * result.hvplot.area(
    x="time_bin",
    y="ci_lower",
    y2="ci_upper",
    groupby="stimulus_seed",
    alpha=0.5,
    line_width=0,
    fill_color="lightblue",
    grid=True,
    ylim=(0, 1),
    xlim=(0, 180 * 1000),
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'922462a6-7952-4dee-95e9-fa8cf1e5e7c4': {'version…

In [None]:
results

stimulus_seed,time_bin,avg_rating,ci_lower,ci_upper,avg_temperature
u16,f64,f64,f64,f64,f64
133,0.0,0.432017,0.401594,0.462439,0.007027
133,1000.0,0.580223,0.554506,0.60594,0.052573
133,2000.0,0.619137,0.592832,0.645442,0.141308
133,3000.0,0.657968,0.632523,0.683413,0.265075
133,4000.0,0.697452,0.673812,0.721092,0.412534
133,5000.0,0.736053,0.715072,0.757033,0.567752
…,…,…,…,…,…
952,175000.0,0.120062,0.097033,0.143092,0.161466
952,176000.0,0.103728,0.08202,0.125435,0.119274
952,177000.0,0.09138,0.070636,0.112124,0.087116
