In [132]:
%%capture
from pathlib import Path

if Path.cwd().stem == "notebooks":
    %cd ..
    %load_ext autoreload
    %autoreload 2

In [133]:
import logging
from pathlib import Path

import altair as alt
import holoviews as hv
import hvplot.polars  # noqa
import matplotlib.pyplot as plt
import numpy as np
import polars as pl
from polars import col

from src.data.database_manager import DatabaseManager
from src.features.labels import add_labels, process_labels
from src.features.resampling import add_timestamp_μs_column, interpolate_and_fill_nulls
from src.features.scaling import scale_min_max
from src.features.transforming import map_trials, merge_dfs
from src.features.utils import to_describe
from src.log_config import configure_logging
from src.plots.confidence_intervals import plot_confidence_intervals
from src.plots.correlations import (
    aggregate_correlations_fisher_z,
    calculate_correlations_by_trial,
    plot_correlations_by_participant,
    plot_correlations_by_trial,
)
from src.plots.utils import prepare_multiline_hvplot

logger = logging.getLogger(__name__.rsplit(".", maxsplit=1)[-1])
configure_logging(
    stream_level=logging.DEBUG,
    ignore_libs=["matplotlib", "Comm", "bokeh", "tornado", "param", "numba"],
)

pl.Config.set_tbl_rows(12)  # for the 12 trials
hv.output(widget_location="bottom", size=130)

In [134]:
db = DatabaseManager()

In [135]:
exclude_invalid_trials = False
with db:
    eda = db.get_table("Feature_EDA", exclude_invalid_trials)
    stimulus = db.get_table("Feature_Stimulus", exclude_invalid_trials)
    trials = db.get_table("Trials", exclude_invalid_trials)

stimulus = add_timestamp_μs_column(stimulus)
eda = add_timestamp_μs_column(eda)

df = merge_dfs([eda, stimulus])
df = add_timestamp_µs_column(df)
df

trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_raw,eda_tonic,eda_phasic,timestamp_µs,temperature,rating,timestamp_µs_right
u16,u8,u8,u32,f64,i64,f64,f64,f64,i64,f64,f64,i64
1,1,1,37660,294210.3603,57892,0.743774,0.743503,0.000271,294210360,,,
1,1,1,,294224.331,,,,,294224331,0.0,0.425,294224331
1,1,1,37673,294340.0132,57905,0.745042,0.743523,0.001519,294340013,,,
1,1,1,,294357.9645,,,,,294357964,0.000069,0.425,294357964
1,1,1,37687,294421.1276,57919,0.744819,0.743543,0.001276,294421127,,,
1,1,1,,294458.0292,,,,,294458029,0.000277,0.35375,294458029
…,…,…,…,…,…,…,…,…,…,…,…,…
332,12,28,,2.7769e6,,,,,2776927121,0.155637,0.85,2776927121
332,12,28,355456,2.7769e6,467053,13.50079,13.42346,-0.025488,2776943078,,,
332,12,28,355470,2.7770e6,467067,13.514941,13.423458,-0.017623,2777017878,,,


99.9982537825655

In [None]:
stimulus.with_columns(
    col("timestamp").diff().mean().over("trial_id").alias("diff_mean")
).get_column("diff_mean").unique().mean()

stimulus.with_columns(
    col("timestamp").diff().std().over("trial_id").alias("diff_std")
).get_column("diff_std").unique().mean()

4.1968711161609304

In [138]:
stimulus


trial_id,trial_number,participant_id,rownumber,timestamp,temperature,rating,timestamp_µs
u16,u8,u8,u32,f64,f64,f64,i64
1,1,1,0,294224.331,0.0,0.425,294224331
1,1,1,1,294357.9645,0.000069,0.425,294357964
1,1,1,2,294458.0292,0.000277,0.35375,294458029
1,1,1,3,294558.6006,0.000622,0.14875,294558600
1,1,1,4,294658.3354,0.001106,0.10125,294658335
1,1,1,5,294758.4957,0.001728,0.2275,294758495
…,…,…,…,…,…,…,…
332,12,28,21606,2.7766e6,0.158607,0.85,2776626924
332,12,28,21607,2.7767e6,0.157223,0.85,2776728651
332,12,28,21608,2.7768e6,0.156232,0.85,2776826390


In [139]:
stim = stimulus.with_columns(
    col("timestamp").round(0).alias("timestamp_resampling").cast(pl.Int64)
)
stim = stim.with_columns(original_sampling=pl.lit(True))
stim = stim.upsample(
    time_column="timestamp_resampling",
    every="10i",
    maintain_order=True,
    group_by="trial_id",
).with_columns(
    # do not lose crucial information
    pl.col(pl.selectors.INTEGER_DTYPES).forward_fill()
)

stim

timestamp_resampling,trial_id,trial_number,participant_id,rownumber,timestamp,temperature,rating,timestamp_µs,original_sampling
i64,u16,u8,u8,u32,f64,f64,f64,i64,bool
294224,1,1,1,0,294224.331,0.0,0.425,294224331,true
294234,1,1,1,0,,,,294224331,
294244,1,1,1,0,,,,294224331,
294254,1,1,1,0,,,,294224331,
294264,1,1,1,0,,,,294224331,
294274,1,1,1,0,,,,294224331,
…,…,…,…,…,…,…,…,…,…
2777069,332,12,28,21607,,,,2776728651,
2777079,332,12,28,21607,,,,2776728651,
2777089,332,12,28,21607,,,,2776728651,


In [178]:
st = interpolate_and_fill_nulls(stim, time_column="timestamp_resampling")
s = st.with_columns(
    pl.when(pl.col("original_sampling"))
    .then(pl.col("original_sampling"))
    .otherwise(False)
)  # .filter(col("original_sampling") == False)
s

timestamp_resampling,trial_id,trial_number,participant_id,rownumber,timestamp,temperature,rating,timestamp_µs,original_sampling
i64,u16,u8,u8,u32,f64,f64,f64,i64,bool
294224,1,1,1,0,294224.331,0.0,0.425,294224331,true
294234,1,1,1,0,294234.329218,0.000219,0.425745,294224331,false
294244,1,1,1,0,294244.327435,0.000437,0.42649,294224331,false
294254,1,1,1,0,294254.325653,0.000656,0.427234,294224331,false
294264,1,1,1,0,294264.323871,0.000874,0.427979,294224331,false
294274,1,1,1,0,294274.322088,0.001093,0.428724,294224331,false
…,…,…,…,…,…,…,…,…,…
2777069,332,12,28,21607,2.7767e6,0.157223,0.85,2776728651,false
2777079,332,12,28,21607,2.7767e6,0.157223,0.85,2776728651,false
2777089,332,12,28,21607,2.7767e6,0.157223,0.85,2776728651,false


In [183]:
s.with_columns(
    col("timestamp_resampling").diff().over("trial_id").alias("diff")
).get_column("diff").std()


0.0

In [None]:
t = eda.filter(col("trial_id") == 1)
t = t.with_columns(old_sampling=pl.lit(True))
# t = t.with_columns(col("timestamp").cast(pl.Duration("ms")))
t = t.upsample(time_column="timestamp_µs", every="10i", maintain_order=True)

In [None]:
t

timestamp_µs,trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_raw,eda_tonic,eda_phasic,old_sampling
i64,u16,u8,u8,u32,f64,i64,f64,f64,f64,bool
294210360,1,1,1,37660,294210.3603,57892,0.743774,0.743503,0.000271,true
294210370,,,,,,,,,,
294210380,,,,,,,,,,
294210390,,,,,,,,,,
294210400,,,,,,,,,,
294210410,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…
474111910,,,,,,,,,,
474111920,,,,,,,,,,
474111930,,,,,,,,,,


In [None]:
from datetime import datetime

a = pl.DataFrame(
    {
        "time": [
            datetime(2021, 2, 1),
            datetime(2021, 4, 1),
            datetime(2021, 5, 1),
            datetime(2021, 6, 1),
        ],
        "groups": ["A", "B", "A", "B"],
        "values": [0, 1, 2, 3],
    }
).set_sorted("time")
# a.upsample(
#     time_column="time", every="1mo", group_by="groups", maintain_order=True
# ).select(pl.all().forward_fill())
a

time,groups,values
datetime[μs],str,i64
2021-02-01 00:00:00,"""A""",0
2021-04-01 00:00:00,"""B""",1
2021-05-01 00:00:00,"""A""",2
2021-06-01 00:00:00,"""B""",3


In [None]:
294210360

294210360

In [None]:
from datetime import datetime

df = pl.DataFrame(
    {
        "time": [
            datetime(2021, 2, 1),
            datetime(2021, 4, 1),
            datetime(2021, 5, 1),
            datetime(2021, 6, 1),
        ],
        "groups": ["A", "B", "A", "B"],
        "values": [0, 1, 2, 3],
    }
).set_sorted("time")
df.upsample(
    time_column="time", every="1mo", group_by="groups", maintain_order=True
).select(pl.all().forward_fill())

time,groups,values
datetime[μs],str,i64
2021-02-01 00:00:00,"""A""",0
2021-03-01 00:00:00,"""A""",0
2021-04-01 00:00:00,"""A""",0
2021-05-01 00:00:00,"""A""",2
2021-04-01 00:00:00,"""B""",1
2021-05-01 00:00:00,"""B""",1
2021-06-01 00:00:00,"""B""",3
