In [1]:
%%capture
from pathlib import Path

if Path.cwd().stem == "models":
    %cd ../..
    %load_ext autoreload
    %autoreload 2

In [61]:
import logging
from datetime import datetime
from pathlib import Path

import holoviews as hv
import hvplot.polars  # noqa
import matplotlib.pyplot as plt
import numpy as np
import optuna
import polars as pl
import torch
import torch.nn as nn
import torch.optim as optim
from icecream import ic
from polars import col
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader, TensorDataset

from src.data.database_manager import DatabaseManager
from src.features.utils import to_describe
from src.log_config import configure_logging
from src.models.utils import StandardScaler3D

configure_logging(
    stream_level=logging.DEBUG,
    ignore_libs=["matplotlib", "Comm", "bokeh", "tornado"],
)

pl.Config.set_tbl_rows(12)  # for the 12 trials
hv.output(widget_location="bottom", size=130)

db = DatabaseManager()

In [3]:
with db:
    labels = db.get_table("Labels")
    eda = db.get_table("Feature_EDA")

In [4]:
eda.hvplot(x="timestamp", y=["eda_raw"], groupby="trial_id", width=800, height=400)

BokehModel(combine_events=True, render_bundle={'docs_json': {'39a3b0bd-9afd-47fb-a213-be2eec995c04': {'version…

In [5]:
labels = labels.with_columns(
    # Add time counter for decreases and strictly increases
    (
        pl.when(col("strictly_increasing_intervals") != 0)
        .then(
            col("timestamp")
            - col("timestamp").min().over("strictly_increasing_intervals")
        )
        .otherwise(None)
    ).alias("normalized_timestamp_increases"),
    (
        pl.when(col("decreasing_intervals") != 0)
        .then(col("timestamp") - col("timestamp").min().over("decreasing_intervals"))
        .otherwise(None)
    ).alias("normalized_timestamp_decreases"),
    # Only keep the first 5 seconds
).filter(
    (col("normalized_timestamp_increases") < 5000)
    | (col("normalized_timestamp_decreases") < 5000)
)
labels

trial_id,trial_number,participant_id,rownumber,timestamp,temperature,rating,stimulus_seed,skin_area,normalized_timestamp,decreasing_intervals,major_decreasing_intervals,increasing_intervals,plateau_intervals,prolonged_minima_intervals,strictly_increasing_intervals,normalized_timestamp_increases,normalized_timestamp_decreases
u16,u8,u8,u32,f64,f64,f64,u16,u8,f64,u16,u16,u16,u16,u16,u16,f64,f64
1,1,1,320,326250.8398,0.80056,0.83375,396,1,32026.5088,1,0,0,0,0,0,,0.0
1,1,1,321,326351.5703,0.800524,0.83375,396,1,32127.2393,1,0,0,0,0,0,,100.7305
1,1,1,322,326453.3964,0.800415,0.8375,396,1,32229.0654,1,0,0,0,0,0,,202.5566
1,1,1,323,326551.3829,0.800232,0.83875,396,1,32327.0519,1,0,0,0,0,0,,300.5431
1,1,1,324,326651.1161,0.799977,0.83875,396,1,32426.7851,1,0,0,0,0,0,,400.2763
1,1,1,325,326751.5133,0.799649,0.83875,396,1,32527.1823,1,0,0,0,0,0,,500.6735
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
332,12,28,21586,2.7746e6,0.25889,0.80625,133,1,177496.8039,1660,0,0,0,0,0,,4403.8851
332,12,28,21587,2.7747e6,0.251239,0.80625,133,1,177596.5374,1660,0,0,0,0,0,,4503.6186
332,12,28,21588,2.7748e6,0.243786,0.81375,133,1,177698.2663,1660,0,0,0,0,0,,4605.3475


In [6]:
# Split data into decreasing and increasing intervals to add labels and sample ids
decreases = labels.filter(
    col("normalized_timestamp_decreases").is_not_null()
).with_columns(
    pl.lit(1).alias("label").cast(pl.UInt8),
    col("decreasing_intervals").alias("sample_id"),
)

In [7]:
increases = labels.filter(
    col("normalized_timestamp_increases").is_not_null()
).with_columns(
    pl.lit(0).alias("label").cast(pl.UInt8),
    (
        col("strictly_increasing_intervals")
        + (decreases.select(pl.last("decreasing_intervals")))  # continue from decreases
    ).alias("sample_id"),
)

In [8]:
# Join the two tables
labels = decreases.vstack(increases).sort("sample_id", "timestamp")

In [9]:
# Normalize the data
ROWS_PER_SAMPLE = 50

labels = (
    labels.sort(["sample_id"])  # Sort within each group if needed
    .group_by("sample_id", maintain_order=True)
    .agg(pl.all().head(ROWS_PER_SAMPLE))
    .explode(pl.all().exclude("sample_id"))  # Explode the result back into rows
)

labels.select(pl.last("sample_id")).item() * ROWS_PER_SAMPLE, labels.height

(132800, 132796)

In [10]:
# Sanity check
if not labels.height == labels.select(pl.last("sample_id")).item() * ROWS_PER_SAMPLE:
    affected_samples = []
    for sample_id, group in labels.group_by("sample_id", maintain_order=True):
        if group.height < ROWS_PER_SAMPLE or group.height > ROWS_PER_SAMPLE:
            affected_samples.append(sample_id[0])  # sample_id is a tuple
logging.debug(
    f"Normalizing to equal {ROWS_PER_SAMPLE} rows per sample was not successful for the following samples: {affected_samples}"
)
labels = labels.filter(~col("sample_id").is_in(affected_samples))

12:09:42 | [36mDEBUG   [0m| root | Normalizing to equal 50 rows per sample was not successful for the following samples: [1310, 2143, 2428, 2641]


In [11]:
labels = labels.select(
    "sample_id",
    "participant_id",
    "rating",
    "label",
)
labels

sample_id,participant_id,rating,label
u16,u8,f64,u8
1,1,0.83375,1
1,1,0.83375,1
1,1,0.8375,1
1,1,0.83875,1
1,1,0.83875,1
1,1,0.83875,1
…,…,…,…
2656,28,0.66375,0
2656,28,0.6675,0
2656,28,0.6675,0


In [57]:
# x has the dimensions (samples, time steps, features)
x = labels.group_by("sample_id").agg("rating").get_column("rating").to_numpy()
# reshaping the array
# NOTE: for univariate data right now, need to be adapted for multivariate
x = np.vstack(x)
x = np.expand_dims(x, axis=2)
y = (
    labels.group_by("sample_id")
    .agg((col("label").first()))
    .get_column("label")
    .to_numpy()
)
groups = (
    labels.group_by("sample_id")
    .agg((col("participant_id").first()))
    .get_column("participant_id")
    .to_numpy()
)

In [63]:
np.save("data/x.npy", x)
np.save("data/y.npy", y)
np.save("data/groups.npy", groups)

In [2]:
train, test = train_test_split(labels, test_size=0.2, random_state=42)
train

NameError: name 'train_test_split' is not defined