In [76]:
from pathlib import Path

if Path.cwd().stem == "notebooks":
    %cd ..
    %load_ext autoreload
    %autoreload 2

In [77]:
import logging

import duckdb
import holoviews as hv
import hvplot.polars
import neurokit2 as nk
import numpy as np
import pandas as pd
import plotly.io as pio
import polars as pl
from bokeh.io import output_notebook
from icecream import ic
from polars import col

from src.data.database_manager import DatabaseManager
from src.log_config import configure_logging
from src.visualization.plotting_utils import prepare_multiline_hvplot

configure_logging(
    stream_level=logging.DEBUG, ignore_libs=("Comm", "bokeh", "tornado", "matplotlib")
)
logger = logging.getLogger(__name__.rsplit(".", maxsplit=1)[-1])

pl.Config.set_tbl_rows(12)  # for the 12 trials

polars.config.Config

In [78]:
db = DatabaseManager()

In [85]:
query = """
from raw_eda
"""
with db:
    df = db.execute(query).pl()
df

trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_d_battery,eda_raw,eda_d_packetreceptionrate
u16,u8,u8,u32,f64,i64,f64,f64,i64
1,1,1,37660,294210.3603,57892,3686.174359,0.752359,100
1,1,1,37661,294211.3575,57893,3693.45641,0.754579,100
1,1,1,37662,294211.3575,57894,3687.630769,0.753247,100
1,1,1,37663,294224.331,57895,3677.435897,0.753247,100
1,1,1,37664,294242.275,57896,3687.630769,0.754135,100
1,1,1,37665,294242.275,57897,3694.912821,0.752359,100
…,…,…,…,…,…,…,…,…
332,12,28,355477,2.7771e6,467074,3617.723077,13.679468,99
332,12,28,355478,2.7771e6,467075,3582.769231,13.674363,99
332,12,28,355479,2.7771e6,467076,3607.528205,13.679468,99


In [94]:
eda_raw = df.get_column("eda_raw").to_numpy()
eda_processed: pd.DataFrame = nk.eda_phasic(
    eda_signal=eda_raw,
    sampling_rate=100,
    method="neurokit",
)  # returns EDA_Phasic and EDA_Tonic columns
# convert to polars struct
pl.DataFrame(eda_processed).select(pl.struct(pl.all()).alias("my_struct"))


# eda_processed = (
#     df.group_by("trial_id").agg(
#         pl.map_groups(
#             exprs=["eda_raw"],
#             function=lambda data: process_eda(data[0]),
#         ).alias("eda_components")
#     )
#     .sort("trial_id")
#     .drop("trial_id")
#     .explode("eda_components")
# )

# eda_df = df.hstack(
#     eda_processed.select(
#         pl.col("eda_components").struct.field("eda_tonic"),
#         pl.col("eda_components").struct.field("eda_phasic")
#     ),
#     in_place=True
# )
# eda_df

my_struct
struct[2]
"{0.75195,0.000409}"
"{0.751953,0.002626}"
"{0.751956,0.001291}"
"{0.751958,0.001289}"
"{0.751961,0.002174}"
"{0.751964,0.000395}"
…
"{13.599425,-0.008761}"
"{13.599425,-0.014327}"
"{13.599425,-0.009684}"


In [75]:
def process_eda(
    series: pl.Series,
    sampling_rate: int = 100,
) -> pl.Series:
    eda_raw = series.to_numpy()
    eda_processed: pd.DataFrame = nk.eda_phasic(
        eda_signal=eda_raw,
        sampling_rate=sampling_rate,
        method="neurokit",
    )  # returns EDA_Phasic and EDA_Tonic columns
    return pl.Series("eda_tonic", eda_processed["EDA_Tonic"])


eda_raw = (
    (
        df.group_by("trial_id").agg(
            pl.map_groups(
                exprs=["eda_raw"],
                function=lambda data: process_eda(data[0]),
            ).alias("eda_tonic")
        )
    )
    .sort("trial_id")
    .drop("trial_id")
    .explode("eda_tonic")
)

eda_df = df.hstack(eda_raw, in_place=True)
eda_df

DuplicateError: unable to hstack, column with name "eda_tonic" already exists

In [84]:
def process_eda(
    series: pl.Series,
    sampling_rate: int = 100,
) -> pl.Struct:
    eda_raw = series.to_numpy()
    eda_processed: pd.DataFrame = nk.eda_phasic(
        eda_signal=eda_raw,
        sampling_rate=sampling_rate,
        method="neurokit",
    )  # returns EDA_Phasic and EDA_Tonic columns
    return pl.Struct(
        [
            pl.Field("eda_tonic", eda_processed["EDA_Tonic"]),
            pl.Field("eda_phasic", eda_processed["EDA_Phasic"]),
        ]
    )


eda_processed = (
    df.group_by("trial_id")
    .agg(
        pl.map_groups(
            exprs=["eda_raw"],
            function=lambda data: process_eda(data[0]),
        ).alias("eda_components")
    )
    .sort("trial_id")
    .drop("trial_id")
    .explode("eda_components")
)

# eda_df = df.hstack(
#     eda_processed.select(
#         pl.col("eda_components").struct.field("eda_tonic"),
#         pl.col("eda_components").struct.field("eda_phasic")
#     ),
#     in_place=True
# )
# eda_df

thread 'polars-3' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-0' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-4' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-1' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-6' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-5' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-2' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-7' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: unhashable type: 'Series'
thread 'polars-0' panicked at py-polars/src/map/lazy.rs:163:19:
python function failed: 

PanicException: python function failed: unhashable type: 'Series'

In [6]:
pddf = process_eda(df.select("eda_raw").to_series())

In [7]:
df

trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_d_battery,eda_raw,eda_d_packetreceptionrate
u16,u8,u8,u32,f64,i64,f64,f64,i64
1,1,1,37660,294210.3603,57892,3686.174359,0.752359,100
1,1,1,37661,294211.3575,57893,3693.45641,0.754579,100
1,1,1,37662,294211.3575,57894,3687.630769,0.753247,100
1,1,1,37663,294224.331,57895,3677.435897,0.753247,100
1,1,1,37664,294242.275,57896,3687.630769,0.754135,100
1,1,1,37665,294242.275,57897,3694.912821,0.752359,100
…,…,…,…,…,…,…,…,…
332,12,28,355477,2.7771e6,467074,3617.723077,13.679468,99
332,12,28,355478,2.7771e6,467075,3582.769231,13.674363,99
332,12,28,355479,2.7771e6,467076,3607.528205,13.679468,99


In [8]:
eda_raw = (
    (
        df.group_by("trial_id").agg(
            pl.map_groups(
                exprs=["eda_raw"],
                function=lambda data: process_eda(data[0]),
            ).alias("eda_tonic")
        )
    )
    .sort("trial_id")
    .drop("trial_id")
    .explode("eda_tonic")
)

eda_df = df.hstack(eda_raw, in_place=True)

In [9]:
df

trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_d_battery,eda_raw,eda_d_packetreceptionrate,eda_tonic
u16,u8,u8,u32,f64,i64,f64,f64,i64,f64
1,1,1,37660,294210.3603,57892,3686.174359,0.752359,100,0.75195
1,1,1,37661,294211.3575,57893,3693.45641,0.754579,100,0.751953
1,1,1,37662,294211.3575,57894,3687.630769,0.753247,100,0.751956
1,1,1,37663,294224.331,57895,3677.435897,0.753247,100,0.751958
1,1,1,37664,294242.275,57896,3687.630769,0.754135,100,0.751961
1,1,1,37665,294242.275,57897,3694.912821,0.752359,100,0.751964
…,…,…,…,…,…,…,…,…,…
332,12,28,355477,2.7771e6,467074,3617.723077,13.679468,99,13.599425
332,12,28,355478,2.7771e6,467075,3582.769231,13.674363,99,13.599425
332,12,28,355479,2.7771e6,467076,3607.528205,13.679468,99,13.599425


In [10]:
# df wise

In [40]:
df

trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_d_battery,eda_raw,eda_d_packetreceptionrate,eda_tonic
u16,u8,u8,u32,f64,i64,f64,f64,i64,f64
1,1,1,37660,294210.3603,57892,3686.174359,0.752359,100,0.75195
1,1,1,37661,294211.3575,57893,3693.45641,0.754579,100,0.751953
1,1,1,37662,294211.3575,57894,3687.630769,0.753247,100,0.751956
1,1,1,37663,294224.331,57895,3677.435897,0.753247,100,0.751958
1,1,1,37664,294242.275,57896,3687.630769,0.754135,100,0.751961
1,1,1,37665,294242.275,57897,3694.912821,0.752359,100,0.751964
…,…,…,…,…,…,…,…,…,…
332,12,28,355477,2.7771e6,467074,3617.723077,13.679468,99,13.599425
332,12,28,355478,2.7771e6,467075,3582.769231,13.674363,99,13.599425
332,12,28,355479,2.7771e6,467076,3607.528205,13.679468,99,13.599425


In [56]:
def process_eda_df(
    df: pl.DataFrame,
    sampling_rate: int = 100,
) -> pl.DataFrame:
    eda_raw = df.get_column("eda_raw").to_numpy()
    eda_processed: pd.DataFrame = nk.eda_phasic(
        eda_signal=eda_raw,
        sampling_rate=sampling_rate,
        method="neurokit",
    )["EDA_Tonic"]  # this returns EDA_Phasic and EDA_Tonic columns
    ic(eda_processed)
    print(eda_processed)
    return df.hstack(pl.from_pandas(eda_processed))  # ["EDA_Tonic"]))


def process_eda(
    series: pl.Series,
    sampling_rate: int = 100,
) -> pl.Series:
    eda_raw = series.to_numpy()
    eda_processed: pd.DataFrame = nk.eda_phasic(
        eda_signal=eda_raw,
        sampling_rate=sampling_rate,
        method="neurokit",
    )  # returns EDA_Phasic and EDA_Tonic columns
    return pl.Series("eda_tonic", eda_processed["EDA_Tonic"])

In [57]:
result = df.group_by("trial_id").map_groups(lambda group: process_eda_df(group))
result

ic| eda_processed: 0        1.376992
                   1        1.376978
                   2        1.376963
                   3        1.376948
                   4        1.376933
                              ...   
                   23037    1.222252
                   23038    1.222252
                   23039    1.222252
                   23040    1.222252
                   23041    1.222252
                   Name: EDA_Tonic, Length: 23042, dtype: float64


0        1.376992
1        1.376978
2        1.376963
3        1.376948
4        1.376933
           ...   
23037    1.222252
23038    1.222252
23039    1.222252
23040    1.222252
23041    1.222252
Name: EDA_Tonic, Length: 23042, dtype: float64


thread '' panicked at py-polars/src/dataframe/general.rs:346:31:
UDF failed: 'Series' object has no attribute 'get_columns'


PanicException: UDF failed: 'Series' object has no attribute 'get_columns'

In [26]:
# check if two polars columns are equal
result.select(col("EDA_Tonic") == col("eda_tonic"))  # .sum()
7650323

EDA_Tonic
bool
true
true
true
true
true
true
…
true
true
true


In [58]:
df = pl.DataFrame(
    {
        "id": [0, 1, 2, 3, 4],
        "color": ["red", "green", "green", "red", "red"],
        "shape": ["square", "triangle", "square", "triangle", "square"],
    }
)
df.group_by("color").map_groups(lambda group_df: group_df.sample(2)), df

(shape: (4, 3)
 ┌─────┬───────┬──────────┐
 │ id  ┆ color ┆ shape    │
 │ --- ┆ ---   ┆ ---      │
 │ i64 ┆ str   ┆ str      │
 ╞═════╪═══════╪══════════╡
 │ 1   ┆ green ┆ triangle │
 │ 2   ┆ green ┆ square   │
 │ 4   ┆ red   ┆ square   │
 │ 3   ┆ red   ┆ triangle │
 └─────┴───────┴──────────┘,
 shape: (5, 3)
 ┌─────┬───────┬──────────┐
 │ id  ┆ color ┆ shape    │
 │ --- ┆ ---   ┆ ---      │
 │ i64 ┆ str   ┆ str      │
 ╞═════╪═══════╪══════════╡
 │ 0   ┆ red   ┆ square   │
 │ 1   ┆ green ┆ triangle │
 │ 2   ┆ green ┆ square   │
 │ 3   ┆ red   ┆ triangle │
 │ 4   ┆ red   ┆ square   │
 └─────┴───────┴──────────┘)

In [14]:
df

trial_id,trial_number,participant_id,rownumber,timestamp,samplenumber,eda_d_battery,eda_raw,eda_d_packetreceptionrate,eda_tonic
u16,u8,u8,u32,f64,i64,f64,f64,i64,f64
1,1,1,37660,294210.3603,57892,3686.174359,0.752359,100,0.75195
1,1,1,37661,294211.3575,57893,3693.45641,0.754579,100,0.751953
1,1,1,37662,294211.3575,57894,3687.630769,0.753247,100,0.751956
1,1,1,37663,294224.331,57895,3677.435897,0.753247,100,0.751958
1,1,1,37664,294242.275,57896,3687.630769,0.754135,100,0.751961
1,1,1,37665,294242.275,57897,3694.912821,0.752359,100,0.751964
…,…,…,…,…,…,…,…,…,…
332,12,28,355477,2.7771e6,467074,3617.723077,13.679468,99,13.599425
332,12,28,355478,2.7771e6,467075,3582.769231,13.674363,99,13.599425
332,12,28,355479,2.7771e6,467076,3607.528205,13.679468,99,13.599425


In [None]:
df.plot(
    x="timestamp",
    y=["eda_raw", "eda_tonic"],
    groupby="trial_id",
    kind="line",
    widget_type="scrubber",
    widget_location="bottom",
).show()

INFO:bokeh.server.server:Starting Bokeh server version 3.4.2 (running on Tornado 6.4.1)
INFO:bokeh.server.tornado:User authentication hooks NOT provided (default user enabled)
DEBUG:bokeh.server.tornado:These host origins can connect to the websocket: ['localhost:58781']
DEBUG:bokeh.server.tornado:Patterns are:
DEBUG:bokeh.server.tornado:  [('/favicon.ico',
DEBUG:bokeh.server.tornado:    <class 'bokeh.server.views.ico_handler.IcoHandler'>,
DEBUG:bokeh.server.tornado:    {'app': <bokeh.server.tornado.BokehTornado object at 0x16a1462a0>}),
DEBUG:bokeh.server.tornado:   ('/?',
DEBUG:bokeh.server.tornado:    <class 'panel.io.server.DocHandler'>,
DEBUG:bokeh.server.tornado:    {'application_context': <bokeh.server.contexts.ApplicationContext object at 0x16a1a4dd0>,
DEBUG:bokeh.server.tornado:     'bokeh_websocket_path': '/ws'}),
DEBUG:bokeh.server.tornado:   ('/ws',
DEBUG:bokeh.server.tornado:    <class 'bokeh.server.views.ws.WSHandler'>,
DEBUG:bokeh.server.tornado:    {'application_context

Launching server at http://localhost:58781


<panel.io.server.Server at 0x16a1a7ce0>

DEBUG:bokeh.server.views.ws:Subprotocol header received
INFO:bokeh.server.views.ws:WebSocket connection opened
DEBUG:bokeh.server.views.ws:Receiver created for Protocol()
DEBUG:bokeh.server.views.ws:ProtocolHandler created for Protocol()
INFO:bokeh.server.views.ws:ServerConnection created
DEBUG:bokeh.server.session:Sending pull-doc-reply from session 'aXlvd2jU2LUCkK7X14wPQv1HqbzfNFYzwiEgnberY5Xy'
INFO:bokeh.server.views.ws:WebSocket connection closed: code=1001, reason=None
DEBUG:bokeh.server.tornado:[pid 28391] 0 clients connected
DEBUG:bokeh.server.tornado:[pid 28391]   / has 1 sessions with 1 unused
DEBUG:bokeh.server.tornado:[pid 28391] 0 clients connected
DEBUG:bokeh.server.tornado:[pid 28391]   / has 1 sessions with 1 unused
DEBUG:bokeh.server.contexts:Scheduling 1 sessions to discard
DEBUG:bokeh.server.contexts:Discarding session 'aXlvd2jU2LUCkK7X14wPQv1HqbzfNFYzwiEgnberY5Xy' last in use 30635.095791995525 milliseconds ago
DEBUG:bokeh.server.tornado:[pid 28391] 0 clients connect

In [59]:
ratings = pl.DataFrame(
    {
        "Movie": ["Cars", "IT", "ET", "Cars", "Up", "IT", "Cars", "ET", "Up", "ET"],
        "Theatre": ["NE", "ME", "IL", "ND", "NE", "SD", "NE", "IL", "IL", "SD"],
        "Avg_Rating": [4.5, 4.4, 4.6, 4.3, 4.8, 4.7, 4.7, 4.9, 4.7, 4.6],
        "Count": [30, 27, 26, 29, 31, 28, 28, 26, 33, 26],
    }
)
print(ratings)

shape: (10, 4)
┌───────┬─────────┬────────────┬───────┐
│ Movie ┆ Theatre ┆ Avg_Rating ┆ Count │
│ ---   ┆ ---     ┆ ---        ┆ ---   │
│ str   ┆ str     ┆ f64        ┆ i64   │
╞═══════╪═════════╪════════════╪═══════╡
│ Cars  ┆ NE      ┆ 4.5        ┆ 30    │
│ IT    ┆ ME      ┆ 4.4        ┆ 27    │
│ ET    ┆ IL      ┆ 4.6        ┆ 26    │
│ Cars  ┆ ND      ┆ 4.3        ┆ 29    │
│ Up    ┆ NE      ┆ 4.8        ┆ 31    │
│ IT    ┆ SD      ┆ 4.7        ┆ 28    │
│ Cars  ┆ NE      ┆ 4.7        ┆ 28    │
│ ET    ┆ IL      ┆ 4.9        ┆ 26    │
│ Up    ┆ IL      ┆ 4.7        ┆ 33    │
│ ET    ┆ SD      ┆ 4.6        ┆ 26    │
└───────┴─────────┴────────────┴───────┘


In [83]:
out = ratings.select(pl.col("Theatre").value_counts(sort=True))
out

Theatre
struct[2]
"{""NE"",3}"
"{""IL"",3}"
"{""SD"",2}"
"{""ME"",1}"
"{""ND"",1}"


In [68]:
out.unnest("Theatre")

Theatre,count
str,u32
"""NE""",3
"""IL""",3
"""SD""",2
"""ME""",1
"""ND""",1
