In [3]:
%run setup.py

import polars as pl
from configs import utils

In [28]:
lf_dogmove = utils.load_parquet(
    "../../data/processed", "DogMoveData_Windowed_Denoised.parquet", 
    utils.dog_move_data_widowed_schema
)

lf_doginfo = utils.load_large_data("../../data/raw/", "DogInfo.csv")

In [5]:
PURITY_THRESHOLD = 0.75

lf_dogmove_with_label = lf_dogmove.with_columns([
    pl.col("Behavior_1").list.eval(
        pl.element().mode().first()
    ).list.first().alias("label"),
    
    pl.struct([
        pl.col("Behavior_1").list.eval(pl.element().mode().first()).list.first().alias("mode_value"),
        pl.col("Behavior_1")
    ]).map_elements(
        lambda x: (x["Behavior_1"].count(x["mode_value"]) / len(x["Behavior_1"])) if len(x["Behavior_1"]) > 0 else 0.0,
        return_dtype=pl.Float64
    ).alias("purity")
])

lf_dogmove_filtered = lf_dogmove_with_label.filter(
    pl.col("purity") >= PURITY_THRESHOLD
)

In [24]:
sensors_cols = [
    "ABack_x", "ABack_y", "ABack_z",
    "ANeck_x", "ANeck_y", "ANeck_z",
    "GBack_x", "GBack_y", "GBack_z",
    "GNeck_x", "GNeck_y", "GNeck_z"
]

feature_expr = []

for col in sensors_cols:
    feature_expr.extend([
        pl.col(col).list.mean().round(decimals=6).alias(f"{col}_mean"),
        pl.col(col).list.std().round(decimals=6).alias(f"{col}_std"),
        pl.col(col).list.min().round(decimals=6).alias(f"{col}_min"),
        pl.col(col).list.max().round(decimals=6).alias(f"{col}_max"),
        pl.col(col).list.median().round(decimals=6).alias(f"{col}_median"),
        pl.col(col).list.eval(pl.element().abs()).list.sum().round(decimals=6).alias(f"{col}_abs_sum")
    ])

lf_features = lf_dogmove_filtered.with_columns(feature_expr)


In [None]:
cols_to_remove = ["TestNum", "t_dt", "n_samples", "t_sec",
                                "ABack_x", "ABack_y", "ABack_z",
                                "ANeck_x", "ANeck_y", "ANeck_z",
                                "GBack_x", "GBack_y", "GBack_z",
                                "GNeck_x", "GNeck_y", "GNeck_z",
                                "Behavior_1", "purity"]

lf_features = lf_features.select(pl.all().exclude(cols_to_remove))

In [30]:
lf_info_features = lf_doginfo.select(pl.all().exclude(["Breed", "Gender", "NeuteringStatus"]))

In [32]:
lf_final = lf_features.join(lf_info_features, on="DogID", how="left")

In [33]:
lf_final.sink_csv("../../data/processed/DogFeatures.csv")