In [6]:
import sys
from pathlib import Path
import importlib

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import src.aih_privacy.datasets.sisfall as sis
importlib.reload(sis)
import src.aih_privacy.datasets.subjects_sisfall as ss
importlib.reload(ss)


from src.aih_privacy.datasets.registry import get_dataset
from  src.aih_privacy.datasets.sisfall import (
    acc_magnitude,
    sliding_windows,
    load_file,
    parse_filename,
    SAMPLING_RATE,
    extract_features,
    gyro_magnitude
)
from src.aih_privacy.datasets.subjects_sisfall import load_subjects

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

In [7]:
WINDOW_SIZE = SAMPLING_RATE  # 200 samples = 1 second
WINDOW_STEP = SAMPLING_RATE  # sem overlap (baseline)

In [8]:
dataset = get_dataset("sisfall")
raw_dir = dataset.raw_dir 

In [9]:
rows = []

for f in raw_dir.rglob("*.txt"):
    parsed = parse_filename(f)
    if parsed is None:
        continue

    activity_code, subject_id, age_group, label = parsed
    df = load_file(f)

    acc_mag = acc_magnitude(df)
    gyro_mag = gyro_magnitude(df)

    acc_windows = sliding_windows(acc_mag, WINDOW_SIZE, WINDOW_STEP)
    gyro_windows = sliding_windows(gyro_mag, WINDOW_SIZE, WINDOW_STEP)

    n = min(len(acc_windows), len(gyro_windows))

    for i in range(n):
        acc_feats = extract_features(acc_windows[i])
        gyro_feats = extract_features(gyro_windows[i])

        rows.append({
            **{f"acc_{k}": v for k, v in acc_feats.items()},
            **{f"gyro_{k}": v for k, v in gyro_feats.items()},
            "label": label,
            "subject_id": subject_id,
            "age_group": age_group,
            "activity_code": activity_code,
        })

windows_df = pd.DataFrame(rows)
windows_df.head()


Unnamed: 0,acc_max,acc_mean,acc_std,acc_range,acc_energy,gyro_max,gyro_mean,gyro_std,gyro_range,gyro_energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,61.501696,36.275651,8.270789,49.125849,276865.758002,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,77.213308,43.917027,14.533872,61.152478,427987.746894,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,61.102289,39.110576,11.105643,46.904561,330594.491214,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,77.614763,31.088233,13.225029,70.586703,228275.924921,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,67.388147,33.309296,12.482396,57.557886,253063.883632,0,SA01,SA,D01


In [11]:
print("windows_df shape:", windows_df.shape)
print("label counts:", windows_df["label"].value_counts().to_dict())

windows_df shape: (78914, 14)
label counts: {0: 52066, 1: 26848}


In [12]:
processed_dir = Path("../data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

In [13]:
out_path = processed_dir / "windows_df.csv"
windows_df.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())

Saved: D:\aih-privacy\data\processed\windows_df.csv


In [14]:
tmp = pd.read_csv(out_path)
print(tmp.shape)
tmp.head()


(78914, 14)


Unnamed: 0,acc_max,acc_mean,acc_std,acc_range,acc_energy,gyro_max,gyro_mean,gyro_std,gyro_range,gyro_energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,61.501696,36.275651,8.270789,49.125849,276865.758002,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,77.213308,43.917027,14.533872,61.152478,427987.746894,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,61.102289,39.110576,11.105643,46.904561,330594.491214,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,77.614763,31.088233,13.225029,70.586703,228275.924921,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,67.388147,33.309296,12.482396,57.557886,253063.883632,0,SA01,SA,D01


In [15]:
print(windows_df.shape)
print(windows_df.columns)
print(windows_df["label"].value_counts())

(78914, 14)
Index(['acc_max', 'acc_mean', 'acc_std', 'acc_range', 'acc_energy', 'gyro_max',
       'gyro_mean', 'gyro_std', 'gyro_range', 'gyro_energy', 'label',
       'subject_id', 'age_group', 'activity_code'],
      dtype='object')
label
0    52066
1    26848
Name: count, dtype: int64
