In [1]:
import sys
from pathlib import Path
import importlib

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import src.aih_privacy.datasets.sisfall as sis
importlib.reload(sis)
import src.aih_privacy.datasets.subjects_sisfall as ss
importlib.reload(ss)


from src.aih_privacy.datasets.registry import get_dataset
from  src.aih_privacy.datasets.sisfall import (
    acc_magnitude,
    sliding_windows,
    load_file,
    parse_filename,
    SAMPLING_RATE,
    extract_features
)
from src.aih_privacy.datasets.subjects_sisfall import load_subjects

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import defaultdict

In [2]:
WINDOW_SIZE = SAMPLING_RATE  # 200 samples = 1 second
WINDOW_STEP = SAMPLING_RATE  # sem overlap (baseline)

In [3]:
dataset = get_dataset("sisfall")
raw_dir = dataset.raw_dir 

In [4]:
rows = []

for f in raw_dir.rglob("*.txt"):
    parsed = parse_filename(f)
    if parsed is None:
        continue

    activity_code, subject_id, age_group, label = parsed
    df = load_file(f)

    acc_mag = acc_magnitude(df)
    windows = sliding_windows(acc_mag, WINDOW_SIZE, WINDOW_STEP)

    for w in windows:
        rows.append({
            **extract_features(w),
            "label": label,
            "subject_id": subject_id,
            "age_group": age_group,
            "activity_code": activity_code,
        })

windows_df = pd.DataFrame(rows)
windows_df.head()


Unnamed: 0,max,mean,std,range,energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,0,SA01,SA,D01


In [5]:
print("windows_df shape:", windows_df.shape)
print("label counts:", windows_df["label"].value_counts().to_dict())

# garantias essenciais
required_cols = {"max","mean","std","range","energy","label","subject_id","age_group","activity_code"}
missing = required_cols - set(windows_df.columns)
assert not missing, f"Missing columns: {missing}"

assert windows_df["label"].isin([0,1]).all()
assert windows_df["subject_id"].notna().all()
assert windows_df["activity_code"].notna().all()

windows_df shape: (78914, 9)
label counts: {0: 52066, 1: 26848}


In [6]:
processed_dir = Path("../data/processed")
processed_dir.mkdir(parents=True, exist_ok=True)

In [7]:
out_path = processed_dir / "windows_df.csv"
windows_df.to_csv(out_path, index=False)
print("Saved:", out_path.resolve())

Saved: D:\aih-privacy\data\processed\windows_df.csv


In [8]:
tmp = pd.read_csv(out_path)
print(tmp.shape)
tmp.head()


(78914, 9)


Unnamed: 0,max,mean,std,range,energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,0,SA01,SA,D01


In [9]:
print(windows_df.shape)
print(windows_df.columns)
print(windows_df["label"].value_counts())

(78914, 9)
Index(['max', 'mean', 'std', 'range', 'energy', 'label', 'subject_id',
       'age_group', 'activity_code'],
      dtype='object')
label
0    52066
1    26848
Name: count, dtype: int64
