In [1]:
import sys
from pathlib import Path
import importlib
import numpy as np
import pandas as pd

if str(Path.cwd().parent) not in sys.path:
    sys.path.insert(0, str(Path.cwd().parent))

import src.aih_privacy.datasets.sisfall as sis
importlib.reload(sis)

from src.aih_privacy.config import DATA_RAW_DIR

# dataset loaders
from  src.aih_privacy.datasets.sisfall import (
    SAMPLING_RATE,
    load_file,
    acc_magnitude,
    parse_filename,
    sliding_windows,
    extract_features,
    gyro_magnitude
)

In [2]:
RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

WINDOW_SIZE = SAMPLING_RATE  
WINDOW_STEP = SAMPLING_RATE

EPSILONS = [2.0, 1.0, 0.5]  # posso tambem tentar encontrar um artigo que guie na escolha
                            # do valor de epsilon ideal para o meu tipo de dados


In [None]:
processed_dir = Path("../data/processed")

base_path = processed_dir / "windows_identity_df.csv"

overlap50_path = processed_dir / "windows_identity_df_overlap50.csv"

subjects_df = pd.read_csv(
    processed_dir / "subjects_df.csv"   
)

df_base = pd.read_csv(base_path)
df_overlap50 = pd.read_csv(overlap50_path)

FEATURE_COLS = [
    "acc_max", "acc_mean", "acc_std", "acc_range", "acc_energy",
    "gyro_max", "gyro_mean", "gyro_std", "gyro_range", "gyro_energy",
]

LABEL_COL = "label"
GROUP_COL = "subject_id"

print("Shape:", df_base.shape)
df_base[FEATURE_COLS + [LABEL_COL, GROUP_COL]].head()

Shape: (78914, 23)


Unnamed: 0,acc_max,acc_mean,acc_std,acc_range,acc_energy,gyro_max,gyro_mean,gyro_std,gyro_range,gyro_energy,label,subject_id
0,1.505301,1.021371,0.161249,0.748475,213.839935,61.501696,36.275651,8.270789,49.125849,276865.758002,0,SA01
1,1.686817,1.094025,0.183646,0.891159,246.123306,77.213308,43.917027,14.533872,61.152478,427987.746894,0,SA01
2,1.598864,1.042618,0.176089,0.814388,223.61203,61.102289,39.110576,11.105643,46.904561,330594.491214,0,SA01
3,1.388703,1.040773,0.144043,0.621426,220.791382,77.614763,31.088233,13.225029,70.586703,228275.924921,0,SA01
4,1.441374,1.059405,0.145564,0.63907,228.705505,67.388147,33.309296,12.482396,57.557886,253063.883632,0,SA01


In [5]:
def compute_feature_clips(df, cols, q_low=0.005, q_high=0.995):
    clip_min = {}
    clip_max = {}
    for c in cols:
        clip_min[c] = float(df[c].quantile(q_low))
        clip_max[c] = float(df[c].quantile(q_high))
    return clip_min, clip_max

def dp_laplace_on_features_df(X: pd.DataFrame, epsilon: float, clip_min: dict, clip_max: dict, rng):
    Xp = X.copy()
    for c in Xp.columns:
        a, b = float(clip_min[c]), float(clip_max[c])
        Xp[c] = Xp[c].clip(a, b)
        scale = (b - a) / epsilon
        Xp[c] = Xp[c].astype(float) + rng.laplace(0.0, scale, size=len(Xp))
    return Xp

# **DP After windowing**

In [None]:
clip_min_feat, clip_max_feat = compute_feature_clips(df_base, FEATURE_COLS)

for eps in EPSILONS:
    rng = np.random.default_rng(RANDOM_SEED)

    df_after = df_base.copy()
    df_after[FEATURE_COLS] = dp_laplace_on_features_df(
        df_base[FEATURE_COLS], eps, clip_min_feat, clip_max_feat, rng
    )

    out = processed_dir / f"windows_identity_dp_after_eps{eps}.csv"
    df_after.to_csv(out, index=False)
    print("Saved:", out, df_after.shape)

Saved: ..\data\processed\windows_identity_dp_after_eps2.0.csv (78914, 23)
Saved: ..\data\processed\windows_identity_dp_after_eps1.0.csv (78914, 23)
Saved: ..\data\processed\windows_identity_dp_after_eps0.5.csv (78914, 23)


In [6]:
clip_min_feat, clip_max_feat = compute_feature_clips(df_overlap50, FEATURE_COLS)

for eps in EPSILONS:
    rng = np.random.default_rng(RANDOM_SEED)

    df_after = df_overlap50.copy()
    df_after[FEATURE_COLS] = dp_laplace_on_features_df(
        df_overlap50[FEATURE_COLS], eps, clip_min_feat, clip_max_feat, rng
    )

    out = processed_dir / f"windows_identity_overlap50_dp_after_eps{eps}.csv"
    df_after.to_csv(out, index=False)
    print("Saved:", out, df_after.shape)

Saved: ..\data\processed\windows_identity_overlap50_dp_after_eps2.0.csv (78914, 23)
Saved: ..\data\processed\windows_identity_overlap50_dp_after_eps1.0.csv (78914, 23)
Saved: ..\data\processed\windows_identity_overlap50_dp_after_eps0.5.csv (78914, 23)


# **DP-before**

In [11]:
files = [f for f in DATA_RAW_DIR.rglob("*.txt") if parse_filename(f) is not None]
rng = np.random.default_rng(42)
files = rng.choice(files, size=min(200, len(files)), replace=False)

acc_vals, gyro_vals = [], []

for f in files:
    df = load_file(f)
    acc_vals.append(acc_magnitude(df))
    gyro_vals.append(gyro_magnitude(df))

acc_vals = np.concatenate(acc_vals)
gyro_vals = np.concatenate(gyro_vals)

CLIP_MIN_ACC = 0.0
CLIP_MAX_ACC = float(np.quantile(acc_vals, 0.995))

CLIP_MIN_GYRO = 0.0
CLIP_MAX_GYRO = float(np.quantile(gyro_vals, 0.995))

Refs: 
- [The Algorithmic Foundations of Differential Privacy,Dwork and Roth, 2014](https://www.nowpublishers.com/article/Details/TCS-042) justifica o facto de DP precisar de sensibilidade limitada e o dominio ser bounded.
- [SisFall: A Fall and Movement Dataset, Sucerquia et al., 2017](https://www.mdpi.com/1424-8220/17/1/198): Artigo do sisfall, justifica porque e necessario controlar outliers

Differential privacy mechanisms require bounded sensitivity in order to control the scale of the injected noise [Dwork and Roth, 2014].

In continuous wearable sensor data, such as accelerometer and gyroscope signals, rare extreme peaks and high inter-subject variability are common, particularly during fall events [Sucerquia et al., 2017].

Without bounding the signal range, these extreme values would dominate the sensitivity and lead to excessive noise injection, severely degrading data utility.

Therefore, following standard differential privacy practice, a robust clipping strategy was applied prior to noise injection to bound the signal values and ensure a stable privacy–utility trade-off.

The clipping thresholds were estimated using a high percentile of the signal distribution, in order to reduce the influence of rare extreme outliers while preserving most of the signal dynamics

In [9]:
def dp_laplace_on_window_signal(w, epsilon, clip_min, clip_max, rng):
    w = np.asarray(w, dtype=float)
    w = np.clip(w, clip_min, clip_max)
    scale = (clip_max - clip_min) / epsilon
    return w + rng.laplace(0.0, scale, size=w.shape)

In [12]:
rows_by_eps = {eps: [] for eps in EPSILONS}
rng_by_eps = {eps: np.random.default_rng(RANDOM_SEED) for eps in EPSILONS}

In [13]:
for f in DATA_RAW_DIR.rglob("*.txt"):
    parsed = parse_filename(f)
    if parsed is None:
        continue

    activity_code, subject_id, age_group, label = parsed

    df = load_file(f)

    acc_mag = acc_magnitude(df)
    gyro_mag = gyro_magnitude(df)

    acc_windows  = sliding_windows(acc_mag,  WINDOW_SIZE, WINDOW_STEP)
    gyro_windows = sliding_windows(gyro_mag, WINDOW_SIZE, WINDOW_STEP)

    n = min(len(acc_windows), len(gyro_windows))

    for i in range(n):
        acc_w = acc_windows[i]
        gyro_w = gyro_windows[i]

        # para cada epsilon, gerar DP-before e features
        for eps in EPSILONS:
            rng = rng_by_eps[eps]

            acc_w_dp = dp_laplace_on_window_signal(acc_w, eps, CLIP_MIN_ACC, CLIP_MAX_ACC, rng)
            gyro_w_dp = dp_laplace_on_window_signal(gyro_w, eps, CLIP_MIN_GYRO, CLIP_MAX_GYRO, rng)

            acc_feats = extract_features(acc_w_dp)
            gyro_feats = extract_features(gyro_w_dp)

            row = {}
            row.update({f"acc_{k}": v for k, v in acc_feats.items()})
            row.update({f"gyro_{k}": v for k, v in gyro_feats.items()})

            row.update({
                "label": label,
                "subject_id": subject_id,
                "age_group": age_group,
                "activity_code": activity_code,
                "epsilon": eps,
            })

            rows_by_eps[eps].append(row)


In [17]:
for eps in EPSILONS:
    df_before = pd.DataFrame(rows_by_eps[eps])

    # merge subjects aqui (melhor prática)
    df_before = df_before.merge(subjects_df, on=["subject_id", "age_group"], how="left", validate="many_to_one")

    out = processed_dir / f"windows_identity_dp_before_eps{eps}.csv"
    df_before.to_csv(out, index=False)
    print("Saved:", out, df_before.shape)


Saved: ..\data\processed\windows_identity_dp_before_eps2.0.csv (78914, 24)
Saved: ..\data\processed\windows_identity_dp_before_eps1.0.csv (78914, 24)
Saved: ..\data\processed\windows_identity_dp_before_eps0.5.csv (78914, 24)


In [7]:
WINDOW_SIZE = SAMPLING_RATE  
WINDOW_STEP = SAMPLING_RATE // 2

In [13]:
rows_by_eps = {eps: [] for eps in EPSILONS}
rng_by_eps = {eps: np.random.default_rng(RANDOM_SEED) for eps in EPSILONS}

for f in DATA_RAW_DIR.rglob("*.txt"):
    parsed = parse_filename(f)
    if parsed is None:
        continue

    activity_code, subject_id, age_group, label = parsed
    df = load_file(f)

    acc_mag = acc_magnitude(df)
    gyro_mag = gyro_magnitude(df)

    acc_windows  = sliding_windows(acc_mag,  WINDOW_SIZE, WINDOW_STEP)
    gyro_windows = sliding_windows(gyro_mag, WINDOW_SIZE, WINDOW_STEP)

    n = min(len(acc_windows), len(gyro_windows))

    for i in range(n):
        acc_w = acc_windows[i]
        gyro_w = gyro_windows[i]

        for eps in EPSILONS:
            rng = rng_by_eps[eps]

            acc_w_dp = dp_laplace_on_window_signal(acc_w, eps, CLIP_MIN_ACC, CLIP_MAX_ACC, rng)
            gyro_w_dp = dp_laplace_on_window_signal(gyro_w, eps, CLIP_MIN_GYRO, CLIP_MAX_GYRO, rng)

            acc_feats = extract_features(acc_w_dp)
            gyro_feats = extract_features(gyro_w_dp)

            row = {}
            row.update({f"acc_{k}": v for k, v in acc_feats.items()})
            row.update({f"gyro_{k}": v for k, v in gyro_feats.items()})

            row.update({
                "label": label,
                "subject_id": subject_id,
                "age_group": age_group,
                "activity_code": activity_code,
                "epsilon": eps,
                "window_size": WINDOW_SIZE,
                "window_step": WINDOW_STEP,
            })

            rows_by_eps[eps].append(row)

for eps in EPSILONS:
    df_before = pd.DataFrame(rows_by_eps[eps])
    out = processed_dir / f"windows_identity_overlap50_dp_before_eps{eps}.csv"
    df_before.to_csv(out, index=False)
    print("Saved:", out, df_before.shape)

Saved: ..\data\processed\windows_identity_overlap50_dp_before_eps2.0.csv (153705, 17)
Saved: ..\data\processed\windows_identity_overlap50_dp_before_eps1.0.csv (153705, 17)
Saved: ..\data\processed\windows_identity_overlap50_dp_before_eps0.5.csv (153705, 17)
