In [4]:
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score

RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)

In [5]:
processed_dir = Path("../data/processed")
base_path = processed_dir / "windows_identity_df.csv"

df_base = pd.read_csv(base_path)

FEATURE_COLS = ["max", "mean", "std", "range", "energy"]
LABEL_COL = "label"
GROUP_COL = "subject_id"

print("Shape:", df_base.shape)
df_base[FEATURE_COLS + [LABEL_COL, GROUP_COL]].head()

Shape: (78914, 18)


Unnamed: 0,max,mean,std,range,energy,label,subject_id
0,1.505301,1.021371,0.161249,0.748475,213.839935,0,SA01
1,1.686817,1.094025,0.183646,0.891159,246.123306,0,SA01
2,1.598864,1.042618,0.176089,0.814388,223.61203,0,SA01
3,1.388703,1.040773,0.144043,0.621426,220.791382,0,SA01
4,1.441374,1.059405,0.145564,0.63907,228.705505,0,SA01


In [7]:
X = df_base[FEATURE_COLS]
y = df_base[LABEL_COL]
groups = df_base[GROUP_COL]

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)
train_idx, test_idx = next(gss.split(X, y, groups))

X_train_b, X_test_b = X.iloc[train_idx], X.iloc[test_idx]
y_train_b, y_test_b = y.iloc[train_idx], y.iloc[test_idx]

print("Train subjects:", groups.iloc[train_idx].nunique())
print("Test subjects:", groups.iloc[test_idx].nunique())


Train subjects: 30
Test subjects: 8


In [8]:
def compute_feature_clips(df: pd.DataFrame, cols, q_low=0.005, q_high=0.995):
    clip_min = {}
    clip_max = {}
    for c in cols:
        clip_min[c] = float(df[c].quantile(q_low))
        clip_max[c] = float(df[c].quantile(q_high))
    return clip_min, clip_max

clip_min, clip_max = compute_feature_clips(df_base, FEATURE_COLS)
clip_min, clip_max


({'max': 0.8770874843919313,
  'mean': 0.8335232735417769,
  'std': 0.00572275783833396,
  'range': 0.03171294556604029,
  'energy': 139.75768432617187},
 {'max': 9.892616302455998,
  'mean': 1.6505419335289502,
  'std': 1.5938957523214494,
  'range': 9.644824526012378,
  'energy': 1017.0755370330809})

In [9]:
def dp_laplace_on_features(
    X: pd.DataFrame,
    epsilon: float,
    clip_min: dict,
    clip_max: dict,
    rng: np.random.Generator
) -> pd.DataFrame:
    if epsilon <= 0:
        raise ValueError("epsilon tem de ser > 0")

    Xp = X.copy()
    for col in Xp.columns:
        a = float(clip_min[col])
        b = float(clip_max[col])
        Xp[col] = Xp[col].clip(a, b)

        sensitivity = b - a
        scale = sensitivity / epsilon

        noise = rng.laplace(loc=0.0, scale=scale, size=len(Xp))
        Xp[col] = Xp[col].astype(float) + noise

    return Xp


In [10]:
def eval_models(X_train, y_train, X_test, y_test):
    models = {
        "LogReg": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=2000))
        ]),
        "RandomForest": RandomForestClassifier(
            n_estimators=300, random_state=RANDOM_SEED, n_jobs=-1
        )
    }

    rows = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

        rows.append({
            "model": name,
            "balanced_accuracy": balanced_accuracy_score(y_test, pred),
            "f1_macro": f1_score(y_test, pred, average="macro")
        })

    return pd.DataFrame(rows)


In [11]:
res_baseline = eval_models(X_train_b, y_train_b, X_test_b, y_test_b)
res_baseline["setting"] = "baseline"
res_baseline

Unnamed: 0,model,balanced_accuracy,f1_macro,setting
0,LogReg,0.523898,0.484726,baseline
1,RandomForest,0.626174,0.62501,baseline


In [12]:
EPSILONS = [2.0, 1.0, 0.5]

rows = []
for eps in EPSILONS:
    X_train_dp = dp_laplace_on_features(
        X_train_b, epsilon=eps, clip_min=clip_min, clip_max=clip_max, rng=rng
    )
    X_test_dp = dp_laplace_on_features(
        X_test_b, epsilon=eps, clip_min=clip_min, clip_max=clip_max, rng=rng
    )

    res = eval_models(X_train_dp, y_train_b, X_test_dp, y_test_b)
    res["setting"] = f"dp_features_laplace_eps={eps}"
    rows.append(res)

res_dp = pd.concat(rows, ignore_index=True)
res_dp


Unnamed: 0,model,balanced_accuracy,f1_macro,setting
0,LogReg,0.5,0.429977,dp_features_laplace_eps=2.0
1,RandomForest,0.504393,0.479524,dp_features_laplace_eps=2.0
2,LogReg,0.5,0.429977,dp_features_laplace_eps=1.0
3,RandomForest,0.502861,0.474057,dp_features_laplace_eps=1.0
4,LogReg,0.5,0.429977,dp_features_laplace_eps=0.5
5,RandomForest,0.501695,0.469516,dp_features_laplace_eps=0.5


In [13]:
results = pd.concat([res_baseline, res_dp], ignore_index=True)
results


Unnamed: 0,model,balanced_accuracy,f1_macro,setting
0,LogReg,0.523898,0.484726,baseline
1,RandomForest,0.626174,0.62501,baseline
2,LogReg,0.5,0.429977,dp_features_laplace_eps=2.0
3,RandomForest,0.504393,0.479524,dp_features_laplace_eps=2.0
4,LogReg,0.5,0.429977,dp_features_laplace_eps=1.0
5,RandomForest,0.502861,0.474057,dp_features_laplace_eps=1.0
6,LogReg,0.5,0.429977,dp_features_laplace_eps=0.5
7,RandomForest,0.501695,0.469516,dp_features_laplace_eps=0.5


In [14]:
pivot_acc = results.pivot(index="model", columns="setting", values="balanced_accuracy")
pivot_f1  = results.pivot(index="model", columns="setting", values="f1_macro")

pivot_acc


setting,baseline,dp_features_laplace_eps=0.5,dp_features_laplace_eps=1.0,dp_features_laplace_eps=2.0
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LogReg,0.523898,0.5,0.5,0.5
RandomForest,0.626174,0.501695,0.502861,0.504393


In [15]:
# deltas vs baseline
for col in pivot_acc.columns:
    if col == "baseline":
        continue
    pivot_acc[f"delta_{col}"] = pivot_acc[col] - pivot_acc["baseline"]

pivot_acc


setting,baseline,dp_features_laplace_eps=0.5,dp_features_laplace_eps=1.0,dp_features_laplace_eps=2.0,delta_dp_features_laplace_eps=0.5,delta_dp_features_laplace_eps=1.0,delta_dp_features_laplace_eps=2.0
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LogReg,0.523898,0.5,0.5,0.5,-0.023898,-0.023898,-0.023898
RandomForest,0.626174,0.501695,0.502861,0.504393,-0.124479,-0.123314,-0.121782
