In [33]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [34]:
processed_dir = Path("../data/processed")
in_path = processed_dir / "windows_identity_df.csv"

assert in_path.exists(), f"Missing file: {in_path.resolve()}"

df = pd.read_csv(in_path)
print("Loaded dataset:", df.shape)
df.head()

Loaded dataset: (78914, 18)


Unnamed: 0,max,mean,std,range,energy,label,subject_id,age_group,activity_code,age,height_cm,weight_kg,gender,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,1.505301,1.021371,0.161249,0.748475,213.839935,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
1,1.686817,1.094025,0.183646,0.891159,246.123306,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
2,1.598864,1.042618,0.176089,0.814388,223.61203,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
3,1.388703,1.040773,0.144043,0.621426,220.791382,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
4,1.441374,1.059405,0.145564,0.63907,228.705505,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177


In [35]:
QI_COLS = ["age", "gender", "height_cm", "weight_kg"]
ID_COL = "subject_id"

missing = set(QI_COLS + [ID_COL]) - set(df.columns)
assert not missing, f"Missing columns: {missing}"

print("Subjects:", df[ID_COL].nunique())
print("Labels:", df["label"].value_counts().to_dict())


Subjects: 38
Labels: {0: 52066, 1: 26848}


In [36]:
df_qi = df[QI_COLS].copy()

# Bins que cobrem o dataset (largos, mas ajustáveis)
df_qi["age_bin"] = pd.cut(
    df_qi["age"],
    bins=[0, 20, 30, 40, 50, 60, 70, 80],
    right=False,
    include_lowest=True
)

df_qi["height_bin"] = pd.cut(
    df_qi["height_cm"],
    bins=np.arange(140, 201, 5),   # até 200 cm
    include_lowest=True
)

df_qi["weight_bin"] = pd.cut(
    df_qi["weight_kg"],
    bins=np.arange(35, 126, 5),    # 35..125 kg
    include_lowest=True
)

QI_BINS = ["age_bin", "gender", "height_bin", "weight_bin"]

# Sanity: não pode haver NaNs nos bins (se houver, alarga bins)
print("NaNs in binned QIs:\n", df_qi[QI_BINS].isna().sum())

# Remover linhas com NaNs nos bins (se existir alguma)
df_qi = df_qi.dropna(subset=QI_BINS).copy()

print("Records used for k-anonymity:", len(df_qi))


NaNs in binned QIs:
 age_bin       0
gender        0
height_bin    0
weight_bin    0
dtype: int64
Records used for k-anonymity: 78914


In [37]:
equiv = (
    df_qi
    .groupby(QI_BINS, observed=True)
    .size()
    .rename("k")
    .reset_index()
)


k_values = equiv["k"]

print("Equivalence classes:", len(equiv))
equiv.head()

Equivalence classes: 34


Unnamed: 0,age_bin,gender,height_bin,weight_bin,k
0,"[0, 20)",F,"(155.0, 160.0]","(45.0, 50.0]",2609
1,"[0, 20)",F,"(155.0, 160.0]","(50.0, 55.0]",2597
2,"[0, 20)",M,"(165.0, 170.0]","(80.0, 85.0]",2606
3,"[20, 30)",F,"(145.0, 150.0]","(40.0, 45.0]",2610
4,"[20, 30)",F,"(150.0, 155.0]","(45.0, 50.0]",2608


In [38]:
risk_summary = {
    "total_records_used": int(len(df_qi)),
    "num_equiv_classes": int(len(equiv)),
    "min_k": int(k_values.min()),
    "median_k": float(k_values.median()),
    "num_classes_k1": int((k_values == 1).sum()),
    "num_classes_k<=2": int((k_values <= 2).sum()),
    "pct_classes_k1": float((k_values == 1).mean()),
    "pct_classes_k<=2": float((k_values <= 2).mean()),
}

risk_summary


{'total_records_used': 78914,
 'num_equiv_classes': 34,
 'min_k': 923,
 'median_k': 2604.5,
 'num_classes_k1': 0,
 'num_classes_k<=2': 0,
 'pct_classes_k1': 0.0,
 'pct_classes_k<=2': 0.0}

In [39]:
attacker_df = (
    df
    .groupby(ID_COL)
    .agg({
        "age": "first",
        "gender": "first",
        "height_cm": "first",
        "weight_kg": "first",
    })
    .reset_index()
)

print("Attacker dataset:", attacker_df.shape)
attacker_df.head()


Attacker dataset: (38, 5)


Unnamed: 0,subject_id,age,gender,height_cm,weight_kg
0,SA01,26,F,165,53.0
1,SA02,23,M,176,58.5
2,SA03,19,F,156,48.0
3,SA04,23,M,170,72.0
4,SA05,22,M,172,69.5


In [40]:
X_attack = attacker_df[QI_COLS]
y_attack = attacker_df[ID_COL]

gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
train_idx, test_idx = next(gss.split(X_attack, y_attack, groups=y_attack))

X_tr, X_te = X_attack.iloc[train_idx], X_attack.iloc[test_idx]
y_tr, y_te = y_attack.iloc[train_idx], y_attack.iloc[test_idx]

print("Train subjects:", len(y_tr), "Test subjects:", len(y_te))


Train subjects: 26 Test subjects: 12


In [41]:
preprocess = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), ["gender"]),
    ("num", StandardScaler(), ["age", "height_cm", "weight_kg"]),
])

attack_knn1 = Pipeline([
    ("prep", preprocess),
    ("clf", KNeighborsClassifier(n_neighbors=1))
])

attack_knn1.fit(X_tr, y_tr)
y_pred = attack_knn1.predict(X_te)

acc_top1 = accuracy_score(y_te, y_pred)
acc_top1


  check_classification_targets(y)


0.0

In [42]:
# Transformações para calcular vizinhos
prep = attack_knn1.named_steps["prep"]
X_tr_t = prep.fit_transform(X_tr)
X_te_t = prep.transform(X_te)

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(X_tr_t, y_tr)

neighbors = knn3.kneighbors(X_te_t, return_distance=False)

y_tr_reset = y_tr.reset_index(drop=True)
hits = []
for i, neigh_idx in enumerate(neighbors):
    candidates = set(y_tr_reset.iloc[neigh_idx])
    hits.append(y_te.iloc[i] in candidates)

acc_top3 = float(np.mean(hits))
acc_top3


  check_classification_targets(y)


0.0

In [43]:
n_classes_train = y_tr.nunique()
random_top1 = 1.0 / n_classes_train
random_top3 = min(3.0 / n_classes_train, 1.0)

{
    "train_classes": int(n_classes_train),
    "random_top1": random_top1,
    "random_top3": random_top3,
    "attack_top1": float(acc_top1),
    "attack_top3": float(acc_top3),
}


{'train_classes': 26,
 'random_top1': 0.038461538461538464,
 'random_top3': 0.11538461538461539,
 'attack_top1': 0.0,
 'attack_top3': 0.0}

In [44]:
def k_anon_summary(df, age_bins, h_step, w_step):
    tmp = df[["age","gender","height_cm","weight_kg"]].copy()
    
    tmp["age_bin"] = pd.cut(tmp["age"], bins=age_bins, right=False, include_lowest=True)
    tmp["height_bin"] = pd.cut(tmp["height_cm"], bins=np.arange(140, 201, h_step), include_lowest=True)
    tmp["weight_bin"] = pd.cut(tmp["weight_kg"], bins=np.arange(35, 126, w_step), include_lowest=True)

    QI_BINS = ["age_bin", "gender", "height_bin", "weight_bin"]
    tmp = tmp.dropna(subset=QI_BINS)

    equiv = tmp.groupby(QI_BINS, observed=True).size().rename("k").reset_index()
    k = equiv["k"]

    return {
        "num_equiv_classes": int(len(equiv)),
        "min_k": int(k.min()),
        "median_k": float(k.median()),
        "num_classes_k<=2": int((k <= 2).sum()),
        "pct_classes_k<=2": float((k <= 2).mean()),
    }

scenarios = [
    ("coarse", [0,20,30,40,50,60,70,80], 5, 5),
    ("medium", np.arange(0, 81, 5),       2, 2),
    ("fine",   np.arange(0, 81, 1),       1, 1),
]

results = []
for name, age_bins, h_step, w_step in scenarios:
    s = k_anon_summary(df, age_bins, h_step, w_step)
    s["scenario"] = name
    results.append(s)

pd.DataFrame(results)[["scenario","num_equiv_classes","min_k","median_k","num_classes_k<=2","pct_classes_k<=2"]]


Unnamed: 0,scenario,num_equiv_classes,min_k,median_k,num_classes_k<=2,pct_classes_k<=2
0,coarse,34,923,2604.5,0,0.0
1,medium,37,923,2604.0,0,0.0
2,fine,38,923,2601.5,0,0.0


In [45]:
# quantos sujeitos do teste têm QIs EXACTAMENTE iguais a algum sujeito do treino?
train_qi = X_tr.copy()
train_qi["sid"] = y_tr.values

test_qi = X_te.copy()
test_qi["sid"] = y_te.values

merged = test_qi.merge(train_qi, on=["age","gender","height_cm","weight_kg"], how="left", suffixes=("", "_train"))
print("Test subjects with exact QI match in train:", merged["sid_train"].notna().sum(), "/", len(test_qi))


Test subjects with exact QI match in train: 0 / 12
