In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import numpy as np
from pathlib import Path
import hashlib

from faker import Faker

from src.aih_privacy.datasets.subjects_sisfall import load_subjects

In [2]:
processed_dir = Path("../data/processed")
in_path = processed_dir / "windows_df.csv"

assert in_path.exists(), f"Missing file: {in_path.resolve()}"

windows_df = pd.read_csv(in_path)
print("Loaded windows_df:", windows_df.shape)
windows_df.head()


Loaded windows_df: (78914, 14)


Unnamed: 0,acc_max,acc_mean,acc_std,acc_range,acc_energy,gyro_max,gyro_mean,gyro_std,gyro_range,gyro_energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,61.501696,36.275651,8.270789,49.125849,276865.758002,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,77.213308,43.917027,14.533872,61.152478,427987.746894,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,61.102289,39.110576,11.105643,46.904561,330594.491214,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,77.614763,31.088233,13.225029,70.586703,228275.924921,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,67.388147,33.309296,12.482396,57.557886,253063.883632,0,SA01,SA,D01


In [5]:
subjects_df = load_subjects().copy()
print("Loaded subjects_df:", subjects_df.shape)
subjects_df.head()


Loaded subjects_df: (38, 6)


Unnamed: 0,subject_id,age,height_cm,weight_kg,gender,age_group
0,SA01,26,165,53.0,F,SA
1,SA02,23,176,58.5,M,SA
2,SA03,19,156,48.0,F,SA
3,SA04,23,170,72.0,M,SA
4,SA05,22,172,69.5,M,SA


In [None]:
PSEUDONYM_SALT = "aih-sisfall-v1"  # mantém fixo

def make_pid(subject_id: str, salt: str = PSEUDONYM_SALT) -> str:
    raw = f"{salt}:{subject_id}".encode("utf-8")
    return hashlib.sha256(raw).hexdigest()[:16]

subjects_df["pid"] = subjects_df["subject_id"].apply(make_pid)
assert subjects_df["pid"].nunique() == len(subjects_df)

subjects_df[["subject_id","pid"]].head()

Unnamed: 0,subject_id,pid
0,SA01,8b39978f3bf493e0
1,SA02,8cca10112272b7d0
2,SA03,5a58367b9ab21a3f
3,SA04,ac62c148b01bdf2b
4,SA05,144051cc52779481


In [17]:
fake = Faker()
fake.seed_instance(42)  # reprodutível

subjects_df["synthetic_name"] = [fake.name() for _ in range(len(subjects_df))]
subjects_df["synthetic_address"] = [fake.address().replace("\n", ", ") for _ in range(len(subjects_df))]
subjects_df["synthetic_phone"] = [fake.phone_number() for _ in range(len(subjects_df))]
subjects_df["synthetic_patient_id"] = [fake.bothify(text="??-########") for _ in range(len(subjects_df))]

subjects_df.head()

Unnamed: 0,subject_id,age,height_cm,weight_kg,gender,age_group,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,SA01,26,165,53.0,F,SA,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
1,SA02,23,176,58.5,M,SA,8cca10112272b7d0,Noah Rhodes,"71822 Arroyo Expressway, Allisonchester, IL 71187",643.252.4082,UY-89178390
2,SA03,19,156,48.0,F,SA,5a58367b9ab21a3f,Angie Henderson,"465 Lam Mission, East Jeffreymouth, AK 77611",9849271094,Rg-84700766
3,SA04,23,170,72.0,M,SA,ac62c148b01bdf2b,Daniel Wagner,"10310 Jones Freeway, Elizabethborough, ND 17843",252.404.7116x719,ir-77115921
4,SA05,22,172,69.5,M,SA,144051cc52779481,Cristian Santos,"76311 Gomez Loop Suite 010, Chandlerville, IA ...",(594)813-1869,DG-99856984


In [9]:
windows_identity_df = windows_df.merge(
    subjects_df,
    on=["subject_id","age_group"],
    how="left",
    validate="many_to_one"
)

print("windows_identity_df:", windows_identity_df.shape)
windows_identity_df.head()

windows_identity_df: (78914, 23)


Unnamed: 0,acc_max,acc_mean,acc_std,acc_range,acc_energy,gyro_max,gyro_mean,gyro_std,gyro_range,gyro_energy,...,activity_code,age,height_cm,weight_kg,gender,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,1.505301,1.021371,0.161249,0.748475,213.839935,61.501696,36.275651,8.270789,49.125849,276865.758002,...,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
1,1.686817,1.094025,0.183646,0.891159,246.123306,77.213308,43.917027,14.533872,61.152478,427987.746894,...,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
2,1.598864,1.042618,0.176089,0.814388,223.61203,61.102289,39.110576,11.105643,46.904561,330594.491214,...,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
3,1.388703,1.040773,0.144043,0.621426,220.791382,77.614763,31.088233,13.225029,70.586703,228275.924921,...,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
4,1.441374,1.059405,0.145564,0.63907,228.705505,67.388147,33.309296,12.482396,57.557886,253063.883632,...,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177


In [12]:
subjects_df = load_subjects().copy()

subjects_df["pid"] = subjects_df["subject_id"].apply(make_pid)

subjects_df["synthetic_name"] = [fake.name() for _ in range(len(subjects_df))]
subjects_df["synthetic_address"] = [fake.address().replace("\n", ", ") for _ in range(len(subjects_df))]
subjects_df["synthetic_phone"] = [fake.phone_number() for _ in range(len(subjects_df))]
subjects_df["synthetic_patient_id"] = [fake.bothify(text="??-########") for _ in range(len(subjects_df))]

subjects_df.head()

Unnamed: 0,subject_id,age,height_cm,weight_kg,gender,age_group,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,SA01,26,165,53.0,F,SA,8b39978f3bf493e0,Gabriel Willis,"PSC 7191, Box 8261, APO AP 74926",001-938-475-8428x241,HX-56164840
1,SA02,23,176,58.5,M,SA,8cca10112272b7d0,Marie Day,"44565 Chavez Fort, East Derekshire, CT 20870",001-648-625-1353x4655,uG-58877043
2,SA03,19,156,48.0,F,SA,5a58367b9ab21a3f,Jeffrey Ashley,"5618 Oconnell Fords Suite 474, North Ericport,...",817.609.9917x381,QO-11288013
3,SA04,23,170,72.0,M,SA,ac62c148b01bdf2b,David Middleton,"52135 Palmer Springs, Jorgeburgh, AK 53505",(702)423-5697x25217,bD-36138612
4,SA05,22,172,69.5,M,SA,144051cc52779481,Logan Brown,"71210 Maria Forges, East Madison, MN 90453",800-874-3292x71296,eK-50110248


In [13]:
processed_dir.mkdir(parents=True, exist_ok=True)

out_windows = processed_dir / "windows_identity_df.csv"
out_subjects = processed_dir / "subjects_df.csv"

windows_identity_df.to_csv(out_windows, index=False)
subjects_df.to_csv(out_subjects, index=False)

print("Saved:", out_windows.resolve())
print("Saved:", out_subjects.resolve())

Saved: D:\aih-privacy\data\processed\windows_identity_df.csv
Saved: D:\aih-privacy\data\processed\subjects_df.csv


In [14]:
print("Label counts:", windows_identity_df["label"].value_counts().to_dict())
print("Subjects:", windows_identity_df["subject_id"].nunique())
print("Columns:", len(windows_identity_df.columns))

Label counts: {0: 52066, 1: 26848}
Subjects: 38
Columns: 23


In [15]:
print(windows_identity_df.shape)
print(windows_identity_df.columns)


(78914, 23)
Index(['acc_max', 'acc_mean', 'acc_std', 'acc_range', 'acc_energy', 'gyro_max',
       'gyro_mean', 'gyro_std', 'gyro_range', 'gyro_energy', 'label',
       'subject_id', 'age_group', 'activity_code', 'age', 'height_cm',
       'weight_kg', 'gender', 'pid', 'synthetic_name', 'synthetic_address',
       'synthetic_phone', 'synthetic_patient_id'],
      dtype='object')


# **With overlap**

In [20]:
in_path = processed_dir / "windows_df_overlap50.csv"

df_overlap = pd.read_csv(in_path)
print("Loaded windows_df:", df_overlap.shape)
df_overlap.head()

Loaded windows_df: (78914, 14)


Unnamed: 0,acc_max,acc_mean,acc_std,acc_range,acc_energy,gyro_max,gyro_mean,gyro_std,gyro_range,gyro_energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,61.501696,36.275651,8.270789,49.125849,276865.758002,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,77.213308,43.917027,14.533872,61.152478,427987.746894,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,61.102289,39.110576,11.105643,46.904561,330594.491214,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,77.614763,31.088233,13.225029,70.586703,228275.924921,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,67.388147,33.309296,12.482396,57.557886,253063.883632,0,SA01,SA,D01


In [None]:
subjects_df = pd.read_csv(processed_dir / "subjects_df.csv")  # ajusta o nome

df_overlap_id = df_overlap.merge(
    subjects_df,
    on=["subject_id", "age_group"],
    how="left",
    validate="many_to_one"
)

out = processed_dir / "windows_identity_df_overlap50.csv"
df_overlap_id.to_csv(out, index=False)
print("Saved:", out, df_overlap_id.shape)


Saved: ..\data\processed\windows_identity_df_overlap50.csv (78914, 23)
