In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
import numpy as np
from pathlib import Path
import hashlib

from faker import Faker

from src.aih_privacy.datasets.subjects_sisfall import load_subjects

In [84]:
processed_dir = Path("../data/processed")
in_path = processed_dir / "windows_df.csv"

assert in_path.exists(), f"Missing file: {in_path.resolve()}"

windows_df = pd.read_csv(in_path)
print("Loaded windows_df:", windows_df.shape)
windows_df.head()


Loaded windows_df: (78914, 9)


Unnamed: 0,max,mean,std,range,energy,label,subject_id,age_group,activity_code
0,1.505301,1.021371,0.161249,0.748475,213.839935,0,SA01,SA,D01
1,1.686817,1.094025,0.183646,0.891159,246.123306,0,SA01,SA,D01
2,1.598864,1.042618,0.176089,0.814388,223.61203,0,SA01,SA,D01
3,1.388703,1.040773,0.144043,0.621426,220.791382,0,SA01,SA,D01
4,1.441374,1.059405,0.145564,0.63907,228.705505,0,SA01,SA,D01


In [85]:
required_cols = {"max","mean","std","range","energy","label","subject_id","age_group","activity_code"}
missing = required_cols - set(windows_df.columns)
assert not missing, f"windows_df missing columns: {missing}"

assert windows_df["label"].isin([0,1]).all()
assert windows_df["subject_id"].notna().all()
assert windows_df["age_group"].isin(["SA","SE"]).all()

# Garantir que ainda NÃO tem QIs (para evitar _x/_y)
qi_cols = {"age","gender","height_cm","weight_kg"}
assert qi_cols.isdisjoint(set(windows_df.columns)), "windows_df already contains QIs. Keep QIs only in subjects_df."


In [86]:
subjects_df = load_subjects().copy()
print("Loaded subjects_df:", subjects_df.shape)
subjects_df.head()


Loaded subjects_df: (38, 6)


Unnamed: 0,subject_id,age,height_cm,weight_kg,gender,age_group
0,SA01,26,165,53.0,F,SA
1,SA02,23,176,58.5,M,SA
2,SA03,19,156,48.0,F,SA
3,SA04,23,170,72.0,M,SA
4,SA05,22,172,69.5,M,SA


In [87]:
required_subject_cols = {"subject_id","age_group","age","gender","height_cm","weight_kg"}
missing = required_subject_cols - set(subjects_df.columns)
assert not missing, f"subjects_df missing columns: {missing}"

assert subjects_df["subject_id"].is_unique, "subject_id must be unique"
assert subjects_df["age_group"].isin(["SA","SE"]).all()
assert subjects_df[["age","gender","height_cm","weight_kg"]].isna().sum().sum() == 0


In [88]:
PSEUDONYM_SALT = "aih-sisfall-v1"  # mantém fixo

def make_pid(subject_id: str, salt: str = PSEUDONYM_SALT) -> str:
    raw = f"{salt}:{subject_id}".encode("utf-8")
    return hashlib.sha256(raw).hexdigest()[:16]

subjects_df["pid"] = subjects_df["subject_id"].apply(make_pid)
assert subjects_df["pid"].nunique() == len(subjects_df)

subjects_df[["subject_id","pid"]].head()


Unnamed: 0,subject_id,pid
0,SA01,8b39978f3bf493e0
1,SA02,8cca10112272b7d0
2,SA03,5a58367b9ab21a3f
3,SA04,ac62c148b01bdf2b
4,SA05,144051cc52779481


In [89]:
fake = Faker()
fake.seed_instance(42)  # reprodutível

subjects_df["synthetic_name"] = [fake.name() for _ in range(len(subjects_df))]
subjects_df["synthetic_address"] = [fake.address().replace("\n", ", ") for _ in range(len(subjects_df))]
subjects_df["synthetic_phone"] = [fake.phone_number() for _ in range(len(subjects_df))]
subjects_df["synthetic_patient_id"] = [fake.bothify(text="??-########") for _ in range(len(subjects_df))]

assert subjects_df[["synthetic_name","synthetic_patient_id"]].isna().sum().sum() == 0
subjects_df.head()


Unnamed: 0,subject_id,age,height_cm,weight_kg,gender,age_group,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,SA01,26,165,53.0,F,SA,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
1,SA02,23,176,58.5,M,SA,8cca10112272b7d0,Noah Rhodes,"71822 Arroyo Expressway, Allisonchester, IL 71187",643.252.4082,UY-89178390
2,SA03,19,156,48.0,F,SA,5a58367b9ab21a3f,Angie Henderson,"465 Lam Mission, East Jeffreymouth, AK 77611",9849271094,Rg-84700766
3,SA04,23,170,72.0,M,SA,ac62c148b01bdf2b,Daniel Wagner,"10310 Jones Freeway, Elizabethborough, ND 17843",252.404.7116x719,ir-77115921
4,SA05,22,172,69.5,M,SA,144051cc52779481,Cristian Santos,"76311 Gomez Loop Suite 010, Chandlerville, IA ...",(594)813-1869,DG-99856984


In [90]:
windows_identity_df = windows_df.merge(
    subjects_df,
    on=["subject_id","age_group"],
    how="left",
    validate="many_to_one"
)

print("windows_identity_df:", windows_identity_df.shape)
windows_identity_df.head()


windows_identity_df: (78914, 18)


Unnamed: 0,max,mean,std,range,energy,label,subject_id,age_group,activity_code,age,height_cm,weight_kg,gender,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,1.505301,1.021371,0.161249,0.748475,213.839935,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
1,1.686817,1.094025,0.183646,0.891159,246.123306,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
2,1.598864,1.042618,0.176089,0.814388,223.61203,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
3,1.388703,1.040773,0.144043,0.621426,220.791382,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177
4,1.441374,1.059405,0.145564,0.63907,228.705505,0,SA01,SA,D01,26,165,53.0,F,8b39978f3bf493e0,Allison Hill,"893 Nathaniel Estates Apt. 957, North Sarahpor...",631-335-1823x374,Xx-02681177


In [None]:
# Não pode haver colunas duplicadas com sufixos
bad = [c for c in windows_identity_df.columns if c.endswith("_x") or c.endswith("_y")]
assert not bad, f"Found duplicated columns after merge: {bad}"

# QIs têm de existir e sem NaNs
assert windows_identity_df[["age","gender","height_cm","weight_kg"]].isna().sum().sum() == 0

# IDs sintéticos e pid também
assert windows_identity_df[["pid","synthetic_patient_id"]].isna().sum().sum() == 0

# Cada subject_id deve ter QIs constantes
check = windows_identity_df.groupby("subject_id")[["age","gender","height_cm","weight_kg"]].nunique()
assert check.max().max() == 1

In [92]:
subjects_df = load_subjects().copy()

subjects_df["pid"] = subjects_df["subject_id"].apply(make_pid)

subjects_df["synthetic_name"] = [fake.name() for _ in range(len(subjects_df))]
subjects_df["synthetic_address"] = [fake.address().replace("\n", ", ") for _ in range(len(subjects_df))]
subjects_df["synthetic_phone"] = [fake.phone_number() for _ in range(len(subjects_df))]
subjects_df["synthetic_patient_id"] = [fake.bothify(text="??-########") for _ in range(len(subjects_df))]

subjects_df.head()

Unnamed: 0,subject_id,age,height_cm,weight_kg,gender,age_group,pid,synthetic_name,synthetic_address,synthetic_phone,synthetic_patient_id
0,SA01,26,165,53.0,F,SA,8b39978f3bf493e0,Kimberly Rodgers,"58527 Welch Valleys, North Jennifer, NH 05348",001-718-801-7046x226,OQ-58673622
1,SA02,23,176,58.5,M,SA,8cca10112272b7d0,Mary Walter,"05486 King Terrace, West Michael, WI 06766",466.518.2512x868,ya-66093796
2,SA03,19,156,48.0,F,SA,5a58367b9ab21a3f,Linda Moore,"1566 Kline Lights Suite 277, Sandrahaven, KY 1...",5049786863,xu-33419182
3,SA04,23,170,72.0,M,SA,ac62c148b01bdf2b,Michael Weber,"9284 Garcia Islands, Kirkchester, GA 90909",(324)231-0690x331,yN-37147859
4,SA05,22,172,69.5,M,SA,144051cc52779481,Michelle Williams,"59640 Emily Lodge Suite 820, Port Cathy, WY 57933",690.530.6738x302,RT-69155792


In [93]:
processed_dir.mkdir(parents=True, exist_ok=True)

out_windows = processed_dir / "windows_identity_df.csv"
out_subjects = processed_dir / "subjects_df.csv"

windows_identity_df.to_csv(out_windows, index=False)
subjects_df.to_csv(out_subjects, index=False)

print("Saved:", out_windows.resolve())
print("Saved:", out_subjects.resolve())

Saved: C:\AIH\Workspace\AIH_fall_privacy\data\processed\windows_identity_df.csv
Saved: C:\AIH\Workspace\AIH_fall_privacy\data\processed\subjects_df.csv


In [94]:
print("Label counts:", windows_identity_df["label"].value_counts().to_dict())
print("Subjects:", windows_identity_df["subject_id"].nunique())
print("Columns:", len(windows_identity_df.columns))

Label counts: {0: 52066, 1: 26848}
Subjects: 38
Columns: 18


In [95]:
print(windows_identity_df.shape)
print(windows_identity_df.columns)


(78914, 18)
Index(['max', 'mean', 'std', 'range', 'energy', 'label', 'subject_id',
       'age_group', 'activity_code', 'age', 'height_cm', 'weight_kg', 'gender',
       'pid', 'synthetic_name', 'synthetic_address', 'synthetic_phone',
       'synthetic_patient_id'],
      dtype='object')
