In [2]:
import pandas as pd
import numpy as np
import uuid
import base64

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
# Load data
data_path = "/content/drive/MyDrive/Data.csv"
df = pd.read_csv(data_path)

df.head()


Unnamed: 0,id,age,year,sex,glang,part,job,stud_h,health,psyt,jspe,qcae_cog,qcae_aff,amsp,erec_mean,cesd,stai_t,mbi_ex,mbi_cy,mbi_ea
0,2,18,1,1,120,1,0,56,3,0,88,62,27,17,0.738095,34,61,17,13,20
1,4,26,4,1,1,1,0,20,4,0,109,55,37,22,0.690476,7,33,14,11,26
2,9,21,3,2,1,0,0,36,3,0,106,64,39,17,0.690476,25,73,24,7,23
3,10,21,2,2,1,0,1,51,5,0,101,52,33,18,0.833333,17,48,16,10,21
4,13,21,3,1,1,1,0,22,4,0,102,58,28,21,0.690476,14,46,22,14,23


In [15]:
#df['id_token'] = [base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode('ascii') for _ in range(len(df))]


# Tokenize
#tokens, uniques = pd.factorize(df["id"], sort=True)
df['id_token'] = [base64.urlsafe_b64encode(uuid.uuid4().bytes).rstrip(b'=').decode('ascii') for _ in range(len(df))]

# Create the mapping from original 'id' to the new 'id_token'
id_to_token = dict(zip(df['id'], df['id_token']))

# Create the inverse mapping from the new 'id_token' to the original 'id'
token_to_id = {v: k for k, v in id_to_token.items()}


print("id_to_token:", list(id_to_token.items())[:10]) # Print first 10 for brevity
print("token_to_id:", list(token_to_id.items())[:10]) # Print first 10 for brevity


# Demonstrate converting tokens back to original values
df["ID_back"] = df["id_token"].map(token_to_id)

df = df.drop(columns=['id'])  # Drop original id
df = df.drop(columns=['ID_back'])  # Drop original id

df.head()



id_to_token: [(2, 'B1WCbvkiSYWtrqQ1wlx-eA'), (4, 'ySz728HlRLC1CwRN9HEFYA'), (9, 'WR6-n7zbRuKiYzqiYEOgDw'), (10, 'NefwwXhgQZ23jxWxyCeltQ'), (13, 'cIqFMvLRSjqSG1Kcvt0G7Q'), (14, 'NWohBUtkTrqR_7Tv1WHNXA'), (17, 'ew2IsUyrRxWgxGGlMzO0Uw'), (21, 'BbLuguC9Tj-qkuGO4bZBTg'), (23, 'SRblImZ1RKGnYkRyxLbK6Q'), (24, 'CKya1ivaSFiwuY1DJ94pFg')]
token_to_id: [('B1WCbvkiSYWtrqQ1wlx-eA', 2), ('ySz728HlRLC1CwRN9HEFYA', 4), ('WR6-n7zbRuKiYzqiYEOgDw', 9), ('NefwwXhgQZ23jxWxyCeltQ', 10), ('cIqFMvLRSjqSG1Kcvt0G7Q', 13), ('NWohBUtkTrqR_7Tv1WHNXA', 14), ('ew2IsUyrRxWgxGGlMzO0Uw', 17), ('BbLuguC9Tj-qkuGO4bZBTg', 21), ('SRblImZ1RKGnYkRyxLbK6Q', 23), ('CKya1ivaSFiwuY1DJ94pFg', 24)]


Unnamed: 0,age,year,sex,glang,part,job,stud_h,health,psyt,jspe,qcae_cog,qcae_aff,amsp,erec_mean,cesd,stai_t,mbi_ex,mbi_cy,mbi_ea,id_token
0,18,1,1,120,1,0,56,3,0,88,62,27,17,0.738095,34,61,17,13,20,B1WCbvkiSYWtrqQ1wlx-eA
1,26,4,1,1,1,0,20,4,0,109,55,37,22,0.690476,7,33,14,11,26,ySz728HlRLC1CwRN9HEFYA
2,21,3,2,1,0,0,36,3,0,106,64,39,17,0.690476,25,73,24,7,23,WR6-n7zbRuKiYzqiYEOgDw
3,21,2,2,1,0,1,51,5,0,101,52,33,18,0.833333,17,48,16,10,21,NefwwXhgQZ23jxWxyCeltQ
4,21,3,1,1,1,0,22,4,0,102,58,28,21,0.690476,14,46,22,14,23,cIqFMvLRSjqSG1Kcvt0G7Q


#### Quasi-Identifiers (QIs)

QIs - age, gender, curriculum year, mother tongue.  
These values may not pinpoint a single person, but together (e.g., a 24-year-old, female, in Mmed1, speaks Turkish) they might uniquely identify someone.

In [16]:
# --- GENERALIZATION ---

# Bin 'age' into age groups (adjust bins as needed)
df['age_group'] = pd.cut(df['age'], bins=[16, 20, 24, 29, 39, 100], labels=['17-20', '21-24', '25-29', '30-39', '40+'], right=True, include_lowest=True)

# Collapse rare languages (glang) to 'Other'
common_langs = df['glang'].value_counts().index[:5]
df['glang_gen'] = df['glang'].apply(lambda x: x if x in common_langs else 'Other')

# Map 'sex' to strings (optional, for clarity)
df['sex'] = df['sex'].map({1: 'Man', 2: 'Woman', 3: 'Non-binary'})

# Map 'year' to broader curriculum groups if needed
df['year_group'] = df['year'].map({1: 'Bmed', 2: 'Bmed', 3: 'Bmed', 4: 'Mmed', 5: 'Mmed', 6: 'Mmed'})

df.head()


Unnamed: 0,age,year,sex,glang,part,job,stud_h,health,psyt,jspe,...,erec_mean,cesd,stai_t,mbi_ex,mbi_cy,mbi_ea,id_token,age_group,glang_gen,year_group
0,18,1,Man,120,1,0,56,3,0,88,...,0.738095,34,61,17,13,20,B1WCbvkiSYWtrqQ1wlx-eA,17-20,Other,Bmed
1,26,4,Man,1,1,0,20,4,0,109,...,0.690476,7,33,14,11,26,ySz728HlRLC1CwRN9HEFYA,25-29,1,Mmed
2,21,3,Woman,1,0,0,36,3,0,106,...,0.690476,25,73,24,7,23,WR6-n7zbRuKiYzqiYEOgDw,21-24,1,Bmed
3,21,2,Woman,1,0,1,51,5,0,101,...,0.833333,17,48,16,10,21,NefwwXhgQZ23jxWxyCeltQ,21-24,1,Bmed
4,21,3,Man,1,1,0,22,4,0,102,...,0.690476,14,46,22,14,23,cIqFMvLRSjqSG1Kcvt0G7Q,21-24,1,Bmed


In [17]:
# --- SELECT QIs AND SENSITIVE ATTRIBUTE ---
qi_cols = ['age_group', 'year_group', 'sex', 'glang_gen']
sensitive_col = 'health'  # Or 'cesd', 'stai_t', etc.


#### k-Anonymity
k-Anonymity is a privacy concept that ensures that each combination of quasi-identifier values appears at least **k** times in the dataset.
- If a dataset is **5-anonymous (k=5)**, then for any combination of QI values, there are at least 5 individuals with those same values.

By generalizing (e.g., binning ages) or suppressing (removing or masking) QI values until each group has at least k records.

#### l-Diversity
l-Diversity is an extension of k-anonymity that also considers the diversity of **sensitive attributes** (like health status, diagnosis, etc.) within each group of identical QI values.
- A dataset has **l-diversity** if, for every group of records sharing the same QI values, there are at least **l** "well-represented" distinct values for the sensitive attribute.
- Without l-diversity, even if a group has 5 people (k=5), if all of them have the same diagnosis, knowing the QIs tells you the sensitive value. l-Diversity ensures there is uncertainty about the sensitive information.

In [18]:
# --- k-ANONYMITY: GROUP BY QIs ---

def k_violations(df, qis, k):
    counts = df.groupby(qis, dropna=False, sort=False).size()
    return counts[counts < k].sort_values()

def l_violations(df, qis, sensitive, l):
    diversity = df.groupby(qis, dropna=False, sort=False)[sensitive].nunique(dropna=True)
    return diversity[diversity < l].sort_values()


In [19]:
k, l = 5, 2

kv = k_violations(df, qi_cols, k)
print(f"\n before— k-violations: {len(kv)} groups")
print("Before — k-anonymity (head):")
print(k_violations(df, qi_cols, k).head(10))


lv = l_violations(df, qi_cols, sensitive_col, l)
print(f"before — l-violations: {len(lv)} groups")
print("\nBefore — l-diversity (head):")
print(l_violations(df, qi_cols, sensitive_col, l).head(10))


 before— k-violations: 155 groups
Before — k-anonymity (head):
age_group  year_group  sex         glang_gen
17-20      Bmed        Non-binary  1            0
                                   90           0
                       Man         20           0
           Mmed        Man         1            0
                                   Other        0
           Bmed        Non-binary  20           0
                                   15           0
                                   102          0
           Mmed        Man         102          0
                                   90           0
dtype: int64
before — l-violations: 138 groups

Before — l-diversity (head):
age_group  year_group  sex         glang_gen
17-20      Bmed        Non-binary  1            0
                                   90           0
                       Man         20           0
           Mmed        Man         1            0
                                   Other        0
           Bmed    

  counts = df.groupby(qis, dropna=False, sort=False).size()
  counts = df.groupby(qis, dropna=False, sort=False).size()
  diversity = df.groupby(qis, dropna=False, sort=False)[sensitive].nunique(dropna=True)
  diversity = df.groupby(qis, dropna=False, sort=False)[sensitive].nunique(dropna=True)


In [20]:
def k_l_anonymize(df, qis, sensitive, k, l):
    anon_df = df.copy()
    grouped = anon_df.groupby(qis, dropna=False, sort=False)
    mask = pd.Series(False, index=anon_df.index)
    for _, group in grouped:
        if len(group) < k or group[sensitive].nunique(dropna=True) < l:
            mask.loc[group.index] = True
    anon_df.loc[mask, qis] = np.nan
    return anon_df




In [21]:
anon_df = k_l_anonymize(df, qi_cols, sensitive_col, k, l)

  grouped = anon_df.groupby(qis, dropna=False, sort=False)


In [22]:
# Drop raw QIs that could re-identify
for c in ['age', 'glang', 'year']:
    if c in anon_df.columns:
        anon_df.drop(columns=[c], inplace=True)

In [23]:
# Report
def summarize(anon_df):
    kv = k_violations(anon_df, qi_cols, k)
    print(f"\nAfter — k-anonymisation: {len(kv)} groups")
    print(k_violations(anon_df, qi_cols, k).head(10))


    lv = l_violations(anon_df, qi_cols, sensitive_col, l)
    print(f"After — l-diversity: {len(lv)} groups")
    print(l_violations(anon_df, qi_cols, sensitive_col, l).head(10))


    suppressed_frac = anon_df[qi_cols].isna().any(axis=1).mean()
    print(f"Suppressed rows (any QI NaN): {suppressed_frac:.2%}")

summarize(anon_df)


anon_df.to_csv(f"Data_Carrard_anonymized.csv", index=False)
print(f"\nSaved: Data_Carrard_anonymized.csv")


After — k-anonymisation: 352 groups
age_group  year_group  sex    glang_gen
40+        Bmed        Woman  Other        0
                              20           0
                              15           0
                              102          0
                              90           0
                              1            0
                              NaN          0
                       Man    Other        0
                              20           0
                              15           0
dtype: int64
After — l-diversity: 352 groups
age_group  year_group  sex    glang_gen
40+        Bmed        Woman  Other        0
                              20           0
                              15           0
                              102          0
                              90           0
                              1            0
                              NaN          0
                       Man    Other        0
                            

  counts = df.groupby(qis, dropna=False, sort=False).size()
  counts = df.groupby(qis, dropna=False, sort=False).size()
  diversity = df.groupby(qis, dropna=False, sort=False)[sensitive].nunique(dropna=True)
  diversity = df.groupby(qis, dropna=False, sort=False)[sensitive].nunique(dropna=True)
