In [8]:
import pandas as pd

# Checking to see if my Jupyter notebook can read the CSV file
df = pd.read_csv(r"C:\Users\higas\Downloads\generative-ai-data-anonymization\synthetic_dataset.csv")
print(df.head())
print(df.columns)

              Name                     Email  Age  Contact Number
0  Brenda Richards    michelle76@example.org   79      9898586166
1    Antonio Perez    psingleton@example.net   19      9876282758
2     Terry Monroe    edwardross@example.net   30      9782846470
3    Heather Floyd    cookbrooke@example.net   65      9739572462
4    Allen Shelton  craigcollins@example.net   63      9676063153
Index(['Name', 'Email', 'Age', 'Contact Number'], dtype='object')


In [9]:
import pandas as pd
import random

# Load the CSV
df = pd.read_csv(r"C:\Users\higas\Downloads\generative-ai-data-anonymization\synthetic_dataset.csv")

# 1. Redact Name (keep vowels only, replace others with #)
def redact_name(name):
    vowels = "aeiouAEIOU"
    return "".join([ch if ch in vowels or not ch.isalpha() else "#" for ch in name])

df["Name"] = df["Name"].apply(redact_name)

# 2. Assign pseudonyms to Email
df["Email"] = [f"user_{i}@pseudo.com" for i in range(1, len(df) + 1)]

# 3. Add random noise to first 5 digits of Contact Number (keep last 5 unchanged)
def add_noise(contact):
    contact = str(contact)
    noisy_first = "".join([str(random.randint(0, 9)) for _ in range(5)])
    return noisy_first + contact[5:]

df["Contact Number"] = df["Contact Number"].apply(add_noise)

# Show transformed data
print(df.head())

# Save to new CSV if needed
df.to_csv(r"C:\Users\higas\Downloads\generative-ai-data-anonymization\synthetic_dataset_anonymized.csv", index=False)


              Name              Email  Age Contact Number
0  ##e##a #i##a###  user_1@pseudo.com   79     0650286166
1    A##o#io #e#e#  user_2@pseudo.com   19     0861682758
2     #e### #o##oe  user_3@pseudo.com   30     3082346470
3    #ea##e# ##o##  user_4@pseudo.com   65     8574272462
4    A##e# ##e##o#  user_5@pseudo.com   63     2795263153


In [10]:
import pandas as pd

# Load anonymized dataset
df = pd.read_csv(r"C:\Users\higas\Downloads\generative-ai-data-anonymization\synthetic_dataset_anonymized.csv")

# Step 1: Generalize Age into bins and convert to string
bins = [0, 18, 29, 49, 120]
labels = ["0-18", "19-29", "30-49", "50+"]
df["Age_Group"] = pd.cut(df["Age"], bins=bins, labels=labels).astype(str)

# Step 2: Generalize Contact Number (keep only last 3 digits)
df["Contact_Group"] = df["Contact Number"].astype(str).str[-3:]

# Step 3: Count how many records per quasi-identifier combination
qi = ["Age_Group", "Contact_Group"]
group_sizes = df.groupby(qi, observed=False).size().reset_index(name="count")

# Step 4: Merge the counts back
df = df.merge(group_sizes, on=qi, how="left")

# Step 5: Suppress groups with fewer than k records
k = 3
df.loc[df["count"] < k, ["Age_Group", "Contact_Group"]] = "Suppressed"

# Step 6: Drop helper column
df = df.drop(columns="count")

# Show anonymized dataset
print(df.head())

# Optional: save to CSV
df.to_csv(r"C:\Users\higas\Downloads\generative-ai-data-anonymization\synthetic_dataset_k_anonymized.csv", index=False)


              Name              Email  Age  Contact Number   Age_Group  \
0  ##e##a #i##a###  user_1@pseudo.com   79       650286166  Suppressed   
1    A##o#io #e#e#  user_2@pseudo.com   19       861682758  Suppressed   
2     #e### #o##oe  user_3@pseudo.com   30      3082346470  Suppressed   
3    #ea##e# ##o##  user_4@pseudo.com   65      8574272462  Suppressed   
4    A##e# ##e##o#  user_5@pseudo.com   63      2795263153  Suppressed   

  Contact_Group  
0    Suppressed  
1    Suppressed  
2    Suppressed  
3    Suppressed  
4    Suppressed  
