In [2]:
pip install Faker

Collecting Faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-37.3.0


In [3]:
import pandas as pd
from faker import Faker
import random

fake = Faker()

# Sample Data
data = {
    'ID': [1, 2, 3, 4, 5],
    'Name': ['Alice Smith', 'Bob Johnson', 'Charlie Brown', 'David Lee', 'Eve Wilson'],
    'Age': [25, 30, 22, 35, 28],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', 'david@example.com', 'eve@example.com']
}

df = pd.DataFrame(data)

# Anonymization Techniques

# 1. Masking/Redaction
df_masked = df.copy()
df_masked['Name'] = df_masked['Name'].apply(lambda x: '*****')
df_masked['Email'] = df_masked['Email'].apply(lambda x: '*****')

# 2. Generalization/Aggregation
df_generalized = df.copy()
df_generalized['Age'] = df_generalized['Age'].apply(lambda x: f'{x // 10 * 10}-{x // 10 * 10 + 9}')
df_generalized['City'] = 'USA'

# 3. Randomization/Perturbation (adding noise)
df_perturbed = df.copy()
df_perturbed['Age'] = df_perturbed['Age'].apply(lambda x: x + random.randint(-2, 2))

# Pseudonymization Techniques

# 1. Replacing with Fake Data
df_pseudonymized = df.copy()
df_pseudonymized['Name'] = [fake.name() for _ in range(len(df))]
df_pseudonymized['City'] = [fake.city() for _ in range(len(df))]
df_pseudonymized['Email'] = [fake.email() for _ in range(len(df))]

# 2. Tokenization (using a lookup table)
name_to_token = {name: f'user_{i}' for i, name in enumerate(df['Name'].unique())}
df_tokenized = df.copy()
df_tokenized['Name'] = df_tokenized['Name'].map(name_to_token)

print("Original Data:\n", df)
print("\nMasked Data:\n", df_masked)
print("\nGeneralized Data:\n", df_generalized)
print("\nPerturbed Data:\n", df_perturbed)
print("\nPseudonymized Data:\n", df_pseudonymized)
print("\nTokenized Data:\n", df_tokenized)

Original Data:
    ID           Name  Age         City                Email
0   1    Alice Smith   25     New York    alice@example.com
1   2    Bob Johnson   30  Los Angeles      bob@example.com
2   3  Charlie Brown   22      Chicago  charlie@example.com
3   4      David Lee   35      Houston    david@example.com
4   5     Eve Wilson   28      Phoenix      eve@example.com

Masked Data:
    ID   Name  Age         City  Email
0   1  *****   25     New York  *****
1   2  *****   30  Los Angeles  *****
2   3  *****   22      Chicago  *****
3   4  *****   35      Houston  *****
4   5  *****   28      Phoenix  *****

Generalized Data:
    ID           Name    Age City                Email
0   1    Alice Smith  20-29  USA    alice@example.com
1   2    Bob Johnson  30-39  USA      bob@example.com
2   3  Charlie Brown  20-29  USA  charlie@example.com
3   4      David Lee  30-39  USA    david@example.com
4   5     Eve Wilson  20-29  USA      eve@example.com

Perturbed Data:
    ID           Nam