In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

accounts = ['ACC123456', 'ACC234567', 'ACC345678', 'ACC456789', 'ACC567890']
merchants = ['ABC Store', 'ATM', 'Employer', 'Stock Exchange', 'XYZ Electronics', 'Online Store', 'Cafe', 'Supermarket']

n_rows = 10000

data = {
    'Transaction ID': np.arange(1000000000, 1000000000 + n_rows),
    'Account ID': np.random.choice(accounts, n_rows),
    'Transaction Amount': np.round(np.random.uniform(10.00, 2000.00, n_rows), 2),
    'Transaction Type': np.random.choice(['Purchase', 'Withdrawal', 'Deposit', 'Trade'], n_rows),
    'Merchant/Counterparty': np.random.choice(merchants, n_rows),
    'Location': np.random.choice(['New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Miami, FL', 'Online'], n_rows),
    'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()
}

pdf = pd.DataFrame(data)

  'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Anonymization").getOrCreate()

df = spark.createDataFrame(pdf)

df.show(truncate=False)

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|Location       |Date and Time   |
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|1000000000    |ACC456789 |86.8              |Deposit         |Employer             |Los Angeles, CA|2024-08-01 00:00|
|1000000001    |ACC567890 |1758.97           |Withdrawal      |ABC Store            |Chicago, IL    |2024-08-01 01:00|
|1000000002    |ACC345678 |636.72            |Purchase        |XYZ Electronics      |New York, NY   |2024-08-01 02:00|
|1000000003    |ACC567890 |266.67            |Deposit         |Employer             |New York, NY   |2024-08-01 03:00|
|1000000004    |ACC567890 |942.24            |Deposit         |ATM                  |Chicago, IL    |2024-08-01 04:00|
|1000000005    |ACC234567 |1200.63           |Tr

In [13]:
from pyspark.sql import Window
from pyspark.sql import functions as F

QIs = ["Location","Merchant/Counterparty","Transaction Type"]
sens = "Transaction Amount"

# k‑Anonymity: suppress QIs where group size < k
k = 40
df_k = (
    df
    .withColumn("__grp_cnt", F.count("*").over(Window.partitionBy(*QIs)))
    .withColumns({
        qi: F.when(F.col("__grp_cnt") < k, F.lit(None)).otherwise(F.col(qi))
        for qi in QIs
    })
    .drop("__grp_cnt")
)

# Show rows where Location or Merchant/Counterparty got suppressed
df_k.filter(
    F.col("Location").isNull() | F.col("Merchant/Counterparty").isNull()
).show(vertical=True, truncate=False)


-RECORD 0---------------------------------
 Transaction ID        | 1000000171       
 Account ID            | ACC123456        
 Transaction Amount    | 1558.96          
 Transaction Type      | NULL             
 Merchant/Counterparty | NULL             
 Location              | NULL             
 Date and Time         | 2024-08-08 03:00 
-RECORD 1---------------------------------
 Transaction ID        | 1000001070       
 Account ID            | ACC567890        
 Transaction Amount    | 856.86           
 Transaction Type      | NULL             
 Merchant/Counterparty | NULL             
 Location              | NULL             
 Date and Time         | 2024-09-14 14:00 
-RECORD 2---------------------------------
 Transaction ID        | 1000001765       
 Account ID            | ACC345678        
 Transaction Amount    | 285.17           
 Transaction Type      | NULL             
 Merchant/Counterparty | NULL             
 Location              | NULL             
 Date and T