In [None]:
pip install presidio-analyzer presidio-anonymizer

In [2]:
import pandas as pd
import numpy as np

np.random.seed(42)

accounts = ['ACC123456', 'ACC234567', 'ACC345678', 'ACC456789', 'ACC567890']
merchants = ['ABC Store', 'ATM', 'Employer', 'Stock Exchange', 'XYZ Electronics', 'Online Store', 'Cafe', 'Supermarket']

n_rows = 10000

data = {
    'Transaction ID': np.arange(1000000000, 1000000000 + n_rows),
    'Account ID': np.random.choice(accounts, n_rows),
    'Transaction Amount': np.round(np.random.uniform(10.00, 2000.00, n_rows), 2),
    'Transaction Type': np.random.choice(['Purchase', 'Withdrawal', 'Deposit', 'Trade'], n_rows),
    'Merchant/Counterparty': np.random.choice(merchants, n_rows),
    'Location': np.random.choice(['New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Miami, FL', 'Online'], n_rows),
    'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()
}

pdf = pd.DataFrame(data)

  'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MaskedData").getOrCreate()

df = spark.createDataFrame(pdf)

df.show(truncate=False)

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|Location       |Date and Time   |
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|1000000000    |ACC456789 |86.8              |Deposit         |Employer             |Los Angeles, CA|2024-08-01 00:00|
|1000000001    |ACC567890 |1758.97           |Withdrawal      |ABC Store            |Chicago, IL    |2024-08-01 01:00|
|1000000002    |ACC345678 |636.72            |Purchase        |XYZ Electronics      |New York, NY   |2024-08-01 02:00|
|1000000003    |ACC567890 |266.67            |Deposit         |Employer             |New York, NY   |2024-08-01 03:00|
|1000000004    |ACC567890 |942.24            |Deposit         |ATM                  |Chicago, IL    |2024-08-01 04:00|
|1000000005    |ACC234567 |1200.63           |Tr

In [4]:
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

analyzer   = AnalyzerEngine()
anonymizer = AnonymizerEngine()

account_recognizer = PatternRecognizer(
    supported_entity="ACC_ID",
    patterns=[Pattern(name="ACC_ID", regex=r"ACC\d{6}", score=0.8)]
)
analyzer.registry.add_recognizer(account_recognizer)

operators = {
    "ACC_ID":   OperatorConfig("mask", params={"masking_char": "*", "chars_to_mask": 6,  "from_end": False}),
    "LOCATION": OperatorConfig("mask", params={"masking_char": "*", "chars_to_mask": 100,"from_end": False}),
}

def mask_udf_fn(text: str) -> str:
    if not text:
        return text
    results = analyzer.analyze(text=text,
                               entities=list(operators.keys()),
                               language="en")
    return anonymizer.anonymize(text=text,
                                analyzer_results=results,
                                operators=operators).text

mask_udf = udf(mask_udf_fn, StringType())

df_masked = (
    df
    .withColumn("Account ID", mask_udf("Account ID"))
    .withColumn("Location",   mask_udf("Location"))
)

df_masked.show(truncate=False)




[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|Location       |Date and Time   |
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|1000000000    |******789 |86.8              |Deposit         |Employer             |***********, CA|2024-08-01 00:00|
|1000000001    |******890 |1758.97           |Withdrawal      |ABC Store            |*******, **    |2024-08-01 01:00|
|1000000002    |******678 |636.72            |Purchase        |XYZ Electronics      |********, **   |2024-08-01 02:00|
|1000000003    |******890 |266.67            |Deposit         |Employer             |********, **   |2024-08-01 03:00|
|1000000004    |******890 |942.24            |Deposit         |ATM                  |*******, **    |2024-08-01 04:00|
|1000000005    |******567 |1200.63           |Tr