In [0]:
%pip install pycountry
%pip install faker

In [0]:
%sql
CREATE VOLUME IF NOT EXISTS workspace.default.source 

In [0]:
import pycountry
import pandas as pd

# High-risk countries (cleaned and matched to pycountry naming)
high_risk_countries = [
    "Algeria", "Angola", "Bolivia", "Bulgaria", "Burkina Faso", "Cameroon", "Côte d'Ivoire",
    "Democratic People's Republic of Korea", "Democratic Republic of the Congo", "Haiti",
    "Iran", "Kenya", "Lao People's Democratic Republic", "Lebanon", "Monaco", "Mozambique",
    "Myanmar", "Namibia", "Nepal", "Nigeria", "Senegal", "South Africa", "South Sudan",
    "Syrian Arab Republic", "Venezuela", "Viet Nam", "Virgin Islands, British", "Yemen"
]


# Get all recognized countries using pycountry
all_countries = {country.name for country in pycountry.countries}

# Build AML risk score table
aml_risk_data = []
for country in sorted(all_countries):
    risk = "High" if country in high_risk_countries else "Low"
    aml_risk_data.append({"country": country, "aml_risk_score": risk})

# Convert to DataFrame
aml_risk_df = spark.createDataFrame(aml_risk_data)

# Save as CSV
aml_risk_df.write.mode("overwrite").option("header", "true").format("csv").save("/Volumes/workspace/default/source/aml_risk")

print("✅ aml_risk_scores.csv file generated.")

In [0]:
aml_risk_df.toPandas().to_csv("/Workspace/Users/tjeerdteduits@gmail.com/KYC-assessment/data/aml_risk_df.csv")

In [0]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta
from pyspark.sql.functions import *
from pyspark.sql.types import *

fake = Faker()
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# --- Configuration ---
NUM_CLIENTS = 1000
NUM_TRANSACTIONS = 15000

# Possible client countries; countries in which van Lanschot Kempen operates
client_countries = [
    "Netherlands", "Belgium", "Germany", "France", "Switzerland", "Italy",
    "Spain", "Austria", "Sweden", "Norway", "Denmark", "Finland"
]

# --- Create clients.csv ---
def random_birthdate(min_age=18, max_age=85):
    today = datetime.today()
    age = random.randint(min_age, max_age)
    birthdate = today - timedelta(days=age * 365 + random.randint(0, 364))
    return birthdate.date().isoformat()

client_data = [
    (f"C{str(i).zfill(4)}", fake.name(), random_birthdate(), random.choice(client_countries))
    for i in range(1, NUM_CLIENTS +1)
]

clients_schema = StructType([
    StructField("client_id", StringType(), False),
    StructField("name", StringType(), False),
    StructField("date_of_birth", StringType(), False),
    StructField("country", StringType(), False)
])

clients_df = spark.createDataFrame(client_data, schema=clients_schema)

# --- Save CSVs ---
clients_df.write.mode("overwrite").option("header", "true").format("csv").save("/Volumes/workspace/default/source/clients")

print("✅ clients.csv files generated.")

In [0]:
clients_df.toPandas().to_csv("/Workspace/Users/tjeerdteduits@gmail.com/KYC-assessment/data/clients.csv")

In [0]:
# --- Create transactions.csv ---

# Assign 5% high-risk counterparties
num_high_risk = int(NUM_TRANSACTIONS * 0.05)
num_low_risk = NUM_TRANSACTIONS - num_high_risk
counterparty_countries = (
    np.random.choice(client_countries, num_low_risk).tolist() +
    np.random.choice(high_risk_countries, num_high_risk).tolist()
)
random.shuffle(counterparty_countries)

client_ids = [row[0] for row in client_data]
credit_debit = ["debit", "credit"]
credit_debit_list = np.random.choice(credit_debit, NUM_TRANSACTIONS).tolist()
transaction_type = ["cash", "wire", "online payment", "internal transfer"]
types_list = np.random.choice(transaction_type, NUM_TRANSACTIONS).tolist()

transaction_rows = []

for i in range(NUM_TRANSACTIONS):
    transaction_id = f"T{str(i+1).zfill(5)}"
    client_id = random.choice(client_ids)
    transaction_amount = np.round(random.expovariate(1/2000), 2)
    transaction_date = (datetime(2024, 1, 1) + timedelta(days=random.randint(0, 365))).isoformat()
    counterparty_id = f"CP{random.randint(10000, 99999)}"
    counterparty_country = counterparty_countries[i]
    credit_debit = credit_debit_list[i]
    transaction_type = types_list[i]

    transaction_rows.append((
        transaction_id, client_id, transaction_amount, transaction_date,
        counterparty_id, counterparty_country, credit_debit, transaction_type
    ))

# -----------------------------
# Define schema and create Spark DataFrame
# -----------------------------
transactions_schema = StructType([
    StructField("transaction_id", StringType(), False),
    StructField("client_id", StringType(), False),
    StructField("transaction_amount", DoubleType(), False),
    StructField("transaction_date", StringType(), False),
    StructField("counterparty_id", StringType(), False),
    StructField("counterparty_country", StringType(), False),
    StructField("credit_debit", StringType(), False),
    StructField("transaction_type", StringType(), False)
])

transactions_df = spark.createDataFrame(transaction_rows, schema=transactions_schema)

# --- Save CSVs ---
transactions_df.write.mode("overwrite").option("header", "true").format("csv").save("/Volumes/workspace/default/source/transactions")

print("✅ transactions.csv files generated.")

In [0]:
transactions_df.toPandas().to_csv("/Workspace/Users/tjeerdteduits@gmail.com/KYC-assessment/data/transactions.csv")

In [0]:
%fs ls /Volumes/workspace/default/source/

In [0]:
clients_read = spark.read.option("header", "true").csv("/Volumes/workspace/default/source/clients/")
clients_read.display()

transactions_read = spark.read.option("header", "true").csv("/Volumes/workspace/default/source/transactions/")
transactions_read.display()

aml_risk_df_read = spark.read.option("header", "true").csv("/Volumes/workspace/default/source/aml_risk_df/")
aml_risk_df_read.display()