To generate the requested data, we need to follow these steps:

Define the structure: We will use the provided structure to replicate and vary the data.
Create 20 unique clients and 10 banks: This will ensure diversity in the data.
Generate data for 100,000 rows: Ensure the data has 4 clusters for clients.
Vary the data to make it realistic: By changing dates, EOD, Total_Outflow, Total_Inflow, Total_Outflow_Volume, and Total_Inflow_Volume.
Let's start by creating the data structure in Python and generating the synthetic data. The clusters will be created by assigning clients to clusters based on some variations in their data.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Define the clients and banks
clients = [i for i in range(20)]
banks = [f"BANK {i}" for i in range(1, 11)]

# Create a date range
date_range = pd.date_range(start='2022-07-01', end='2022-12-31', freq='B')  # Business days only

# Helper function to generate random data for clusters
def generate_cluster_data(cluster_id, num_rows):
    cluster_data = []
    for _ in range(num_rows):
        bal_date = np.random.choice(date_range)
        cust_id = np.random.randint(100000000, 999999999)
        LOB_data = "ASSET SERVICING INTERNAL"
        cust_ipid_nm = clients[cluster_id]
        ult_parent_cust_ipid_no = np.random.randint(100000000, 999999999)
        ult_parent_cust_nm = f"Company of {clients[cluster_id]}"
        client = np.random.choice(banks)
        subclient = np.random.choice(["COMMERCIAL BANK", "INVESTMENT BANK"])
        EOD = np.random.randint(500000000, 1000000000)
        Total_Outflow = np.random.randint(5000000000, 10000000000)
        Total_Inflow = Total_Outflow + np.random.randint(-100000000, 100000000)
        Total_Outflow_Volume = np.random.randint(50, 150)
        Total_Inflow_Volume = np.random.randint(50, 150)
        
        cluster_data.append([
            pd.to_datetime(bal_date).strftime("%m/%d/%y"), cust_id, LOB_data, cust_ipid_nm, ult_parent_cust_ipid_no,
            ult_parent_cust_nm, client, subclient, EOD, Total_Outflow, Total_Inflow, Total_Outflow_Volume, Total_Inflow_Volume
        ])
    return cluster_data

# Create a DataFrame to hold all data
data = []

# Generate data for 4 clusters
num_clusters = 4
rows_per_cluster = 100000 // num_clusters

for cluster_id in range(num_clusters):
    cluster_data = generate_cluster_data(cluster_id, rows_per_cluster)
    data.extend(cluster_data)

# Convert to DataFrame
columns = ["bal_date", "cust_id", "LOB_data", "cust_ipid_nm", "ult_parent_cust_ipid_no", "ult_parent_cust_nm",
           "client", "subclient", "EOD", "Total_Outflow", "Total_Inflow", "Total_Outflow_Volume", "Total_Inflow_Volume"]

df = pd.DataFrame(data, columns=columns)

# Shuffle the DataFrame rows
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv('./datasets/odm_data_example/synthetic_data.csv', index=False)


In [1]:
## with clear cluster sepeation

In [3]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Define the clients and banks
clients = [i for i in range(20)]
banks = [f"BANK {i}" for i in range(1, 11)]

# Create a date range
date_range = pd.date_range(start='2022-07-01', end='2022-12-31', freq='B')  # Business days only

# Define ranges for each cluster
cluster_ranges = {
    0: {"EOD": (500000000, 600000000), "Total_Outflow": (5000000000, 6000000000), "Total_Inflow_Diff": (-100000000, 100000000), "Volume": (50, 60)},
    1: {"EOD": (600000000, 700000000), "Total_Outflow": (6000000000, 7000000000), "Total_Inflow_Diff": (-200000000, 200000000), "Volume": (60, 70)},
    2: {"EOD": (700000000, 800000000), "Total_Outflow": (7000000000, 8000000000), "Total_Inflow_Diff": (-300000000, 300000000), "Volume": (70, 80)},
    3: {"EOD": (800000000, 900000000), "Total_Outflow": (8000000000, 9000000000), "Total_Inflow_Diff": (-400000000, 400000000), "Volume": (80, 90)}
}

# Helper function to generate random data for clusters
def generate_cluster_data(cluster_id, num_rows):
    cluster_data = []
    ranges = cluster_ranges[cluster_id]
    
    for _ in range(num_rows):
        bal_date = np.random.choice(date_range)
        cust_id = np.random.randint(100000000, 999999999)
        LOB_data = "ASSET SERVICING INTERNAL"
        cust_ipid_nm = clients[cluster_id]
        ult_parent_cust_ipid_no = np.random.randint(100000000, 999999999)
        ult_parent_cust_nm = f"Company of {clients[cluster_id]}"
        client = np.random.choice(banks)
        subclient = np.random.choice(["COMMERCIAL BANK", "INVESTMENT BANK"])
        EOD = np.random.randint(*ranges["EOD"])
        Total_Outflow = np.random.randint(*ranges["Total_Outflow"])
        Total_Inflow = Total_Outflow + np.random.randint(*ranges["Total_Inflow_Diff"])
        Total_Outflow_Volume = np.random.randint(*ranges["Volume"])
        Total_Inflow_Volume = np.random.randint(*ranges["Volume"])
        
        cluster_data.append([
            pd.to_datetime(bal_date).strftime("%m/%d/%y"), cust_id, LOB_data, cust_ipid_nm, ult_parent_cust_ipid_no,
            ult_parent_cust_nm, client, subclient, EOD, Total_Outflow, Total_Inflow, Total_Outflow_Volume, Total_Inflow_Volume
        ])
    return cluster_data

# Create a DataFrame to hold all data
data = []

# Generate data for 4 clusters
num_clusters = 4
rows_per_cluster = 100000 // num_clusters

for cluster_id in range(num_clusters):
    cluster_data = generate_cluster_data(cluster_id, rows_per_cluster)
    data.extend(cluster_data)

# Convert to DataFrame
columns = ["bal_date", "cust_id", "LOB_data", "cust_ipid_nm", "ult_parent_cust_ipid_no", "ult_parent_cust_nm",
           "client", "subclient", "EOD", "Total_Outflow", "Total_Inflow", "Total_Outflow_Volume", "Total_Inflow_Volume"]

df = pd.DataFrame(data, columns=columns)

# Shuffle the DataFrame rows
df = df.sample(frac=1).reset_index(drop=True)

# Save to CSV
df.to_csv('./datasets/odm_data_example/synthetic_data.csv', index=False)
