In [171]:
from datetime import date

import pandas as pd
import random
from faker import Faker

In [172]:
fake = Faker()
# Same random data every time it runs
Faker.seed(42)
random.seed(42)

In [173]:
# Table 1 : MCC Codes

mcc_data = [
    {'mcc_code': 5812, 'merchant_category_name': 'Restaurants', 'risk_category': 'Low'},
    {'mcc_code': 5411, 'merchant_category_name': 'Grocery Stores', 'risk_category': 'Low'},
    {'mcc_code': 5651, 'merchant_category_name': 'Clothing Stores', 'risk_category': 'Low'},
    {'mcc_code': 5734, 'merchant_category_name': 'Computer Software', 'risk_category': 'Moderate'},
    {'mcc_code': 7994, 'merchant_category_name': 'Video Game Arcades/Establishments', 'risk_category': 'Moderate'},
    {'mcc_code': 5532, 'merchant_category_name': 'Automotive Tire Stores', 'risk_category': 'Moderate'},
    {'mcc_code': 5968, 'merchant_category_name': 'Subscription Services', 'risk_category': 'High'},
    {'mcc_code': 5921, 'merchant_category_name': 'Package Stores â€” Beer, Wine, and Liquor', 'risk_category': 'High'},
    {'mcc_code': 4511, 'merchant_category_name': 'Airlines and Air Carriers (not elsewhere classified)', 'risk_category': 'High'},
    {'mcc_code': 7995, 'merchant_category_name': 'Gambling/Betting', 'risk_category': 'Prohibited'},
    {'mcc_code': 7273, 'merchant_category_name': 'Dating and Escort Services', 'risk_category': 'Prohibited'}
]

df_mcc = pd.DataFrame(mcc_data)
file_path = '../fake_data/mcc_codes.csv'
df_mcc.to_csv(file_path, index=False)

In [174]:
# Table 2: Merchant's Information

merchant_list = []
mcc_code_list = [x['mcc_code'] for x in mcc_data] # Get a list of mcc codes

for i in range(1, 2001): # Generate 2000 Merchants
    row = {
        'merchant_id': fake.bothify(text='???????########'),  # Primary Key
        'merchant_legal_name': fake.company(),
        'merchant_dba_name': fake.last_name() + " " + fake.company_suffix(),
        'state': fake.state_abbr(),
        'tax_id': fake.ein(),
        'mcc_code': random.choice(mcc_code_list), # Foreign Key (Links to table 1)
        'website_url': fake.url()
    }
    merchant_list.append(row)

df_merchants = pd.DataFrame(merchant_list)
file_path = '../fake_data/merchants.csv'
df_merchants.to_csv(file_path, index=False)

In [175]:
# Table 3: Applications
application_list = []

# Owner Credit Score
def generate_credit_score():
    # Pick a range based on weights
    owner_credit_score = random.choices(['normal', 'low'], weights=[90, 10])[0] # 90% chance of normal, 10% chance of low

    if owner_credit_score == 'normal': # If normal, pick a random number between 650 and 850
        return random.randint(650, 850)
    else:
        return random.randint(400, 649) # If low, pick a random number between 400 and 649

for merchant in merchant_list:
    # Monthly Volume and Requested Limit
    monthly_volume = random.choice([10000, 25000, 50000, 100000])
    requested_limit = monthly_volume * 1.2 # Assuming that 20% of monthly volume is requested

    # Average Ticket Size
    if merchant['mcc_code'] in [5812, 5411, 5651]: # Low Risk Merchant Categories
        average_ticket_size = random.uniform(10, 80)
    elif merchant['mcc_code'] in [5734, 7994, 5532]: # Moderate Risk Merchant Categories
        average_ticket_size = random.uniform(100, 1000)
    else:
        average_ticket_size = random.uniform(1000, 10000) # High Risk and Prohibited Merchant Categories

    # Years in Business and Previous Chargeback Ratio
    years_in_business = random.randint(0, 10)
    if years_in_business == 0: # Merchant has never been in business
        previous_chargeback_ratio = 0
    else:
        previous_chargeback_ratio = random.choices([random.uniform(0.1, 0.8), random.uniform(0.9, 1.5)], weights=[90,10])[0]
        # 90% chance of a value between 0.1 and 0.8, 10% chance of a value between 0.9 and 1.5


    row = {
        'application_id': fake.bothify(text='APP-######'), # Primary Key
        'merchant_id': merchant['merchant_id'], # Foreign Key (Links to table 2)
        'application_date': fake.date_between(start_date= date(2025,1,1), end_date= date(2026,1,1)),
        'monthly_volume': monthly_volume,
        'requested_limit': int(requested_limit),
        'average_ticket_size': round(average_ticket_size, 2),
        'owner_credit_score':generate_credit_score(),
        'years_in_business':years_in_business,
        'previous_chargeback_ratio': round(previous_chargeback_ratio, 2),
        'kyc_status': random.choices(['Passed', 'Failed'], weights=[95, 5])[0], # 95% chance of passing, 5% chance of failing
        'kyb_status': random.choices(['Passed', 'Failed'], weights=[98, 2])[0], # 98% chance of passing, 2% chance of failing
        'tmf_list': random.choices([True, False], weights=[2, 98])[0] # 2% are criminals
    }
    application_list.append(row)

df_applications = pd.DataFrame(application_list)
file_path = '../fake_data/applications.csv'
df_applications.to_csv(file_path, index=False)
df_applications.head(10)

Unnamed: 0,application_id,merchant_id,application_date,monthly_volume,requested_limit,average_ticket_size,owner_credit_score,years_in_business,previous_chargeback_ratio,kyc_status,kyb_status,tmf_list
0,APP-519788,fLBcbfn10433218,2025-02-05,100000,120000,9827.37,767,7,0.54,Passed,Passed,False
1,APP-495710,ygwwMqZ65423511,2025-10-20,10000,12000,32.09,842,4,0.46,Passed,Passed,False
2,APP-259900,wnQrSRP31647525,2025-09-14,10000,12000,70.61,762,5,0.29,Passed,Passed,False
3,APP-284224,PFzPDjq56413953,2025-10-07,100000,120000,722.86,670,5,0.75,Passed,Failed,False
4,APP-701831,eyyMDHq71012269,2025-03-29,50000,60000,389.09,692,1,0.33,Passed,Passed,False
5,APP-743403,jxWkIXH82814893,2025-12-27,50000,60000,653.37,813,6,1.49,Passed,Passed,False
6,APP-057925,MBnIWUS82278248,2025-09-16,25000,30000,76.7,710,2,0.69,Passed,Passed,False
7,APP-173335,oecveGp98393010,2025-07-10,10000,12000,52.36,685,10,0.41,Passed,Passed,False
8,APP-368220,UvZgpmm56670106,2025-01-30,25000,30000,9644.68,777,0,0.0,Passed,Passed,False
9,APP-529796,ayqYYDs67736026,2025-07-21,50000,60000,7336.34,681,3,1.07,Passed,Passed,False
