In [None]:
!pip install faker

In [None]:
!pip install pandas

In [4]:
#importing required libraries and modules 
import pandas as pd
import random
from faker import Faker

fake = Faker()

# Country to currency and city mapping
country_currency_mapping = {
    'United States': {'currency': 'USD', 'cities': ['San Francisco', 'Seattle', 'Boston', 'Miami', 'Dallas']},
    'India': {'currency': 'INR', 'cities': ['Kolkata', 'Pune', 'Ahmedabad', 'Jaipur', 'Lucknow']},
    'United Kingdom': {'currency': 'GBP', 'cities': ['Edinburgh', 'Leeds', 'Bristol', 'Cardiff', 'Sheffield']},
    'Japan': {'currency': 'JPY', 'cities': ['Tokyo', 'Osaka', 'Nagoya', 'Sapporo', 'Fukuoka']},
    'Germany': {'currency': 'EUR', 'cities': ['Stuttgart', 'Düsseldorf', 'Dresden', 'Leipzig', 'Nuremberg']},
    'UAE': {'currency': 'AED', 'cities': ['Dubai', 'Abu Dhabi', 'Sharjah', 'Al Ain', 'Ajman']},
    'Qatar': {'currency': 'QAR', 'cities': ['Doha', 'Al Rayyan', 'Umm Salal', 'Al Wakrah', 'Al Khor']}
}

# Function to generate a 18-character UUID
def generate_short_uuid():
    return fake.uuid4()[:18]

# Transaction data generation
def generate_transaction_data(num_transactions, customers_df, accounts_df):
    transactions = []
    channels = ['online', 'mobile', 'ATM', 'in-branch', 'telephone', 'mail']
    transaction_types = ['purchase', 'transfer', 'withdrawal', 'deposit', 'payment', 'refund']
    merchants = ['Rakuten', 'eBay', 'Zalando', 'ASOS', 'Lazada', 'Allegro', 
                 'Temu', 'Target', 'BigBasket', 'Uniqlo', 'Mediamarkt', 'Saturn']
    categories = ['Electronics', 'Fashion', 'Home & Garden', 'Health & Beauty', 'Sports & Outdoors']

    for _ in range(num_transactions):
        customer = customers_df.sample(1).iloc[0]
        account = accounts_df[accounts_df['customer_id'] == customer['customer_id']].sample(1).iloc[0]
        transaction_id = generate_short_uuid()
        customer_id = customer['customer_id']
        transaction_date = fake.date_time_between(start_date='-2y', end_date='now')
        amount = round(random.uniform(1.0, 10000.0), 2)
        country_info = country_currency_mapping.get(customer['country'], {'currency': 'USD', 'cities': ['San Francisco']})        
        currency = country_info['currency']
        transaction_type = random.choice(transaction_types)
        channel = random.choice(channels)
        merchant_name = random.choice(merchants)
        merchant_category = random.choice(categories)
        location_country = customer['country']
        location_city = random.choice(country_info['cities'])
        is_flagged = fake.boolean(chance_of_getting_true=5)  # 5% chance of being flagged

        transactions.append({
            'transaction_id': transaction_id,
            'customer_id': customer_id,
            'transaction_date': transaction_date,
            'amount': amount,
            'currency': currency,
            'transaction_type': transaction_type,
            'channel': channel,
            'merchant_name': merchant_name,
            'merchant_category': merchant_category,
            'location_country': location_country,
            'location_city': location_city,
            'is_flagged': is_flagged
        })

    return pd.DataFrame(transactions)

# Customer data generation
def generate_customers(num_customers):
    customers = []
    countries = list(country_currency_mapping.keys())
    for _ in range(num_customers):
        customer_id = generate_short_uuid()
        first_name = "null" if random.random() < 0.1 else fake.first_name()
        date_of_birth = "null" if random.random() < 0.1 else fake.date_of_birth(minimum_age=0, maximum_age=100)
        last_name = fake.last_name()
        gender = random.choice(['M', 'F'])
        email = fake.email()
        phone_number = fake.phone_number()
        address = fake.street_address()
        country = random.choice(countries)
        city = random.choice(country_currency_mapping[country]['cities'])
        occupation = fake.job()
        income_bracket = random.choice(['Low', 'Medium', 'High'])
        customer_since = fake.date_between(start_date='-2y', end_date='now')

        customers.append({
            'customer_id': customer_id,
            'first_name': first_name,
            'last_name': last_name,
            'date_of_birth': date_of_birth,
            'gender': gender,
            'email': email,
            'phone_number': phone_number,
            'address': address,
            'city': city,
            'country': country,
            'occupation': occupation,
            'income_bracket': income_bracket,
            'customer_since': customer_since
        })

    return pd.DataFrame(customers)

# Account data generation
def generate_accounts(customers_df):
    accounts = []
    account_types = ['savings', 'credit card', 'loan', 'investment', 'mortgage']
    account_statuses = ['active', 'dormant', 'closed', 'suspended', 'pending']

    for _, customer in customers_df.iterrows():
        for _ in range(random.randint(1, 3)):  # Each customer can have 1 to 3 accounts
            account_id = generate_short_uuid()
            customer_id = customer['customer_id']
            account_type = random.choice(account_types)
            account_status = random.choice(account_statuses)
            open_date = fake.date_between(start_date=customer['customer_since'])
            current_balance = round(random.uniform(0.0, 100000.0), 2)
            currency = country_currency_mapping.get(customer['country'], {'currency': 'USD'})['currency']
            credit_limit = round(random.uniform(1000.0, 50000.0), 2) if account_type == 'credit card' else 0.0

            accounts.append({
                'account_id': account_id,
                'customer_id': customer_id,
                'account_type': account_type,
                'account_status': account_status,
                'open_date': open_date,
                'current_balance': current_balance,
                'currency': currency,
                'credit_limit': credit_limit
            })

    return pd.DataFrame(accounts)

# Credit data generation
def generate_credit_data(customers_df):
    credit_data = []
    for _, customer in customers_df.iterrows():
        credit_score = random.randint(300, 850)
        number_of_credit_accounts = random.randint(1, 10)
        total_credit_limit = round(random.uniform(1000.0, 50000.0), 2)
        total_credit_used = round(random.uniform(0.0, total_credit_limit), 2)
        number_of_late_payments = random.randint(0, 5)
        bankruptcies = random.randint(0, 1)

        credit_data.append({
            'customer_id': customer['customer_id'],
            'credit_score': credit_score,
            'number_of_credit_accounts': number_of_credit_accounts,
            'total_credit_limit': total_credit_limit,
            'total_credit_used': total_credit_used,
            'number_of_late_payments': number_of_late_payments,
            'bankruptcies': bankruptcies
        })

    return pd.DataFrame(credit_data)

# Watchlist data generation
def generate_watchlist_data(num_entities):
    watchlist = []
    risk_categories = ['Low', 'Medium', 'High']

    for _ in range(num_entities):
        entity_id = generate_short_uuid()
        entity_name = fake.company()
        entity_type = random.choice(['Individual', 'Organization'])
        risk_category = random.choice(risk_categories)
        listed_date = fake.date_between(start_date='-2y', end_date='now')

        watchlist.append({
            'entity_id': entity_id,
            'entity_name': entity_name,
            'entity_type': entity_type,
            'risk_category': risk_category,
            'listed_date': listed_date
        })

    return pd.DataFrame(watchlist)

# Function to generate all data
def generate_data(num_customers, num_transactions, num_watchlist):
    customers_df = generate_customers(num_customers)
    accounts_df = generate_accounts(customers_df)
    transactions_df = generate_transaction_data(num_transactions, customers_df, accounts_df)
    credits_df = generate_credit_data(customers_df)
    watchlists_df = generate_watchlist_data(num_watchlist)
    return customers_df, accounts_df, transactions_df, credits_df, watchlists_df


# Save data to CSV files
def save_data_to_csv(customers_df, accounts_df, transactions_df, credits_df, watchlists_df):
    customers_df.to_csv('Gbank_customers.csv', index=False)
    accounts_df.to_csv('Gbank_accounts.csv', index=False)
    transactions_df.to_csv('Gbank_transactions.csv', index=False)
    credits_df.to_csv('Gbank_credits.csv', index=False)
    watchlists_df.to_csv('Gbank_watchlists.csv', index=False)

# Main Function
num_customers = 1000
num_transactions = 10000
num_watchlist = 100

# Save data to CSV files


customers_df, accounts_df, transactions_df, credits_df, watchlists_df = generate_data(num_customers, num_transactions, num_watchlist)

save_data_to_csv(customers_df, accounts_df, transactions_df, credits_df, watchlists_df)


