In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def generate_credit_card_data(num_transactions=10000):
    np.random.seed(42)

    # Generate transaction IDs
    transaction_ids = np.arange(1, num_transactions + 1)

    # Generate transaction dates (last 365 days)
    dates = [datetime.now() - timedelta(days=np.random.randint(0, 365)) for _ in range(num_transactions)]

    # Generate amounts (positive values only)
    amounts = np.random.normal(loc=50, scale=30, size=num_transactions)
    amounts[amounts < 1] = 1  # ensure positive amounts

    # Card types and merchants
    card_types = np.random.choice(['Visa', 'Mastercard', 'Amex'], size=num_transactions, p=[0.5, 0.3, 0.2])
    merchants = np.random.choice(
        ['Amazon', 'Walmart', 'Starbucks', 'Target', 'BestBuy', 'Gas Station', 'Restaurant'],
        size=num_transactions
    )
    
    # Fraud labels (2% fraud)
    is_fraud = np.zeros(num_transactions, dtype=int)
    fraud_indices = np.random.choice(num_transactions, size=int(num_transactions * 0.02), replace=False)
    is_fraud[fraud_indices] = 1
    amounts[fraud_indices] = np.random.normal(loc=200, scale=100, size=len(fraud_indices))

    # Customer demographic data
    num_customers = int(num_transactions / 10)  # assume each customer has ~10 transactions
    customer_ids = np.arange(1, num_customers + 1)
    ages = np.random.randint(20, 70, size=num_customers)
    genders = np.random.choice(['Male', 'Female'], size=num_customers)
    incomes = np.random.normal(loc=60000, scale=20000, size=num_customers)

    customer_data = pd.DataFrame({
        'customer_id': customer_ids,
        'age': ages,
        'gender': genders,
        'income': incomes
    })

    # Assign customers to transactions
    transaction_customer_ids = np.random.choice(customer_ids, size=num_transactions)

    transaction_data = pd.DataFrame({
        'transaction_id': transaction_ids,
        'customer_id': transaction_customer_ids,
        'date': dates,
        'amount': amounts,
        'card_type': card_types,
        'merchant': merchants,
        'is_fraud': is_fraud
    })

    # Merge transactions with customer data
    df = pd.merge(transaction_data, customer_data, on='customer_id', how='left')
    
    return df


if __name__ == '__main__':
    df = generate_credit_card_data()
    
    # Save dataset (Windows path)
    save_path = "C:\\Users\\ajroy\\OneDrive\\Desktop\\credit\\credit_card_transactions.csv"
    df.to_csv(save_path, index=False)

    print(f"✅ Credit card transaction data generated and saved to {save_path}")
    print("Sample Data:")
    print(df.head())


✅ Credit card transaction data generated and saved to C:\Users\ajroy\OneDrive\Desktop\credit\credit_card_transactions.csv
Sample Data:
   transaction_id  customer_id                       date      amount  \
0               1           36 2025-06-23 23:34:47.380983   39.042771   
1               2          422 2024-10-20 23:34:47.381490    1.000000   
2               3          478 2025-01-06 23:34:47.381505  111.076206   
3               4          502 2025-06-19 23:34:47.381512   64.069342   
4               5          338 2025-07-24 23:34:47.381518   35.241293   

  card_type     merchant  is_fraud  age  gender        income  
0      Visa  Gas Station         0   63    Male  76843.742364  
1      Visa       Amazon         0   65    Male  49701.202459  
2      Visa   Restaurant         0   39  Female  66393.261077  
3      Visa       Target         0   65  Female  61575.546887  
4      Visa       Amazon         0   32  Female  80177.918460  
