In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

print("Starting dataset generation...")

# --- Configuration ---
# We will start with 1000 normal transactions for two users
NUM_TRANSACTIONS = 1000
USER_IDS = ['user_A', 'user_B'] # These are our two fictional users

# --- Generate Transaction Data ---
# We create a Python dictionary to hold our data columns
data = {
    # Create a list of timestamps going back over the last 30 days
    'timestamp': [datetime.now() - timedelta(days=np.random.randint(0, 30), hours=np.random.randint(0, 24)) for _ in range(NUM_TRANSACTIONS)],
    
    # Randomly assign each transaction to one of our users
    'user_id': np.random.choice(USER_IDS, NUM_TRANSACTIONS),
    
    # Generate realistic, small transaction amounts. 
    # np.random.normal creates a "bell curve" of values centered around 200.
    'amount': np.round(np.random.normal(loc=200, scale=75, size=NUM_TRANSACTIONS), 2),
    
    # For now, all these transactions are normal, so 'is_fraud' is 0
    'is_fraud': 0 
}

# --- Create and Save the DataFrame ---
# Convert the dictionary into a pandas DataFrame (a table)
transactions_df = pd.DataFrame(data)

# Make sure no transaction amounts are negative
transactions_df['amount'] = transactions_df['amount'].abs()

# Sort the transactions by time to make them look like a real statement
transactions_df = transactions_df.sort_values(by='timestamp').reset_index(drop=True)

# --- Save the dataset to a CSV file ---
# This file will be used by the next part of our project
transactions_df.to_csv('upi_transactions.csv', index=False)

# --- Display the first 5 rows to check our work ---
print("Dataset created successfully!")
print("Here are the first 5 transactions:")
print(transactions_df.head())


Starting dataset generation...
Dataset created successfully!
Here are the first 5 transactions:
                   timestamp user_id  amount  is_fraud
0 2025-06-23 16:06:35.483604  user_B  246.37         0
1 2025-06-23 16:06:35.484851  user_B  179.48         0
2 2025-06-23 16:06:35.488855  user_A   82.32         0
3 2025-06-23 17:06:35.483604  user_A  203.28         0
4 2025-06-23 17:06:35.488855  user_B  159.04         0


In [2]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

print("Loading existing dataset...")
# --- Step 1: Load the dataset of normal transactions ---
try:
    transactions_df = pd.read_csv('upi_transactions.csv')
    transactions_df['timestamp'] = pd.to_datetime(transactions_df['timestamp'])
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("Error: 'upi_transactions.csv' not found. Please run the first code cell to create it.")

# --- Step 2: Define and generate fraudulent transactions ---
print("Generating fraudulent transactions with advanced scenarios...")
NUM_FRAUD_GROUPS = 15 # We will create 15 groups of fraud attacks
fraud_transactions = []
USER_IDS = ['user_A', 'user_B']

for i in range(NUM_FRAUD_GROUPS):
    victim_user = np.random.choice(USER_IDS)
    
    # Randomly choose a fraud scenario for each group
    scenario = i % 3
    
    if scenario == 0:
        # Scenario 1: Unusually high amount (single transaction)
        fraud_amount = np.round(np.random.uniform(10000, 50000), 2)
        fraud_time = datetime.now() - timedelta(days=np.random.randint(0, 5))
        fraud_transactions.append({
            'timestamp': fraud_time, 'user_id': victim_user,
            'amount': fraud_amount, 'is_fraud': 1
        })

    elif scenario == 1:
        # Scenario 2: Transaction at a very odd hour (single transaction)
        fraud_amount = np.round(np.random.uniform(500, 2000), 2)
        fraud_time = datetime.now().replace(hour=3, minute=np.random.randint(0,59)) - timedelta(days=np.random.randint(0, 5))
        fraud_transactions.append({
            'timestamp': fraud_time, 'user_id': victim_user,
            'amount': fraud_amount, 'is_fraud': 1
        })

    else:
        # Scenario 3: NEW "Rapid-Fire Fraud" (multiple small transactions)
        # A burst of 3 to 5 small transactions within a 2-minute window
        num_rapid_txns = np.random.randint(3, 6)
        base_time = datetime.now() - timedelta(days=np.random.randint(0, 5))
        for j in range(num_rapid_txns):
            fraud_amount = np.round(np.random.uniform(100, 500), 2)
            # Each transaction is a few seconds after the previous one
            fraud_time = base_time + timedelta(seconds=np.random.randint(10, 120))
            fraud_transactions.append({
                'timestamp': fraud_time, 'user_id': victim_user,
                'amount': fraud_amount, 'is_fraud': 1
            })


# Convert the list of fraud transactions into a DataFrame
fraud_df = pd.DataFrame(fraud_transactions)

# --- Step 3: Combine normal and fraudulent transactions ---
print("Combining normal and fraudulent data...")
combined_df = pd.concat([transactions_df, fraud_df], ignore_index=True)
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

# --- Step 4: Save the final, combined dataset ---
combined_df.to_csv('upi_transactions.csv', index=False)

print("\n--- Final Dataset Summary ---")
print(f"Total transactions: {len(combined_df)}")
print(f"Normal transactions: {len(combined_df[combined_df['is_fraud'] == 0])}")
print(f"Fraudulent transactions: {len(combined_df[combined_df['is_fraud'] == 1])}")
print("\n'upi_transactions.csv' has been updated with new, advanced fraud data.")


Loading existing dataset...
Dataset loaded successfully.
Generating fraudulent transactions with advanced scenarios...
Combining normal and fraudulent data...

--- Final Dataset Summary ---
Total transactions: 1030
Normal transactions: 1000
Fraudulent transactions: 30

'upi_transactions.csv' has been updated with new, advanced fraud data.
