In [1]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

print("Starting dataset generation...")

# --- Configuration ---
# We will start with 1000 normal transactions for two users
NUM_TRANSACTIONS = 1000
USER_IDS = ['user_A', 'user_B'] # These are our two fictional users

# --- Generate Transaction Data ---
# We create a Python dictionary to hold our data columns
data = {
    # Create a list of timestamps going back over the last 30 days
    'timestamp': [datetime.now() - timedelta(days=np.random.randint(0, 30), hours=np.random.randint(0, 24)) for _ in range(NUM_TRANSACTIONS)],
    
    # Randomly assign each transaction to one of our users
    'user_id': np.random.choice(USER_IDS, NUM_TRANSACTIONS),
    
    # Generate realistic, small transaction amounts. 
    # np.random.normal creates a "bell curve" of values centered around 200.
    'amount': np.round(np.random.normal(loc=200, scale=75, size=NUM_TRANSACTIONS), 2),
    
    # For now, all these transactions are normal, so 'is_fraud' is 0
    'is_fraud': 0 
}

# --- Create and Save the DataFrame ---
# Convert the dictionary into a pandas DataFrame (a table)
transactions_df = pd.DataFrame(data)

# Make sure no transaction amounts are negative
transactions_df['amount'] = transactions_df['amount'].abs()

# Sort the transactions by time to make them look like a real statement
transactions_df = transactions_df.sort_values(by='timestamp').reset_index(drop=True)

# --- Save the dataset to a CSV file ---
# This file will be used by the next part of our project
transactions_df.to_csv('upi_transactions.csv', index=False)

# --- Display the first 5 rows to check our work ---
print("Dataset created successfully!")
print("Here are the first 5 transactions:")
print(transactions_df.head())


Starting dataset generation...
Dataset created successfully!
Here are the first 5 transactions:
                   timestamp user_id  amount  is_fraud
0 2025-06-22 13:06:56.068080  user_B  110.66         0
1 2025-06-22 14:06:56.071316  user_A   84.95         0
2 2025-06-22 14:06:56.071316  user_B  244.91         0
3 2025-06-22 15:06:56.071316  user_B  102.56         0
4 2025-06-22 16:06:56.052677  user_A  353.38         0


In [4]:
#Import the necessary libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

print("--- SCRIPT START ---")
print("Attempting to load 'upi_transactions.csv'...")

try:
    # --- Step 1: Load the dataset of normal transactions ---
    transactions_df = pd.read_csv('upi_transactions.csv')
    print("...File loaded into pandas successfully.")
    
    print("Attempting to convert 'timestamp' column to datetime format...")
    # Convert timestamp column back to datetime objects, which is crucial
    transactions_df['timestamp'] = pd.to_datetime(transactions_df['timestamp'])
    print("...'timestamp' column converted successfully.")

    # --- Step 2: Define and generate fraudulent transactions ---
    print("Generating 20 fraudulent transactions...")
    NUM_FRAUD_TRANSACTIONS = 20
    fraud_transactions = []

    for i in range(NUM_FRAUD_TRANSACTIONS):
        victim_user = np.random.choice(['user_A', 'user_B'])
        
        if i % 3 == 0:
            fraud_amount = np.round(np.random.uniform(10000, 50000), 2)
            fraud_time = datetime.now() - timedelta(days=np.random.randint(0, 5), hours=np.random.randint(9, 17))
        elif i % 3 == 1:
            fraud_amount = np.round(np.random.uniform(500, 2000), 2)
            fraud_time = datetime.now().replace(hour=3, minute=np.random.randint(0,59)) - timedelta(days=np.random.randint(0, 5))
        else:
            fraud_amount = np.round(np.random.uniform(100, 500), 2)
            fraud_time = datetime.now() - timedelta(minutes=np.random.randint(1, 15))

        fraud_transactions.append({
            'timestamp': fraud_time,
            'user_id': victim_user,
            'amount': fraud_amount,
            'is_fraud': 1
        })

    fraud_df = pd.DataFrame(fraud_transactions)
    print("...Fraudulent transactions generated.")

    # --- Step 3: Combine normal and fraudulent transactions ---
    print("Combining normal and fraudulent dataframes...")
    combined_df = pd.concat([transactions_df, fraud_df], ignore_index=True)
    print("...Dataframes combined.")
    
    print("Shuffling the dataset...")
    combined_df = combined_df.sample(frac=1).reset_index(drop=True)
    print("...Dataset shuffled.")

    # --- Step 4: Save the final, combined dataset ---
    combined_df.to_csv('upi_transactions.csv', index=False)
    print("\n--- SCRIPT FINISHED ---")
    print(f"Total transactions: {len(combined_df)}")
    print(f"Normal transactions: {len(combined_df[combined_df['is_fraud'] == 0])}")
    print(f"Fraudulent transactions: {len(combined_df[combined_df['is_fraud'] == 1])}")
    print("\n'upi_transactions.csv' has been successfully updated.")

# This part will catch ANY error and tell us what it is.
except Exception as e:
    print("\n--- AN ERROR OCCURRED ---")
    print(f"The error is: {e}")
    print(f"The type of error is: {type(e)}")

--- SCRIPT START ---
Attempting to load 'upi_transactions.csv'...
...File loaded into pandas successfully.
Attempting to convert 'timestamp' column to datetime format...
...'timestamp' column converted successfully.
Generating 20 fraudulent transactions...
...Fraudulent transactions generated.
Combining normal and fraudulent dataframes...
...Dataframes combined.
Shuffling the dataset...
...Dataset shuffled.

--- SCRIPT FINISHED ---
Total transactions: 1040
Normal transactions: 1000
Fraudulent transactions: 40

'upi_transactions.csv' has been successfully updated.
