In [1]:
!pip install pandas numpy

Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m562.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Number of synthetic users and transactions
n_users = 2000
n_samples = 100_000

# Generate base user profiles (normal spending behavior)

In [3]:
user_profiles = pd.DataFrame({
    "user_id": np.arange(1000, 1000 + n_users),
    "avg_amount": np.random.uniform(200, 2000, n_users),  # typical spend
    "std_amount": np.random.uniform(50, 300, n_users),   # variability in spend
    "home_location": np.random.choice(
        ["Delhi", "Mumbai", "Hyderabad", "Chennai", "Bangalore", "Kolkata"],
        n_users
    )
})

In [4]:
user_profiles

Unnamed: 0,user_id,avg_amount,std_amount,home_location
0,1000,874.172214,115.426421,Mumbai
1,1001,1911.285752,111.744700,Bangalore
2,1002,1517.589095,276.563645,Bangalore
3,1003,1277.585272,112.386550,Hyderabad
4,1004,480.833553,117.987432,Chennai
...,...,...,...,...
1995,2995,1382.519281,160.526759,Bangalore
1996,2996,1921.906318,133.600295,Kolkata
1997,2997,324.124429,148.643079,Bangalore
1998,2998,302.698498,182.485147,Chennai


In [6]:
uids = np.random.choice(user_profiles["user_id"], n_samples, replace=True)
amounts = np.zeros(n_samples)
locations = np.empty(n_samples, dtype=object)
devices = np.random.randint(5000, 7000, n_samples)
merchants = np.random.randint(200, 1000, n_samples)
times = pd.to_datetime(
    np.random.randint(
        pd.Timestamp("2023-01-01").value // 10**9,
        pd.Timestamp("2023-12-31").value // 10**9,
        size=n_samples
    ),
    unit="s"
)
is_fraud = np.zeros(n_samples, dtype=int)

In [7]:
for i, uid in enumerate(uids):
    user = user_profiles[user_profiles["user_id"] == uid].iloc[0]
    amount = np.random.normal(user["avg_amount"], user["std_amount"])
    amount = max(10, round(amount, 2))
    location = user["home_location"]

    if np.random.rand() < 0.04:  # ~4% fraud
        is_fraud[i] = 1
        fraud_type = np.random.choice(
            ["slight_high_amount", "subtle_loc_change", "device_spoof", "burst"], 
            p=[0.4, 0.3, 0.2, 0.1]  # probabilities
        )

        if fraud_type == "slight_high_amount":
            amount = round(user["avg_amount"] * np.random.uniform(1.5, 3), 2)

        elif fraud_type == "subtle_loc_change":
            locs = ["Delhi", "Mumbai", "Hyderabad", "Chennai", "Bangalore", "Kolkata"]
            locs.remove(user["home_location"])
            location = np.random.choice(locs)
            amount = round(user["avg_amount"] * np.random.uniform(1.2, 2.5), 2)

        elif fraud_type == "device_spoof":
            devices[i] = devices[i] + np.random.randint(-5, 5)  # small tweak
            amount = round(user["avg_amount"] * np.random.uniform(1.3, 2.8), 2)

        elif fraud_type == "burst":
            amount = round(user["avg_amount"] * np.random.uniform(2, 4), 2)

    # Add random noise to all transactions
    amount = max(10, round(amount + np.random.normal(0, 50), 2))

    amounts[i] = amount
    locations[i] = location

In [8]:
upi_behavior_df = pd.DataFrame({
    "transaction_id": np.arange(1, n_samples + 1),
    "user_id": uids,
    "transaction_amount": amounts,
    "merchant_id": merchants,
    "device_id": devices,
    "transaction_time": times,
    "location": locations,
    "is_fraud": is_fraud
})

# Fraud score
fraud_score = np.clip(
    is_fraud * np.random.uniform(0.7, 1.0, n_samples) +
    (1 - is_fraud) * np.random.uniform(0.0, 0.3, n_samples), 0, 1
)
upi_behavior_df["fraud_score"] = fraud_score

In [9]:
# Save dataset to CSV
upi_behavior_df.to_csv("synthetic_upi_fraud_dataset.csv", index=False)

print("✅ Dataset saved as synthetic_upi_fraud_dataset.csv")


✅ Dataset saved as synthetic_upi_fraud_dataset.csv
