In [1]:
!pip install pandas numpy

Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Collecting numpy
  Using cached numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m562.5 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached numpy-2.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.6 MB)
Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

In [None]:
n_users = 5000
min_txns, max_txns = 10, 20 
fraud_user_ratio = 0.2     
all_locations = ["Delhi", "Mumbai", "Hyderabad", "Chennai", "Bangalore", "Kolkata"]

In [4]:
user_profiles = pd.DataFrame({
    "user_id": np.arange(1000, 1000 + n_users),
    "avg_amount": np.random.uniform(300, 2000, n_users),
    "std_amount": np.random.uniform(50, 300, n_users),
    "home_location": np.random.choice(all_locations, n_users),
    "is_traveler": np.random.choice([0, 1], n_users, p=[0.8, 0.2])  # 20% frequent travelers
})

In [5]:
transactions = []
txn_id = 1

In [6]:
fraud_users = set(np.random.choice(user_profiles["user_id"], int(fraud_user_ratio * n_users), replace=False))

In [7]:
for _, user in user_profiles.iterrows():
    n_txns = np.random.randint(min_txns, max_txns + 1)
    last_ts = pd.Timestamp("2023-01-01").value // 10**9

    fraud_injected = False

    for i in range(n_txns):
        # Base amount
        amount = np.random.normal(user["avg_amount"], user["std_amount"])
        amount = max(10, round(amount, 2))

        # Location
        if user["is_traveler"] and np.random.rand() < 0.3:
            location = np.random.choice(all_locations)
        else:
            location = user["home_location"]

        # Device / Merchant
        device_id = np.random.randint(5000, 7000)
        merchant_id = np.random.randint(200, 1000)

        # Time (add random gap)
        gap = np.random.randint(60, 60 * 60 * 24)  # 1 min – 1 day
        ts = last_ts + gap
        last_ts = ts
        txn_time = pd.to_datetime(ts, unit="s")

        fraud = 0

        # Fraud injection if:
        # - user is in fraud_users
        # - OR randomly by 4% chance
        if (user["user_id"] in fraud_users and not fraud_injected and i > 2) or (np.random.rand() < 0.04):
            fraud = 1
            fraud_injected = True
            fraud_type = np.random.choice(["high_amount", "burst", "loc_device_change"])
            if fraud_type == "high_amount":
                amount = round(user["avg_amount"] * np.random.uniform(8, 20), 2)
            elif fraud_type == "burst":
                amount = round(user["avg_amount"] * np.random.uniform(5, 15), 2)
                ts = last_ts + np.random.randint(1, 120)  # within 2 mins
                last_ts = ts
                txn_time = pd.to_datetime(ts, unit="s")
            elif fraud_type == "loc_device_change":
                locs = [l for l in all_locations if l != user["home_location"]]
                location = np.random.choice(locs)
                device_id = np.random.randint(7000, 8000)
                amount = round(user["avg_amount"] * np.random.uniform(5, 15), 2)

        # Append transaction
        transactions.append({
            "transaction_id": txn_id,
            "user_id": user["user_id"],
            "transaction_amount": amount,
            "merchant_id": merchant_id,
            "device_id": device_id,
            "transaction_time": txn_time,
            "location": location,
            "is_fraud": fraud
        })
        txn_id += 1

In [8]:
upi_behavior_df = pd.DataFrame(transactions)

In [9]:
fraud_score = np.clip(
    upi_behavior_df["is_fraud"] * np.random.uniform(0.7, 1.0, len(upi_behavior_df)) +
    (1 - upi_behavior_df["is_fraud"]) * np.random.uniform(0.0, 0.3, len(upi_behavior_df)), 
    0, 1
)

In [10]:
upi_behavior_df["fraud_score"] = fraud_score

In [13]:
print(" Dataset shape:", upi_behavior_df.shape)

 Dataset shape: (29842, 9)


In [12]:
print("Users with at least 1 fraud:", upi_behavior_df.groupby("user_id")["is_fraud"].max().sum())

Users with at least 1 fraud: 1097


In [14]:
print(upi_behavior_df.groupby("user_id").size().head(20))

user_id
1000    18
1001    20
1002    16
1003    12
1004    15
1005    13
1006    18
1007    20
1008    20
1009    10
1010    14
1011    17
1012    15
1013    13
1014    18
1015    13
1016    19
1017    16
1018    17
1019    13
dtype: int64


In [15]:
print(upi_behavior_df.head(15))

    transaction_id  user_id  transaction_amount  merchant_id  device_id  \
0                1     1000              793.07          533       5501   
1                2     1000              988.64          778       6481   
2                3     1000              931.95          421       5558   
3                4     1000              766.52          434       6182   
4                5     1000             1039.50          686       5478   
5                6     1000              946.32          205       6526   
6                7     1000              857.47          421       5873   
7                8     1000             1071.00          740       6588   
8                9     1000              927.19          460       5185   
9               10     1000              928.95          423       5761   
10              11     1000              817.40          409       6646   
11              12     1000              905.02          922       6244   
12              13     10

In [17]:
upi_behavior_df.to_csv("upi.csv", index=False)