Feature Engineering For Mobile_Payments_Fraud_Detection

In [11]:
import pandas as pd
import numpy as np


In [12]:
df=pd.read_csv("../data/processed/transactions_clean.csv")
df["timestamp"]=pd.to_datetime(df["timestamp"],errors="coerce")
df=df.sort_values(["user_id","timestamp"]).reset_index(drop=True)


In [13]:

df["user_txn_count"] = df.groupby("user_id")["transaction_id"].transform("count")
df["user_txn_sum"]   = df.groupby("user_id")["amount"].transform("sum")
df["user_avg_amount"] = df.groupby("user_id")["amount"].transform("mean")
df["user_std_amount"] = df.groupby("user_id")["amount"].transform("std").replace(0, 1)


df["amount_zscore_user"] = (df["amount"] - df["user_avg_amount"]) / df["user_std_amount"]

df["time_diff"] = df.groupby("user_id")["timestamp"].diff().dt.total_seconds().fillna(999999)

df["txn_velocity"] = 1 / (df["time_diff"] + 1)


In [14]:
df["merchant_txn_count"] = df.groupby("merchant_id")["transaction_id"].transform("count")
df["merchant_unique_users"] = df.groupby("merchant_id")["user_id"].transform("nunique")
df["merchant_fraud_rate"] = df.groupby("merchant_id")["is_fraud"].transform("mean")


In [15]:
device_fraud_rate = df.groupby("device_type")["is_fraud"].mean().to_dict()
df["device_risk_score"] = df["device_type"].map(device_fraud_rate)

location_fraud_rate = df.groupby("location")["is_fraud"].mean().to_dict()
df["location_risk_score"] = df["location"].map(location_fraud_rate)

df["location_device_interaction"] = df["device_risk_score"] * df["location_risk_score"]


In [16]:
def rolling_counts(user_df):
    user_df = user_df.sort_values('timestamp').set_index('timestamp')
    user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
    user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
    return user_df.reset_index()

df = df.groupby('user_id', group_keys=False).apply(rolling_counts).reset_index(drop=True)


  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolling('24H').count() - 1
  user_df['txn_count_1h'] = user_df['amount'].rolling('1H').count() - 1
  user_df['txn_count_24h'] = user_df['amount'].rolli

In [17]:
df['txn_count_1h'] = df['txn_count_1h'].fillna(0).astype(int)
df['txn_count_24h'] = df['txn_count_24h'].fillna(0).astype(int)

In [18]:
df["is_high_amount"] = (df["amount"] > df["user_avg_amount"] * 3).astype(int)
df["is_new_user"] = (df["user_txn_count"] < 5).astype(int)
df["is_high_velocity"] = (df["txn_velocity"] > df["txn_velocity"].quantile(0.95)).astype(int)
df["is_risky_device"] = (df["device_risk_score"] > df["device_risk_score"].quantile(0.9)).astype(int)
df["is_risky_location"] = (df["location_risk_score"] > df["location_risk_score"].quantile(0.9)).astype(int)


In [19]:

df["weighted_amount"] = df["amount"] * df["device_risk_score"] * df["location_risk_score"]

df["user_risk_score"] = (df["device_risk_score"] + df["location_risk_score"]) / 2


In [48]:
df.head(20)

Unnamed: 0,timestamp,transaction_id,user_id,amount,merchant_id,location,device_type,transaction_type,is_fraud,hour,...,location_device_interaction,txn_count_1h,txn_count_24h,is_high_amount,is_new_user,is_high_velocity,is_risky_device,is_risky_location,weighted_amount,user_risk_score
0,2025-05-05 01:02:00,TXN009738,1000,3518.44,2027,Kolkata,iOS,top-up,0,1,...,0.000502,0,0,0,0,0,0,0,1.765185,0.022428
1,2025-05-05 05:33:00,TXN003367,1000,7672.91,2009,Kolkata,Android,transfer,0,5,...,0.000329,0,1,0,0,0,0,0,2.526288,0.018377
2,2025-05-05 07:58:00,TXN008530,1000,3881.75,2027,Bangalore,Android,top-up,0,7,...,0.0003,0,2,0,0,0,0,0,1.164674,0.017433
3,2025-05-05 10:04:00,TXN000201,1000,5997.26,2013,Bangalore,Android,top-up,0,10,...,0.0003,0,3,0,0,0,0,0,1.799409,0.017433
4,2025-05-05 10:08:00,TXN002371,1000,3780.48,2018,Kolkata,Web,purchase,0,10,...,0.000538,1,4,0,0,1,0,0,2.032706,0.023273
5,2025-05-05 11:31:00,TXN006455,1000,6704.47,2045,Kolkata,iOS,transfer,0,11,...,0.000502,0,5,0,0,0,0,0,3.363602,0.022428
6,2025-05-05 15:16:00,TXN005334,1000,9214.64,2001,Bangalore,Android,top-up,0,15,...,0.0003,0,6,0,0,0,0,0,2.764747,0.017433
7,2025-05-06 02:27:00,TXN007160,1000,5726.72,2049,Delhi,Web,purchase,0,2,...,0.000681,0,6,0,0,0,0,0,3.897949,0.026103
8,2025-05-07 12:10:00,TXN003381,1000,3335.63,2023,Kolkata,Web,top-up,0,12,...,0.000538,0,0,0,0,0,0,0,1.793517,0.023273
9,2025-05-07 12:18:00,TXN007610,1000,3291.56,2008,Mumbai,iOS,purchase,0,12,...,0.000429,1,1,0,0,1,0,0,1.413401,0.020894


In [49]:
df.columns.tolist()

['timestamp',
 'transaction_id',
 'user_id',
 'amount',
 'merchant_id',
 'location',
 'device_type',
 'transaction_type',
 'is_fraud',
 'hour',
 'day_of_week',
 'is_weekend',
 'user_txn_count',
 'user_txn_sum',
 'user_avg_amount',
 'user_std_amount',
 'amount_zscore_user',
 'time_diff',
 'txn_velocity',
 'merchant_txn_count',
 'merchant_unique_users',
 'merchant_fraud_rate',
 'device_risk_score',
 'location_risk_score',
 'location_device_interaction',
 'txn_count_1h',
 'txn_count_24h',
 'is_high_amount',
 'is_new_user',
 'is_high_velocity',
 'is_risky_device',
 'is_risky_location',
 'weighted_amount',
 'user_risk_score']

In [None]:
cols_to_encode = [c for c in ["transaction_type", "device_type", "day_of_week"] if c in df.columns]
print("Encoding these columns:", cols_to_encode)

df = pd.get_dummies(df, columns=cols_to_encode, drop_first=True)

# ✅ Drop ID columns that are not useful for model training
id_cols = [c for c in ["transaction_id", "user_id", "merchant_id", "location"] if c in df.columns]
print("Dropping ID columns:", id_cols)
df = df.drop(id_cols, axis=1)

# ✅ Fill missing values with 0
df = df.fillna(0)

# ✅ Log-transform skewed columns (only those that exist)
for col in ["amount", "user_txn_sum", "weighted_amount"]:
    if col in df.columns:
        df[col] = np.log1p(df[col])

bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

# ✅ Save to file
df.to_csv("features_processed.csv", index=False)
print("✅ Feature dataset ready for modeling, shape:", df.shape)


✅ Feature dataset ready for modeling, shape: (10000, 34)
