In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import sys
import os
# Load helper functions
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from src.data_loader import load_fraud_data
from src.feature_engineering import add_time_features

# 2. Load merged dataset (with geolocation already added)
fraud_df = load_fraud_data("../data/merged_fraud_geo.csv")


In [2]:
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

In [3]:
fraud_df = add_time_features(fraud_df)

In [4]:
# ✅ --- Transaction Frequency ---
# Count how many transactions each user made
user_txn_count = fraud_df['user_id'].value_counts().to_dict()
fraud_df['user_transaction_count'] = fraud_df['user_id'].map(user_txn_count)

In [6]:
# ✅ --- Transaction Velocity ---
# How fast users transact after signup (txns per hour since signup)
fraud_df['transaction_velocity'] = fraud_df['user_transaction_count'] / (fraud_df['time_since_signup'] + 1)

In [7]:
device_counts = fraud_df['device_id'].value_counts().to_dict()
fraud_df['device_user_count'] = fraud_df['device_id'].map(device_counts)

In [8]:
# Count number of users per IP address
ip_counts = fraud_df['ip_address'].value_counts().to_dict()
fraud_df['ip_user_count'] = fraud_df['ip_address'].map(ip_counts)



In [9]:
# 6. --- Country Risk Feature ---
# First calculate country-level fraud rate if not done
fraud_rate_per_country = fraud_df.groupby('country')['class'].mean()
fraud_df['country_risk'] = fraud_df['country'].map(fraud_rate_per_country)



In [10]:
# 7. --- Encode Categorical Variables ---
label_cols = ['browser', 'source', 'sex', 'country']

for col in label_cols:
    le = LabelEncoder()
    fraud_df[col] = le.fit_transform(fraud_df[col].astype(str))

# 8. --- Drop unneeded raw columns (optional cleanup) ---


In [11]:
fraud_df.drop(['signup_time', 'purchase_time', 'device_id', 'ip_address'], axis=1, inplace=True)

# 9. Save the engineered dataset
fraud_df.to_csv("../data/fraud_with_features.csv", index=False)

print("✅ Feature engineering completed. File saved as: data/fraud_with_features.csv")

✅ Feature engineering completed. File saved as: data/fraud_with_features.csv
