# Phase 3: Data Preprocessing & Feature Engineering

- Load raw train data
- Clean and transform
- Engineer key features (time-based, frequency, distance, amount, age)
- Handle PII ethically
- Save processed data

## Imports & Path Set-up

In [14]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
from math import radians, sin, cos, sqrt, atan2

RAW_DATA_PATH = '../data/raw/'
PROCESSED_DATA_PATH = '../data/processed/'

os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

train_file = os.path.join(RAW_DATA_PATH, 'fraudTrain.csv')
sample_file = os.path.join(RAW_DATA_PATH, 'fraudTrain_sample_10k.csv')

# For faster development: use sample first, then switch to full train
USE_SAMPLE = False
data_file = sample_file if USE_SAMPLE else train_file

print("Using file:", data_file)

Using file: ../data/raw/fraudTrain.csv


## Load data & Initial cleaning


In [15]:
# Load the CSV data into a DataFrame (skip unnamed index if present)
df = pd.read_csv(data_file, index_col=0)

# Print basic info for verification
print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())

# Convert transaction time column to datetime for easier extraction (e.g., hour, month)
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')

# Convert date of birth to datetime for age calculation
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Drop any rows with bad dates (should be 0 or few)
df = df.dropna(subset=['trans_date_trans_time'])

# Drop unnecessary/redundant columns (e.g., unique IDs not useful for modeling)
cols_to_drop = ['trans_num', 'unix_time']
df = df.drop(columns=cols_to_drop, errors='ignore')

# Ethical handling: Drop direct PII columns to avoid privacy risks; we'll compute age but consider excluding it from models
pii_cols = ['first', 'last', 'street', 'city', 'state', 'zip', 'job']
df = df.drop(columns=pii_cols, errors='ignore')

print("\nAfter cleaning shape:", df.shape)
df.head(3)

Shape: (1296675, 22)

Columns: ['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']

Missing values:
 trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

After cleaning shape: (1296675, 13)


Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,gender,lat,long,city_pop,dob,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,F,36.0788,-81.1781,3495,1988-03-09,36.011293,-82.048315,0
1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,F,48.8878,-118.2105,149,1978-06-21,49.159047,-118.186462,0
2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,M,42.1808,-112.262,4154,1962-01-19,43.150704,-112.154481,0


## Basic time features

In [16]:
# Extract hour of day from transaction time (for patterns like overnight fraud spikes)
df['trans_hour'] = df['trans_date_trans_time'].dt.hour

# Extract day of week (0=Monday, 6=Sunday; for weekend patterns)
df['trans_dayofweek'] = df['trans_date_trans_time'].dt.dayofweek

# Extract month (for January/holiday seasonality; fraud spikes in Jan post-holidays)
df['trans_month'] = df['trans_date_trans_time'].dt.month

# Research-inspired: Flag late-night hours (midnight-3am, where fraud often spikes due to low monitoring)
df['is_late_night'] = df['trans_hour'].apply(lambda x: 1 if (0 <= x <= 3) else 0)

# Broader night flag (21:00-05:00, as fallback)
df['is_night'] = df['trans_hour'].apply(lambda x: 1 if (x >= 21 or x < 4) else 0)

# Holiday season flag (Nov-Dec for Black Friday/Cyber Monday spikes)
df['is_holiday_season'] = df['trans_month'].apply(lambda x: 1 if x in [11, 12] else 0)

## Distance between user & merchant (Strong fraud signal)

In [17]:
# Haversine function to calculate km distance (accounts for Earth's curve; fraud often far from user location)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat/2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    return R * c

# Apply to each row to create distance feature
df['distance_km'] = df.apply(
    lambda row: haversine(
        row['lat'], row['long'],
        row['merch_lat'], row['merch_long']
    ), axis=1
)

# Quick stats to verify (distances should range 0-1000+ km)
print(df['distance_km'].describe())

count    1.296675e+06
mean     7.611465e+01
std      2.911693e+01
min      2.225452e-02
25%      5.533491e+01
50%      7.823175e+01
75%      9.850327e+01
max      1.521172e+02
Name: distance_km, dtype: float64


## Time-based frequency & recency per card

In [18]:
# Sort by card number and transaction time (required for groupby diff and rolling)
df = df.sort_values(['cc_num', 'trans_date_trans_time'])

# Time since last transaction (in hours)
df['time_since_last_trans'] = df.groupby('cc_num')['trans_date_trans_time'].diff().dt.total_seconds() / 3600
df['time_since_last_trans'] = df['time_since_last_trans'].fillna(0)

# Creator's functions for rolling counts (exact count in last N days, excluding current)
def last1DayTransactionCount(group):
    temp = pd.Series(group.index, index=group['trans_date_trans_time'], name='count_1_day').sort_index()
    count_1_day = temp.rolling('1d').count() - 1
    count_1_day.index = temp.values
    group['count_1_day'] = count_1_day.reindex(group.index)
    return group

def last7DaysTransactionCount(group):
    temp = pd.Series(group.index, index=group['trans_date_trans_time'], name='count_7_days').sort_index()
    count_7_days = temp.rolling('7d').count() - 1
    count_7_days.index = temp.values
    group['count_7_days'] = count_7_days.reindex(group.index)
    return group

def last30DaysTransactionCount(group):
    temp = pd.Series(group.index, index=group['trans_date_trans_time'], name='count_30_days').sort_index()
    count_30_days = temp.rolling('30d').count() - 1
    count_30_days.index = temp.values
    group['count_30_days'] = count_30_days.reindex(group.index)
    return group

# Apply the functions per group (slow on full data — run once, ~1–2 min)
df = df.groupby('cc_num', group_keys=False).apply(last1DayTransactionCount)
df = df.groupby('cc_num', group_keys=False).apply(last7DaysTransactionCount)
df = df.groupby('cc_num', group_keys=False).apply(last30DaysTransactionCount)

# Quick check
print(df[['cc_num', 'trans_date_trans_time', 'time_since_last_trans', 
          'count_1_day', 'count_7_days', 'count_30_days']].head(10))

  df = df.groupby('cc_num', group_keys=False).apply(last1DayTransactionCount)
  df = df.groupby('cc_num', group_keys=False).apply(last7DaysTransactionCount)


           cc_num trans_date_trans_time  time_since_last_trans  count_1_day  \
1017  60416207185   2019-01-01 12:47:15               0.000000          0.0   
2724  60416207185   2019-01-02 08:44:57              19.961667          1.0   
2726  60416207185   2019-01-02 08:47:36               0.044167          2.0   
2882  60416207185   2019-01-02 12:38:14               3.843889          3.0   
2907  60416207185   2019-01-02 13:10:46               0.542222          3.0   
4135  60416207185   2019-01-03 13:56:35              24.763611          0.0   
4337  60416207185   2019-01-03 17:05:10               3.143056          1.0   
5467  60416207185   2019-01-04 13:59:55              20.912500          1.0   
6027  60416207185   2019-01-04 21:17:22               7.290833          1.0   
6273  60416207185   2019-01-05 00:42:24               3.417222          2.0   

      count_7_days  count_30_days  
1017           0.0            0.0  
2724           1.0            1.0  
2726           2.0    

  df = df.groupby('cc_num', group_keys=False).apply(last30DaysTransactionCount)


## 

In [19]:
# New feature: Distance from "usual" location (mean lat/long of last 5 transactions per card)
# Calculate rolling mean lat/long per card
df['usual_lat'] = df.groupby('cc_num')['merch_lat'].rolling(window=5, min_periods=1).mean().shift(1).reset_index(level=0, drop=True)
df['usual_long'] = df.groupby('cc_num')['merch_long'].rolling(window=5, min_periods=1).mean().shift(1).reset_index(level=0, drop=True)
df['usual_lat'] = df['usual_lat'].fillna(df['merch_lat'])  # for first transactions, use current
df['usual_long'] = df['usual_long'].fillna(df['merch_long'])

# Calculate distance from usual location (using haversine function)
df['distance_from_usual_km'] = df.apply(
    lambda row: haversine(row['usual_lat'], row['usual_long'], row['merch_lat'], row['merch_long']),
    axis=1
)

# Quick check
print(df[['cc_num', 'merch_lat', 'merch_long', 'usual_lat', 'usual_long', 'distance_from_usual_km']].head(10))

           cc_num  merch_lat  merch_long  usual_lat  usual_long  \
1017  60416207185  43.974711 -109.741904  43.974711 -109.741904   
2724  60416207185  42.018766 -109.044172  43.974711 -109.741904   
2726  60416207185  42.961335 -109.157564  42.996738 -109.393038   
2882  60416207185  42.228227 -108.747683  42.984937 -109.314547   
2907  60416207185  43.321745 -108.091143  42.795760 -109.172831   
4135  60416207185  43.477317 -109.467136  42.900957 -108.956493   
4337  60416207185  42.871477 -109.160268  42.801478 -108.901540   
5467  60416207185  43.332599 -108.318444  42.972020 -108.924759   
6027  60416207185  43.598123 -108.977767  43.046273 -108.756935   
6273  60416207185  42.314401 -108.554520  43.320252 -108.802952   

      distance_from_usual_km  
1017                0.000000  
2724              224.769219  
2726               19.556262  
2882               96.083975  
2907              105.563313  
4135               76.296273  
4337               22.486494  
5467          

## Amount Transformations

In [20]:
# Log transform amount (handles skewness; fraud often in extreme amounts)
df['log_amt'] = np.log1p(df['amt'])  # log(1 + amt) to avoid log(0)

## Age features based on research (targetted groups between 30 and 39

In [21]:
# Approximate age at transaction (year difference; for patterns like senior targeting)
trans_year = df['trans_date_trans_time'].dt.year
birth_year = df['dob'].dt.year
df['age_at_trans'] = trans_year - birth_year

# Bin age for groups (seniors 60+ for higher loss insights)
bins = [0, 20, 30, 40, 50, 60, 120]
labels = ['<20', '20-29', '30-39', '40-49', '50-59', '60+']
df['age_bin'] = pd.cut(df['age_at_trans'], bins=bins, labels=labels)

# Optional flag based on stats for 30-39 (highest volume group)
# df['is_30_39'] = df['age_bin'].apply(lambda x: 1 if x == '30-39' else 0)

## Reorder columns & save processed data

In [22]:
# Drop high-cardinality columns to reduce memory & overfitting
high_cardinality_cols = ['merchant', 'cc_num']  # add any others if needed
df = df.drop(columns=high_cardinality_cols, errors='ignore')

# Reorder columns for clarity (focus on key ones first)
important_cols = ['trans_date_trans_time', 'category', 'amt', 'log_amt',
                  'trans_hour', 'is_late_night', 'trans_month', 'is_holiday_season',
                  'distance_km', 'time_since_last_trans', 'count_30_days', 'count_7_days', 
                  'count_1_day', 'age_at_trans', 'age_bin', 'is_fraud']

df_processed = df[important_cols + [col for col in df.columns if col not in important_cols]]

# Save the processed DataFrame to CSV
processed_file = os.path.join(PROCESSED_DATA_PATH, 'train_processed.csv')
df_processed.to_csv(processed_file, index=False)
print("Processed data saved to:", processed_file)

# Final verification
print("\nProcessed shape:", df_processed.shape)
df_processed.head(3)

Processed data saved to: ../data/processed/train_processed.csv

Processed shape: (1296675, 28)


Unnamed: 0,trans_date_trans_time,category,amt,log_amt,trans_hour,is_late_night,trans_month,is_holiday_season,distance_km,time_since_last_trans,...,long,city_pop,dob,merch_lat,merch_long,trans_dayofweek,is_night,usual_lat,usual_long,distance_from_usual_km
1017,2019-01-01 12:47:15,misc_net,7.27,2.112635,12,0,1,0,127.606239,0.0,...,-108.8964,1645,1986-02-17,43.974711,-109.741904,1,0,43.974711,-109.741904,0.0
2724,2019-01-02 08:44:57,gas_transport,52.94,3.987872,8,0,1,0,110.308921,19.961667,...,-108.8964,1645,1986-02-17,42.018766,-109.044172,2,0,43.974711,-109.741904,224.769219
2726,2019-01-02 08:47:36,gas_transport,82.08,4.419804,8,0,1,0,21.787261,0.044167,...,-108.8964,1645,1986-02-17,42.961335,-109.157564,2,0,42.996738,-109.393038,19.556262
