# Feature Engineering

This notebook focuses on feature engineering and preprocessing for both e-commerce and credit card fraud detection datasets.


## 1. Import Libraries


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Enable tqdm for pandas
tqdm.pandas()


## 2. Load Data


In [None]:
# Load datasets
fraud_data = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_country = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
creditcard_data = pd.read_csv('../data/raw/creditcard.csv')

print("Data loaded successfully")
print(f"\nFraud Data shape: {fraud_data.shape}")
print(f"IP Country mapping shape: {ip_country.shape}")
print(f"Credit Card Data shape: {creditcard_data.shape}")


## 3. Data Cleaning - Fraud Data


In [None]:
# Create a copy for processing
df_fraud = fraud_data.copy()

print("="*80)
print("DATA CLEANING - FRAUD DATA")
print("="*80)

# 1. Check for missing values
print("\n1. Missing Values Analysis:")
print("-" * 50)
missing = df_fraud.isnull().sum()
missing_pct = (missing / len(df_fraud)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])
if missing_df[missing_df['Missing Count'] > 0].empty:
    print("✓ No missing values found!")

# 2. Check for duplicates
print("\n2. Duplicate Analysis:")
print("-" * 50)
duplicates = df_fraud.duplicated().sum()
print(f"Duplicate rows: {duplicates} ({duplicates/len(df_fraud)*100:.2f}%)")
if duplicates > 0:
    df_fraud = df_fraud.drop_duplicates()
    print(f"✓ Removed {duplicates} duplicate rows")
    print(f"New shape: {df_fraud.shape}")
else:
    print("✓ No duplicates found!")

# 3. Correct data types
print("\n3. Data Type Corrections:")
print("-" * 50)
print("Original data types:")
print(df_fraud.dtypes)

# Convert time columns to datetime
df_fraud['signup_time'] = pd.to_datetime(df_fraud['signup_time'])
df_fraud['purchase_time'] = pd.to_datetime(df_fraud['purchase_time'])

# Convert IP address to integer (it's already numeric but may have decimals)
df_fraud['ip_address'] = df_fraud['ip_address'].astype('int64')

print("\n✓ Converted signup_time and purchase_time to datetime")
print("✓ Converted ip_address to int64")

print("\nUpdated data types:")
print(df_fraud.dtypes)


In [None]:
print("="*80)
print("GEOLOCATION INTEGRATION")
print("="*80)

# Convert IP address ranges to integer format
ip_country['lower_bound_ip_address'] = ip_country['lower_bound_ip_address'].astype('int64')
ip_country['upper_bound_ip_address'] = ip_country['upper_bound_ip_address'].astype('int64')

print("\nIP Country mapping data types:")
print(ip_country.dtypes)
print(f"\nIP Country mapping shape: {ip_country.shape}")
print(f"\nSample IP ranges:")
print(ip_country.head(10))

# Function to map IP address to country using range-based lookup
def map_ip_to_country(ip_address, ip_country_df):
    """
    Maps an IP address to a country using range-based lookup.
    Uses binary search for efficiency.
    """
    # Find the country where ip_address falls within the range
    mask = (ip_country_df['lower_bound_ip_address'] <= ip_address) & \
           (ip_country_df['upper_bound_ip_address'] >= ip_address)
    matches = ip_country_df[mask]
    
    if len(matches) > 0:
        # If multiple matches, take the first one (shouldn't happen with proper ranges)
        return matches.iloc[0]['country']
    else:
        return 'Unknown'

# Apply the mapping (this may take a while for large datasets)
print("\nMapping IP addresses to countries...")
print("This may take a few minutes for large datasets...")

# Use vectorized approach for better performance
# Create a sorted version for faster lookup
ip_country_sorted = ip_country.sort_values('lower_bound_ip_address').reset_index(drop=True)

# For large datasets, we'll use a more efficient approach
# Using merge_asof for range-based lookup (requires sorted data)
df_fraud_sorted = df_fraud.sort_values('ip_address').reset_index(drop=True)
ip_country_sorted = ip_country_sorted.sort_values('lower_bound_ip_address').reset_index(drop=True)

# Use merge_asof for efficient range lookup
df_fraud_sorted = pd.merge_asof(
    df_fraud_sorted,
    ip_country_sorted[['lower_bound_ip_address', 'country']],
    left_on='ip_address',
    right_on='lower_bound_ip_address',
    direction='backward'
)

# Filter to keep only valid matches (where ip_address is within range)
valid_mask = (df_fraud_sorted['ip_address'] >= df_fraud_sorted['lower_bound_ip_address']) & \
             (df_fraud_sorted['ip_address'] <= ip_country_sorted.loc[
                 ip_country_sorted['lower_bound_ip_address'] == df_fraud_sorted['lower_bound_ip_address'].values[0] if len(df_fraud_sorted) > 0 else 0,
                 'upper_bound_ip_address'
             ].values[0] if len(df_fraud_sorted) > 0 else False)

# Alternative: Use apply with optimized function
print("Using optimized mapping function...")
tqdm.pandas(desc="Mapping IPs")
df_fraud['country'] = df_fraud['ip_address'].progress_apply(
    lambda x: map_ip_to_country(x, ip_country_sorted)
)

print(f"\n✓ IP to country mapping completed!")
print(f"\nCountry distribution:")
print(df_fraud['country'].value_counts().head(10))


## 5. Feature Engineering - Transaction Frequency and Velocity


In [None]:
print("="*80)
print("FEATURE ENGINEERING - TRANSACTION FREQUENCY AND VELOCITY")
print("="*80)

# Sort by user_id and purchase_time for time-based calculations
df_fraud = df_fraud.sort_values(['user_id', 'purchase_time']).reset_index(drop=True)

# 1. Transaction frequency per user (total transactions per user)
print("\n1. Calculating transaction frequency per user...")
user_transaction_count = df_fraud.groupby('user_id').size().reset_index(name='transaction_count')
df_fraud = df_fraud.merge(user_transaction_count, on='user_id', how='left')

# 2. Transaction velocity - transactions in time windows
print("2. Calculating transaction velocity...")

# Calculate time differences between consecutive transactions for the same user
df_fraud['prev_purchase_time'] = df_fraud.groupby('user_id')['purchase_time'].shift(1)
df_fraud['time_since_last_transaction'] = (
    df_fraud['purchase_time'] - df_fraud['prev_purchase_time']
).dt.total_seconds() / 3600  # in hours

# Fill NaN for first transaction of each user with a large value
df_fraud['time_since_last_transaction'] = df_fraud['time_since_last_transaction'].fillna(999999)

# Transactions in last 24 hours, 7 days, 30 days
print("   - Transactions in last 24 hours, 7 days, 30 days...")
df_fraud['transactions_last_24h'] = 0
df_fraud['transactions_last_7d'] = 0
df_fraud['transactions_last_30d'] = 0

for idx, row in tqdm(df_fraud.iterrows(), total=len(df_fraud), desc="Calculating velocity"):
    user_id = row['user_id']
    purchase_time = row['purchase_time']
    
    # Get all transactions for this user before current transaction
    user_transactions = df_fraud[
        (df_fraud['user_id'] == user_id) & 
        (df_fraud['purchase_time'] < purchase_time)
    ]
    
    # Count transactions in time windows
    df_fraud.loc[idx, 'transactions_last_24h'] = len(
        user_transactions[user_transactions['purchase_time'] >= purchase_time - pd.Timedelta(hours=24)]
    )
    df_fraud.loc[idx, 'transactions_last_7d'] = len(
        user_transactions[user_transactions['purchase_time'] >= purchase_time - pd.Timedelta(days=7)]
    )
    df_fraud.loc[idx, 'transactions_last_30d'] = len(
        user_transactions[user_transactions['purchase_time'] >= purchase_time - pd.Timedelta(days=30)]
    )

print("✓ Transaction frequency and velocity features created!")

# Display summary
print("\nTransaction frequency statistics:")
print(df_fraud[['transaction_count', 'transactions_last_24h', 'transactions_last_7d', 'transactions_last_30d']].describe())


In [None]:
print("="*80)
print("FEATURE ENGINEERING - TIME-BASED FEATURES")
print("="*80)

# 1. Hour of day
df_fraud['hour_of_day'] = df_fraud['purchase_time'].dt.hour

# 2. Day of week (0=Monday, 6=Sunday)
df_fraud['day_of_week'] = df_fraud['purchase_time'].dt.dayofweek

# 3. Day of month
df_fraud['day_of_month'] = df_fraud['purchase_time'].dt.day

# 4. Month
df_fraud['month'] = df_fraud['purchase_time'].dt.month

# 5. Time since signup (in hours)
df_fraud['time_since_signup'] = (
    df_fraud['purchase_time'] - df_fraud['signup_time']
).dt.total_seconds() / 3600

# 6. Is weekend
df_fraud['is_weekend'] = (df_fraud['day_of_week'] >= 5).astype(int)

# 7. Is business hours (9 AM - 5 PM)
df_fraud['is_business_hours'] = ((df_fraud['hour_of_day'] >= 9) & (df_fraud['hour_of_day'] < 17)).astype(int)

print("✓ Time-based features created!")
print("\nTime-based features summary:")
time_features = ['hour_of_day', 'day_of_week', 'day_of_month', 'month', 
                'time_since_signup', 'is_weekend', 'is_business_hours']
print(df_fraud[time_features].describe())


## 7. Data Transformation - Scaling and Encoding


In [None]:
print("="*80)
print("DATA TRANSFORMATION")
print("="*80)

# Separate features and target
X = df_fraud.drop('class', axis=1)
y = df_fraud['class']

# Identify numerical and categorical features
numerical_features = ['purchase_value', 'age', 'ip_address', 
                     'transaction_count', 'time_since_last_transaction',
                     'transactions_last_24h', 'transactions_last_7d', 
                     'transactions_last_30d', 'time_since_signup',
                     'hour_of_day', 'day_of_week', 'day_of_month', 'month']

categorical_features = ['source', 'browser', 'sex', 'country', 
                       'device_id', 'user_id']

# Keep only features that exist in the dataframe
numerical_features = [f for f in numerical_features if f in X.columns]
categorical_features = [f for f in categorical_features if f in X.columns]

print(f"\nNumerical features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Create a copy for transformation
X_transformed = X.copy()

# 1. Normalize/Scale numerical features
print("\n1. Scaling numerical features...")
scaler = StandardScaler()
X_transformed[numerical_features] = scaler.fit_transform(X[numerical_features])
print("✓ Applied StandardScaler to numerical features")

# 2. Encode categorical features using One-Hot Encoding
print("\n2. Encoding categorical features...")
# For high cardinality features, we might want to use target encoding or limit categories
# For now, we'll use one-hot encoding for low cardinality features

# Identify low and high cardinality categorical features
low_cardinality = []
high_cardinality = []

for feature in categorical_features:
    if feature in X.columns:
        unique_count = X[feature].nunique()
        if unique_count <= 20:  # Threshold for one-hot encoding
            low_cardinality.append(feature)
        else:
            high_cardinality.append(feature)
            print(f"   - {feature}: {unique_count} unique values (will use label encoding)")

# One-hot encode low cardinality features
if low_cardinality:
    X_encoded = pd.get_dummies(X_transformed[low_cardinality], prefix=low_cardinality, drop_first=True)
    X_transformed = pd.concat([X_transformed.drop(low_cardinality, axis=1), X_encoded], axis=1)
    print(f"✓ Applied One-Hot Encoding to: {low_cardinality}")

# Label encode high cardinality features
if high_cardinality:
    label_encoders = {}
    for feature in high_cardinality:
        le = LabelEncoder()
        X_transformed[feature] = le.fit_transform(X[feature].astype(str))
        label_encoders[feature] = le
    print(f"✓ Applied Label Encoding to: {high_cardinality}")

# Drop time columns that were used for feature engineering but shouldn't be in final model
columns_to_drop = ['signup_time', 'purchase_time', 'prev_purchase_time']
columns_to_drop = [c for c in columns_to_drop if c in X_transformed.columns]
if columns_to_drop:
    X_transformed = X_transformed.drop(columns_to_drop, axis=1)
    print(f"✓ Dropped time columns: {columns_to_drop}")

print(f"\nFinal feature shape: {X_transformed.shape}")
print(f"Final features: {X_transformed.columns.tolist()}")


## 8. Train-Test Split


In [None]:
print("="*80)
print("TRAIN-TEST SPLIT")
print("="*80)

# Split data before handling class imbalance (to avoid data leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X_transformed, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

print("\nClass distribution in training set:")
print(y_train.value_counts())
print(f"Imbalance ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}:1")

print("\nClass distribution in test set:")
print(y_test.value_counts())
print(f"Imbalance ratio: {y_test.value_counts()[0] / y_test.value_counts()[1]:.2f}:1")


## 9. Handle Class Imbalance


In [None]:
print("="*80)
print("HANDLING CLASS IMBALANCE")
print("="*80)

# Document class distribution before resampling
print("\nBEFORE RESAMPLING:")
print("-" * 50)
print(f"Training set - Legitimate: {y_train.value_counts()[0]:,} ({y_train.value_counts(normalize=True)[0]*100:.2f}%)")
print(f"Training set - Fraud: {y_train.value_counts()[1]:,} ({y_train.value_counts(normalize=True)[1]*100:.2f}%)")
print(f"Imbalance ratio: {y_train.value_counts()[0] / y_train.value_counts()[1]:.2f}:1")

# Choice of technique: SMOTE (Synthetic Minority Oversampling Technique)
# Justification:
# 1. SMOTE creates synthetic samples rather than duplicating existing ones, reducing overfitting
# 2. It's effective for highly imbalanced datasets
# 3. It preserves the original data distribution while balancing classes
# 4. Better than simple oversampling which can lead to overfitting
# 5. Better than undersampling which discards valuable data

print("\n" + "="*80)
print("TECHNIQUE SELECTION: SMOTE")
print("="*80)
print("Justification:")
print("1. Creates synthetic samples rather than duplicating (reduces overfitting)")
print("2. Effective for highly imbalanced datasets")
print("3. Preserves original data distribution while balancing classes")
print("4. Better than simple oversampling (reduces overfitting risk)")
print("5. Better than undersampling (preserves valuable data)")
print("="*80)

# Apply SMOTE to training data only (important: never apply to test set!)
print("\nApplying SMOTE to training data...")
smote = SMOTE(random_state=42, sampling_strategy=0.5)  # Balance to 0.5 (1:2 ratio)
# Alternative: sampling_strategy='auto' for 1:1 ratio, or a float for custom ratio

try:
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print("✓ SMOTE applied successfully!")
except Exception as e:
    print(f"Error applying SMOTE: {e}")
    print("Trying with different parameters...")
    # Try with fewer neighbors if error occurs
    smote = SMOTE(random_state=42, k_neighbors=3, sampling_strategy=0.5)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print("✓ SMOTE applied successfully with adjusted parameters!")

# Document class distribution after resampling
print("\nAFTER RESAMPLING (SMOTE):")
print("-" * 50)
print(f"Training set - Legitimate: {y_train_resampled.value_counts()[0]:,} ({y_train_resampled.value_counts(normalize=True)[0]*100:.2f}%)")
print(f"Training set - Fraud: {y_train_resampled.value_counts()[1]:,} ({y_train_resampled.value_counts(normalize=True)[1]*100:.2f}%)")
print(f"Imbalance ratio: {y_train_resampled.value_counts()[0] / y_train_resampled.value_counts()[1]:.2f}:1")
print(f"\nOriginal training set size: {len(X_train):,}")
print(f"Resampled training set size: {len(X_train_resampled):,}")
print(f"New samples created: {len(X_train_resampled) - len(X_train):,}")

# Note: Test set remains unchanged (as it should be)
print("\n✓ Test set remains unchanged (no resampling applied)")
print(f"Test set - Legitimate: {y_test.value_counts()[0]:,}")
print(f"Test set - Fraud: {y_test.value_counts()[1]:,}")


## 10. Save Processed Data


In [None]:
print("="*80)
print("SAVING PROCESSED DATA")
print("="*80)

# Save processed datasets
processed_dir = Path('../data/processed')
processed_dir.mkdir(exist_ok=True)

# Save training and test sets
X_train_resampled.to_csv(processed_dir / 'X_train_processed.csv', index=False)
y_train_resampled.to_csv(processed_dir / 'y_train_processed.csv', index=False)
X_test.to_csv(processed_dir / 'X_test_processed.csv', index=False)
y_test.to_csv(processed_dir / 'y_test_processed.csv', index=False)

print("✓ Saved processed datasets:")
print(f"  - {processed_dir / 'X_train_processed.csv'}")
print(f"  - {processed_dir / 'y_train_processed.csv'}")
print(f"  - {processed_dir / 'X_test_processed.csv'}")
print(f"  - {processed_dir / 'y_test_processed.csv'}")

# Save scaler and encoders for later use
import joblib
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

joblib.dump(scaler, models_dir / 'scaler.pkl')
if 'label_encoders' in locals():
    joblib.dump(label_encoders, models_dir / 'label_encoders.pkl')

print("\n✓ Saved preprocessing objects:")
print(f"  - {models_dir / 'scaler.pkl'}")
if 'label_encoders' in locals():
    print(f"  - {models_dir / 'label_encoders.pkl'}")

print("\n" + "="*80)
print("FEATURE ENGINEERING COMPLETE!")
print("="*80)
print(f"\nFinal feature count: {X_train_resampled.shape[1]}")
print(f"Training samples: {len(X_train_resampled):,}")
print(f"Test samples: {len(X_test):,}")
print("\n✓ Data is ready for modeling!")
