# Feature Engineering

This notebook demonstrates the feature engineering pipeline for creating meaningful features from raw data.


In [None]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent / "src"))

from data_loader import DataLoader
from data_cleaner import DataCleaner
from feature_engineer import FeatureEngineer
from geolocation import GeolocationMapper

print("Modules imported successfully!")


## Load and Prepare Data


In [None]:
# Initialize components
loader = DataLoader(data_dir="../data/raw")
cleaner = DataCleaner()
feature_engineer = FeatureEngineer()

# Load data
try:
    fraud_df = loader.load_csv("Fraud_Data.csv")
    print(f"Loaded data: {fraud_df.shape}")
except FileNotFoundError as e:
    print(f"Data file not found: {e}")
    fraud_df = None


## Time-Based Features


In [None]:
if fraud_df is not None and "purchase_time" in fraud_df.columns:
    # Extract time features from purchase_time
    fraud_df = feature_engineer.extract_time_features(
        fraud_df,
        datetime_column="purchase_time",
        prefix="purchase"
    )
    print(f"After time feature extraction: {fraud_df.shape}")
    print(f"\nNew time features created:")
    time_features = [col for col in fraud_df.columns if col.startswith("purchase_")]
    print(time_features)


## Time Since Signup


In [None]:
if fraud_df is not None and "signup_time" in fraud_df.columns and "purchase_time" in fraud_df.columns:
    fraud_df = feature_engineer.calculate_time_since_signup(
        fraud_df,
        signup_column="signup_time",
        reference_column="purchase_time"
    )
    print("Time since signup features created")
    print(f"\nSample values:")
    print(fraud_df[["time_since_signup", "time_since_signup_days"]].head())


## Transaction Frequency


In [None]:
if fraud_df is not None and "user_id" in fraud_df.columns and "purchase_time" in fraud_df.columns:
    fraud_df = feature_engineer.calculate_transaction_frequency(
        fraud_df,
        user_column="user_id",
        datetime_column="purchase_time",
        time_windows=["1H", "24H", "7D", "30D"]
    )
    print("Transaction frequency features created")
    freq_features = [col for col in fraud_df.columns if col.startswith("txn_freq_")]
    print(f"\nFrequency features: {freq_features}")


## Transaction Velocity


In [None]:
if fraud_df is not None and "user_id" in fraud_df.columns and "purchase_time" in fraud_df.columns:
    fraud_df = feature_engineer.calculate_transaction_velocity(
        fraud_df,
        user_column="user_id",
        datetime_column="purchase_time",
        amount_column="purchase_value" if "purchase_value" in fraud_df.columns else None
    )
    print("Transaction velocity features created")
    print(f"\nVelocity features:")
    velocity_features = [col for col in fraud_df.columns if "velocity" in col or "time_since_last" in col]
    print(velocity_features)


## Complete Feature Engineering Pipeline

Alternatively, use the complete pipeline:


In [None]:
# Example: Complete feature engineering in one step
# fraud_df = feature_engineer.engineer_all_features(
#     fraud_df,
#     user_column="user_id",
#     purchase_datetime="purchase_time",
#     signup_datetime="signup_time",
#     amount_column="purchase_value" if "purchase_value" in fraud_df.columns else None
# )
