# Task 2: Model Building and Training

This notebook demonstrates the complete model building and training pipeline for fraud detection, including:
1. Data preparation with stratified train-test split
2. Baseline model (Logistic Regression)
3. Ensemble model (Random Forest/XGBoost/LightGBM)
4. Model evaluation (AUC-PR, F1-Score, Confusion Matrix)
5. Cross-validation (Stratified K-Fold)
6. Model comparison and selection


In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent / "src"))

from preprocessor import PreprocessingPipeline
from model_pipeline import ModelPipeline
from data_preparator import DataPreparator
from model_trainer import ModelTrainer
from model_evaluator import ModelEvaluator
from cross_validator import CrossValidator

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

print("All modules imported successfully!")


## Step 1: Data Preprocessing

First, we'll preprocess the data using the pipeline from Task 1.


In [None]:
# Initialize preprocessing pipeline
preprocessing_pipeline = PreprocessingPipeline(
    data_dir="../data/raw",
    output_dir="../data/processed"
)

# Process fraud data (or load if already processed)
try:
    # Try to load processed data first
    processed_df = pd.read_csv("../data/processed/processed_fraud_data.csv")
    print(f"Loaded processed data: {processed_df.shape}")
except FileNotFoundError:
    # If not found, run preprocessing
    print("Processed data not found. Running preprocessing pipeline...")
    processed_df, metadata = preprocessing_pipeline.process_fraud_data(
        fraud_data_file="Fraud_Data.csv",
        ip_country_file="IpAddress_to_Country.csv",
        target_column="class",
        user_column="user_id",
        purchase_datetime="purchase_time",
        signup_datetime="signup_time",
        ip_column="ip_address",
        perform_eda=False,  # Skip EDA for faster processing
        handle_imbalance=False,  # We'll handle this in model training
        save_processed=True
    )
    print(f"Preprocessing complete. Shape: {processed_df.shape}")


## Step 2: Data Preparation

Prepare data for model training with stratified train-test split.


In [None]:
# Prepare data with stratified train-test split
data_preparator = DataPreparator()

# Determine target column name
target_col = "class" if "class" in processed_df.columns else "Class"

X_train, X_test, y_train, y_test = data_preparator.prepare_data(
    processed_df,
    target_column=target_col,
    test_size=0.2,
    random_state=42
)

print(f"\nTraining set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


## Step 3: Train Baseline Model (Logistic Regression)


In [None]:
# Train baseline Logistic Regression model
model_trainer = ModelTrainer()

baseline_model = model_trainer.train_baseline_model(
    X_train,
    y_train,
    class_weight='balanced',  # Handle class imbalance
    random_state=42
)

print("Baseline model trained successfully!")


## Step 4: Train Ensemble Model

Choose one of: Random Forest, XGBoost, or LightGBM


In [None]:
# Train Random Forest ensemble model
# Alternative: Use train_xgboost() or train_lightgbm() instead

ensemble_model = model_trainer.train_random_forest(
    X_train,
    y_train,
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)

print("Ensemble model trained successfully!")


## Step 5: Evaluate Models

Evaluate both models using AUC-PR, F1-Score, and Confusion Matrix


In [None]:
# Initialize evaluator
model_evaluator = ModelEvaluator(output_dir="../models/evaluation_outputs")

# Evaluate baseline model
baseline_results = model_evaluator.evaluate_model(
    baseline_model,
    X_test,
    y_test,
    model_name="Logistic Regression",
    plot=True
)

print("\nBaseline Model Results:")
print(f"  PR-AUC: {baseline_results['pr_auc']:.4f}")
print(f"  F1-Score: {baseline_results['f1_score']:.4f}")
print(f"  ROC-AUC: {baseline_results['roc_auc']:.4f}")


In [None]:
# Evaluate ensemble model
ensemble_results = model_evaluator.evaluate_model(
    ensemble_model,
    X_test,
    y_test,
    model_name="Random Forest",
    plot=True
)

print("\nEnsemble Model Results:")
print(f"  PR-AUC: {ensemble_results['pr_auc']:.4f}")
print(f"  F1-Score: {ensemble_results['f1_score']:.4f}")
print(f"  ROC-AUC: {ensemble_results['roc_auc']:.4f}")


## Step 6: Cross-Validation (Stratified K-Fold)

Perform 5-fold cross-validation for reliable performance estimation


In [None]:
# Perform cross-validation
cross_validator = CrossValidator(n_splits=5, random_state=42)

# Cross-validate baseline
baseline_cv = cross_validator.cross_validate(
    baseline_model,
    X_train,
    y_train,
    model_name="Logistic Regression"
)


In [None]:
# Cross-validate ensemble
ensemble_cv = cross_validator.cross_validate(
    ensemble_model,
    X_train,
    y_train,
    model_name="Random Forest"
)


## Step 7: Model Comparison and Selection

Compare all models side-by-side and select the best model


In [None]:
# Compare models
comparison_results = {
    'Logistic Regression': baseline_results,
    'Random Forest': ensemble_results
}

comparison_df = model_evaluator.compare_models(
    comparison_results,
    output_file="model_comparison.csv"
)

display(comparison_df)


## Step 8: Select Best Model

Select the best model based on PR-AUC (most important for imbalanced data) and interpretability


In [None]:
# Select best model
best_model_name = comparison_df.iloc[0]['Model']
best_model_metrics = comparison_results[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"\nJustification:")
print(f"  - PR-AUC: {best_model_metrics['pr_auc']:.4f} (primary metric for imbalanced data)")
print(f"  - F1-Score: {best_model_metrics['f1_score']:.4f}")
print(f"  - Precision: {best_model_metrics['precision']:.4f}")
print(f"  - Recall: {best_model_metrics['recall']:.4f}")

# Save best model
if best_model_name == "Logistic Regression":
    best_model = baseline_model
else:
    best_model = ensemble_model

model_trainer.save_model(
    best_model_name.lower().replace(" ", "_"),
    f"../models/{best_model_name.lower().replace(' ', '_')}_best_model.joblib"
)

print(f"\nBest model saved to models/ directory")
