# 03 - Baseline Models Training

## Section D: Machine Learning Model Design

This notebook implements and compares two scalable machine learning models:
1. **Random Forest** - Tree-based ensemble (scalable)
2. **XGBoost** - Gradient boosting (state-of-art for tabular data)

Both models are:
- Unsuitable for small toy datasets (require substantial data)
- Scalable through parallel processing
- Capable of handling class imbalance

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('..')

from config import PROCESSED_DATA_DIR, RESULTS_DIR, RANDOM_STATE
from src.models import ModelTrainer, train_all_models
from src.evaluation import (
    compute_metrics, evaluate_at_scales, 
    plot_roc_curves, plot_precision_recall_curves,
    generate_evaluation_report
)
from src.visualization import (
    plot_class_distribution, plot_confusion_matrix,
    plot_feature_importance, plot_metrics_comparison,
    create_summary_dashboard
)
from sklearn.metrics import confusion_matrix

print('Imports successful!')

In [None]:
# Load processed data
df = pd.read_csv(PROCESSED_DATA_DIR / 'loan_data_processed.csv')

print('='*60)
print('DATASET OVERVIEW')
print('='*60)
print(f'Total records: {len(df):,}')
print(f'Total features: {len(df.columns)}')
print(f'Memory usage: {df.memory_usage(deep=True).sum() / 1e6:.2f} MB')
print(f'\nTarget distribution:')
print(df['default'].value_counts())
print(f'\nDefault rate: {df["default"].mean()*100:.2f}%')

## Model 1: Random Forest Classifier

### Why Random Forest fits the data characteristics:
- **Handles mixed feature types**: Numerical and categorical features
- **Robust to outliers**: Tree-based splitting not affected by extreme values
- **Feature importance**: Built-in importance scores for interpretability
- **Scalable**: Parallelizable with n_jobs=-1
- **Handles imbalance**: class_weight='balanced' adjusts for class distribution

In [None]:
# Initialize trainer
trainer = ModelTrainer(random_state=RANDOM_STATE)

# Prepare data
X_train, X_test, y_train, y_test = trainer.prepare_data(df, target_col='default')

In [None]:
# Train Random Forest
print('\n' + '='*60)
print('TRAINING RANDOM FOREST')
print('='*60)

rf_model, rf_results = trainer.train_model(
    'RandomForest', 
    X_train, y_train, 
    X_test, y_test,
    tune_hyperparameters=False
)

In [None]:
# Random Forest Feature Importance
rf_importance = trainer.get_feature_importance('RandomForest')
print('\nTop 15 Most Important Features (Random Forest):')
print(rf_importance.head(15).to_string(index=False))

fig = plot_feature_importance(rf_importance, top_n=15, 
                              title='Random Forest Feature Importance')
plt.savefig(RESULTS_DIR / 'rf_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## Model 2: XGBoost Classifier

### Why XGBoost fits the data characteristics:
- **State-of-the-art for tabular data**: Consistently wins competitions
- **Handles imbalance**: scale_pos_weight parameter
- **Regularization**: L1/L2 regularization prevents overfitting
- **Scalable**: tree_method='hist' for faster training on large datasets
- **Missing value handling**: Built-in handling of missing values

In [None]:
# Train XGBoost
print('\n' + '='*60)
print('TRAINING XGBOOST')
print('='*60)

imbalance_ratio = (y_train == 0).sum() / (y_train == 1).sum()
print(f'\nImbalance ratio: {imbalance_ratio:.2f}')

xgb_model, xgb_results = trainer.train_model(
    'XGBoost', 
    X_train, y_train, 
    X_test, y_test,
    tune_hyperparameters=False
)

In [None]:
# XGBoost Feature Importance
xgb_importance = trainer.get_feature_importance('XGBoost')
print('\nTop 15 Most Important Features (XGBoost):')
print(xgb_importance.head(15).to_string(index=False))

fig = plot_feature_importance(xgb_importance, top_n=15, 
                              title='XGBoost Feature Importance')
plt.savefig(RESULTS_DIR / 'xgb_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

## Model Comparison

In [None]:
print('\n' + '='*60)
print('MODEL COMPARISON')
print('='*60)

comparison_df = trainer.compare_models()
print(comparison_df.to_string(index=False))

In [None]:
# Plot ROC curves
fig = plot_roc_curves(trainer.models, X_test, y_test,
                      save_path=RESULTS_DIR / 'roc_comparison.png')
plt.show()

In [None]:
# Confusion matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, (name, model) in zip(axes, trainer.models.items()):
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Non-Default', 'Default'],
                yticklabels=['Non-Default', 'Default'])
    ax.set_title(f'{name} Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print('\n' + '='*60)
print('COMPUTATIONAL COST ANALYSIS')
print('='*60)

print(f'\nTraining Data Size: {len(X_train):,} samples x {X_train.shape[1]} features')
print(f'Test Data Size: {len(X_test):,} samples')

for model_name, results in trainer.results.items():
    train_time = results['training_time']
    throughput = len(X_train) / train_time
    print(f'{model_name}: {train_time:.2f}s ({throughput:,.0f} samples/s)')

In [None]:
# Save results
comparison_df.to_csv(RESULTS_DIR / 'model_comparison.csv', index=False)
rf_importance.to_csv(RESULTS_DIR / 'rf_feature_importance.csv', index=False)
xgb_importance.to_csv(RESULTS_DIR / 'xgb_feature_importance.csv', index=False)

import joblib
for name, model in trainer.models.items():
    joblib.dump(model, RESULTS_DIR / f'{name.lower()}_model.joblib')

print('Results and models saved!')