# Step 3: Modeling & Interpretation
This notebook handles model training, hyperparameter tuning, artifact generation, and validation.

## 1. Environment Setup

In [None]:
import os
import sys
import joblib
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score

# Ensure project root is in path for src imports
sys.path.append('..')
from src.features import add_engineered_features
from src.evaluation import generate_shap_plots

# Ensure directories exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../reports', exist_ok=True)

## 2. Preprocessing & Data Prep

In [None]:
# Define processing columns
numeric_features = ['income', 'loan_amount', 'loan_duration_months', 'credit_score', 'age', 'previous_defaults', 'debt_to_income', 'monthly_payment']
categorical_features = ['employment_type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

df = pd.read_csv('../data/raw/example.csv')
X = df.drop('target', axis=1)
y = df['target']

X_eng = add_engineered_features(X)
X_proc = preprocessor.fit_transform(X_eng)

## 3. Hyperparameter Tuning
Using `RandomizedSearchCV` to optimize XGBoost and Random Forest for ROC-AUC.

In [None]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 3.1 XGBoost Tuning
xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9]
}

xgb_search = RandomizedSearchCV(
    XGBClassifier(random_state=42),
    param_distributions=xgb_param_grid,
    n_iter=10,
    scoring='roc_auc',
    cv=skf,
    verbose=1,
    random_state=42
)

print("Tuning XGBoost...")
xgb_search.fit(X_proc, y)
print(f"Best XGB ROC-AUC: {xgb_search.best_score_:.4f}")

# 3.2 Random Forest Tuning
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=rf_param_grid,
    n_iter=10,
    scoring='roc_auc',
    cv=skf,
    verbose=1,
    random_state=42
)

print("Tuning Random Forest...")
rf_search.fit(X_proc, y)
print(f"Best RF ROC-AUC: {rf_search.best_score_:.4f}")

## 4. Finalize Best Model
Compare and save the best performing model.

In [None]:
best_model = xgb_search.best_estimator_ if xgb_search.best_score_ >= rf_search.best_score_ else rf_search.best_estimator_

joblib.dump(preprocessor, '../models/preprocessor.pkl')
joblib.dump(best_model, '../models/final_model.pkl')

print(f"Final model ({type(best_model).__name__}) and preprocessor saved.")

## 5. Run Unit Tests & Interpretation

In [None]:
print("Triggering pytest for model validation...")
!python -m pytest ../tests/test_model.py

feature_names = preprocessor.get_feature_names_out()
importances = best_model.feature_importances_
feat_importances = pd.Series(importances, index=feature_names)
top_5 = feat_importances.nlargest(5).to_dict()
joblib.dump(top_5, '../models/top_features.pkl')

generate_shap_plots(best_model, X_proc, feature_names, output_path='../reports/shap_summary.png')
print("Artifacts updated.")