# Step 3: Modeling & Interpretation
This notebook handles model training, artifact generation, and validation.

## 1. Environment Setup

In [None]:
import os
import sys
import joblib
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Ensure project root is in path for src imports
sys.path.append('..')
from src.features import add_engineered_features
from src.evaluation import generate_shap_plots

# Ensure directories exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../reports', exist_ok=True)

## 2. Train Dummy Model for Pipeline Validation
We fit a small model using the synthetic dataset to generate necessary `.pkl` artifacts.

In [None]:
# Define processing columns
numeric_features = ['income', 'loan_amount', 'loan_duration_months', 'credit_score', 'age', 'previous_defaults', 'debt_to_income', 'monthly_payment']
categorical_features = ['employment_type']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Load data
df = pd.read_csv('../data/raw/example.csv')
X = df.drop('target', axis=1)
y = df['target']

# Feature engineering
X_eng = add_engineered_features(X)

# Fit components
X_proc = preprocessor.fit_transform(X_eng)
model = XGBClassifier(n_estimators=10, random_state=42)
model.fit(X_proc, y)

# Persist artifacts
joblib.dump(preprocessor, '../models/preprocessor.pkl')
joblib.dump(model, '../models/final_model.pkl')

print("Success: Model and preprocessor saved to models/ directory.")

## 3. Run Unit Tests
Ensuring the saved model meets the project requirements by running `test_model.py`.

In [None]:
print("Triggering pytest for model validation...")
!python -m pytest ../tests/test_model.py

## 4. Interpretation (SHAP)
Analyzing feature contributions using the generated model.

In [None]:
feature_names = preprocessor.get_feature_names_out()

# Display Top 5 Global Importances
importances = model.feature_importances_
feat_importances = pd.Series(importances, index=feature_names)
top_5 = feat_importances.nlargest(5).to_dict()

print("Top 5 Global Drivers:")
for feat, val in top_5.items():
    print(f"{feat}: {val:.4f}")

joblib.dump(top_5, '../models/top_features.pkl')

# Save summary plot
generate_shap_plots(model, X_proc, feature_names, output_path='../reports/shap_summary.png')
print("SHAP summary saved to reports/shap_summary.png")