# 05_ML_Models_RF_XGB_LGBM
Train and evaluate Random Forest, XGBoost, LightGBM with SMOTE.

In [None]:
# Common imports for the project
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%matplotlib inline


In [None]:
from src.preprocessing import basic_cleaning
from src.feature_engineering import create_features
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, classification_report
import joblib

In [None]:
df = pd.read_csv('../data/merged_data.csv')
df = basic_cleaning(df)
df = create_features(df)
target = 'default'
exclude = [target, 'id', 'index', 'source']
features = [c for c in df.select_dtypes(include=[np.number]).columns if c not in exclude]
X = df[features].fillna(0)
y = df[target].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled shape:', X_res.shape)

In [None]:
models = {
    'rf': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=42),
    'xgb': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'lgbm': LGBMClassifier(random_state=42)
}
results = {}
for name, m in models.items():
    print('Training', name)
    m.fit(X_res, y_res)
    probs = m.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, probs)
    print(f'{name} AUC: {auc:.4f}')
    results[name] = auc
    joblib.dump(m, f'../models/{name}.joblib')
print('Results:', results)