# Model Training

Train Logistic Regression, Random Forest, and XGBoost models, handling class imbalance with SMOTE.

In [1]:
import sys
import os

# go one level up (from Notebook/ to project root)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from joblib import dump
from src.config import PROCESSED_DATA_PATH, MODEL_DIR, TEST_SIZE, RANDOM_STATE, SMOTE_RANDOM_STATE
from src.utils import log_info

# Load engineered data
data = pd.read_csv(PROCESSED_DATA_PATH.replace('.csv', '_engineered.csv'))
X = data.drop('Loan_Status', axis=1)
y = data['Loan_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
log_info(f'Data split: X_train shape {X_train.shape}, X_test shape {X_test.shape}')

2025-09-11 02:39:45,567 - INFO - Data split: X_train shape (491, 15), X_test shape (123, 15)


In [2]:
# Handle class imbalance with SMOTE
smote = SMOTE(random_state=SMOTE_RANDOM_STATE)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
log_info(f'SMOTE applied: X_train_smote shape {X_train_smote.shape}')

2025-09-11 02:39:45,628 - INFO - SMOTE applied: X_train_smote shape (684, 15)


In [3]:
# Train models
log_model = LogisticRegression(random_state=RANDOM_STATE)
rf_model = RandomForestClassifier(random_state=RANDOM_STATE)
xgb_model = XGBClassifier(random_state=RANDOM_STATE, eval_metric='logloss')

log_model.fit(X_train_smote, y_train_smote)
rf_model.fit(X_train_smote, y_train_smote)
xgb_model.fit(X_train_smote, y_train_smote)

log_info('Models trained: LogisticRegression, RandomForest, XGBoost')

# Save models
dump(log_model, os.path.join(MODEL_DIR, 'logistic_regression.sav'))
dump(rf_model, os.path.join(MODEL_DIR, 'random_forest.sav'))
dump(xgb_model, os.path.join(MODEL_DIR, 'xgb_model.sav'))
log_info('Models saved to models/ directory')

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2025-09-11 02:39:46,356 - INFO - Models trained: LogisticRegression, RandomForest, XGBoost
2025-09-11 02:39:46,484 - INFO - Models saved to models/ directory
