# Logistic Regression

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    precision_recall_curve,
)
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import joblib

In [29]:
import sys
import os
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
from fileDir import getDataDir, getModelDir, getPredDir

In [30]:
CLEANED_PATH = getDataDir("cleaned", 2)
FEATURE_PATH = getModelDir("train_features_model_logistic", 1, True)
SCALER_PATH = getModelDir("scaler_model_logistic", 1, True)
MODEL_PATH = getModelDir("model_logistic", 1, True)

Load and Prepare Data

In [31]:
df = pd.read_csv(CLEANED_PATH)


target = "default_12month"
X = df.drop(columns=[target])
y = df[target]

X = pd.get_dummies(X, drop_first=True)

feature_names = X.columns.tolist()
joblib.dump(feature_names, FEATURE_PATH)

['../../model/v1/train_features_model_logistic.pkl']

Split and Scale

In [32]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
joblib.dump(scaler, SCALER_PATH)

['../../model/v1/scaler_model_logistic.pkl']

Handle Imbalance

In [33]:
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

Train Logistic Regression

In [34]:
model = LogisticRegression(
    solver="lbfgs",  # works well for smaller datasets and L1/L2 regularization
    class_weight="balanced",  # helps with imbalance
    max_iter=1000,
)
model.fit(X_train_res, y_train_res)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


Evaluate

In [35]:
y_proba = model.predict_proba(X_test)[:, 1]

# Find best threshold using F1
precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_threshold = thresholds[np.argmax(f1_scores)]

y_pred = (y_proba >= best_threshold).astype(int)

print("\nOptimal Threshold:", round(best_threshold, 3))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_proba))


Optimal Threshold: 0.521

Confusion Matrix:
[[1926 1273]
 [ 256  265]]

Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.60      0.72      3199
         1.0       0.17      0.51      0.26       521

    accuracy                           0.59      3720
   macro avg       0.53      0.56      0.49      3720
weighted avg       0.78      0.59      0.65      3720

ROC-AUC Score: 0.5702147804106249


Save Model

In [36]:
joblib.dump(model, MODEL_PATH)
print("\nLogistic Regression model saved successfully as model_logistic.pkl")


Logistic Regression model saved successfully as model_logistic.pkl
