In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.utils import resample

# === Load Dataset ===
df = pd.read_csv("/Users/apple/Desktop/loandefaultprediction/loan_deafult_prediction/Data/Dataset.csv", dtype=str, low_memory=False)

# === Clean + Convert Target ===
df = df.dropna(subset=["Default"])
df["Default"] = df["Default"].astype(int)

# === Select Only Numeric Columns for Simplicity ===
df["Client_Income"] = pd.to_numeric(df["Client_Income"], errors="coerce")
df["Credit_Amount"] = pd.to_numeric(df["Credit_Amount"], errors="coerce")
df = df[["Client_Income", "Credit_Amount", "Default"]].dropna()

X = df[["Client_Income", "Credit_Amount"]]
y = df["Default"]

# === Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

results = {}

# === Baseline Random Forest ===
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results["Random Forest (base)"] = classification_report(y_test, y_pred_rf, output_dict=True)

# === Baseline Naive Bayes ===
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)
results["Naive Bayes (base)"] = classification_report(y_test, y_pred_nb, output_dict=True)

# === Random Forest + Undersampling ===
df_train = pd.concat([X_train, y_train], axis=1)
majority = df_train[df_train["Default"] == 0]
minority = df_train[df_train["Default"] == 1]

# Resample majority
majority_downsampled = resample(majority, replace=False, n_samples=len(minority), random_state=42)
df_resampled = pd.concat([majority_downsampled, minority])
X_resampled = df_resampled.drop("Default", axis=1)
y_resampled = df_resampled["Default"]

rf.fit(X_resampled, y_resampled)
y_pred_rus = rf.predict(X_test)
results["Random Forest + Undersampling"] = classification_report(y_test, y_pred_rus, output_dict=True)

# === Random Forest + SMOTE ===
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X_train, y_train)
rf.fit(X_sm, y_sm)
y_pred_sm = rf.predict(X_test)
results["Random Forest + SMOTE"] = classification_report(y_test, y_pred_sm, output_dict=True)

# === Random Forest + SMOTEENN ===
smenn = SMOTEENN(random_state=42)
X_se, y_se = smenn.fit_resample(X_train, y_train)
rf.fit(X_se, y_se)
y_pred_se = rf.predict(X_test)
results["Random Forest + SMOTEENN"] = classification_report(y_test, y_pred_se, output_dict=True)

# === Print All Results ===
for model_name, report in results.items():
    print("\n", "="*30)
    print(f"{model_name}")
    print("="*30)
    print(pd.DataFrame(report).transpose())


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Random Forest (base)
              precision    recall  f1-score       support
0              0.922339  0.984745  0.952521  26352.000000
1              0.258303  0.060215  0.097663   2325.000000
accuracy       0.909788  0.909788  0.909788      0.909788
macro avg      0.590321  0.522480  0.525092  28677.000000
weighted avg   0.868502  0.909788  0.883213  28677.000000

Naive Bayes (base)
              precision    recall  f1-score       support
0              0.918925  1.000000  0.957750  26352.000000
1              0.000000  0.000000  0.000000   2325.000000
accuracy       0.918925  0.918925  0.918925      0.918925
macro avg      0.459462  0.500000  0.478875  28677.000000
weighted avg   0.844422  0.918925  0.880100  28677.000000

Random Forest + Undersampling
              precision    recall  f1-score       support
0              0.933545  0.536278  0.681224  26352.000000
1              0.097422  0.567312  0.166288   2325.000000
accuracy       0.538794  0.538794  0.538794      0.538794