# Loan Default Prediction with Synthetic Data

This notebook generates a synthetic dataset for binary classification (loan default prediction), and trains three models:

- Decision Tree
- Random Forest
- Neural Network

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns


## Generate Synthetic Data

In [4]:
url = 'https://raw.githubusercontent.com/trkrkn/aiforfinance/main/loan_data.csv'
df = pd.read_csv(url)

## Preprocess Data

In [5]:
# Make a copy to avoid altering the original
df_model = df.copy()

# Encode binary categorical features
df_model['Gender'] = df_model['Gender'].map({'Male': 1, 'Female': 0})
df_model['Married'] = df_model['Married'].map({'Yes': 1, 'No': 0})
df_model['Self_Employed'] = df_model['Self_Employed'].map({'Yes': 1, 'No': 0})
df_model['Education'] = df_model['Education'].map({'Graduate': 1, 'Not Graduate': 0})
df_model['Loan_Status'] = df_model['Loan_Status'].map({'Y': 1, 'N': 0})  # target

# One-hot encode multiclass categorical variables
df_model = pd.get_dummies(df_model, columns=['Dependents', 'Collateral_Type'], drop_first=True)

# Drop Loan_ID (non-predictive)
df_model.drop(columns=['Loan_ID'], inplace=True)

# Split into X and y
X = df_model.drop(columns=['Loan_Status'])
y = df_model['Loan_Status']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: scale numeric features
numeric_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# Check shapes
print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Train shape: (8000, 13) (8000,)
Test shape: (2000, 13) (2000,)


## Train and Evaluate Models

## Decision Tree

In [16]:
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
dt_proba = dt_model.predict_proba(X_test)[:, 1]

### Output performance

In [15]:
dt_report = classification_report(y_test, dt_pred, output_dict=True)
dt_auc = roc_auc_score(y_test, dt_proba)

print("Decision Tree Performance")
print(f"ROC AUC: {dt_auc:.2f}")
print(f"Precision (1): {dt_report['1']['precision']:.2f}")
print(f"Recall (1): {dt_report['1']['recall']:.2f}")
print(f"F1-Score (1): {dt_report['1']['f1-score']:.2f}")
print("-" * 40)

Decision Tree Performance
ROC AUC: 0.64
Precision (1): 0.55
Recall (1): 0.43
F1-Score (1): 0.48
----------------------------------------


## Random Forest

In [27]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:, 1]

### Output performance

In [28]:
rf_report = classification_report(y_test, rf_pred, output_dict=True)
rf_auc = roc_auc_score(y_test, rf_proba)

print("Random Forest Performance")
print(f"ROC AUC: {rf_auc:.2f}")
print(f"Precision (1): {rf_report['1']['precision']:.2f}")
print(f"Recall (1): {rf_report['1']['recall']:.2f}")
print(f"F1-Score (1): {rf_report['1']['f1-score']:.2f}")
print("-" * 40)

Random Forest Performance
ROC AUC: 0.68
Precision (1): 0.59
Recall (1): 0.43
F1-Score (1): 0.50
----------------------------------------


## Neural Network

In [20]:
# nn_model = MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
nn_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, early_stopping=True, random_state=42)
nn_model.fit(X_train, y_train)  # scaled input
nn_pred = nn_model.predict(X_test)
nn_proba = nn_model.predict_proba(X_test)[:, 1]

### Output performance

In [19]:
nn_report = classification_report(y_test, nn_pred, output_dict=True)
nn_auc = roc_auc_score(y_test, nn_proba)

print("Neural Network Performance")
print(f"ROC AUC: {nn_auc:.2f}")
print(f"Precision (1): {nn_report['1']['precision']:.2f}")
print(f"Recall (1): {nn_report['1']['recall']:.2f}")
print(f"F1-Score (1): {nn_report['1']['f1-score']:.2f}")
print("-" * 40)

Neural Network Performance
ROC AUC: 0.71
Precision (1): 0.62
Recall (1): 0.44
F1-Score (1): 0.52
----------------------------------------


## All models together

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=8, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, early_stopping=True, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_proba)
    results[name] = {
        "classification_report": report,
        "roc_auc": auc
    }

# Display model performance
summary_df = pd.DataFrame({
    model: {
        "ROC AUC": f"{results[model]['roc_auc']:.2f}",
        "Precision (1)": f"{results[model]['classification_report']['1']['precision']:.2f}",
        "Recall (1)": f"{results[model]['classification_report']['1']['recall']:.2f}",
        "F1-Score (1)": f"{results[model]['classification_report']['1']['f1-score']:.2f}"
    }
    for model in results
}).T.reset_index().rename(columns={"index": "Model"})

summary_df
