In [None]:
# Loan Default Prediction - End-to-End Annotated Model

# ==============================
# 1. INSTALL DEPENDENCIES
# ==============================
!pip install tensorflow==2.15.0 --quiet

# ==============================
# 2. IMPORT LIBRARIES
# ==============================
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import zipfile
import os

# Sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# ==============================
# 3. LOAD DATA FROM ZIP FILE
# ==============================
# https://www.kaggle.com/datasets/yasserh/loan-default-dataset/discussion/522084
zip_path = '/content/Loan_Default.csv.zip'
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    csv_filename = zip_ref.namelist()[0]
    with zip_ref.open(csv_filename) as csv_file:
        df = pd.read_csv(csv_file)
df.to_csv('Loan_Default.csv', index=False)

# ==============================
# 4. CLEAN DATA
# ==============================
loan = df.copy()
loan.drop(columns=['ID' ,'loan_limit' ,'Gender', 'construction_type', 'Secured_by', 'total_units',
                   'credit_type', 'co-applicant_credit_type', 'Region', 'Neg_ammortization', 'year',
                   'submission_of_application' , 'approv_in_adv'], inplace=True)

loan = loan.drop_duplicates()

# Adjust numeric values
loan['property_value1'] = loan['property_value'] - 8000
loan['loan_amount1'] = loan['loan_amount'] - 6500

# Fill missing categorical with "Missing"
cat = [col for col in loan.columns if loan[col].dtype == 'object']
loan[cat] = loan[cat].fillna('Missing')

# KNN Impute numerical (excluding target and calculated fields)
num = [col for col in loan.columns if loan[col].dtype != 'object']
to_remove = ['Status', 'LTV', 'property_value', 'loan_amount']
num = [col for col in num if col not in to_remove]

knn = KNNImputer(n_neighbors=3)
loan[num] = knn.fit_transform(loan[num])

# Recalculate LTV
loan['LTV_1'] = loan['loan_amount1'] / loan['property_value1'] * 100
loan['LTV_1'] = loan['LTV_1'].replace([np.inf, -np.inf], np.nan)

# Drop old and missing values
loan_clean = loan.drop(['loan_amount', 'property_value', 'LTV'], axis=1)
loan_clean = loan_clean.dropna()

# Save cleaned data
loan_clean.to_csv('/content/Loan_Default_Clean.csv', index=False)

# ==============================
# 5. SPLIT & PREPROCESS
# ==============================
DATA_PATH = "/content/Loan_Default_Clean.csv"
TARGET_COLUMN = "Status"
TEST_SIZE = 0.2
RANDOM_STATE = 42

df = pd.read_csv(DATA_PATH)
X = df.drop(columns=[TARGET_COLUMN])
y = df[TARGET_COLUMN]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)

num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocessor = ColumnTransformer([
    ("nums", num_pipe, num_cols),
    ("cats", cat_pipe, cat_cols),
])

# ==============================
# 6. TRAIN & EVALUATE FUNCTION
# ==============================
def train_and_evaluate(model, name):
    pipe = Pipeline([
        ("preproc", preprocessor),
        ("clf", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else None

    print(f"\n=== {name} ===")
    print("Accuracy:  ", accuracy_score(y_test, y_pred))
    print("Precision: ", precision_score(y_test, y_pred))
    print("Recall:    ", recall_score(y_test, y_pred))
    print("F1 Score:  ", f1_score(y_test, y_pred))
    if y_proba is not None:
        print("ROC AUC:   ", roc_auc_score(y_test, y_proba))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# ==============================
# 7. CLASSIC ML MODELS
# ==============================
train_and_evaluate(LogisticRegression(solver="liblinear", random_state=RANDOM_STATE), "Logistic Regression")
train_and_evaluate(DecisionTreeClassifier(random_state=RANDOM_STATE), "Decision Tree")
train_and_evaluate(RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE), "Random Forest")
train_and_evaluate(KNeighborsClassifier(n_neighbors=5), "k-Nearest Neighbors")

# ==============================
# 8. NEURAL NETWORK
# ==============================
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

nn = Sequential([
    Dense(64, activation="relu", input_shape=(X_train_proc.shape[1],)),
    Dropout(0.5),
    Dense(32, activation="relu"),
    Dropout(0.5),
    Dense(1, activation="sigmoid"),
])

nn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

print("\nTraining Neural Network...")
nn.fit(X_train_proc, y_train, epochs=20, batch_size=32, validation_split=0.1, verbose=2)

y_proba_nn = nn.predict(X_test_proc).ravel()
y_pred_nn = (y_proba_nn > 0.5).astype(int)

print("\n=== Feed-forward Neural Network ===")
print("Accuracy:  ", accuracy_score(y_test, y_pred_nn))
print("Precision: ", precision_score(y_test, y_pred_nn))
print("Recall:    ", recall_score(y_test, y_pred_nn))
print("F1 Score:  ", f1_score(y_test, y_pred_nn))
print("ROC AUC:   ", roc_auc_score(y_test, y_proba_nn))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nn))
print("Classification Report:\n", classification_report(y_test, y_pred_nn))
