In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score

# ------------------------
# 1. Load & Preprocess Data
# ------------------------
df = pd.read_csv(r"C:\Users\Relig\Downloads\LoanApproval-ML-RL\data\accepted_2007_to_2018.csv")

# Simplify target
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])]
df['loan_status'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

# Feature selection
features = ['loan_amnt', 'int_rate', 'annual_inc', 'dti', 
            'emp_length', 'home_ownership', 'purpose']
df = df[features + ['loan_status']].dropna()

# Encode categorical
for col in ['home_ownership','purpose','emp_length']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Scale numerical
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Train/test split
X = df[features]
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to tensors
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).view(-1,1)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.float32).view(-1,1)

print("Train shape:", X_train_t.shape, " Test shape:", X_test_t.shape)

# ------------------------
# 2. Define Model
# ------------------------
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.layers(x)

model = MLP(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ------------------------
# 3. Training Loop
# ------------------------
for epoch in range(10):
    optimizer.zero_grad()
    y_pred = model(X_train_t)
    loss = criterion(y_pred, y_train_t)
    loss.backward()
    optimizer.step()
    if epoch % 2 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item()}")

# ------------------------
# 4. Evaluation
# ------------------------
y_test_pred = model(X_test_t).detach().numpy()
auc = roc_auc_score(y_test, y_test_pred)
f1 = f1_score(y_test, (y_test_pred>0.5).astype(int))

print("AUC:", auc)
print("F1-Score:", f1)


  df = pd.read_csv(r"C:\Users\Relig\Downloads\LoanApproval-ML-RL\data\accepted_2007_to_2018.csv")


Train shape: torch.Size([1013425, 7])  Test shape: torch.Size([253357, 7])
Epoch 0, Loss: 0.694806694984436
Epoch 2, Loss: 0.6453614830970764
Epoch 4, Loss: 0.6059209108352661
Epoch 6, Loss: 0.5737894773483276
Epoch 8, Loss: 0.5474066734313965
AUC: 0.5919672934965796
F1-Score: 0.0


In [None]:
# =============================
# 1_EDA_Preprocessing_and_DL_Fixed.ipynb
# =============================

# -----------------------------
# 1️⃣ IMPORT LIBRARIES
# -----------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix, RocCurveDisplay

# -----------------------------
# 2️⃣ LOAD & PREPROCESS DATA
# -----------------------------
df = pd.read_csv(r"C:\Users\Relig\Downloads\LoanApproval-ML-RL\data\accepted_2007_to_2018.csv", low_memory=False)

# Keep only relevant target classes
df = df[df['loan_status'].isin(['Fully Paid', 'Charged Off'])].copy()
df['loan_status'] = df['loan_status'].map({'Fully Paid': 0, 'Charged Off': 1})

# Select features
features = ['loan_amnt', 'int_rate', 'annual_inc', 'dti', 'emp_length', 'home_ownership', 'purpose']
df = df[features + ['loan_status']].dropna()

# Convert interest rate to float
df['int_rate'] = df['int_rate'].astype(str).str.rstrip('%')
df['int_rate'] = pd.to_numeric(df['int_rate'], errors='coerce') / 100.0
df = df.dropna()

# Encode categorical features
for col in ['home_ownership','purpose','emp_length']:
    df[col] = LabelEncoder().fit_transform(df[col].astype(str))

# Scale numeric features
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])

# Train/test split with stratification
X = df[features]
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to PyTorch tensors
X_train_t = torch.tensor(X_train.values, dtype=torch.float32)
y_train_t = torch.tensor(y_train.values, dtype=torch.float32).view(-1,1)
X_test_t = torch.tensor(X_test.values, dtype=torch.float32)
y_test_t = torch.tensor(y_test.values, dtype=torch.float32).view(-1,1)

print("Train shape:", X_train_t.shape, " Test shape:", X_test_t.shape)

# -----------------------------
# 3️⃣ DEFINE MLP MODEL (NO SIGMOID)
# -----------------------------
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)  # no sigmoid
        )
    def forward(self, x):
        return self.layers(x)

model = MLP(X_train.shape[1])

# -----------------------------
# 4️⃣ HANDLE CLASS IMBALANCE
# -----------------------------
num_pos = y_train.sum()
num_neg = len(y_train) - num_pos
pos_weight = torch.tensor(num_neg / num_pos, dtype=torch.float32)  # for BCEWithLogitsLoss

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# -----------------------------
# 5️⃣ TRAINING LOOP
# -----------------------------
epochs = 50
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    y_pred = model(X_train_t)
    loss = criterion(y_pred, y_train_t)
    loss.backward()
    optimizer.step()
    if epoch % 5 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

# -----------------------------
# 6️⃣ EVALUATION
# -----------------------------
model.eval()
y_test_pred_logits = model(X_test_t).detach()
y_test_pred = torch.sigmoid(y_test_pred_logits).numpy()  # apply sigmoid here

# Metrics
auc = roc_auc_score(y_test, y_test_pred)
f1 = f1_score(y_test, (y_test_pred>0.5).astype(int))
print("\n===== EVALUATION =====")
print("AUC:", round(auc,4))
print("F1-Score:", round(f1,4))

# Confusion Matrix
y_pred_label = (y_test_pred>0.5).astype(int)
cm = confusion_matrix(y_test, y_pred_label)
print("\nConfusion Matrix:\n", cm)

# ROC Curve
RocCurveDisplay.from_predictions(y_test, y_test_pred)
plt.show()

# -----------------------------
# 7️⃣ NOTES / JUSTIFICATIONS
# -----------------------------
"""
Model Justification:
- MLP with 2 hidden layers (128 → 64 neurons) captures non-linear feature interactions.
- ReLU activations for faster convergence.
- Dropout (0.2) reduces overfitting.
- No Sigmoid in final layer: use BCEWithLogitsLoss with pos_weight to handle imbalance.
- Class imbalance handled using pos_weight (~neg/pos ratio).

Next Steps:
- Can tune learning rate, batch size, or layers.
- Could apply SMOTE or advanced architectures.
- Metrics ready for comparison with RL agent in Task 4.
"""


Train shape: torch.Size([1013425, 7])  Test shape: torch.Size([253357, 7])


RuntimeError: output with shape [1013425, 1] doesn't match the broadcast shape [1013425, 1013425]