In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import category_encoders as ce
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_auc_score, average_precision_score, matthews_corrcoef, log_loss
)
import random

In [11]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.load_dataset(
    adapter=KaggleDatasetAdapter.PANDAS,
    handle="ducthanhvu/doan-2-cuoi-cung",
    path="Data doan cuoi cung.xlsx"
)


  df = kagglehub.load_dataset(


In [12]:
df["cbm_feature_1"] = df["wallet_usage_frequency"] * df["bank_avg_balance"]
df["cbm_feature_2"] = df["risk_score_raw"] * df["loan_count_score"]

# Expand and enrich text feature
loan_purpose_templates = [
    "I need this loan to pay for my {purpose}.",
    "This funding will help cover my {purpose}.",
    "The loan is intended for {purpose}.",
    "I am applying to cover expenses related to {purpose}.",
    "These funds will support my {purpose} needs.",
    "The money will be used primarily for {purpose}."
]
loan_purposes = [
    "home renovation", "medical expenses", "debt consolidation", "starting a small business",
    "education tuition fees", "vehicle repairs", "wedding expenses", "family emergency",
    "travel plans", "childcare costs", "unforeseen bills", "buying new equipment",
    "moving to a new home", "paying off credit card debt", "supporting a relative"
]
df["loan_purpose_text"] = [
    random.choice(loan_purpose_templates).format(purpose=random.choice(loan_purposes))
    for _ in range(len(df))
]

# Encode features and prepare inputs
label_col = "default"
text_col = "loan_purpose_text"
selected_features = [
    "age", "gender", "marital_status", "residential_area", "monthly_income",
    "estimated_monthly_expense", "employment_status", "job_type",
    "total_outstanding_debt", "number_of_current_loans", "total_late_payments",
    "num_loans_from_app", "num_late_payments_in_app", "has_bank_account_linked",
    "bank_avg_balance", "has_e_wallet_linked", "wallet_usage_frequency",
    "cbm_feature_1", "cbm_feature_2"
]
X_tabular = df[selected_features].copy()
X_text = df[text_col].astype(str)
y = df[label_col]

categorical_cols = X_tabular.select_dtypes(include="object").columns.tolist()
encoder = ce.OrdinalEncoder(cols=categorical_cols)
X_encoded = encoder.fit_transform(X_tabular)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
encoded_text = tokenizer(
    X_text.tolist(), padding=True, truncation=True, max_length=32, return_tensors="pt"
)
y_tensor = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)

X_tab_train, X_tab_test, ids_train, ids_test, y_train, y_test, mask_train, mask_test = train_test_split(
    torch.tensor(X_scaled, dtype=torch.float32), encoded_text["input_ids"], y_tensor,
    encoded_text["attention_mask"], test_size=0.2, random_state=42
)
train_dataset = TensorDataset(X_tab_train, ids_train, mask_train, y_train)
test_dataset = TensorDataset(X_tab_test, ids_test, mask_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
class TabFinBERT_CBModel(nn.Module):
    def __init__(self, tab_input_dim, text_model_name="yiyanghkust/finbert-tone", d_model=128, n_heads=4, dropout=0.1):
        super().__init__()
        self.tabular_proj = nn.Linear(tab_input_dim, d_model)
        self.tab_transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=d_model, nhead=n_heads, dropout=dropout, batch_first=True),
            num_layers=2)
        self.text_encoder = AutoModel.from_pretrained(text_model_name)
        self.text_proj = nn.Linear(self.text_encoder.config.hidden_size, d_model)
        self.cbm_proj = nn.Sequential(nn.Linear(2, d_model), nn.ReLU(), nn.Dropout(dropout))
        self.final = nn.Sequential(
            nn.Linear(d_model * 3, 64), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(64, 32), nn.ReLU(), nn.Linear(32, 1))

    def forward(self, x_tab, input_ids, attention_mask, cbm):
        tab_encoded = self.tabular_proj(x_tab)
        tab_out = self.tab_transformer(tab_encoded.unsqueeze(1)).squeeze(1)
        text_out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = text_out.pooler_output
        text_out = self.text_proj(pooled)
        cbm_out = self.cbm_proj(cbm)
        combined = torch.cat([tab_out, text_out, cbm_out], dim=1)
        return self.final(combined)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TabFinBERT_CBModel(tab_input_dim=X_tab_train.shape[1]).to(device)


from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(class_weight='balanced', classes=np.array([0, 1]), y=df[label_col])
pos_weight = torch.tensor([class_weights[1] / class_weights[0]]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = optim.AdamW(model.parameters(), lr=2e-4, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=4, factor=0.5, verbose=True)

train_losses, test_losses = [], []
best_loss, patience, counter = float('inf'), 8, 0

for epoch in range(1, 151):
    model.train(); total_train_loss = 0
    for x_tab, ids, mask, yb in train_loader:
        x_tab, ids, mask, yb = x_tab.to(device), ids.to(device), mask.to(device), yb.to(device)
        cbm = x_tab[:, -2:]
        optimizer.zero_grad()
        logits = model(x_tab, ids, mask, cbm)
        loss = criterion(logits, yb)
        loss.backward(); optimizer.step()
        total_train_loss += loss.item()
    train_losses.append(total_train_loss / len(train_loader))

    
    model.eval(); total_test_loss, all_preds, all_labels = 0, [], []
    with torch.no_grad():
        for x_tab, ids, mask, yb in test_loader:
            x_tab, ids, mask, yb = x_tab.to(device), ids.to(device), mask.to(device), yb.to(device)
            cbm = x_tab[:, -2:]
            logits = model(x_tab, ids, mask, cbm)
            loss = criterion(logits, yb)
            total_test_loss += loss.item()
            preds = torch.sigmoid(logits)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(yb.cpu().numpy())
    avg_test_loss = total_test_loss / len(test_loader)
    test_losses.append(avg_test_loss)
    auc = roc_auc_score(all_labels, all_preds)
    print(f"Epoch {epoch:02d} | Train Loss: {train_losses[-1]:.4f} | Test Loss: {avg_test_loss:.4f} | AUC: {auc:.4f}")
    scheduler.step(avg_test_loss)

    if avg_test_loss < best_loss:
        best_loss, best_model_state, counter = avg_test_loss, model.state_dict(), 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered."); break

if best_model_state:
    model.load_state_dict(best_model_state)

2025-05-14 08:18:43.017917: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747210723.215048      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747210723.279389      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Epoch 01 | Train Loss: 0.7005 | Test Loss: 0.7004 | AUC: 0.5212
Epoch 02 | Train Loss: 0.6999 | Test Loss: 0.7000 | AUC: 0.5253
Epoch 03 | Train Loss: 0.6995 | Test Loss: 0.6995 | AUC: 0.5303
Epoch 04 | Train Loss: 0.6989 | Test Loss: 0.6997 | AUC: 0.5336
Epoch 05 | Train Loss: 0.6984 | Test Loss: 0.6987 | AUC: 0.5366
Epoch 06 | Train Loss: 0.6975 | Test Loss: 0.6985 | AUC: 0.5389
Epoch 07 | Train Loss: 0.6961 | Test Loss: 0.6968 | AUC: 0.5467
Epoch 08 | Train Loss: 0.6945 | Test Loss: 0.6950 | AUC: 0.5554
Epoch 09 | Train Loss: 0.6921 | Test Loss: 0.6945 | AUC: 0.5614
Epoch 10 | Train Loss: 0.6886 | Test Loss: 0.6895 | AUC: 0.5721
Epoch 11 | Train Loss: 0.6845 | Test Loss: 0.6868 | AUC: 0.5778
Epoch 12 | Train Loss: 0.6798 | Test Loss: 0.6805 | AUC: 0.5894
Epoch 13 | Train Loss: 0.6742 | Test Loss: 0.6820 | AUC: 0.5930
Epoch 14 | Train Loss: 0.6681 | Test Loss: 0.6714 | AUC: 0.6093
Epoch 15 | Train Loss: 0.6616 | Test Loss: 0.6632 | AUC: 0.6273
Epoch 16 | Train Loss: 0.6552 | Test Los

In [1]:
torch.save(model.state_dict(), "/kaggle/working/model_final.pt")

NameError: name 'torch' is not defined

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(8, 5))
plt.plot(train_losses, label="Train Loss")
plt.plot(test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss per Epoch")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import shap
class SimpleTabModel(nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model.tabular_proj
    def forward(self, x):
        return self.model(x)

shap_model = SimpleTabModel(model).to("cpu")
explainer = shap.Explainer(shap_model, torch.tensor(X_scaled[:1000], dtype=torch.float32))
shap_values = explainer(torch.tensor(X_scaled[:1000], dtype=torch.float32))
shap.summary_plot(shap_values.values, features=X_encoded.iloc[:1000], feature_names=X_encoded.columns.tolist())

In [None]:
shap.initjs()
shap.force_plot(
    base_value=explainer.expected_value[0],
    shap_values=shap_values.values[0],
    features=X_encoded.iloc[0],
    feature_names=X_encoded.columns.tolist()
)

In [None]:
shap.plots.bar(shap_values.mean(0), max_display=10)

In [None]:
import seaborn as sns
cm = confusion_matrix(y_true, y_pred_binary)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["No Default", "Default"], yticklabels=["No Default", "Default"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (Seaborn)")
plt.show()

In [None]:
from sklearn.metrics import roc_curve, precision_recall_curve
fpr, tpr, _ = roc_curve(y_true, y_pred)
plt.plot(fpr, tpr, label="ROC Curve")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.grid(True)
plt.legend()
plt.show()

In [None]:
from sklearn.calibration import calibration_curve
prob_true, prob_pred = calibration_curve(y_true, y_pred, n_bins=10)
plt.plot(prob_pred, prob_true, marker='o')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("Mean predicted probability")
plt.ylabel("True probability in bin")
plt.title("Calibration Curve")
plt.grid(True)
plt.show()