In [16]:
!pip install xgboost imbalanced-learn





[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings("ignore")


In [18]:
df = pd.read_csv(
    r"C:\Users\Yashwi\OneDrive\Desktop\IITK\Summer Project 2024\Application of Probability Theory\Loan Default\accepted_2007_to_2018Q4.csv\accepted_2007_to_2018Q4.csv",
    usecols=[
        "loan_status", "annual_inc", "fico_range_low", "dti", "loan_amnt", "purpose",
        "int_rate", "emp_length", "revol_util", "delinq_2yrs", "home_ownership", "term"
    ],
    nrows=2000000
)


In [19]:
# binary classification
df = df[df["loan_status"].isin(["Charged Off", "Fully Paid"])]
df["default"] = df["loan_status"].apply(lambda x: 1 if x == "Charged Off" else 0)

# emp_length to numeric
def parse_emp_length(x):
    if pd.isnull(x):
        return np.nan
    if '<' in x:
        return 0
    if '10+' in x:
        return 10
    return int(x.split()[0])
df["emp_length"] = df["emp_length"].apply(parse_emp_length)

# drop missing rows
df.dropna(inplace=True)

# one-hot encode categorical variables
df = pd.get_dummies(df, columns=["purpose", "home_ownership", "term"], drop_first=True)

# features and label
X = df.drop(columns=["loan_status", "default"])
y = df["default"]


In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [21]:
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)


In [38]:
neg, pos = np.bincount(y_train)
scale_pos_weight = neg / pos
print(f"Scale Pos Weight: {scale_pos_weight:.2f}")

from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    objective="binary:logistic",        # binary classification
    eval_metric="logloss",              # avoid deprecation warning
    use_label_encoder=False,            # suppress label encoder warning
    n_estimators=200,                   # number of trees
    max_depth=5,                        # control overfitting
    learning_rate=0.1,                  # step size (lower = more conservative)
    subsample=0.8,                      # row sampling to reduce overfitting
    colsample_bytree=0.8,               # feature sampling per tree
    scale_pos_weight=scale_pos_weight,  # handle class imbalance
    random_state=42                     # reproducibility
)

xgb_model.fit(X_train_bal, y_train_bal)
y_pred = xgb_model.predict(X_test)
y_proba = xgb_model.predict_proba(X_test)[:, 1]


Scale Pos Weight: 4.20


In [40]:
# empirical stats
n = len(y_test)
actual_defaults = y_pred.sum()
empirical_rate = y_train.mean()

# theoretical mean and standard deviation
expected_defaults = n * empirical_rate
std_dev = np.sqrt(n * empirical_rate * (1 - empirical_rate))

k = 2  # 95% confidence
chebyshev_upper_bound = expected_defaults + k * std_dev

print("Actual Test Set Defaults:", actual_defaults)
print(f"Expected Defaults (mean): {expected_defaults:.2f}")
print(f"Chebyshev's Upper Bound (95% confidence): {chebyshev_upper_bound:.2f}")


Actual Test Set Defaults: 155623
Expected Defaults (mean): 44515.50
Chebyshev's Upper Bound (95% confidence): 44894.78


In [41]:
# predict probability for sample borrower
new_borrower = pd.DataFrame([{
    "annual_inc": 2000000,
    "fico_range_low": 700,
    "dti": 12.5,
    "loan_amnt": 8000,
    "int_rate": 12.0,
    "emp_length": 5,
    "revol_util": 30.0,
    "delinq_2yrs": 0,
    # Set all one-hot encoded cols to 0
    **{col: 0 for col in X.columns if col.startswith("purpose_") or col.startswith("home_") or col.startswith("term_")}
}], columns=X.columns)

new_borrower_scaled = scaler.transform(new_borrower)
prob_default = xgb_model.predict_proba(new_borrower_scaled)[0, 1]
print("🔍 Predicted Default Probability:", round(prob_default, 4))


🔍 Predicted Default Probability: 0.4351


In [42]:
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix, recall_score

# accuracy (not the best for imbalanced data)
accuracy = accuracy_score(y_test, y_pred)

# AUC Score
auc = roc_auc_score(y_test, y_proba)

# classification report
report = classification_report(y_test, y_pred, target_names=["Fully Paid", "Default"])

# confusion matrix
cm = confusion_matrix(y_test, y_pred)

# extract recall for defaulters (class 1)
recall_default = recall_score(y_test, y_pred)

# Print everything
print(f"Accuracy: {accuracy:.4f}")
print(f"AUC Score: {auc:.4f}")
print(f"Recall for Defaulters (Class 1): {recall_default:.4f}")
print("Classification Report:\n", report)
print("Confusion Matrix:\n", cm)


Accuracy: 0.4678
AUC Score: 0.7005
Recall for Defaulters (Class 1): 0.8629
Classification Report:
               precision    recall  f1-score   support

  Fully Paid       0.92      0.37      0.53    187187
     Default       0.25      0.86      0.38     44516

    accuracy                           0.47    231703
   macro avg       0.58      0.62      0.46    231703
weighted avg       0.79      0.47      0.50    231703

Confusion Matrix:
 [[ 69978 117209]
 [  6102  38414]]


In [43]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report

# random forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc_score = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"F1-Score: {f1:.3f}")
print("\nAUC score:", auc_score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# important features
importances = rf_model.feature_importances_
feature_names = X.columns

# top 10 features
top_indices = np.argsort(importances)[-10:][::-1]
print("Top 10 important features:")
for i in top_indices:
    print(f"{feature_names[i]}: {importances[i]:.4f}")


Accuracy: 0.809
F1-Score: 0.040

AUC score: 0.5085868626254635

Classification Report:
               precision    recall  f1-score   support

           0       0.81      1.00      0.89    187187
           1       0.60      0.02      0.04     44516

    accuracy                           0.81    231703
   macro avg       0.70      0.51      0.47    231703
weighted avg       0.77      0.81      0.73    231703

Top 10 important features:
int_rate: 0.4396
term_ 60 months: 0.2174
fico_range_low: 0.0891
dti: 0.0721
loan_amnt: 0.0503
annual_inc: 0.0318
home_ownership_MORTGAGE: 0.0260
home_ownership_RENT: 0.0232
revol_util: 0.0195
emp_length: 0.0063


In [44]:
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import unittest

# logistic regression
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train, y_train)

# prediction latency on test set
start = time.time()
y_pred = lr_model.predict(X_test)
end = time.time()
latency_per_sample = (end - start) / len(X_test) * 1000  # in ms

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Average prediction latency: {latency_per_sample:.3f} ms")

class TestModel(unittest.TestCase):
    def test_predict_shape(self):
        preds = lr_model.predict(X_test)
        self.assertEqual(preds.shape[0], X_test.shape[0])

    def test_predict_values(self):
        preds = lr_model.predict(X_test)
        self.assertTrue(set(preds).issubset({0,1}))

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)


..

Accuracy: 0.809
Average prediction latency: 0.000 ms



----------------------------------------------------------------------
Ran 2 tests in 0.029s

OK
