 ## **1. Data Pre-processing**

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

# Load the datasets
train_url = "https://raw.githubusercontent.com/mh2t/DS5110/main/Homework/HW4-Train.csv"
valid_url = "https://raw.githubusercontent.com/mh2t/DS5110/main/Homework/HW4-Validation.csv"

train = pd.read_csv(train_url)
valid = pd.read_csv(valid_url)

# Add a source flag BEFORE concatenating
train["source"] = "train"
valid["source"] = "valid"

# Combine training and validation data
df = pd.concat([train, valid], axis=0).reset_index(drop=True)

print(df.columns.tolist())

# 1. Check and handle missing values
print("Missing values per column:\n", df.isna().sum())

# If there are very few missing values, we can drop them
df = df.dropna()

# 2. Separate predictors (X) and target (y)
target_col = "Default_ind"

y = df[target_col]
source = df["source"]
X = df.drop(columns=[target_col, "source"])

# 3. Encode categorical variables (States)
cat_cols = X.select_dtypes(include=["object", "category"]).columns
print("Categorical columns:", list(cat_cols))

X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# 4. Standardize numerical variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data preprocessing completed.")
print("Final shape:", X_scaled.shape)

# 5. Split back into train / valid using 'source'
mask_train = (source == "train").values
mask_valid = (source == "valid").values

X_train = X_scaled[mask_train]
y_train = y[mask_train]

X_valid = X_scaled[mask_valid]
y_valid = y[mask_valid]

print("Train size:", X_train.shape, y_train.shape)
print("Validation size:", X_valid.shape, y_valid.shape)


['tot_credit_debt', 'avg_card_debt', 'credit_age', 'credit_good_age', 'card_age', 'non_mtg_acc_past_due_12_months_num', 'non_mtg_acc_past_due_6_months_num', 'mortgages_past_due_6_months_num', 'credit_past_due_amount', 'inq_12_month_num', 'card_inq_24_month_num', 'card_open_36_month_num', 'auto_open_ 36_month_num', 'uti_card', 'uti_50plus_pct', 'uti_max_credit_line', 'uti_card_50plus_pct', 'ind_acc_XYZ', 'rep_income', 'States', 'Default_ind', 'source']
Missing values per column:
 tot_credit_debt                          0
avg_card_debt                            0
credit_age                               0
credit_good_age                          0
card_age                                 0
non_mtg_acc_past_due_12_months_num       0
non_mtg_acc_past_due_6_months_num        0
mortgages_past_due_6_months_num          0
credit_past_due_amount                   0
inq_12_month_num                         0
card_inq_24_month_num                    0
card_open_36_month_num                   0


## **2. Logistic Regression Model**

In [None]:
# Fit Logistic Regression
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)

# Predict on validation set
y_pred = lr.predict(X_valid)
y_pred_prob = lr.predict_proba(X_valid)[:, 1]

# Performance Evaluation
print("\n=== Logistic Regression Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("AUC:", roc_auc_score(y_valid, y_pred_prob))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_pred))

# Interpret Coefficients
coef_df = pd.DataFrame({
    "feature": X.columns,
    "coefficient": lr.coef_[0]
}).sort_values(by="coefficient", ascending=False)

print("\n=== Top Positive Coefficients (higher default risk) ===")
print(coef_df.head(10))

print("\n=== Top Negative Coefficients (lower default risk) ===")
print(coef_df.tail(10))


=== Logistic Regression Performance ===
Accuracy: 0.9393449251920744
AUC: 0.8213424964803017
Confusion Matrix:
 [[2270   20]
 [ 130   53]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.95      0.99      0.97      2290
         1.0       0.73      0.29      0.41       183

    accuracy                           0.94      2473
   macro avg       0.84      0.64      0.69      2473
weighted avg       0.93      0.94      0.93      2473


=== Top Positive Coefficients (higher default risk) ===
                               feature  coefficient
13                            uti_card     0.558554
5   non_mtg_acc_past_due_12_months_num     0.464246
1                        avg_card_debt     0.371700
9                     inq_12_month_num     0.274913
7      mortgages_past_due_6_months_num     0.258833
6    non_mtg_acc_past_due_6_months_num     0.168581
16                 uti_card_50plus_pct     0.106207
11              card_open_36_month_

## **3. Random Forest Model**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
# Explanation of hyperparameters:
# - n_estimators: number of trees in the forest
# - max_depth: maximum depth of each tree (None = expand fully)
# - min_samples_leaf: minimum samples in a leaf node (larger value reduces overfitting)
# - class_weight="balanced": adjust for potential class imbalance (defaults vs non-defaults)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=10,
    random_state=0,
    class_weight="balanced",
    n_jobs=-1
)

# Fit model on training data
rf.fit(X_train, y_train)

# Predict on validation data
y_valid_pred = rf.predict(X_valid)
y_valid_prob = rf.predict_proba(X_valid)[:, 1]

# Evaluate performance
print("\n=== Random Forest Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_valid_pred))
print("AUC:", roc_auc_score(y_valid, y_valid_prob))
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_valid_pred))
print("\nClassification Report:\n", classification_report(y_valid, y_valid_pred))

# Feature importance
importances = rf.feature_importances_
feat_importance_df = pd.DataFrame({
    "feature": X.columns,
    "importance": importances
}).sort_values(by="importance", ascending=False)

print("\n=== Top 15 Most Important Features (Random Forest) ===")
print(feat_importance_df.head(15))


=== Random Forest Performance ===
Accuracy: 0.9219571370804691
AUC: 0.8604338177392798
Confusion Matrix:
 [[2194   96]
 [  97   86]]

Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.96      0.96      2290
         1.0       0.47      0.47      0.47       183

    accuracy                           0.92      2473
   macro avg       0.72      0.71      0.71      2473
weighted avg       0.92      0.92      0.92      2473


=== Top 15 Most Important Features (Random Forest) ===
                               feature  importance
1                        avg_card_debt    0.172532
13                            uti_card    0.095343
0                      tot_credit_debt    0.076144
5   non_mtg_acc_past_due_12_months_num    0.074857
16                 uti_card_50plus_pct    0.074430
8               credit_past_due_amount    0.059742
4                             card_age    0.054864
14                      uti_50plus_pct    0.054545

## **Final Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score

# Manually define a small set of candidate parameter combinations
param_grid = [
    {"n_estimators": 200, "max_depth": None, "min_samples_leaf": 10},
    {"n_estimators": 300, "max_depth": 8,    "min_samples_leaf": 10},
    {"n_estimators": 300, "max_depth": 10,   "min_samples_leaf": 5},
    {"n_estimators": 400, "max_depth": 8,    "min_samples_leaf": 5},
    {"n_estimators": 500, "max_depth": 10,   "min_samples_leaf": 5},
]

results = []

for params in param_grid:
    print("\nTrying params:", params)

    rf = RandomForestClassifier(
        n_estimators=params["n_estimators"],
        max_depth=params["max_depth"],
        min_samples_leaf=params["min_samples_leaf"],
        class_weight="balanced",
        random_state=0,
        n_jobs=-1
    )

    rf.fit(X_train, y_train)
    y_valid_prob = rf.predict_proba(X_valid)[:, 1]
    y_valid_pred = rf.predict(X_valid)

    auc = roc_auc_score(y_valid, y_valid_prob)
    acc = accuracy_score(y_valid, y_valid_pred)

    print(f"Validation AUC = {auc:.4f}, Accuracy = {acc:.4f}")

    results.append({
        "params": params,
        "auc": auc,
        "accuracy": acc
    })

# Identify the parameter combination with the highest AUC
results_df = pd.DataFrame(results)
print("\nAll candidates:")
print(results_df)

best_row = results_df.iloc[results_df["auc"].idxmax()]
best_params = best_row["params"]
print("\nBest params by validation AUC:")
print(best_params)
print("Best AUC:", best_row["auc"])

best_params = {'n_estimators': 400, 'max_depth': 8, 'min_samples_leaf': 5}

final_rf = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_leaf=best_params["min_samples_leaf"],
    class_weight="balanced",
    random_state=0,
    n_jobs=-1
)

final_rf.fit(X_scaled, y)

print("Final Random Forest trained on all available data.")


Trying params: {'n_estimators': 200, 'max_depth': None, 'min_samples_leaf': 10}
Validation AUC = 0.8583, Accuracy = 0.9211

Trying params: {'n_estimators': 300, 'max_depth': 8, 'min_samples_leaf': 10}
Validation AUC = 0.8593, Accuracy = 0.8908

Trying params: {'n_estimators': 300, 'max_depth': 10, 'min_samples_leaf': 5}
Validation AUC = 0.8579, Accuracy = 0.9163

Trying params: {'n_estimators': 400, 'max_depth': 8, 'min_samples_leaf': 5}
Validation AUC = 0.8593, Accuracy = 0.8941

Trying params: {'n_estimators': 500, 'max_depth': 10, 'min_samples_leaf': 5}
Validation AUC = 0.8579, Accuracy = 0.9171

All candidates:
                                              params       auc  accuracy
0  {'n_estimators': 200, 'max_depth': None, 'min_...  0.858339  0.921148
1  {'n_estimators': 300, 'max_depth': 8, 'min_sam...  0.859284  0.890821
2  {'n_estimators': 300, 'max_depth': 10, 'min_sa...  0.857926  0.916296
3  {'n_estimators': 400, 'max_depth': 8, 'min_sam...  0.859317  0.894056
4  {'n_esti

## **Demo: Credit Card Applications**

In [None]:
from sklearn.ensemble import RandomForestClassifier

best_params = {'n_estimators': 400, 'max_depth': 8, 'min_samples_leaf': 5}

best_rf = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_leaf=best_params["min_samples_leaf"],
    class_weight="balanced",
    random_state=0,
    n_jobs=-1
)

best_rf.fit(X_train, y_train)

y_valid_prob = best_rf.predict_proba(X_valid)[:, 1]

# Compute default probability for validation set
y_valid_prob = best_rf.predict_proba(X_valid)[:, 1]

# Show first 20 probabilities
import numpy as np
print("First 20 predicted default probabilities:\n")
print(np.round(y_valid_prob[:20], 4))

# Organize the probability of default into a readable table
import pandas as pd

prob_df = pd.DataFrame({
    "Default_Probability": y_valid_prob
})

print(prob_df.head(20))

# Automatically generate risk levels
def risk_band(p):
    if p < 0.10:
        return "Low Risk"
    elif p < 0.25:
        return "Medium Risk"
    else:
        return "High Risk"

prob_df["Risk_Band"] = prob_df["Default_Probability"].apply(risk_band)

print(prob_df.head(20))

First 20 predicted default probabilities:

[0.2909 0.2672 0.2365 0.2156 0.0841 0.5754 0.1819 0.4081 0.2383 0.2243
 0.1741 0.0609 0.1146 0.1379 0.4388 0.1672 0.2221 0.8857 0.2473 0.2412]
    Default_Probability
0              0.290919
1              0.267221
2              0.236455
3              0.215630
4              0.084137
5              0.575353
6              0.181931
7              0.408136
8              0.238256
9              0.224316
10             0.174110
11             0.060866
12             0.114589
13             0.137940
14             0.438838
15             0.167231
16             0.222109
17             0.885744
18             0.247338
19             0.241154
    Default_Probability    Risk_Band
0              0.290919    High Risk
1              0.267221    High Risk
2              0.236455  Medium Risk
3              0.215630  Medium Risk
4              0.084137     Low Risk
5              0.575353    High Risk
6              0.181931  Medium Risk
7             