In [None]:
import pickle
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# Load dataset
with open("train_val_test.pkl", "rb") as f:
    X_train, X_val, X_test, y_train, y_val, y_test = pickle.load(f)

print("Data successfully loaded!")


In [None]:
# Convert categorical features to category dtype for XGBoost
categorical_features = ["sub_grade", "term", "purpose"]
for col in categorical_features:
    X_train[col] = X_train[col].astype("category")
    X_val[col] = X_val[col].astype("category")
    X_test[col] = X_test[col].astype("category")

# One-Hot Encode "emp_length" since it has many categories
ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
encoded_train = ohe.fit_transform(X_train[["emp_length"]])
encoded_val = ohe.transform(X_val[["emp_length"]])
encoded_test = ohe.transform(X_test[["emp_length"]])

# Convert encoded data to DataFrame
encoded_train_df = pd.DataFrame(encoded_train, index=X_train.index, columns=ohe.get_feature_names_out(["emp_length"]))
encoded_val_df = pd.DataFrame(encoded_val, index=X_val.index, columns=ohe.get_feature_names_out(["emp_length"]))
encoded_test_df = pd.DataFrame(encoded_test, index=X_test.index, columns=ohe.get_feature_names_out(["emp_length"]))

# Drop original "emp_length" and concatenate encoded features
X_train = X_train.drop(columns=["emp_length"]).reset_index(drop=True)
X_val = X_val.drop(columns=["emp_length"]).reset_index(drop=True)
X_test = X_test.drop(columns=["emp_length"]).reset_index(drop=True)

X_train = pd.concat([X_train, encoded_train_df], axis=1)
X_val = pd.concat([X_val, encoded_val_df], axis=1)
X_test = pd.concat([X_test, encoded_test_df], axis=1)

print("Categorical processing complete.")


In [None]:
# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",  # Multi-class classification
    num_class=len(y_train.unique()),  # Number of unique classes
    tree_method="hist",  # Faster training
    enable_categorical=True,  # Use categorical support
    eval_metric="mlogloss",  # Log loss for multi-class
    use_label_encoder=False
)

# Train the model
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

print("Model training complete.")


In [None]:
# Make predictions
y_pred = xgb_model.predict(X_test)

# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
# Feature Importance
import matplotlib.pyplot as plt
import xgboost as xgb

# Plot feature importance
fig, ax = plt.subplots(figsize=(10, 6))
xgb.plot_importance(xgb_model, importance_type="weight", ax=ax)
plt.title("Feature Importance (XGBoost)")
plt.show()

In [None]:
# Get feature importance scores
feature_importance = xgb_model.get_booster().get_score(importance_type="weight")

# Convert to DataFrame for better readability
importance_df = pd.DataFrame(
    feature_importance.items(),
    columns=["Feature", "Importance"]
).sort_values(by="Importance", ascending=False)

# Display importance rankings
import ace_tools as tools  # Enables interactive display
tools.display_dataframe_to_user(name="Feature Importance", dataframe=importance_df)
