In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import shap
import os
import matplotlib.pyplot as plt

# --- 1. Load the New, Clean Dataset ---
print("--- Step 1: Loading new Model A data ---")
try:
    file_path = 'Model_A_data/Loan_default.csv' # Using .xlsx as you confirmed
    df = pd.read_csv(file_path)
    print("New dataset loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: The file was not found at {file_path}. Please check the path and filename.")
    raise

# --- 2. Prepare the Data (with One-Hot Encoding) ---
print("--- Step 2: Preparing data for training ---")

# --- NEW: One-Hot Encode the categorical text columns ---
categorical_cols = [
    'Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage',
    'HasDependents', 'LoanPurpose', 'HasCoSigner'
]
print(f"One-hot encoding the following columns: {categorical_cols}")
df_encoded = pd.get_dummies(df, columns=categorical_cols, dummy_na=False, drop_first=True)

# The target variable is the 'Default' column
y = df_encoded['Default']

# The features are all other columns except the LoanID and original target
X = df_encoded.drop(columns=['Default', 'LoanID'])

# Ensure all feature names are in a format XGBoost can handle
X.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X.columns]
final_features_a = X.columns.tolist()

# --- 3. Split the Data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Data split into {len(X_train)} training samples and {len(X_test)} testing samples.")

# --- 4. Train the XGBoost Model ---
print("\n--- Step 4: Training XGBoost Model A ---")
weight_ratio = y_train.value_counts()[0] / y_train.value_counts()[1]
repayment_model = XGBClassifier(
    n_estimators=200, max_depth=8, learning_rate=0.05, subsample=0.8,
    colsample_bytree=0.8, use_label_encoder=False, eval_metric='logloss',
    scale_pos_weight=weight_ratio, n_jobs=-1, random_state=42
)
repayment_model.fit(X_train, y_train)
print("Model A trained successfully.")

# --- 5. Evaluate the New Model ---
print("\n--- Step 5: Evaluating New Model A Performance ---")
predictions = repayment_model.predict(X_test)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, predictions))
print("\nClassification Report:")
print(classification_report(y_test, predictions))

# --- 6. Save the Final Model and Artifacts ---
print("\n--- Step 6: Saving final model and artifacts ---")
os.makedirs('saved_models', exist_ok=True)
joblib.dump(repayment_model, 'saved_models/repayment_model_xgb.joblib')
joblib.dump(final_features_a, 'saved_models/repayment_model_features.joblib')
explainer_a = shap.TreeExplainer(repayment_model)
joblib.dump(explainer_a, 'saved_models/repayment_model_explainer.joblib')
print("--- Model A, its features, and SHAP explainer saved successfully! ---")
print("\nYour Repayment Model is now ready to be used by the API.")

  from .autonotebook import tqdm as notebook_tqdm


--- Step 1: Loading new Model A data ---
New dataset loaded successfully.
--- Step 2: Preparing data for training ---
One-hot encoding the following columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
Data split into 204277 training samples and 51070 testing samples.

--- Step 4: Training XGBoost Model A ---


Parameters: { "use_label_encoder" } are not used.



Model A trained successfully.

--- Step 5: Evaluating New Model A Performance ---

Confusion Matrix:
[[33654 11485]
 [ 2240  3691]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.75      0.83     45139
           1       0.24      0.62      0.35      5931

    accuracy                           0.73     51070
   macro avg       0.59      0.68      0.59     51070
weighted avg       0.86      0.73      0.77     51070


--- Step 6: Saving final model and artifacts ---
--- Model A, its features, and SHAP explainer saved successfully! ---

Your Repayment Model is now ready to be used by the API.
