# Credit Risk SHAP Project - Notebook with model artifacts

In [7]:
# Ensure the dataset file is present in the notebook working directory.
# This notebook assumes `credit_risk_dataset.csv` is included alongside the notebook in the ZIP.
import os, shutil
if not os.path.exists("credit_risk_dataset.csv"):
    # Try copying from a known path if available (may not be necessary when ZIP contains CSV)
    src = '/mnt/data/credit_risk_dataset.csv'
    if os.path.exists(src):
        shutil.copy(src, "credit_risk_dataset.csv")
        print("Copied dataset from", src)
    else:
        raise FileNotFoundError("credit_risk_dataset.csv not found in working dir or source path. Ensure the CSV is next to this notebook.")
else:
    print("Found credit_risk_dataset.csv in working directory.")
import pandas as pd
df = pd.read_csv("credit_risk_dataset.csv")
print("Loaded dataset shape:", df.shape)
display(df.head())

Found credit_risk_dataset.csv in working directory.
Loaded dataset shape: (32581, 12)


Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [8]:
# Imports and setup
import warnings
warnings.filterwarnings('ignore')
import numpy as np, pandas as pd, os, pickle
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
try:
    import xgboost as xgb
except Exception:
    xgb = None
try:
    import lightgbm as lgb
except Exception:
    lgb = None
try:
    import shap
except Exception:
    shap = None
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})

OUT = Path("submission_package")
MODELS_DIR = OUT/"models"
OUT.mkdir(exist_ok=True)
MODELS_DIR.mkdir(exist_ok=True)
print("Output folder:", OUT.resolve())

Output folder: F:\Datas\AI_Course\Yamuna\Credit_Risk_SHAP_WithModels_Package\submission_package


In [9]:
# Load dataset, detect target and preprocess
df = pd.read_csv("credit_risk_dataset.csv")
common_targets = ["default","loan_default","loan_status","SeriousDlqin2yrs","DEFAULT_PAYMENT_NEXT_MONTH","is_default","default_payment_next_month","target"]
target_col = None
for t in common_targets:
    if t in df.columns:
        target_col = t
        break
if target_col is None:
    last_col = df.columns[-1]
    uniq = df[last_col].dropna().unique()
    if set(uniq).issubset({0,1}) or len(uniq) <= 5:
        target_col = last_col
    else:
        target_col = last_col
        print("Warning: target not auto-detected; using last column:", target_col)

print("Detected target:", target_col)
y = df[target_col]
X = df.drop(columns=[target_col])

# Drop ID-like columns
id_cols = [c for c in X.columns if c.lower().startswith("id")]
if id_cols:
    X = X.drop(columns=id_cols)
    print("Dropped id columns:", id_cols)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
print("Num numeric:", len(num_cols), "Num categorical:", len(cat_cols))

num_pipe = Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())])
cat_pipe = Pipeline([('imp', SimpleImputer(strategy='most_frequent')), ('enc', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])
preprocessor = ColumnTransformer([('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)], remainder='drop')

X_pre = preprocessor.fit_transform(X)
print("Preprocessed shape:", X_pre.shape)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_pre, y, test_size=0.2, stratify=y, random_state=42)
print("Train/test sizes:", X_train.shape, X_test.shape)

Detected target: loan_status
Num numeric: 7 Num categorical: 4
Preprocessed shape: (32581, 11)
Train/test sizes: (26064, 11) (6517, 11)


In [10]:
# Train three models (if libs present) and save model artifacts + record models used
results = {}
models_used = []

# 1) RandomForest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_proba = rf.predict_proba(X_test)[:,1]
y_pred = rf.predict(X_test)
results['random_forest'] = {'auc': roc_auc_score(y_test, y_proba), 'accuracy': accuracy_score(y_test, y_pred), 'f1': f1_score(y_test, y_pred)}
# Save model
with open(MODELS_DIR/'random_forest.pkl', 'wb') as f:
    pickle.dump(rf, f)
models_used.append('RandomForest')

print("Saved RandomForest model. AUC:", results['random_forest']['auc'])

# 2) XGBoost
if xgb is not None:
    xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
    xgb_clf.fit(X_train, y_train)
    y_proba = xgb_clf.predict_proba(X_test)[:,1]
    y_pred = xgb_clf.predict(X_test)
    results['xgboost'] = {'auc': roc_auc_score(y_test, y_proba), 'accuracy': accuracy_score(y_test, y_pred), 'f1': f1_score(y_test, y_pred)}
    with open(MODELS_DIR/'xgboost.pkl','wb') as f:
        pickle.dump(xgb_clf, f)
    models_used.append('XGBoost')
    print("Saved XGBoost model. AUC:", results['xgboost']['auc'])
else:
    print("XGBoost not installed - skipped")

# 3) LightGBM
if lgb is not None:
    lgb_clf = lgb.LGBMClassifier(random_state=42, n_jobs=-1)
    lgb_clf.fit(X_train, y_train)
    y_proba = lgb_clf.predict_proba(X_test)[:,1]
    y_pred = lgb_clf.predict(X_test)
    results['lightgbm'] = {'auc': roc_auc_score(y_test, y_proba), 'accuracy': accuracy_score(y_test, y_pred), 'f1': f1_score(y_test, y_pred)}
    with open(MODELS_DIR/'lightgbm.pkl','wb') as f:
        pickle.dump(lgb_clf, f)
    models_used.append('LightGBM')
    print("Saved LightGBM model. AUC:", results['lightgbm']['auc'])
else:
    print("LightGBM not installed - skipped")

# Save metrics and models_used file
import json
with open(OUT/'model_metrics.json','w') as f:
    json.dump(results, f, indent=2)
pd.DataFrame(results).T.to_csv(OUT/'model_metrics.csv')

with open(MODELS_DIR/'models_used.txt','w') as f:
    for m in models_used:
        f.write(m + "\\n")
print("Models used and saved:", models_used)

Saved RandomForest model. AUC: 0.9312978858785743
Saved XGBoost model. AUC: 0.9498688077028719
[LightGBM] [Info] Number of positive: 5686, number of negative: 20378
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 949
[LightGBM] [Info] Number of data points in the train set: 26064, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.218155 -> initscore=-1.276449
[LightGBM] [Info] Start training from score -1.276449
Saved LightGBM model. AUC: 0.9478848433904892
Models used and saved: ['RandomForest', 'XGBoost', 'LightGBM']


In [11]:
# SHAP analysis (global + 3 local cases) if shap is installed
if shap is None:
    print("SHAP not installed - skipping SHAP section")
else:
    best_name = max(results.keys(), key=lambda k: results[k]['auc'])
    best_model = None
    if best_name == 'random_forest':
        best_model = rf
    elif best_name == 'xgboost' and xgb is not None:
        best_model = xgb_clf
    elif best_name == 'lightgbm' and lgb is not None:
        best_model = lgb_clf
    print("Best model for SHAP:", best_name)
    try:
        explainer = shap.TreeExplainer(best_model)
        shap_values = explainer.shap_values(X_pre)
        # Global plots
        shap.summary_plot(shap_values, X_pre, show=False)
        plt.tight_layout(); plt.savefig(OUT/'shap_summary.png', dpi=150); plt.clf()
        shap.summary_plot(shap_values, X_pre, plot_type='bar', show=False)
        plt.tight_layout(); plt.savefig(OUT/'shap_bar.png', dpi=150); plt.clf()
        print("Saved SHAP global plots to submission_package/")
        # Local plots for TP, TN, FP if available
        preds = (best_model.predict_proba(X_pre)[:,1] >= 0.5).astype(int)
        from sklearn.metrics import confusion_matrix
        tn, fp, fn, tp = confusion_matrix(y, preds).ravel()
        tp_idx = tn_idx = fp_idx = None
        for i,(p,yt) in enumerate(zip(preds, y)):
            if p==1 and yt==1 and tp_idx is None:
                tp_idx = i
            if p==0 and yt==0 and tn_idx is None:
                tn_idx = i
            if p==1 and yt==0 and fp_idx is None:
                fp_idx = i
        local = [('tp',tp_idx),('tn',tn_idx),('fp',fp_idx)]
        for name, idx in local:
            if idx is None: continue
            try:
                shap.plots._waterfall.waterfall_legacy(explainer.expected_value, shap_values[idx], show=False)
                plt.tight_layout(); plt.savefig(OUT/f'shap_local_{name}.png', dpi=150); plt.clf()
                print("Saved local SHAP:", name)
            except Exception as e:
                print("Could not save local SHAP for", name, e)
    except Exception as e:
        print("SHAP TreeExplainer failed or error occurred:", e)

Best model for SHAP: xgboost
Saved SHAP global plots to submission_package/
Saved local SHAP: tp
Saved local SHAP: tn
Saved local SHAP: fp
