In [5]:
import shap
import joblib
from archive.explore_data import load_data
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
# loading model, encoder, & data
model = joblib.load("xgb_fraud_model.pkl")
le = joblib.load("label_encoder.pkl")
df = load_data()
df['encoded_type'] = le.fit_transform(df['type'])

# features included in model
features = ['step', 'amount', 'log_amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest', 'encoded_type']
X = df[features]
y = df['isFraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2, random_state=42)

In [3]:
# initialize shap explainer
explainer = shap.Explainer(model)
shap_values = explainer(X)

In [None]:
# save shap values: how much each feature pushed the prediction up/dowm compared to the avg prediction
# positive shap: pushes prediction towards fraud
# pushes prediction towards not fraud
np.save('shap_values.npy', shap_values.values)
np.save('shap_base.npy', shap_values.base_values)
X.to_csv('shap_input.csv', index=False)

In [None]:
# shap summary plot
shap.summary_plot(shap_values)