In [1]:
import pandas as pd
import shap
import matplotlib.pyplot as plt
import joblib
import os

# Set Matplotlib backend for headless environments
import matplotlib
matplotlib.use('Agg')

# Load test data and models
X_fraud_test = pd.read_csv('../data/processed/X_fraud_test.csv')
X_credit_test = pd.read_csv('../data/processed/X_credit_test.csv')
xgb_fraud = joblib.load('../models/fraud_xgb_model.pkl')
xgb_credit = joblib.load('../models/credit_xgb_model.pkl')

# Initialize SHAP explainer
explainer_fraud = shap.TreeExplainer(xgb_fraud)
explainer_credit = shap.TreeExplainer(xgb_credit)

# Compute SHAP values
shap_values_fraud = explainer_fraud.shap_values(X_fraud_test)
shap_values_credit = explainer_credit.shap_values(X_credit_test)

# Create directory for plots
os.makedirs('../reports/shap_plots', exist_ok=True)

# Summary Plots with dynamic interpretation
shap.summary_plot(shap_values_fraud, X_fraud_test, show=False)
plt.savefig('../reports/shap_plots/fraud_summary_plot.png', dpi=300, bbox_inches='tight')
plt.close()

shap.summary_plot(shap_values_credit, X_credit_test, show=False)
plt.savefig('../reports/shap_plots/credit_summary_plot.png', dpi=300, bbox_inches='tight')
plt.close()

# Get top features from SHAP values
shap_summary_fraud = pd.DataFrame(shap_values_fraud, columns=X_fraud_test.columns)
top_features_fraud = shap_summary_fraud.abs().mean().sort_values(ascending=False).head(2).index

shap_summary_credit = pd.DataFrame(shap_values_credit, columns=X_credit_test.columns)
top_features_credit = shap_summary_credit.abs().mean().sort_values(ascending=False).head(2).index

# Print data-driven interpretations
print("Fraud_Data: Top features driving fraud (from summary plot):")
for feature in top_features_fraud:
    if feature in ['time_since_signup', 'device_transaction_count']:
        trend = 'short' if feature == 'time_since_signup' else 'high'
        print(f"- {feature}: {trend} values increase fraud likelihood.")
    else:
        print(f"- {feature}: significant impact on fraud likelihood.")

print("creditcard: Top features driving fraud (from summary plot):")
for feature in top_features_credit:
    if feature in ['V14', 'V4', 'log_amount']:
        trend = 'high negative' if feature in ['V14', 'V4'] else 'large'
        print(f"- {feature}: {trend} values strongly predict fraud.")
    else:
        print(f"- {feature}: significant impact on fraud likelihood.")

# Force Plots with dynamic interpretation
try:
    shap.force_plot(explainer_fraud.expected_value, shap_values_fraud[0], X_fraud_test.iloc[0], matplotlib=True, show=False)
    plt.savefig('../reports/shap_plots/fraud_force_plot.png', dpi=300, bbox_inches='tight')
    plt.close()
    top_contributors_fraud = pd.DataFrame({'feature': X_fraud_test.columns, 'shap_value': shap_values_fraud[0]}).nlargest(2, 'shap_value')['feature']
    print(f"Fraud_Data: Force plot for first instance shows {', '.join(top_contributors_fraud)} pushing towards {'fraud' if shap_values_fraud[0].sum() > 0 else 'non-fraud'}.")
except Exception as e:
    print(f"Error generating fraud force plot: {e}")

try:
    shap.force_plot(explainer_credit.expected_value, shap_values_credit[0], X_credit_test.iloc[0], matplotlib=True, show=False)
    plt.savefig('../reports/shap_plots/credit_force_plot.png', dpi=300, bbox_inches='tight')
    plt.close()
    top_contributors_credit = pd.DataFrame({'feature': X_credit_test.columns, 'shap_value': shap_values_credit[0]}).nlargest(2, 'shap_value')['feature']
    print(f"creditcard: Force plot for first instance shows {', '.join(top_contributors_credit)} pushing towards {'fraud' if shap_values_credit[0].sum() > 0 else 'non-fraud'}.")
except Exception as e:
    print(f"Error generating credit force plot: {e}")

Fraud_Data: Top features driving fraud (from summary plot):
- device_transaction_count: high values increase fraud likelihood.
- country_United States: significant impact on fraud likelihood.
creditcard: Top features driving fraud (from summary plot):
- V14: high negative values strongly predict fraud.
- V4: high negative values strongly predict fraud.
Fraud_Data: Force plot for first instance shows country_China, source_Direct pushing towards non-fraud.
creditcard: Force plot for first instance shows log_amount, V28 pushing towards non-fraud.
