In [None]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

# Initialize JS visualization for interactive plots
shap.initjs()

print("Libraries imported successfully!")


In [None]:
# Set up paths
project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
data_dir = project_root / 'data' / 'processed'
models_dir = project_root / 'models'

print(f"Project root: {project_root}")
print(f"Data directory: {data_dir}")
print(f"Models directory: {models_dir}")


In [None]:
# Load processed data
print("Loading processed data...")
X_test = pd.read_csv(data_dir / 'X_test_processed.csv')
y_test = pd.read_csv(data_dir / 'y_test_processed.csv').values.ravel()

print(f"Test set shape: {X_test.shape}")
print(f"Test set class distribution:")
print(pd.Series(y_test).value_counts())
print(f"\nFraud rate: {y_test.mean():.4f}")


In [None]:
# Load the best model (Random Forest)
print("Loading best model...")
best_model = joblib.load(models_dir / 'best_model_random_forest.pkl')
print(f"Model loaded: {type(best_model).__name__}")
print(f"Model parameters:")
print(best_model.get_params())


## 2. Feature Importance Baseline

Extract and visualize the built-in feature importance from the Random Forest model.


In [None]:
# Extract feature importance from Random Forest
feature_importance = pd.DataFrame({
    'feature': X_test.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features (Random Forest Built-in):")
print("="*60)
print(feature_importance.head(20).to_string(index=False))


In [None]:
# Visualize top 10 most important features
top_10_features = feature_importance.head(10)

plt.figure(figsize=(10, 6))
plt.barh(range(len(top_10_features)), top_10_features['importance'].values[::-1])
plt.yticks(range(len(top_10_features)), top_10_features['feature'].values[::-1])
plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
plt.title('Top 10 Most Important Features (Random Forest Built-in)', fontsize=14, fontweight='bold')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print(f"\nTop 10 Features:")
for idx, row in top_10_features.iterrows():
    print(f"{row['feature']:30s}: {row['importance']:.4f}")


## 3. SHAP Analysis

Generate SHAP values to understand how each feature contributes to individual predictions.


In [None]:
# Create SHAP explainer
# For Random Forest, we use TreeExplainer which is optimized for tree-based models
print("Creating SHAP TreeExplainer...")
explainer = shap.TreeExplainer(best_model)
print("SHAP explainer created successfully!")

# Note: Computing SHAP values for the entire test set can be slow
# We'll use a sample for faster computation, but keep enough for meaningful analysis
sample_size = min(1000, len(X_test))
X_test_sample = X_test.sample(n=sample_size, random_state=42)
y_test_sample = y_test[X_test_sample.index]

print(f"\nComputing SHAP values for {sample_size} samples...")
shap_values = explainer.shap_values(X_test_sample)

# For binary classification, shap_values is a list [values_for_class_0, values_for_class_1]
# We're interested in class 1 (fraud)
if isinstance(shap_values, list):
    shap_values_fraud = shap_values[1]  # Class 1 (fraud)
else:
    shap_values_fraud = shap_values

print(f"SHAP values computed. Shape: {shap_values_fraud.shape}")


In [None]:
# SHAP Summary Plot (Global Feature Importance)
print("Generating SHAP Summary Plot...")
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values_fraud, X_test_sample, plot_type="bar", show=False)
plt.title("SHAP Global Feature Importance (Bar Plot)", fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


In [None]:
# SHAP Summary Plot (Detailed - shows feature values and their impact)
print("Generating detailed SHAP Summary Plot...")
plt.figure(figsize=(12, 10))
shap.summary_plot(shap_values_fraud, X_test_sample, show=False)
plt.title("SHAP Summary Plot (Feature Value Impact)", fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


In [None]:
# Get predictions for the sample
y_pred_sample = best_model.predict(X_test_sample)
y_pred_proba_sample = best_model.predict_proba(X_test_sample)[:, 1]

# Create a DataFrame for easier analysis
results_df = pd.DataFrame({
    'true_label': y_test_sample,
    'predicted_label': y_pred_sample,
    'predicted_proba': y_pred_proba_sample
}, index=X_test_sample.index)

# Identify cases
true_positives = results_df[(results_df['true_label'] == 1) & (results_df['predicted_label'] == 1)]
false_positives = results_df[(results_df['true_label'] == 0) & (results_df['predicted_label'] == 1)]
false_negatives = results_df[(results_df['true_label'] == 1) & (results_df['predicted_label'] == 0)]

print("Case Distribution:")
print(f"True Positives (Correctly identified fraud): {len(true_positives)}")
print(f"False Positives (Legitimate flagged as fraud): {len(false_positives)}")
print(f"False Negatives (Missed fraud): {len(false_negatives)}")
print(f"True Negatives: {len(results_df[(results_df['true_label'] == 0) & (results_df['predicted_label'] == 0)])}")

# Select representative cases
if len(true_positives) > 0:
    tp_idx = true_positives.index[0]  # First TP
    tp_sample_idx = X_test_sample.index.get_loc(tp_idx)
    print(f"\nSelected True Positive case: Index {tp_idx} (Sample index: {tp_sample_idx})")
    print(f"  Predicted probability: {true_positives.iloc[0]['predicted_proba']:.4f}")
else:
    tp_sample_idx = None
    print("\nNo True Positives found in sample")

if len(false_positives) > 0:
    fp_idx = false_positives.index[0]  # First FP
    fp_sample_idx = X_test_sample.index.get_loc(fp_idx)
    print(f"\nSelected False Positive case: Index {fp_idx} (Sample index: {fp_sample_idx})")
    print(f"  Predicted probability: {false_positives.iloc[0]['predicted_proba']:.4f}")
else:
    fp_sample_idx = None
    print("\nNo False Positives found in sample")

if len(false_negatives) > 0:
    fn_idx = false_negatives.index[0]  # First FN
    fn_sample_idx = X_test_sample.index.get_loc(fn_idx)
    print(f"\nSelected False Negative case: Index {fn_idx} (Sample index: {fn_sample_idx})")
    print(f"  Predicted probability: {false_negatives.iloc[0]['predicted_proba']:.4f}")
else:
    fn_sample_idx = None
    print("\nNo False Negatives found in sample")


In [None]:
# SHAP Force Plot for True Positive (Correctly identified fraud)
if tp_sample_idx is not None:
    print("="*60)
    print("SHAP Force Plot: TRUE POSITIVE (Correctly Identified Fraud)")
    print("="*60)
    print(f"True Label: Fraud (1)")
    print(f"Predicted Label: Fraud (1)")
    print(f"Predicted Probability: {y_pred_proba_sample[tp_sample_idx]:.4f}")
    print("\nThis plot shows how each feature pushed the prediction toward fraud:")
    
    # Create force plot
    shap.force_plot(
        explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
        shap_values_fraud[tp_sample_idx],
        X_test_sample.iloc[tp_sample_idx],
        matplotlib=True,
        show=False
    )
    plt.title("SHAP Force Plot: True Positive (Correctly Identified Fraud)", 
              fontsize=12, fontweight='bold', pad=10)
    plt.tight_layout()
    plt.show()
    
    # Print top contributing features
    feature_contributions = pd.DataFrame({
        'feature': X_test_sample.columns,
        'shap_value': shap_values_fraud[tp_sample_idx],
        'feature_value': X_test_sample.iloc[tp_sample_idx].values
    }).sort_values('shap_value', key=abs, ascending=False)
    
    print("\nTop 10 Features Contributing to Fraud Prediction:")
    print(feature_contributions.head(10).to_string(index=False))
else:
    print("No True Positive case available for visualization")


In [None]:
# SHAP Force Plot for False Positive (Legitimate flagged as fraud)
if fp_sample_idx is not None:
    print("="*60)
    print("SHAP Force Plot: FALSE POSITIVE (Legitimate Flagged as Fraud)")
    print("="*60)
    print(f"True Label: Legitimate (0)")
    print(f"Predicted Label: Fraud (1)")
    print(f"Predicted Probability: {y_pred_proba_sample[fp_sample_idx]:.4f}")
    print("\nThis plot shows why a legitimate transaction was incorrectly flagged:")
    
    # Create force plot
    shap.force_plot(
        explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
        shap_values_fraud[fp_sample_idx],
        X_test_sample.iloc[fp_sample_idx],
        matplotlib=True,
        show=False
    )
    plt.title("SHAP Force Plot: False Positive (Legitimate Flagged as Fraud)", 
              fontsize=12, fontweight='bold', pad=10)
    plt.tight_layout()
    plt.show()
    
    # Print top contributing features
    feature_contributions = pd.DataFrame({
        'feature': X_test_sample.columns,
        'shap_value': shap_values_fraud[fp_sample_idx],
        'feature_value': X_test_sample.iloc[fp_sample_idx].values
    }).sort_values('shap_value', key=abs, ascending=False)
    
    print("\nTop 10 Features Contributing to Incorrect Fraud Prediction:")
    print(feature_contributions.head(10).to_string(index=False))
else:
    print("No False Positive case available for visualization")


In [None]:
# SHAP Force Plot for False Negative (Missed fraud)
if fn_sample_idx is not None:
    print("="*60)
    print("SHAP Force Plot: FALSE NEGATIVE (Missed Fraud)")
    print("="*60)
    print(f"True Label: Fraud (1)")
    print(f"Predicted Label: Legitimate (0)")
    print(f"Predicted Probability: {y_pred_proba_sample[fn_sample_idx]:.4f}")
    print("\nThis plot shows why a fraudulent transaction was missed:")
    
    # Create force plot
    shap.force_plot(
        explainer.expected_value[1] if isinstance(explainer.expected_value, list) else explainer.expected_value,
        shap_values_fraud[fn_sample_idx],
        X_test_sample.iloc[fn_sample_idx],
        matplotlib=True,
        show=False
    )
    plt.title("SHAP Force Plot: False Negative (Missed Fraud)", 
              fontsize=12, fontweight='bold', pad=10)
    plt.tight_layout()
    plt.show()
    
    # Print top contributing features
    feature_contributions = pd.DataFrame({
        'feature': X_test_sample.columns,
        'shap_value': shap_values_fraud[fn_sample_idx],
        'feature_value': X_test_sample.iloc[fn_sample_idx].values
    }).sort_values('shap_value', key=abs, ascending=False)
    
    print("\nTop 10 Features Contributing to Incorrect Legitimate Prediction:")
    print(feature_contributions.head(10).to_string(index=False))
else:
    print("No False Negative case available for visualization")


## 5. Interpretation: SHAP vs Built-in Feature Importance


In [None]:
# Calculate mean absolute SHAP values (global importance)
mean_abs_shap = pd.DataFrame({
    'feature': X_test_sample.columns,
    'mean_abs_shap': np.abs(shap_values_fraud).mean(axis=0)
}).sort_values('mean_abs_shap', ascending=False)

print("Top 20 Features by Mean Absolute SHAP Value:")
print("="*60)
print(mean_abs_shap.head(20).to_string(index=False))


In [None]:
# Compare SHAP importance with built-in feature importance
comparison = pd.merge(
    feature_importance.rename(columns={'importance': 'rf_importance'}),
    mean_abs_shap.rename(columns={'mean_abs_shap': 'shap_importance'}),
    on='feature',
    how='outer'
).fillna(0)

# Normalize both to 0-1 scale for comparison
comparison['rf_importance_norm'] = comparison['rf_importance'] / comparison['rf_importance'].max()
comparison['shap_importance_norm'] = comparison['shap_importance'] / comparison['shap_importance'].max()

# Get top 20 for visualization
comparison_top20 = comparison.head(20).sort_values('shap_importance', ascending=True)

# Create comparison plot
fig, ax = plt.subplots(figsize=(12, 10))
y_pos = np.arange(len(comparison_top20))

ax.barh(y_pos - 0.2, comparison_top20['rf_importance_norm'].values, 0.4, 
        label='Random Forest Built-in', alpha=0.8, color='steelblue')
ax.barh(y_pos + 0.2, comparison_top20['shap_importance_norm'].values, 0.4, 
        label='SHAP Mean |Value|', alpha=0.8, color='coral')

ax.set_yticks(y_pos)
ax.set_yticklabels(comparison_top20['feature'].values)
ax.set_xlabel('Normalized Importance', fontsize=12, fontweight='bold')
ax.set_title('Feature Importance Comparison: Random Forest vs SHAP (Top 20)', 
             fontsize=14, fontweight='bold')
ax.legend(fontsize=11)
ax.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nComparison Summary:")
print("="*60)
print(f"Correlation between RF importance and SHAP importance: {comparison['rf_importance_norm'].corr(comparison['shap_importance_norm']):.4f}")


In [None]:
# Identify Top 5 Drivers of Fraud Predictions
print("="*60)
print("TOP 5 DRIVERS OF FRAUD PREDICTIONS")
print("="*60)
print("\nBased on SHAP Analysis (Mean Absolute SHAP Values):")
print("-"*60)

top_5_shap = mean_abs_shap.head(5)
for idx, (_, row) in enumerate(top_5_shap.iterrows(), 1):
    print(f"{idx}. {row['feature']:30s} - Mean |SHAP|: {row['mean_abs_shap']:.4f}")

print("\nBased on Random Forest Built-in Importance:")
print("-"*60)
top_5_rf = feature_importance.head(5)
for idx, (_, row) in enumerate(top_5_rf.iterrows(), 1):
    print(f"{idx}. {row['feature']:30s} - Importance: {row['importance']:.4f}")

# Find common features
common_top5 = set(top_5_shap['feature']) & set(top_5_rf['feature'])
print(f"\nCommon features in both top 5: {len(common_top5)}")
if common_top5:
    print(f"Features: {', '.join(common_top5)}")


## 6. Surprising or Counterintuitive Findings


In [None]:
# Analyze surprising findings
print("="*60)
print("SURPRISING OR COUNTERINTUITIVE FINDINGS")
print("="*60)

# Find features with high SHAP importance but low RF importance (and vice versa)
comparison['importance_diff'] = abs(comparison['rf_importance_norm'] - comparison['shap_importance_norm'])
comparison['shap_higher'] = comparison['shap_importance_norm'] > comparison['rf_importance_norm'] + 0.1
comparison['rf_higher'] = comparison['rf_importance_norm'] > comparison['shap_importance_norm'] + 0.1

print("\n1. Features with Higher SHAP Importance than RF Importance:")
print("   (These features may have complex interactions)")
shap_higher_features = comparison[comparison['shap_higher']].sort_values('shap_importance', ascending=False).head(10)
for _, row in shap_higher_features.iterrows():
    print(f"   - {row['feature']:30s} | SHAP: {row['shap_importance_norm']:.3f} | RF: {row['rf_importance_norm']:.3f}")

print("\n2. Features with Higher RF Importance than SHAP Importance:")
print("   (These features may have more consistent effects)")
rf_higher_features = comparison[comparison['rf_higher']].sort_values('rf_importance', ascending=False).head(10)
for _, row in rf_higher_features.iterrows():
    print(f"   - {row['feature']:30s} | RF: {row['rf_importance_norm']:.3f} | SHAP: {row['shap_importance_norm']:.3f}")

# Analyze feature interactions by looking at SHAP values distribution
print("\n3. Features with High Variability in SHAP Values:")
print("   (These features have context-dependent effects)")
shap_std = pd.DataFrame({
    'feature': X_test_sample.columns,
    'shap_std': shap_values_fraud.std(axis=0),
    'shap_mean_abs': np.abs(shap_values_fraud).mean(axis=0)
})
shap_std['cv'] = shap_std['shap_std'] / (shap_std['shap_mean_abs'] + 1e-10)  # Coefficient of variation
high_variability = shap_std.nlargest(10, 'cv')
for _, row in high_variability.iterrows():
    print(f"   - {row['feature']:30s} | CV: {row['cv']:.3f} | Mean |SHAP|: {row['shap_mean_abs']:.4f}")


## 7. Business Recommendations

Based on SHAP analysis, provide actionable recommendations for fraud prevention.


In [None]:
# Analyze key features for business recommendations
print("="*80)
print("BUSINESS RECOMMENDATIONS BASED ON SHAP ANALYSIS")
print("="*80)

# Get feature statistics for top important features
top_features_analysis = []
for feature in top_5_shap['feature'].head(5):
    fraud_values = X_test_sample[y_test_sample == 1][feature]
    legitimate_values = X_test_sample[y_test_sample == 0][feature]
    
    top_features_analysis.append({
        'feature': feature,
        'fraud_mean': fraud_values.mean(),
        'fraud_median': fraud_values.median(),
        'legitimate_mean': legitimate_values.mean(),
        'legitimate_median': legitimate_values.median(),
        'difference': fraud_values.mean() - legitimate_values.mean()
    })

analysis_df = pd.DataFrame(top_features_analysis)
print("\nFeature Statistics for Top 5 Fraud Drivers:")
print(analysis_df.to_string(index=False))


### Recommendation 1: Time-Based Verification

**SHAP Insight**: `time_since_signup` is one of the top drivers of fraud predictions.

**Recommendation**: 
Transactions occurring within **X hours of signup** should receive additional verification steps.

**Implementation**:
- Calculate time between signup and purchase
- Flag transactions with `time_since_signup < threshold` for manual review
- Apply stricter verification (2FA, phone verification) for new accounts

**Expected Impact**: Reduce false negatives by catching fraudsters who act quickly after account creation.


In [None]:
# Calculate threshold for time_since_signup
if 'time_since_signup' in X_test_sample.columns:
    fraud_time = X_test_sample[y_test_sample == 1]['time_since_signup']
    legitimate_time = X_test_sample[y_test_sample == 0]['time_since_signup']
    
    # Use percentile-based threshold
    threshold_24h = 24  # 24 hours
    threshold_48h = 48  # 48 hours
    
    fraud_below_24h = (fraud_time < threshold_24h).mean()
    legitimate_below_24h = (legitimate_time < threshold_24h).mean()
    
    print(f"Fraud transactions within 24 hours of signup: {fraud_below_24h:.2%}")
    print(f"Legitimate transactions within 24 hours of signup: {legitimate_below_24h:.2%}")
    print(f"\nRecommended threshold: {threshold_24h} hours")
    print(f"Rationale: {fraud_below_24h:.1%} of fraud cases occur within this window")
else:
    print("'time_since_signup' feature not found in dataset")


### Recommendation 2: Transaction Velocity Monitoring

**SHAP Insight**: Transaction frequency features (`transactions_last_24h`, `transactions_last_7d`, `transactions_last_30d`) are key fraud indicators.

**Recommendation**: 
Implement real-time transaction velocity monitoring with automatic flagging for suspicious patterns.

**Implementation**:
- Monitor number of transactions per user in rolling time windows (24h, 7d, 30d)
- Set dynamic thresholds based on user's historical behavior
- Flag accounts with sudden spikes in transaction frequency
- Combine with amount-based rules (e.g., multiple high-value transactions in short time)

**Expected Impact**: Catch fraudsters who make multiple fraudulent transactions quickly, reducing both false negatives and financial losses.


In [None]:
# Analyze transaction velocity patterns
velocity_features = ['transactions_last_24h', 'transactions_last_7d', 'transactions_last_30d']
available_velocity = [f for f in velocity_features if f in X_test_sample.columns]

if available_velocity:
    print("Transaction Velocity Analysis:")
    print("="*60)
    for feature in available_velocity:
        fraud_vel = X_test_sample[y_test_sample == 1][feature]
        legitimate_vel = X_test_sample[y_test_sample == 0][feature]
        
        print(f"\n{feature}:")
        print(f"  Fraud - Mean: {fraud_vel.mean():.2f}, Median: {fraud_vel.median():.2f}, Max: {fraud_vel.max():.2f}")
        print(f"  Legitimate - Mean: {legitimate_vel.mean():.2f}, Median: {legitimate_vel.median():.2f}, Max: {legitimate_vel.max():.2f}")
        
        # Suggest threshold (e.g., 95th percentile of legitimate)
        threshold = legitimate_vel.quantile(0.95)
        fraud_above_threshold = (fraud_vel > threshold).mean()
        print(f"  Suggested threshold (95th percentile of legitimate): {threshold:.2f}")
        print(f"  {fraud_above_threshold:.1%} of fraud cases exceed this threshold")
else:
    print("Transaction velocity features not found in dataset")


### Recommendation 3: Geolocation-Based Risk Scoring

**SHAP Insight**: Country and IP-based features contribute significantly to fraud predictions.

**Recommendation**: 
Implement geolocation-based risk scoring with country-level fraud rate monitoring.

**Implementation**:
- Map IP addresses to countries (already implemented)
- Calculate country-specific fraud rates
- Flag transactions from high-risk countries
- Combine with device fingerprinting (device_id) for additional verification
- Monitor for VPN/proxy usage patterns

**Expected Impact**: Reduce fraud from known high-risk regions while minimizing false positives through multi-factor verification.


In [None]:
# Analyze country-based patterns (if country features exist)
country_features = [col for col in X_test_sample.columns if 'country' in col.lower() or 'Country' in col]

if country_features:
    print("Country-Based Risk Analysis:")
    print("="*60)
    for feature in country_features[:5]:  # Limit to first 5 country features
        if X_test_sample[feature].dtype in ['int64', 'float64']:
            fraud_country = X_test_sample[y_test_sample == 1][feature]
            legitimate_country = X_test_sample[y_test_sample == 0][feature]
            
            print(f"\n{feature}:")
            print(f"  Fraud - Mean: {fraud_country.mean():.2f}, Median: {fraud_country.median():.2f}")
            print(f"  Legitimate - Mean: {legitimate_country.mean():.2f}, Median: {legitimate_country.median():.2f}")
else:
    print("Country features not found in dataset")
    print("Note: Country information may be encoded in other features")


### Recommendation 4: Device and Browser Pattern Analysis

**SHAP Insight**: Device and browser features show significant importance in fraud detection.

**Recommendation**: 
Monitor device and browser usage patterns for anomaly detection.

**Implementation**:
- Track device_id usage patterns (multiple accounts from same device)
- Monitor browser type and version for suspicious patterns
- Flag accounts with frequent device/browser changes
- Implement device fingerprinting for persistent tracking

**Expected Impact**: Identify fraudsters using multiple accounts or compromised devices.


### Recommendation 5: Purchase Value and User Profile Analysis

**SHAP Insight**: Purchase value and user demographics (age, sex) contribute to fraud predictions.

**Recommendation**: 
Implement value-based and profile-based risk scoring.

**Implementation**:
- Flag transactions significantly above user's historical average purchase value
- Monitor age-related patterns (very young or very old accounts with high-value transactions)
- Combine purchase value with transaction velocity for enhanced detection
- Use user profile consistency checks (e.g., age vs. purchase patterns)

**Expected Impact**: Catch fraudsters making unusually high-value transactions or using inconsistent profiles.


In [None]:
# Analyze purchase value patterns
value_features = [col for col in X_test_sample.columns if 'purchase_value' in col.lower() or 'value' in col.lower()]

if value_features:
    print("Purchase Value Analysis:")
    print("="*60)
    for feature in value_features[:3]:  # Limit to first 3 value features
        if X_test_sample[feature].dtype in ['int64', 'float64']:
            fraud_value = X_test_sample[y_test_sample == 1][feature]
            legitimate_value = X_test_sample[y_test_sample == 0][feature]
            
            print(f"\n{feature}:")
            print(f"  Fraud - Mean: ${fraud_value.mean():.2f}, Median: ${fraud_value.median():.2f}, Max: ${fraud_value.max():.2f}")
            print(f"  Legitimate - Mean: ${legitimate_value.mean():.2f}, Median: ${legitimate_value.median():.2f}, Max: ${legitimate_value.max():.2f}")
            
            # Calculate percentile thresholds
            threshold_95 = legitimate_value.quantile(0.95)
            fraud_above_threshold = (fraud_value > threshold_95).mean()
            print(f"  95th percentile threshold: ${threshold_95:.2f}")
            print(f"  {fraud_above_threshold:.1%} of fraud cases exceed this threshold")
else:
    print("Purchase value features not found in dataset")


## 8. Summary and Next Steps

### Key Findings:
1. **Top 5 Fraud Drivers** (from SHAP analysis):
   - Identified through mean absolute SHAP values
   - These features have the strongest impact on fraud predictions

2. **SHAP vs Built-in Importance**:
   - Generally aligned, but SHAP reveals feature interactions
   - Some features show context-dependent effects

3. **Individual Prediction Insights**:
   - True Positives: Correctly identified fraud cases
   - False Positives: Legitimate transactions incorrectly flagged
   - False Negatives: Missed fraud cases requiring attention

### Business Recommendations Summary:
1. **Time-Based Verification**: Flag transactions within 24-48 hours of signup
2. **Transaction Velocity Monitoring**: Real-time monitoring of transaction frequency
3. **Geolocation Risk Scoring**: Country-based fraud rate monitoring
4. **Device/Browser Analysis**: Pattern-based anomaly detection
5. **Value and Profile Analysis**: Purchase value and user profile consistency checks

### Next Steps:
- Implement recommended rules in production system
- A/B test the impact of new verification steps
- Monitor false positive rates to ensure good user experience
- Continuously update risk thresholds based on new fraud patterns
