# Pet Insurance Claims SHAP Analysis Tutorial

This notebook provides an interactive tutorial for using SHAP to explain GPT model decisions on pet insurance claims.

## 1. Setup and Imports

In [1]:
import sys
sys.path.append('..')

import warnings
warnings.filterwarnings('ignore')

from src.shap_explainer import PetClaimExplainer
from src.semantic_analyzer import SemanticAnalyzer
from src.visualization import ClaimVisualizer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

print("Setup complete!")

Setup complete!


## 2. Initialize Components

In [3]:
# Initialize explainer (using GPT-2 as example)
explainer = PetClaimExplainer(model_name='gpt2')
print("✓ SHAP Explainer initialized")

# Initialize semantic analyzer
analyzer = SemanticAnalyzer()
print("✓ Semantic Analyzer initialized")

# Initialize visualizer
visualizer = ClaimVisualizer()
print("✓ Visualizer initialized")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ SHAP Explainer initialized
✓ Semantic Analyzer initialized


OSError: 'seaborn' is not a valid package style, path of style file, URL of style file, or library style name (library styles are listed in `style.available`)

## 3. Single Claim Analysis

In [4]:
# Example claim
claim_text = "Emergency surgery required for my 3-year-old dog after eating chocolate. Severe poisoning symptoms. Total cost $3,500."

# Generate explanation
explanation = explainer.explain_claim(claim_text)

# Display results
print(f"Claim: {claim_text}")
print(f"\nDecision: {explanation['prediction']}")
print(f"Confidence: {explanation['confidence']:.1%}")
print(f"\nMost influential factors:")
for token, importance in explanation['influential_tokens'][:5]:
    impact = "positive" if importance > 0 else "negative"
    print(f"  • '{token}': {impact} impact ({importance:.3f})")

ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).

In [5]:
# Visualize the explanation
fig = visualizer.plot_single_explanation(claim_text, explanation)
plt.show()

NameError: name 'visualizer' is not defined

## 4. Semantic Feature Analysis

In [None]:
# Extract semantic features
features = analyzer.extract_features(claim_text)

# Display features
print("Semantic Features:")
for feature, value in features.items():
    if isinstance(value, bool):
        print(f"  {feature}: {'Yes' if value else 'No'}")
    elif isinstance(value, (int, float)):
        print(f"  {feature}: {value:.2f}" if isinstance(value, float) else f"  {feature}: {value}")

## 5. Batch Analysis

In [None]:
# Sample dataset
sample_claims = [
    "Emergency surgery for dog who ate chocolate. Bill: $4000",
    "Routine dental cleaning for cat. Cost: $300",
    "Dog hit by car, multiple fractures. Emergency treatment $5500",
    "Annual wellness checkup and vaccinations. $150",
    "Cat diagnosed with diabetes, needs insulin. Monthly cost $200",
    "Preventive flea and tick medication. $75"
]

# Process all claims
explanations = explainer.explain_batch(sample_claims)

# Create results dataframe
results = pd.DataFrame({
    'claim': sample_claims,
    'prediction': [exp['prediction'] for exp in explanations],
    'confidence': [exp['confidence'] for exp in explanations]
})

print(results)

In [None]:
# Visualize batch summary
fig = visualizer.plot_batch_summary(explanations, sample_claims)
plt.show()

## 6. Pattern Discovery

In [None]:
# Identify patterns
predictions = [exp['prediction'] for exp in explanations]
patterns = analyzer.identify_patterns(sample_claims, predictions)

# Display key differences
print("Key Differences between Approved and Rejected Claims:")
for feature, diff in patterns['key_differences'].items():
    print(f"\n{feature}:")
    print(f"  Approved: {diff['approved_value']:.2f}")
    print(f"  Rejected: {diff['rejected_value']:.2f}")
    print(f"  Difference: {diff['difference']:.2f}")

In [None]:
# Generate business rules
rules = analyzer.generate_business_rules(patterns)

print("Generated Business Rules:")
for i, rule in enumerate(rules, 1):
    print(f"{i}. {rule}")

## 7. Interactive Claim Testing

In [None]:
def test_claim(claim_text):
    """Interactive function to test any claim"""
    # Get explanation
    explanation = explainer.explain_claim(claim_text)
    features = analyzer.extract_features(claim_text)
    
    # Display results
    print(f"\n{'='*60}")
    print(f"CLAIM: {claim_text}")
    print(f"{'='*60}")
    print(f"\nDECISION: {explanation['prediction']} (Confidence: {explanation['confidence']:.1%})")
    
    print("\nKEY FACTORS:")
    for token, imp in explanation['influential_tokens'][:3]:
        arrow = "↑" if imp > 0 else "↓"
        print(f"  {arrow} {token} ({imp:+.3f})")
    
    print("\nSEMANTIC ANALYSIS:")
    print(f"  Emergency: {'Yes' if features['is_emergency'] else 'No'}")
    print(f"  Preventive: {'Yes' if features['is_preventive'] else 'No'}")
    print(f"  Severity Score: {features['severity_score']:.2f}")
    if features['cost_amount']:
        print(f"  Cost: ${features['cost_amount']:,.0f}")

# Test your own claims
test_claim("My puppy needs emergency stomach surgery after swallowing a toy. Vet bill is $2800.")

In [None]:
# Try another example
test_claim("Annual teeth cleaning and nail trim for my healthy 5-year-old cat. Cost $125.")

## 8. Advanced Analysis: Decision Rules Extraction

In [None]:
# Extract decision rules from multiple explanations
decision_rules = explainer.get_decision_rules(explanations)

print("Approval Indicators (with >10% support):")
for token, stats in sorted(decision_rules['approval_indicators'].items(), 
                         key=lambda x: x[1]['avg_importance'], reverse=True)[:5]:
    print(f"  '{token}': avg importance = {stats['avg_importance']:.3f}, "
          f"frequency = {stats['frequency']}, support = {stats['support']:.1%}")

print("\nRejection Indicators (with >10% support):")
for token, stats in sorted(decision_rules['rejection_indicators'].items(), 
                         key=lambda x: x[1]['avg_importance'])[:5]:
    print(f"  '{token}': avg importance = {stats['avg_importance']:.3f}, "
          f"frequency = {stats['frequency']}, support = {stats['support']:.1%}")

## 9. Export Results

In [None]:
# Save detailed results
results_detailed = results.copy()

# Add semantic features
for i, claim in enumerate(sample_claims):
    features = analyzer.extract_features(claim)
    results_detailed.loc[i, 'is_emergency'] = features['is_emergency']
    results_detailed.loc[i, 'is_preventive'] = features['is_preventive']
    results_detailed.loc[i, 'severity_score'] = features['severity_score']
    results_detailed.loc[i, 'cost_amount'] = features['cost_amount']

# Save to CSV
results_detailed.to_csv('claim_analysis_results.csv', index=False)
print("Results saved to 'claim_analysis_results.csv'")

# Display summary statistics
print("\nSummary Statistics:")
print(results_detailed.groupby('prediction')[['confidence', 'severity_score']].agg(['mean', 'std']))

## 10. Next Steps

1. **Fine-tune the model**: Train on your actual claims data
2. **Customize semantic features**: Add domain-specific patterns
3. **Deploy to production**: Integrate with your claims system
4. **Monitor and improve**: Track accuracy and update rules

For more information, see the project README and documentation.