# Chapter 19: Deploying Experiment-Trained Models - Safe Retraining Pipelines and Governance

This notebook contains all code examples from Chapter 19, demonstrating:

1. **SafeExperimentRetrainer Pipeline (Section 4.2)**
   - Temporal separation with 14-day buffer
   - Cross-experiment validation
   - Automated contamination checks

2. **ContaminationMonitor (Section 4.3)**
   - Holdout vs production performance tracking
   - Distribution shift detection
   - Alerting for degradation

3. **Fairness Strategies (Section 5.1)**
   - Strategy 1: Stratified sampling (fixes bias at source)
   - Strategy 2: Bias auditing (detects disparate impact)
   - Strategy 3: Fairness constraints (policy enforcement)

## Setup: Install Required Packages

In [None]:
# Install required packages (uncomment if needed)
# !pip install pandas numpy scikit-learn scipy fairlearn matplotlib seaborn

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import warnings
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from scipy.stats import ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns

# For fairness constraints (Strategy 3 in Section 5.1)
try:
    from fairlearn.reductions import ExponentiatedGradient, DemographicParity
    FAIRLEARN_AVAILABLE = True
except ImportError:
    FAIRLEARN_AVAILABLE = False
    print("Warning: fairlearn not installed. Fairness constraints example will be skipped.")
    print("To install: pip install fairlearn")

# Set random seed for reproducibility
np.random.seed(42)

print("All libraries imported successfully!")

## Generate Synthetic Experiment Data

Create synthetic experiment data to demonstrate the pipeline patterns.

In [None]:
def generate_experiment_data(n_users=10000, n_experiments=5):
    """
    Generate synthetic experiment data for demonstrating retraining pipeline.
    
    Args:
        n_users: Number of unique users
        n_experiments: Number of experiments to generate
    
    Returns:
        DataFrame with experiment data including:
        - user_id, experiment_id, treatment, outcome
        - experiment_start_date, experiment_end_date
        - features for model training
    """
    data = []
    base_date = datetime(2024, 1, 1)
    
    for exp_id in range(1, n_experiments + 1):
        # Each experiment runs for 2 weeks
        exp_start = base_date + timedelta(weeks=(exp_id - 1) * 2)
        exp_end = exp_start + timedelta(weeks=2)
        
        for user_id in range(n_users):
            # User features
            feature1 = np.random.randn()
            feature2 = np.random.randn()
            
            # Treatment assignment (50/50 split)
            treatment = np.random.choice([0, 1])
            
            # Outcome depends on features and treatment
            # Simulate treatment effect
            true_effect = 0.1  # 10% treatment effect
            base_probability = 1 / (1 + np.exp(-(0.5 * feature1 + 0.3 * feature2)))
            
            if treatment == 1:
                outcome_prob = base_probability + true_effect
            else:
                outcome_prob = base_probability
            
            outcome = np.random.binomial(1, min(outcome_prob, 1.0))
            
            data.append({
                'user_id': user_id,
                'experiment_id': exp_id,
                'experiment_start_date': exp_start,
                'experiment_end_date': exp_end,
                'treatment': treatment,
                'feature1': feature1,
                'feature2': feature2,
                'outcome': outcome,
                'experiment_status': 'completed'
            })
    
    df = pd.DataFrame(data)
    return df

# Generate experiment data
experiment_data = generate_experiment_data(n_users=1000, n_experiments=5)

print(f"Generated {len(experiment_data)} experiment records")
print(f"\nExperiments: {experiment_data['experiment_id'].nunique()}")
print(f"Users: {experiment_data['user_id'].nunique()}")
print(f"Date range: {experiment_data['experiment_end_date'].min()} to {experiment_data['experiment_end_date'].max()}")
print("\nSample data:")
print(experiment_data.head())

## Section 4.1: Mock Database for SafeExperimentRetrainer

Create a simple database mock to demonstrate the pipeline.

In [None]:
class MockDatabase:
    """
    Mock database for demonstrating SafeExperimentRetrainer.
    In production, this would be replaced with actual database connections.
    """
    def __init__(self, experiment_data, holdout_users):
        self.experiment_data = experiment_data
        self.holdout_users = holdout_users
    
    def query(self, query_string, **params):
        """
        Simulate database query filtering.
        """
        df = self.experiment_data.copy()
        
        # Filter by cutoff date
        if 'cutoff' in params:
            df = df[df['experiment_end_date'] < params['cutoff']]
        
        # Filter out holdout users
        df = df[~df['user_id'].isin(self.holdout_users)]
        
        # Filter by status
        df = df[df['experiment_status'] == 'completed']
        
        return df

# Create permanent holdout (5% of users)
all_users = experiment_data['user_id'].unique()
n_holdout = int(len(all_users) * 0.05)
permanent_holdout_users = np.random.choice(all_users, n_holdout, replace=False)

# Create mock database
db = MockDatabase(experiment_data, permanent_holdout_users)

print(f"Mock database created with {len(experiment_data)} records")
print(f"Permanent holdout: {len(permanent_holdout_users)} users ({len(permanent_holdout_users)/len(all_users)*100:.1f}%)")

## Section 4.2: SafeExperimentRetrainer Implementation

Complete implementation from Chapter 19, Section 4.2

In [None]:
class ContaminationError(Exception):
    """Raised when contamination risk is detected."""
    pass

class ValidationError(Exception):
    """Raised when model fails validation."""
    pass

class SafeExperimentRetrainer:
    """
    Production-grade pipeline for safely retraining models on experiment data.
    
    Combines:
    - Temporal separation (Pattern 1 from Section 3.1)
    - Diverse experiment training with validation (Pattern 2 from Section 3.2)
    - Propensity weighting (from Chapter 18)
    """
    
    def __init__(self, holdout_rate=0.05, temporal_buffer_days=14, min_acceptable_score=0.6):
        self.holdout_rate = holdout_rate
        self.temporal_buffer = timedelta(days=temporal_buffer_days)
        self.min_acceptable_score = min_acceptable_score
    
    def get_safe_training_data(self, current_deployment_date):
        """
        Retrieve experiment data that is safe to train on.
        
        Safety checks:
        1. Temporal: Data must predate current deployment by buffer period
        2. Holdout: Exclude users in permanent holdout group
        3. Completeness: Experiments must have fully ended
        """
        cutoff_date = current_deployment_date - self.temporal_buffer
        
        # Query database (in production this would be SQL query)
        data = db.query("""
            SELECT * FROM experiments
            WHERE experiment_end_date < :cutoff
            AND user_id NOT IN (SELECT user_id FROM permanent_holdout)
            AND experiment_status = 'completed'
        """, cutoff=cutoff_date)
        
        # Validation
        self._validate_temporal_safety(data, current_deployment_date)
        self._validate_no_future_leakage(data)
        
        return data
    
    def _validate_temporal_safety(self, data, deployment_date):
        """Ensure no data postdates deployment."""
        if len(data) == 0:
            raise ContaminationError("No training data available after temporal filtering.")
        
        max_date = data['experiment_end_date'].max()
        buffer_date = deployment_date - self.temporal_buffer
        
        if max_date >= buffer_date:
            raise ContaminationError(
                f"Training data includes recent experiments: "
                f"{max_date} >= {buffer_date}. "
                f"Risk of feedback loops."
            )
    
    def _validate_no_future_leakage(self, data):
        """Check for suspicious patterns indicating contamination."""
        # Example: Check if treatment effects are suspiciously large
        treatment_means = data.groupby('treatment')['outcome'].mean()
        if len(treatment_means) < 2:
            return
        
        ate = treatment_means.diff().iloc[-1]
        
        if ate > 0.5:  # Threshold based on domain knowledge
            warnings.warn(
                f"Unusually large treatment effect ({ate:.2%}). "
                f"Possible contamination or cherry-picked experiments."
            )
    
    def retrain_with_validation(self, model, current_deployment_date):
        """
        Retrain model with counterfactual cross-validation.
        
        Args:
            model: Scikit-learn compatible model with fit/score methods
            current_deployment_date: Current date for temporal filtering
        
        Returns:
            Tuple of (trained_model, validation_score)
        """
        # Get safe training data
        all_data = self.get_safe_training_data(current_deployment_date)
        
        print(f"Retrieved {len(all_data)} safe training records")
        print(f"Experiments used: {sorted(all_data['experiment_id'].unique())}")
        
        # Split by experiment ID (not users) - implements Pattern 2 from Section 3.2
        experiment_ids = all_data['experiment_id'].unique()
        np.random.shuffle(experiment_ids)
        
        train_exp_ids = experiment_ids[:len(experiment_ids)//2]
        val_exp_ids = experiment_ids[len(experiment_ids)//2:]
        
        train_data = all_data[all_data['experiment_id'].isin(train_exp_ids)]
        val_data = all_data[all_data['experiment_id'].isin(val_exp_ids)]
        
        print(f"\nTrain experiments: {sorted(train_exp_ids)}")
        print(f"Validation experiments: {sorted(val_exp_ids)}")
        print(f"Train samples: {len(train_data)}, Validation samples: {len(val_data)}")
        
        # Prepare features and labels
        X_train = train_data[['feature1', 'feature2']]
        y_train = train_data['outcome']
        
        X_val = val_data[['feature1', 'feature2']]
        y_val = val_data['outcome']
        
        # Train
        model.fit(X_train, y_train)
        
        # Validate on separate experiments
        val_score = model.score(X_val, y_val)
        
        print(f"\nCross-experiment validation score: {val_score:.3f}")
        
        if val_score < self.min_acceptable_score:
            raise ValidationError(
                f"Model fails cross-experiment validation: {val_score:.3f} < {self.min_acceptable_score}"
            )
        
        return model, val_score

print("SafeExperimentRetrainer class defined successfully!")

## Section 4.2: Demonstrate SafeExperimentRetrainer Usage

In [None]:
# Initialize retrainer
retrainer = SafeExperimentRetrainer(holdout_rate=0.05, temporal_buffer_days=14, min_acceptable_score=0.5)

# Create a simple model
my_model = LogisticRegression(random_state=42)

# Simulate current deployment date (4 weeks after last experiment)
latest_exp_date = experiment_data['experiment_end_date'].max()
current_date = latest_exp_date + timedelta(weeks=4)

print(f"Current deployment date: {current_date}")
print(f"Temporal buffer: 14 days")
print(f"Will use experiments ending before: {current_date - timedelta(days=14)}\n")

try:
    new_model, val_score = retrainer.retrain_with_validation(
        model=my_model,
        current_deployment_date=current_date
    )
    print(f"\n✅ Retraining successful! Validation score: {val_score:.3f}")
except ContaminationError as e:
    print(f"\n❌ Training aborted due to contamination risk: {e}")
except ValidationError as e:
    print(f"\n❌ Training aborted due to validation failure: {e}")

## Section 4.3: ContaminationMonitor Implementation

Post-deployment monitoring from Chapter 19, Section 4.3

In [None]:
class ContaminationMonitor:
    """
    Monitor deployed models for signs of contamination in production.
    Compares production (95%) vs permanent holdout (5%) performance.
    """
    
    def check_holdout_vs_production(self, holdout_metrics, production_metrics):
        """
        Compare holdout vs production performance at current point in time.
        
        Expected: Production (trained ML model) should outperform holdout (baseline)
        Red flag: Production underperforms holdout → model learned patterns that don't work
        """
        holdout_conversion = holdout_metrics['conversion_rate']
        prod_conversion = production_metrics['conversion_rate']
        
        print(f"Holdout (baseline V0) conversion: {holdout_conversion:.3f}")
        print(f"Production (ML model) conversion: {prod_conversion:.3f}")
        
        # Production should be better than holdout (or at worst, equal)
        if prod_conversion < holdout_conversion:
            degradation = (holdout_conversion - prod_conversion) / holdout_conversion
            self.alert(
                f"CONTAMINATION: Production underperforms holdout by {degradation:.1%}. "
                f"Holdout (baseline): {holdout_conversion:.3f}, "
                f"Production (ML model): {prod_conversion:.3f}. "
                f"Model is worse than doing nothing!"
            )
        else:
            improvement = (prod_conversion - holdout_conversion) / holdout_conversion
            print(f"✅ Production outperforms holdout by {improvement:.1%}")
    
    def check_distribution_shift(self, training_features, production_features):
        """
        Detect if production data distribution diverges from training data.
        
        Red flag: Large distribution shifts may indicate the model is changing
        user behavior in unexpected ways (contamination feedback loop).
        """
        print("\nChecking distribution shift...")
        
        for feature in training_features.columns:
            stat, p_value = ks_2samp(
                training_features[feature], 
                production_features[feature]
            )
            
            print(f"  {feature}: KS statistic={stat:.3f}, p-value={p_value:.4f}", end="")
            
            if p_value < 0.01:
                print(" ⚠️")
                self.alert(
                    f"Distribution shift detected in '{feature}' "
                    f"(KS statistic: {stat:.3f}, p-value: {p_value:.4f}). "
                    f"Model may be influencing user behavior."
                )
            else:
                print(" ✅")
    
    def alert(self, message):
        """Send alert to on-call team."""
        print(f"\n[ALERT] {message}")
        # In production: Send to PagerDuty, Slack, email, etc.

print("ContaminationMonitor class defined successfully!")

## Section 4.3: Demonstrate ContaminationMonitor Usage

In [None]:
# Generate synthetic production data
def generate_production_data(n_samples=1000, distribution_shift=False):
    """Generate synthetic production data for monitoring."""
    if distribution_shift:
        # Simulate distribution shift (contamination)
        feature1 = np.random.randn(n_samples) + 0.5  # Shifted mean
        feature2 = np.random.randn(n_samples) * 1.5  # Changed variance
    else:
        # No shift (healthy)
        feature1 = np.random.randn(n_samples)
        feature2 = np.random.randn(n_samples)
    
    return pd.DataFrame({
        'feature1': feature1,
        'feature2': feature2
    })

# Create monitor
monitor = ContaminationMonitor()

# Scenario 1: Healthy system (production outperforms holdout, no distribution shift)
print("=" * 60)
print("Scenario 1: Healthy System")
print("=" * 60)

holdout_metrics = {'conversion_rate': 0.15}  # Baseline
production_metrics = {'conversion_rate': 0.18}  # ML model improved

monitor.check_holdout_vs_production(holdout_metrics, production_metrics)

# Check distribution (no shift)
training_features = experiment_data[['feature1', 'feature2']].sample(1000)
production_features = generate_production_data(1000, distribution_shift=False)

monitor.check_distribution_shift(training_features, production_features)

# Scenario 2: Contaminated system (production underperforms, distribution shift)
print("\n" + "=" * 60)
print("Scenario 2: Contaminated System")
print("=" * 60)

holdout_metrics = {'conversion_rate': 0.15}  # Baseline
production_metrics = {'conversion_rate': 0.12}  # ML model degraded!

monitor.check_holdout_vs_production(holdout_metrics, production_metrics)

# Check distribution (with shift)
production_features_shifted = generate_production_data(1000, distribution_shift=True)

monitor.check_distribution_shift(training_features, production_features_shifted)

## Section 5.1: Stratified Sampling for Fairness (Strategy 1)

From Chapter 19, Section 5.1 - ensures experiment data is representative across demographics.

In [None]:
def assign_to_experiment_with_stratification(users, strata_columns):
    """
    Assign users to experiment with stratified sampling.
    Ensures each demographic group is proportionally represented.
    
    Args:
        users: DataFrame with user attributes
        strata_columns: List of columns to stratify by (e.g., ['location_type', 'industry'])
    
    Returns:
        users with 'in_experiment' flag
    """
    # Create stratification key combining all strata
    users['strata_key'] = users[strata_columns].astype(str).agg('_'.join, axis=1)
    
    # Split into experiment (50%) and holdout (50%), stratified by demographics
    experiment_users, _ = train_test_split(
        users,
        test_size=0.5,
        stratify=users['strata_key'],
        random_state=42
    )
    
    users['in_experiment'] = users.index.isin(experiment_users.index)
    
    # Verify stratification worked
    print("Population distribution:")
    pop_dist = users['strata_key'].value_counts(normalize=True).sort_index()
    print(pop_dist)
    
    print("\nExperiment distribution:")
    exp_dist = users[users['in_experiment']]['strata_key'].value_counts(normalize=True).sort_index()
    print(exp_dist)
    
    # Calculate maximum difference
    max_diff = (pop_dist - exp_dist).abs().max()
    print(f"\nMaximum distribution difference: {max_diff:.4f}")
    
    return users

# Example: Job recommendation experiment
users = pd.DataFrame({
    'user_id': range(10000),
    'location_type': np.random.choice(['tech_hub', 'non_tech_hub'], 10000, p=[0.3, 0.7]),
    'industry': np.random.choice(['tech', 'healthcare', 'finance', 'retail'], 10000)
})

# Ensure experiment includes proportional representation
users = assign_to_experiment_with_stratification(
    users, 
    strata_columns=['location_type', 'industry']
)

print(f"\n✅ Result: Experiment data will be representative across location_type × industry combinations")
print(f"Training a model on this data reduces bias compared to convenience sampling")

## Section 5.1: Bias Auditing Before Deployment (Strategy 2)

Measure model performance across subgroups to detect disparate impact.

In [None]:
# Generate synthetic validation data with location groups
np.random.seed(42)
validation_data = pd.DataFrame({
    'feature1': np.random.randn(1000),
    'feature2': np.random.randn(1000),
    'location': np.random.choice(['tech_hub', 'non_tech_hub'], 1000, p=[0.3, 0.7])
})

# Generate outcomes (tech_hub has higher conversion)
validation_data['outcome'] = (
    (validation_data['feature1'] + validation_data['feature2'] > 0).astype(int)
)

# Add location bias (tech_hub users more likely to convert)
tech_hub_mask = validation_data['location'] == 'tech_hub'
validation_data.loc[tech_hub_mask, 'outcome'] = (
    (validation_data.loc[tech_hub_mask, 'outcome'] | (np.random.rand(tech_hub_mask.sum()) > 0.3)).astype(int)
)

# Train a simple model
model = LogisticRegression(random_state=42)
X = validation_data[['feature1', 'feature2']]
y = validation_data['outcome']
model.fit(X, y)

# Check model performance across demographic groups
print("Bias Auditing: Model Performance Across Groups")
print("=" * 60)

min_acceptable_score = 0.55

for group in ['tech_hub', 'non_tech_hub']:
    group_data = validation_data[validation_data['location'] == group]
    X_group = group_data[['feature1', 'feature2']]
    y_group = group_data['outcome']
    
    group_score = model.score(X_group, y_group)
    print(f"{group}: Accuracy = {group_score:.3f}", end="")
    
    if group_score < min_acceptable_score:
        print(f" ❌ BELOW THRESHOLD ({min_acceptable_score})")
        print(f"   [ERROR] Model underperforms on {group}")
    else:
        print(f" ✅")

# Calculate disparate impact
tech_score = model.score(
    validation_data[validation_data['location'] == 'tech_hub'][['feature1', 'feature2']],
    validation_data[validation_data['location'] == 'tech_hub']['outcome']
)
non_tech_score = model.score(
    validation_data[validation_data['location'] == 'non_tech_hub'][['feature1', 'feature2']],
    validation_data[validation_data['location'] == 'non_tech_hub']['outcome']
)

disparate_impact = non_tech_score / tech_score if tech_score > 0 else 0
print(f"\nDisparate Impact Ratio: {disparate_impact:.3f}")
print(f"  (Ratio of non_tech_hub / tech_hub performance)")
print(f"  Ideal: 1.0 (equal performance), Concerning: <0.8 (80% rule)")

if disparate_impact < 0.8:
    print(f"  ⚠️ WARNING: Model shows potential bias against non_tech_hub users")
else:
    print(f"  ✅ Model performance is reasonably balanced")

## Section 5.1: Fairness Constraints During Training (Strategy 3)

**IMPORTANT**: This strategy imposes policy constraints on the model, which may reduce performance.
Use only when you have a policy requirement for equal treatment across groups.

In [None]:
if FAIRLEARN_AVAILABLE:
    print("Fairness Constraints: Enforcing Demographic Parity")
    print("=" * 60)
    print("\nScenario: Job recommendation data shows different acceptance rates by location")
    print("Question: Is this difference due to:")
    print("  A) Tech hub users genuinely prefer different jobs (legitimate pattern)")
    print("  B) Your experiment had biased job offerings (data collection problem)")
    print("\nFairness constraints assume B and enforce equal treatment.\n")
    
    # Prepare data
    X_train = validation_data[['feature1', 'feature2']]
    y_train = validation_data['outcome']
    sensitive_features = validation_data['location']
    
    # Train unconstrained model
    unconstrained_model = RandomForestClassifier(n_estimators=50, random_state=42, max_depth=5)
    unconstrained_model.fit(X_train, y_train)
    unconstrained_accuracy = unconstrained_model.score(X_train, y_train)
    
    # Calculate unconstrained selection rates by group
    predictions_unconstrained = unconstrained_model.predict(X_train)
    tech_rate_unconstrained = predictions_unconstrained[sensitive_features == 'tech_hub'].mean()
    non_tech_rate_unconstrained = predictions_unconstrained[sensitive_features == 'non_tech_hub'].mean()
    
    print(f"Unconstrained Model:")
    print(f"  Overall Accuracy: {unconstrained_accuracy:.3f}")
    print(f"  Tech hub selection rate: {tech_rate_unconstrained:.3f}")
    print(f"  Non-tech hub selection rate: {non_tech_rate_unconstrained:.3f}")
    print(f"  Difference: {abs(tech_rate_unconstrained - non_tech_rate_unconstrained):.3f}\n")
    
    # Train with fairness constraints
    constraint = DemographicParity()  # Policy: Equal recommendation rates across groups
    mitigator = ExponentiatedGradient(
        estimator=RandomForestClassifier(n_estimators=50, random_state=42, max_depth=5),
        constraints=constraint
    )
    
    # This enforces: P(recommend job | tech_hub) ≈ P(recommend job | non_tech_hub)
    # Regardless of what the data shows about actual acceptance rates
    mitigator.fit(
        X_train, 
        y_train, 
        sensitive_features=sensitive_features
    )
    
    # Evaluate constrained model
    predictions_constrained = mitigator.predict(X_train)
    constrained_accuracy = (predictions_constrained == y_train).mean()
    tech_rate_constrained = predictions_constrained[sensitive_features == 'tech_hub'].mean()
    non_tech_rate_constrained = predictions_constrained[sensitive_features == 'non_tech_hub'].mean()
    
    print(f"Constrained Model (Demographic Parity):")
    print(f"  Overall Accuracy: {constrained_accuracy:.3f}")
    print(f"  Tech hub selection rate: {tech_rate_constrained:.3f}")
    print(f"  Non-tech hub selection rate: {non_tech_rate_constrained:.3f}")
    print(f"  Difference: {abs(tech_rate_constrained - non_tech_rate_constrained):.3f}\n")
    
    # Show trade-off
    accuracy_loss = unconstrained_accuracy - constrained_accuracy
    fairness_gain = abs(tech_rate_unconstrained - non_tech_rate_unconstrained) - abs(tech_rate_constrained - non_tech_rate_constrained)
    
    print(f"Trade-off:")
    print(f"  Accuracy loss: {accuracy_loss:.3f} ({accuracy_loss/unconstrained_accuracy*100:.1f}%)")
    print(f"  Fairness gain (reduced disparity): {fairness_gain:.3f}\n")
    
    print("Critical Questions Before Using This:")
    print("1. Is the data difference due to bias or reality?")
    print("2. Is this social engineering? (Yes, explicitly)")
    print("3. What's the cost? (Lower model performance)")
    print("\nRecommendation: Use Strategy 1 (Stratified Sampling) to fix bias at source.")
    print("Only use Strategy 3 when you have a policy requirement for equal treatment.")
else:
    print("Fairlearn not available. Skipping fairness constraints example.")
    print("To run this example: pip install fairlearn")

## Visualization: Timeline of Safe Retraining

In [None]:
# Visualize the temporal separation pattern
fig, ax = plt.subplots(figsize=(14, 6))

# Timeline data
weeks = np.arange(1, 11)
events = [
    (1, "Deploy V0\nRun Exp 1 (V0)", 'green'),
    (3, "Deploy V1\nRun Exp 2 (V1)", 'blue'),
    (5, "Deploy V2\nRun Exp 3 (V2)", 'blue'),
    (7, "Train V3 on Exp 1 data\nDeploy V3, Run Exp 4", 'green'),
    (9, "Train V4 on Exp 2 data\nDeploy V4", 'green'),
]

# Plot timeline
ax.axhline(y=0, color='black', linewidth=2)

for week, event, color in events:
    ax.plot(week, 0, 'o', markersize=15, color=color)
    ax.text(week, 0.3, event, ha='center', fontsize=9, 
            bbox=dict(boxstyle='round', facecolor=color, alpha=0.3))

# Add annotations
ax.annotate('', xy=(7, -0.5), xytext=(1, -0.5),
            arrowprops=dict(arrowstyle='<->', color='red', lw=2))
ax.text(4, -0.7, 'V3 trains on V0 data\n(2 generations back)', 
        ha='center', fontsize=10, color='red', weight='bold')

ax.set_xlim(0, 11)
ax.set_ylim(-1, 1)
ax.set_xlabel('Week', fontsize=12)
ax.set_title('Temporal Separation Pattern (Version Skipping)\nPrevents Feedback Loops', 
             fontsize=14, weight='bold')
ax.set_yticks([])
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

print("Key insight: Model V3 trains on data from V0 (not V2), breaking the contamination chain.")