# Chapter 13: Deployment and CI/CD for AutoML Models

This notebook accompanies Chapter 13 of the AutoML book, demonstrating practical deployment patterns for AutoML models. We'll cover:

1. Model preparation and packaging
2. FastAPI serving with health checks
3. Shadow deployment implementation
4. MLflow model registry integration
5. Prometheus metrics collection
6. Drift detection with Evidently
7. Input sanitization and security
8. Continuous learning pipeline

**Note**: Many of these examples are designed to run in a containerized environment. The notebook demonstrates the code patterns that you would deploy in production.

## 1. Model Preparation for Deployment

Before deploying, we need to save the model with comprehensive metadata.

In [None]:
import json
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any

class ModelPackager:
    """
    Packages trained AutoGluon models for deployment.
    
    Creates:
    - Model metadata (features, classes, metrics)
    - Requirements file for dependencies
    - Validation checks for deployment readiness
    """
    
    def __init__(self, model_path: str):
        self.model_path = Path(model_path)
        self.metadata = {}
    
    def create_metadata(self, predictor, test_accuracy: float) -> Dict:
        """Create comprehensive metadata for deployment."""
        self.metadata = {
            'model_type': 'tabular_classification',
            'target': predictor.label,
            'classes': list(predictor.class_labels) if hasattr(predictor, 'class_labels') else [],
            'features': predictor.feature_metadata_in.get_features(),
            'eval_metric': predictor.eval_metric.name if hasattr(predictor.eval_metric, 'name') else str(predictor.eval_metric),
            'test_accuracy': test_accuracy,
            'training_date': datetime.now().isoformat(),
            'autogluon_version': '1.5.0',
            'model_path': str(self.model_path),
            'feature_count': len(predictor.feature_metadata_in.get_features())
        }
        
        # Save metadata
        metadata_path = self.model_path / 'metadata.json'
        with open(metadata_path, 'w') as f:
            json.dump(self.metadata, f, indent=2)
        
        print(f"Metadata saved to {metadata_path}")
        return self.metadata
    
    def create_requirements(self) -> str:
        """Create requirements.txt for the model."""
        requirements = """
autogluon.tabular==1.5.0
fastapi==0.104.1
uvicorn[standard]==0.24.0
pydantic==2.5.0
pandas==2.1.3
numpy==1.26.2
python-multipart==0.0.6
prometheus-client==0.19.0
evidently==0.4.0
""".strip()
        
        req_path = self.model_path.parent / 'requirements.txt'
        with open(req_path, 'w') as f:
            f.write(requirements)
        
        print(f"Requirements saved to {req_path}")
        return requirements
    
    def validate_deployment_readiness(self) -> Dict[str, bool]:
        """Check if model is ready for deployment."""
        checks = {
            'model_exists': self.model_path.exists(),
            'metadata_exists': (self.model_path / 'metadata.json').exists(),
            'predictor_exists': (self.model_path / 'predictor.pkl').exists() or 
                               (self.model_path / 'learner.pkl').exists(),
        }
        
        all_passed = all(checks.values())
        print(f"Deployment readiness: {'PASSED' if all_passed else 'FAILED'}")
        for check, passed in checks.items():
            print(f"  {check}: {'✓' if passed else '✗'}")
        
        return checks

# Example usage (would run after training)
print("ModelPackager class defined for deployment preparation")

## 2. FastAPI Serving with Comprehensive Health Checks

Health checks should verify not just that the model is loaded, but that it can actually make predictions.

In [None]:
from typing import Dict, List, Optional
import pandas as pd
import time
from datetime import datetime
from dataclasses import dataclass

@dataclass
class HealthStatus:
    """Health check status."""
    status: str
    model_loaded: bool
    can_predict: bool
    model_info: Optional[Dict] = None
    error: Optional[str] = None
    latency_ms: Optional[float] = None

class ModelServer:
    """
    Production model server with comprehensive health checks.
    
    Addresses reviewer comment [f]: Health checks should make
    test predictions to verify model integrity.
    """
    
    # Standard test input for health checks
    TEST_INPUT = {
        "age": 35,
        "workclass": "Private",
        "education": "Bachelors",
        "education_num": 13,
        "marital_status": "Never-married",
        "occupation": "Tech-support",
        "relationship": "Not-in-family",
        "race": "White",
        "sex": "Male",
        "capital_gain": 0,
        "capital_loss": 0,
        "hours_per_week": 40,
        "native_country": "United-States"
    }
    
    def __init__(self, model_path: str):
        self.model_path = model_path
        self.predictor = None
        self.metadata = {}
    
    def load_model(self):
        """Load the AutoGluon predictor."""
        # In production, this would be:
        # from autogluon.tabular import TabularPredictor
        # self.predictor = TabularPredictor.load(self.model_path)
        print(f"Model would be loaded from {self.model_path}")
        self.predictor = "mock_predictor"  # Simulated for notebook
    
    def health_check(self) -> HealthStatus:
        """
        Comprehensive health check.
        
        Verifies:
        1. Model is loaded
        2. Model can make predictions (catches corrupted weights)
        3. Prediction is valid
        """
        status = HealthStatus(
            status="unhealthy",
            model_loaded=False,
            can_predict=False
        )
        
        # Check 1: Model loaded
        if self.predictor is None:
            status.error = "Model not loaded"
            return status
        
        status.model_loaded = True
        status.model_info = self.metadata
        
        # Check 2: Make test prediction
        try:
            start = time.time()
            # In production: self.predictor.predict(pd.DataFrame([self.TEST_INPUT]))
            prediction = ">50K"  # Simulated
            status.latency_ms = (time.time() - start) * 1000
            
            # Check 3: Validate prediction
            valid_classes = ['<=50K', '>50K']
            if prediction not in valid_classes:
                status.error = f"Invalid prediction: {prediction}"
                return status
            
            status.can_predict = True
            status.status = "healthy"
            
        except Exception as e:
            status.error = f"Prediction failed: {str(e)}"
        
        return status
    
    def readiness_check(self) -> bool:
        """
        Kubernetes readiness probe.
        
        Separate from liveness - indicates ready for traffic.
        """
        health = self.health_check()
        return health.status == "healthy"

# Demonstrate health check
server = ModelServer("/app/models/adult_income_model")
server.load_model()
health = server.health_check()
print(f"\nHealth Status: {health.status}")
print(f"Model Loaded: {health.model_loaded}")
print(f"Can Predict: {health.can_predict}")

## 3. Shadow Deployment Implementation

Shadow deployments run candidate models against production traffic without affecting users.

In [None]:
from collections import deque
from dataclasses import dataclass, field
from typing import Any, Callable
import random
import uuid

@dataclass
class ShadowComparison:
    """Record of production vs shadow prediction comparison."""
    request_id: str
    timestamp: datetime
    production_prediction: Any
    shadow_prediction: Any
    production_latency_ms: float
    shadow_latency_ms: float
    predictions_match: bool

class ShadowDeploymentManager:
    """
    Manages shadow deployment for safe model validation.
    
    Addresses reviewer comment [c]: Show how to implement
    shadow mode in FastAPI.
    """
    
    def __init__(self, sample_rate: float = 1.0):
        self.sample_rate = sample_rate
        self.comparisons: deque = deque(maxlen=10000)
        self.production_model = None
        self.shadow_model = None
        self._comparison_count = 0
        self._discrepancy_count = 0
    
    def predict_with_shadow(
        self,
        features: Dict,
        production_predict: Callable,
        shadow_predict: Callable
    ) -> Dict:
        """
        Make production prediction with async shadow comparison.
        
        Production prediction is synchronous (returned to user).
        Shadow prediction is logged but doesn't affect response.
        """
        request_id = str(uuid.uuid4())
        
        # Production prediction (what user sees)
        start = time.time()
        production_result = production_predict(features)
        production_latency = (time.time() - start) * 1000
        
        # Shadow prediction (sampled, non-blocking in production)
        if random.random() < self.sample_rate:
            start = time.time()
            shadow_result = shadow_predict(features)
            shadow_latency = (time.time() - start) * 1000
            
            # Log comparison
            comparison = ShadowComparison(
                request_id=request_id,
                timestamp=datetime.now(),
                production_prediction=production_result,
                shadow_prediction=shadow_result,
                production_latency_ms=production_latency,
                shadow_latency_ms=shadow_latency,
                predictions_match=(production_result == shadow_result)
            )
            self.comparisons.append(comparison)
            self._comparison_count += 1
            
            if not comparison.predictions_match:
                self._discrepancy_count += 1
        
        return {
            "prediction": production_result,
            "request_id": request_id,
            "latency_ms": production_latency
        }
    
    def get_metrics(self) -> Dict:
        """Get shadow deployment metrics."""
        agreement_rate = (
            (self._comparison_count - self._discrepancy_count) / 
            self._comparison_count if self._comparison_count > 0 else 0
        )
        
        return {
            "total_comparisons": self._comparison_count,
            "discrepancies": self._discrepancy_count,
            "agreement_rate": f"{agreement_rate:.2%}",
            "sample_rate": self.sample_rate
        }

# Demonstrate shadow deployment
shadow_mgr = ShadowDeploymentManager(sample_rate=1.0)

# Simulated models
def prod_model(features): return ">50K" if features.get('education_num', 0) > 12 else "<=50K"
def shadow_model(features): return ">50K" if features.get('education_num', 0) > 11 else "<=50K"

# Run some predictions
test_cases = [
    {"education_num": 16, "age": 45},  # Both predict >50K
    {"education_num": 9, "age": 25},   # Both predict <=50K
    {"education_num": 12, "age": 35},  # Discrepancy: boundary case
]

for features in test_cases:
    result = shadow_mgr.predict_with_shadow(features, prod_model, shadow_model)
    print(f"Features: {features} -> {result['prediction']}")

print(f"\nShadow Metrics: {shadow_mgr.get_metrics()}")

## 4. MLflow Model Registry Integration

Load models from MLflow registry for dynamic updates without redeployment.

In [None]:
from typing import Optional

class MLflowModelManager:
    """
    Manages model loading from MLflow Model Registry.
    
    Addresses reviewer comment [j]: Show loading from
    MLflow model registry to tie chapters together.
    """
    
    def __init__(self, tracking_uri: str, model_name: str):
        self.tracking_uri = tracking_uri
        self.model_name = model_name
        self.model = None
        self.model_version = None
        self.model_stage = None
    
    def load_production_model(self):
        """
        Load the current Production-stage model.
        
        In production:
        ```python
        import mlflow
        mlflow.set_tracking_uri(self.tracking_uri)
        
        model_uri = f"models:/{self.model_name}/Production"
        self.model = mlflow.pyfunc.load_model(model_uri)
        ```
        """
        self.model_stage = "Production"
        self.model_version = "3"  # Simulated
        print(f"Loaded {self.model_name} v{self.model_version} ({self.model_stage})")
    
    def load_staging_model(self) -> bool:
        """Load staging model for shadow comparison."""
        try:
            # model_uri = f"models:/{self.model_name}/Staging"
            # self.shadow_model = mlflow.pyfunc.load_model(model_uri)
            print(f"Loaded staging model for shadow comparison")
            return True
        except Exception:
            print("No staging model available")
            return False
    
    def check_for_new_version(self) -> bool:
        """
        Check if a new production model is available.
        
        Enables hot-reloading without container restart.
        """
        # In production: query MLflow client for latest version
        # client = MlflowClient()
        # versions = client.get_latest_versions(self.model_name, stages=["Production"])
        current_version = int(self.model_version) if self.model_version else 0
        latest_version = 4  # Simulated
        
        if latest_version > current_version:
            print(f"New version available: v{latest_version} (current: v{current_version})")
            return True
        return False
    
    def get_model_info(self) -> Dict:
        """Get information about the loaded model."""
        return {
            "model_name": self.model_name,
            "version": self.model_version,
            "stage": self.model_stage,
            "tracking_uri": self.tracking_uri
        }

# Demonstrate MLflow integration
mlflow_mgr = MLflowModelManager(
    tracking_uri="http://mlflow-server:5000",
    model_name="adult-income-predictor"
)

mlflow_mgr.load_production_model()
print(f"Model Info: {mlflow_mgr.get_model_info()}")
mlflow_mgr.check_for_new_version()

## 5. Prometheus Metrics Collection

Collect ML-specific metrics for production monitoring.

In [None]:
from collections import defaultdict

class PrometheusMetricsCollector:
    """
    Collects Prometheus-compatible metrics for ML serving.
    
    Addresses reviewer comment [k]: Introduce Prometheus-Grafana
    stack for production monitoring.
    """
    
    def __init__(self, model_name: str, model_version: str):
        self.model_name = model_name
        self.model_version = model_version
        
        # Counters
        self.prediction_counts = defaultdict(int)
        self.error_counts = defaultdict(int)
        
        # Histograms (simplified as lists)
        self.latencies = []
        self.confidences = defaultdict(list)
        self.feature_values = defaultdict(list)
    
    def record_prediction(
        self,
        prediction: str,
        latency_seconds: float,
        confidence: float,
        features: Dict
    ):
        """Record metrics for a prediction."""
        # Count predictions by class
        self.prediction_counts[prediction] += 1
        
        # Record latency
        self.latencies.append(latency_seconds)
        
        # Record confidence
        self.confidences[prediction].append(confidence)
        
        # Record feature distributions (numerical only)
        for feature, value in features.items():
            if isinstance(value, (int, float)):
                self.feature_values[feature].append(value)
    
    def record_error(self, error_type: str):
        """Record a prediction error."""
        self.error_counts[error_type] += 1
    
    def get_prometheus_metrics(self) -> str:
        """
        Generate Prometheus exposition format.
        
        In production, use prometheus_client library.
        """
        lines = []
        
        # Prediction counts
        lines.append("# HELP ml_predictions_total Total predictions")
        lines.append("# TYPE ml_predictions_total counter")
        for cls, count in self.prediction_counts.items():
            lines.append(
                f'ml_predictions_total{{model="{self.model_name}",'
                f'version="{self.model_version}",class="{cls}"}} {count}'
            )
        
        # Latency summary
        if self.latencies:
            import statistics
            p50 = statistics.median(self.latencies)
            p99 = sorted(self.latencies)[int(len(self.latencies) * 0.99)] if len(self.latencies) > 100 else max(self.latencies)
            
            lines.append("# HELP ml_prediction_latency_seconds Prediction latency")
            lines.append("# TYPE ml_prediction_latency_seconds summary")
            lines.append(f'ml_prediction_latency_seconds{{quantile="0.5"}} {p50:.4f}')
            lines.append(f'ml_prediction_latency_seconds{{quantile="0.99"}} {p99:.4f}')
        
        return "\n".join(lines)
    
    def get_summary(self) -> Dict:
        """Get metrics summary."""
        import statistics
        
        total_predictions = sum(self.prediction_counts.values())
        
        return {
            "total_predictions": total_predictions,
            "predictions_by_class": dict(self.prediction_counts),
            "avg_latency_ms": statistics.mean(self.latencies) * 1000 if self.latencies else 0,
            "p99_latency_ms": (sorted(self.latencies)[int(len(self.latencies) * 0.99)] * 1000 
                               if len(self.latencies) > 10 else 0),
            "error_counts": dict(self.error_counts)
        }

# Demonstrate metrics collection
metrics = PrometheusMetricsCollector("adult-income", "1.2.0")

# Simulate predictions
import random
for _ in range(100):
    prediction = random.choice([">50K", "<=50K"])
    latency = random.uniform(0.01, 0.05)
    confidence = random.uniform(0.6, 0.99)
    features = {"age": random.randint(20, 60), "education_num": random.randint(8, 16)}
    
    metrics.record_prediction(prediction, latency, confidence, features)

# Add some errors
metrics.record_error("validation_error")
metrics.record_error("timeout")

print("Metrics Summary:")
for key, value in metrics.get_summary().items():
    print(f"  {key}: {value}")

print("\nPrometheus Format (excerpt):")
print(metrics.get_prometheus_metrics()[:500])

## 6. Drift Detection with Evidently

Detect data drift that could degrade model performance.

In [None]:
import numpy as np

class SimpleDriftDetector:
    """
    Simplified drift detection (Evidently-inspired).
    
    Addresses reviewer comment [l]: Show drift detection
    with tools like Evidently, WhyLabs.
    
    In production, use:
    - Evidently: evidently.ai
    - WhyLabs: whylabs.ai  
    - Great Expectations: greatexpectations.io
    """
    
    def __init__(self, reference_data: pd.DataFrame, feature_columns: List[str]):
        self.reference_data = reference_data
        self.feature_columns = feature_columns
        self.reference_stats = self._compute_stats(reference_data)
    
    def _compute_stats(self, df: pd.DataFrame) -> Dict:
        """Compute statistics for numerical features."""
        stats = {}
        for col in self.feature_columns:
            if df[col].dtype in ['int64', 'float64']:
                stats[col] = {
                    'mean': df[col].mean(),
                    'std': df[col].std(),
                    'min': df[col].min(),
                    'max': df[col].max()
                }
        return stats
    
    def compute_psi(self, expected: np.ndarray, actual: np.ndarray, buckets: int = 10) -> float:
        """
        Compute Population Stability Index (PSI).
        
        PSI < 0.1: No significant change
        0.1 <= PSI < 0.2: Moderate change
        PSI >= 0.2: Significant change (drift detected)
        """
        # Create buckets based on expected distribution
        breakpoints = np.percentile(expected, np.linspace(0, 100, buckets + 1))
        breakpoints[0] = -np.inf
        breakpoints[-1] = np.inf
        
        expected_counts = np.histogram(expected, breakpoints)[0]
        actual_counts = np.histogram(actual, breakpoints)[0]
        
        # Normalize to proportions
        expected_props = expected_counts / len(expected)
        actual_props = actual_counts / len(actual)
        
        # Avoid division by zero
        expected_props = np.where(expected_props == 0, 0.0001, expected_props)
        actual_props = np.where(actual_props == 0, 0.0001, actual_props)
        
        # PSI formula
        psi = np.sum((actual_props - expected_props) * np.log(actual_props / expected_props))
        return psi
    
    def check_drift(self, current_data: pd.DataFrame) -> Dict:
        """
        Check for drift between reference and current data.
        """
        results = {
            'drift_detected': False,
            'drifted_features': [],
            'feature_psi': {}
        }
        
        for col in self.feature_columns:
            if col not in self.reference_stats:
                continue
            
            ref_values = self.reference_data[col].values
            cur_values = current_data[col].values
            
            psi = self.compute_psi(ref_values, cur_values)
            results['feature_psi'][col] = round(psi, 4)
            
            if psi >= 0.2:
                results['drifted_features'].append(col)
                results['drift_detected'] = True
        
        return results

# Create reference data
np.random.seed(42)
reference = pd.DataFrame({
    'age': np.random.normal(38, 13, 1000).astype(int),
    'education_num': np.random.normal(10, 2.5, 1000).astype(int),
    'hours_per_week': np.random.normal(40, 12, 1000).astype(int)
})

# Current data - with drift in 'age'
current = pd.DataFrame({
    'age': np.random.normal(45, 15, 500).astype(int),  # Shifted!
    'education_num': np.random.normal(10, 2.5, 500).astype(int),
    'hours_per_week': np.random.normal(40, 12, 500).astype(int)
})

# Check drift
detector = SimpleDriftDetector(reference, ['age', 'education_num', 'hours_per_week'])
drift_results = detector.check_drift(current)

print("Drift Detection Results:")
print(f"  Drift Detected: {drift_results['drift_detected']}")
print(f"  Drifted Features: {drift_results['drifted_features']}")
print(f"  PSI Scores:")
for feature, psi in drift_results['feature_psi'].items():
    status = "DRIFT" if psi >= 0.2 else "OK" if psi < 0.1 else "WATCH"
    print(f"    {feature}: {psi:.4f} ({status})")

## 7. Input Sanitization and Security

Protect ML endpoints from DoS attacks and malformed inputs.

In [None]:
from dataclasses import dataclass
from typing import Tuple

@dataclass
class SanitizationConfig:
    """Configuration for input sanitization."""
    max_request_size_bytes: int = 1_000_000  # 1MB
    max_batch_size: int = 100
    max_string_length: int = 1000
    rate_limit_per_minute: int = 1000

class InputSanitizer:
    """
    Sanitizes and validates ML model inputs.
    
    Addresses reviewer comment [m]: Add input sanitization
    and size limits for DoS prevention.
    """
    
    # Expected schema for Adult Income model
    SCHEMA = {
        'age': {'type': int, 'min': 17, 'max': 90},
        'education_num': {'type': int, 'min': 1, 'max': 16},
        'hours_per_week': {'type': int, 'min': 1, 'max': 99},
        'capital_gain': {'type': int, 'min': 0, 'max': 100000},
        'capital_loss': {'type': int, 'min': 0, 'max': 5000},
        'workclass': {'type': str, 'max_length': 50},
        'education': {'type': str, 'max_length': 50},
        'marital_status': {'type': str, 'max_length': 50},
        'occupation': {'type': str, 'max_length': 50},
        'relationship': {'type': str, 'max_length': 50},
        'race': {'type': str, 'max_length': 50},
        'sex': {'type': str, 'max_length': 10},
        'native_country': {'type': str, 'max_length': 50}
    }
    
    def __init__(self, config: SanitizationConfig = None):
        self.config = config or SanitizationConfig()
    
    def validate_batch_size(self, batch: List[Dict]) -> Tuple[bool, str]:
        """Check if batch size is within limits."""
        if len(batch) > self.config.max_batch_size:
            return False, f"Batch size {len(batch)} exceeds max {self.config.max_batch_size}"
        return True, ""
    
    def sanitize_features(self, features: Dict) -> Tuple[Dict, List[str]]:
        """
        Sanitize input features.
        
        Returns:
            - Sanitized features dict
            - List of warnings/modifications made
        """
        sanitized = {}
        warnings = []
        
        for field, spec in self.SCHEMA.items():
            if field not in features:
                warnings.append(f"Missing field: {field}")
                continue
            
            value = features[field]
            
            # Type validation and coercion
            if spec['type'] == int:
                try:
                    value = int(value)
                except (ValueError, TypeError):
                    warnings.append(f"Invalid int for {field}: {value}")
                    continue
                
                # Range clamping
                if 'min' in spec and value < spec['min']:
                    warnings.append(f"{field} clamped from {value} to {spec['min']}")
                    value = spec['min']
                if 'max' in spec and value > spec['max']:
                    warnings.append(f"{field} clamped from {value} to {spec['max']}")
                    value = spec['max']
            
            elif spec['type'] == str:
                value = str(value)
                max_len = spec.get('max_length', 100)
                if len(value) > max_len:
                    warnings.append(f"{field} truncated from {len(value)} to {max_len} chars")
                    value = value[:max_len]
            
            sanitized[field] = value
        
        # Remove unexpected fields (security)
        unexpected = set(features.keys()) - set(self.SCHEMA.keys())
        if unexpected:
            warnings.append(f"Removed unexpected fields: {unexpected}")
        
        return sanitized, warnings
    
    def detect_adversarial(
        self,
        features: Dict,
        training_stats: Dict,
        threshold_std: float = 5.0
    ) -> Tuple[bool, List[str]]:
        """
        Basic detection of potentially adversarial inputs.
        
        Checks if values are far outside training distribution.
        """
        suspicious = []
        
        for field, stats in training_stats.items():
            if field not in features:
                continue
            
            value = features[field]
            if not isinstance(value, (int, float)):
                continue
            
            if stats['std'] > 0:
                z_score = abs(value - stats['mean']) / stats['std']
                if z_score > threshold_std:
                    suspicious.append(
                        f"{field}: z-score={z_score:.2f} (value={value})"
                    )
        
        return len(suspicious) > 0, suspicious

# Demonstrate sanitization
sanitizer = InputSanitizer()

# Test cases
test_inputs = [
    # Normal input
    {"age": 35, "education_num": 13, "hours_per_week": 40,
     "workclass": "Private", "education": "Bachelors"},
    
    # Out of range values
    {"age": 150, "education_num": -5, "hours_per_week": 200,
     "workclass": "Private"},
    
    # Injection attempt
    {"age": 35, "workclass": "Private; DROP TABLE users;--",
     "malicious_field": "<script>alert('xss')</script>"}
]

for i, input_data in enumerate(test_inputs):
    print(f"\nTest {i + 1}:")
    print(f"  Input: {input_data}")
    sanitized, warnings = sanitizer.sanitize_features(input_data)
    print(f"  Sanitized: {sanitized}")
    if warnings:
        print(f"  Warnings: {warnings}")

## 8. Continuous Learning Pipeline

Collect feedback and determine when retraining is needed.

In [None]:
from collections import deque
from datetime import timedelta

@dataclass
class LabeledPrediction:
    """A prediction with ground truth feedback."""
    prediction_id: str
    features: Dict
    prediction: str
    ground_truth: Optional[str]
    timestamp: datetime

class ContinuousLearningPipeline:
    """
    Implements continuous learning from production feedback.
    
    Addresses reviewer comments [p] and [q]:
    - [p]: Ongoing performance monitoring plans
    - [q]: Code example for continuous learning
    """
    
    def __init__(
        self,
        accuracy_threshold: float = 0.80,
        drift_threshold: float = 0.2,
        min_samples_for_retrain: int = 1000
    ):
        self.accuracy_threshold = accuracy_threshold
        self.drift_threshold = drift_threshold
        self.min_samples = min_samples_for_retrain
        
        self.pending_feedback: Dict[str, LabeledPrediction] = {}
        self.labeled_data: deque = deque(maxlen=100000)
        self.last_retrain = None
    
    def record_prediction(self, prediction_id: str, features: Dict, prediction: str):
        """Record a prediction awaiting feedback."""
        self.pending_feedback[prediction_id] = LabeledPrediction(
            prediction_id=prediction_id,
            features=features,
            prediction=prediction,
            ground_truth=None,
            timestamp=datetime.now()
        )
    
    def receive_feedback(self, prediction_id: str, ground_truth: str):
        """Receive ground truth for a prediction."""
        if prediction_id not in self.pending_feedback:
            return False
        
        labeled = self.pending_feedback.pop(prediction_id)
        labeled.ground_truth = ground_truth
        self.labeled_data.append(labeled)
        return True
    
    def get_accuracy(self, window_hours: int = 24) -> float:
        """Calculate accuracy over recent labeled predictions."""
        cutoff = datetime.now() - timedelta(hours=window_hours)
        recent = [lp for lp in self.labeled_data 
                  if lp.timestamp > cutoff and lp.ground_truth]
        
        if not recent:
            return 0.0
        
        correct = sum(1 for lp in recent if lp.prediction == lp.ground_truth)
        return correct / len(recent)
    
    def check_retrain_needed(self, drift_score: float) -> Dict:
        """
        Determine if model retraining should be triggered.
        
        Checks multiple signals:
        - Accuracy below threshold
        - Data drift above threshold
        - Sufficient new samples collected
        """
        current_accuracy = self.get_accuracy()
        new_samples = len(self.labeled_data)
        
        reasons = []
        
        if current_accuracy < self.accuracy_threshold and new_samples > 100:
            reasons.append(
                f"Accuracy {current_accuracy:.2%} below threshold {self.accuracy_threshold:.2%}"
            )
        
        if drift_score > self.drift_threshold:
            reasons.append(
                f"Drift score {drift_score:.3f} above threshold {self.drift_threshold}"
            )
        
        if new_samples >= self.min_samples:
            reasons.append(
                f"{new_samples} new samples available (threshold: {self.min_samples})"
            )
        
        return {
            'should_retrain': len(reasons) > 0,
            'reasons': reasons,
            'current_accuracy': current_accuracy,
            'drift_score': drift_score,
            'new_samples': new_samples
        }
    
    def select_for_active_learning(self, predictions: List[Tuple[str, float]], n: int = 100) -> List[str]:
        """
        Select samples for human labeling using uncertainty sampling.
        
        Prioritizes predictions where model is least confident.
        """
        # Sort by confidence (ascending - least confident first)
        sorted_preds = sorted(predictions, key=lambda x: x[1])
        return [pred_id for pred_id, _ in sorted_preds[:n]]

# Demonstrate continuous learning
pipeline = ContinuousLearningPipeline(
    accuracy_threshold=0.85,
    min_samples_for_retrain=50  # Lower for demo
)

# Simulate predictions and feedback
import random
for i in range(100):
    pred_id = f"pred_{i}"
    features = {"age": random.randint(20, 60)}
    prediction = random.choice([">50K", "<=50K"])
    
    pipeline.record_prediction(pred_id, features, prediction)
    
    # Simulate delayed feedback (80% correct)
    if random.random() < 0.8:
        ground_truth = prediction  # Correct
    else:
        ground_truth = ">50K" if prediction == "<=50K" else "<=50K"  # Wrong
    
    pipeline.receive_feedback(pred_id, ground_truth)

# Check if retraining needed
retrain_check = pipeline.check_retrain_needed(drift_score=0.15)

print("Continuous Learning Pipeline Status:")
print(f"  Current Accuracy: {pipeline.get_accuracy():.2%}")
print(f"  Labeled Samples: {len(pipeline.labeled_data)}")
print(f"  Should Retrain: {retrain_check['should_retrain']}")
if retrain_check['reasons']:
    print(f"  Reasons:")
    for reason in retrain_check['reasons']:
        print(f"    - {reason}")

## Summary

This notebook demonstrated key deployment patterns for AutoML models:

1. **Model Packaging**: Creating metadata and requirements for deployment
2. **Health Checks**: Verifying model can actually predict, not just load
3. **Shadow Deployment**: Safe validation of new models without user impact
4. **MLflow Integration**: Dynamic model loading from registry
5. **Prometheus Metrics**: ML-specific monitoring for production
6. **Drift Detection**: Using PSI to detect distribution changes
7. **Input Sanitization**: Protecting endpoints from attacks
8. **Continuous Learning**: Feedback collection and retraining triggers

### Production Tools Mentioned

- **Serving**: FastAPI, TensorFlow Serving, TorchServe, Triton, BentoML
- **Orchestration**: Kubernetes, Istio, Helm
- **Monitoring**: Prometheus, Grafana, Datadog
- **Drift Detection**: Evidently, WhyLabs, Great Expectations
- **Model Registry**: MLflow, Weights & Biases

These patterns scale from simple deployments to enterprise ML platforms.