In [None]:
!sudo apt-get update
!sudo apt-get install -y zstd

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# import os

# BASE_DIR = "/content/drive/MyDrive/anki_mindmap_LLM"

# OLLAMA_DIR = f"{BASE_DIR}/ollama"
# MODELS_DIR = f"{OLLAMA_DIR}/models"

# # os.makedirs(MODELS_DIR, exist_ok=True)

# print("✅ Ollama model directory ready")
# print("MODELS:", MODELS_DIR)


In [None]:
%env OLLAMA_MODELS=/content/drive/MyDrive/anki_mindmap_LLM/ollama/models


In [None]:
!nohup /usr/local/bin/ollama serve > ollama.log 2>&1 &

In [None]:
!ollama list


In [None]:
!pip install bloom-filter2

In [None]:
from pathlib import Path

path = Path("/content/drive/MyDrive/anki_mindmap_LLM/input/metadata.csv")

print("Exists:", path.exists())
print("Is file:", path.is_file())
print("Parent exists:", path.parent.exists())
print("Parent contents:", list(path.parent.glob("*")))

In [None]:
# ============================================================
# CELL 1: CONFIGURATION & CONSTANTS (FIXED VERSION)
# ============================================================
import json
import time
import subprocess
import requests
import pandas as pd
import sys
import hashlib
import re
import os
import logging
from pathlib import Path
from typing import Dict, List, Set, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from enum import Enum
from collections import defaultdict
from threading import Lock
from functools import lru_cache
from types import MappingProxyType

# Setup logging before anything else
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('/tmp/anki_pipeline.log')
    ]
)
logger = logging.getLogger("anki-pipeline")

# Optional bloom filter (graceful degradation if not available)
try:
    from bloom_filter2 import BloomFilter
    BLOOM_AVAILABLE = True
except ImportError:
    BLOOM_AVAILABLE = False
    logger.warning("bloom_filter2 not installed - using set-based deduplication")

# ============================================================
# CONFIG - IMMUTABLE
# ============================================================
def load_config():
    """Load configuration with proper validation"""
    # Determine CSV location
    CSV_LOCATIONS = [
        "/content/drive/MyDrive/anki_mindmap_LLM/input/metadata.csv",
        "./metadata.csv",
        os.getenv("ANKI_CSV_PATH", ""),
    ]
    
    CSV_FILE = None
    for loc in CSV_LOCATIONS:
        if loc and Path(loc).exists():
            CSV_FILE = loc
            break
    
    if not CSV_FILE:
        CSV_FILE = CSV_LOCATIONS[0]
    
    # Determine base directory
    BASE_DIR = Path(os.getenv("ANKI_BASE_DIR", "/content/drive/MyDrive/anki_mindmap_LLM"))
    
    config = {
        # Core paths
        "OLLAMA_URL": "http://127.0.0.1:11434",
        "CSV_FILE": CSV_FILE,
        "BASE_DIR": BASE_DIR,
        "OUT_DIR": BASE_DIR / "output",
        
        # Processing limits
        "BATCH_SIZE": 5,
        "MAX_WORKERS": 4,
        "MAX_REELS": 1000,
        "CONFIDENCE_THRESHOLD": 0.65,
        "FINGERPRINT_BATCH_SIZE": 100,
        "CACHE_VERSION": "v4_fixed",
        
        # Circuit breaker settings
        "CIRCUIT_BREAKER_THRESHOLD": 5,
        "CIRCUIT_BREAKER_TIMEOUT": 60,
        
        # PRODUCTION SAFETY SWITCHES
        "ENABLE_ENRICHMENT": True,
        "ENABLE_REJECTION_LEARNING": True,
        "ENABLE_TRADEOFFS": True,
        "ENABLE_PROMPT_ROUTING": True,
        "ENABLE_FOUNDATION_EXPANSION": True,
        
        # PHASE 1-3: ADVANCED CONTENT PROCESSING
        "ENABLE_CONTENT_FILTERING": True,
        "ENABLE_TRANSCRIPT_NORMALIZATION": True,
        "ENABLE_HYBRID_ROUTING": True,
        
        # ENRICHMENT CONTROLS
        "MAX_ENRICHMENTS_PER_CONCEPT": {
            "foundation": 1,
            "intermediate": 2,
            "advanced": 2
        },
        "MAX_RETRIES": 3,
        
        # COMPLETION THRESHOLDS
        "MIN_CARDS_FOR_FULL": 3,
        "MIN_CARDS_FOR_PARTIAL": 2,
        
        # NORMALIZATION
        "NORMALIZE_TECH_SCORES": True,
        
        # Content density thresholds
        "DENSE_CONTENT_MIN_WORDS": 150,
        "LIGHT_CONTENT_MAX_WORDS": 100,
        
        # Validation
        "MIN_TRANSCRIPT_LENGTH": 80,
        "MIN_CATEGORY_CONFIDENCE": 70,
    }
    
    # Create directories
    config["OUT_DIR"].mkdir(parents=True, exist_ok=True)
    
    # Create cache directory
    CACHE_DIR = config["OUT_DIR"] / "cache"
    CACHE_DIR.mkdir(exist_ok=True)
    config["CACHE_DIR"] = CACHE_DIR
    
    return config

# Load and make config immutable
_CONFIG = load_config()
CONFIG = MappingProxyType(_CONFIG)
logger.info(f"Loaded configuration from {CONFIG['CSV_FILE']}")

# File paths
PROGRESS_FILE = CONFIG["OUT_DIR"] / "processed.txt"
CARD_FINGERPRINTS_FILE = CONFIG["OUT_DIR"] / "card_fingerprints.json"
BLOOM_FILE = CONFIG["OUT_DIR"] / "card_bloom.bin"
REJECTION_MEMORY_FILE = CONFIG["OUT_DIR"] / "rejection_memory.json"
PROMPT_VERSION_FILE = CONFIG["OUT_DIR"] / "prompt_version_stats.json"
ROUTING_METRICS_FILE = CONFIG["OUT_DIR"] / "routing_metrics.json"
TERMINAL_REJECTIONS_FILE = CONFIG["OUT_DIR"] / "terminal_rejections.json"
CONFIDENCE_CALIBRATION_FILE = CONFIG["OUT_DIR"] / "confidence_calibration.json"
ERROR_LOG_FILE = CONFIG["OUT_DIR"] / "error_log.json"

# ============================================================
# CIRCUIT BREAKER PATTERN
# ============================================================
class CircuitBreaker:
    """Circuit breaker to prevent cascade failures"""
    
    def __init__(self, name, failure_threshold=5, reset_timeout=60):
        self.name = name
        self.failure_threshold = failure_threshold
        self.reset_timeout = reset_timeout
        self.failures = 0
        self.last_failure_time = 0
        self.state = "CLOSED"  # CLOSED, OPEN, HALF_OPEN
        self.lock = Lock()
    
    def is_open(self):
        """Check if circuit breaker is open"""
        with self.lock:
            if self.state == "OPEN":
                # Check if timeout has passed
                if time.time() - self.last_failure_time > self.reset_timeout:
                    self.state = "HALF_OPEN"
                    logger.info(f"Circuit breaker {self.name} moving to HALF_OPEN")
                    return False
                return True
            return False
    
    def record_failure(self):
        """Record a failure and potentially open the circuit"""
        with self.lock:
            self.failures += 1
            self.last_failure_time = time.time()
            
            if self.failures >= self.failure_threshold:
                if self.state != "OPEN":
                    self.state = "OPEN"
                    logger.warning(f"Circuit breaker {self.name} OPENED after {self.failures} failures")
    
    def record_success(self):
        """Record a success and potentially close the circuit"""
        with self.lock:
            if self.state == "HALF_OPEN":
                self.state = "CLOSED"
                self.failures = 0
                logger.info(f"Circuit breaker {self.name} CLOSED after successful trial")
            elif self.state == "CLOSED":
                # Decay failures slowly
                self.failures = max(0, self.failures - 1)
    
    def call(self, func, *args, **kwargs):
        """Wrap a function call with circuit breaker protection"""
        if self.is_open():
            raise RuntimeError(f"Circuit breaker {self.name} is OPEN")
        
        try:
            result = func(*args, **kwargs)
            self.record_success()
            return result
        except Exception as e:
            self.record_failure()
            raise

# Initialize circuit breakers
OLLAMA_CIRCUIT_BREAKER = CircuitBreaker(
    "ollama",
    failure_threshold=CONFIG["CIRCUIT_BREAKER_THRESHOLD"],
    reset_timeout=CONFIG["CIRCUIT_BREAKER_TIMEOUT"]
)

# ============================================================
# ERROR TRACKING
# ============================================================
class ErrorTracker:
    """Track and categorize errors for observability"""
    
    def __init__(self, error_file: Path):
        self.error_file = error_file
        self.errors = []
        self.error_counts = defaultdict(int)
        self.lock = Lock()
        self._load()
    
    def _load(self):
        """Load existing errors"""
        if self.error_file.exists():
            try:
                with open(self.error_file, 'r') as f:
                    data = json.load(f)
                    self.errors = data.get("errors", [])
                    self.error_counts = defaultdict(int, data.get("counts", {}))
            except Exception as e:
                logger.error(f"Failed to load error tracker: {e}")
    
    def record(self, error_type: str, message: str, context: Dict = None):
        """Record an error"""
        with self.lock:
            error_entry = {
                "type": error_type,
                "message": message,
                "context": context or {},
                "timestamp": time.time(),
                "iso_timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
            }
            
            self.errors.append(error_entry)
            self.error_counts[error_type] += 1
            
            # Keep only last 1000 errors
            if len(self.errors) > 1000:
                self.errors = self.errors[-1000:]
            
            # Periodic save
            if len(self.errors) % 50 == 0:
                self.save()
    
    def save(self):
        """Save errors to disk"""
        with self.lock:
            try:
                with open(self.error_file, 'w') as f:
                    json.dump({
                        "errors": self.errors[-500:],  # Keep only recent errors
                        "counts": dict(self.error_counts),
                        "total_errors": len(self.errors)
                    }, f, indent=2)
            except Exception as e:
                logger.error(f"Failed to save error tracker: {e}")
    
    def get_stats(self) -> Dict:
        """Get error statistics"""
        with self.lock:
            return {
                "total_errors": len(self.errors),
                "error_counts": dict(self.error_counts),
                "recent_errors": self.errors[-20:] if self.errors else []
            }

# Initialize error tracker
error_tracker = ErrorTracker(ERROR_LOG_FILE)

# ============================================================
# ATOMIC FILE OPERATIONS
# ============================================================
def atomic_write(data, filepath: Path):
    """Write data atomically to prevent corruption"""
    import tempfile
    temp_path = Path(str(filepath) + '.tmp')
    try:
        with open(temp_path, 'w') as f:
            if isinstance(data, (dict, list)):
                json.dump(data, f, indent=2)
            else:
                f.write(str(data))
        temp_path.replace(filepath)
    except Exception as e:
        logger.error(f"Atomic write failed: {e}")
        # Clean up temp file
        if temp_path.exists():
            temp_path.unlink()
        raise

def atomic_read(filepath: Path, default=None):
    """Read data with corruption recovery"""
    if not filepath.exists():
        return default
    
    try:
        with open(filepath, 'r') as f:
            if filepath.suffix == '.json':
                return json.load(f)
            else:
                return f.read()
    except (json.JSONDecodeError, IOError) as e:
        logger.warning(f"File {filepath} corrupted, attempting recovery: {e}")
        # Try to backup corrupted file
        backup_path = filepath.with_suffix('.corrupted')
        try:
            filepath.rename(backup_path)
        except:
            pass
        return default

# ============================================================
# INITIALIZATION STATUS
# ============================================================
logger.info("=" * 60)
logger.info("ANKI GENERATION PIPELINE - INITIALIZED")
logger.info("=" * 60)
logger.info(f"Output directory: {CONFIG['OUT_DIR']}")
logger.info(f"CSV file: {CONFIG['CSV_FILE']}")
logger.info(f"Max reels: {CONFIG['MAX_REELS']}")
logger.info(f"Circuit breaker: {CONFIG['CIRCUIT_BREAKER_THRESHOLD']} failures / {CONFIG['CIRCUIT_BREAKER_TIMEOUT']}s timeout")
logger.info("=" * 60)

In [None]:
# ============================================================
# CELL 2: DATA TYPES & SCHEMAS (COMPLETE VERSION)
# ============================================================

# ============================================================
# ENUM DEFINITIONS (MUST BE AT TOP LEVEL)
# ============================================================
class CompletionState(Enum):
    FULL = "full"
    PARTIAL = "partial"
    INCOMPLETE = "incomplete"

class RejectionType(Enum):
    SEMANTIC = "rejected_semantic"
    STRUCTURAL = "rejected_structural"
    MECHANICAL = "error_mechanical"
    TERMINAL = "terminal_rejected"

class RejectionReason(Enum):
    """Typed rejection reasons for analytics"""
    LOW_TECHNICAL_SIGNAL = "low_technical_signal"
    INSUFFICIENT_SOURCE = "insufficient_source_material"
    MOTIVATIONAL_CONTENT = "motivational_content"
    PROMOTIONAL_CTA = "promotional_cta"
    DUPLICATE_CONTENT = "duplicate_content"
    INVALID_SCHEMA = "invalid_schema"
    NO_CARDS_GENERATED = "no_cards_generated"
    ALL_CARDS_DUPLICATES = "all_cards_duplicates"
    TOO_SHORT = "too_short"
    LOW_CONFIDENCE_CATEGORY = "low_confidence_category"
    REPEATEDLY_FAILED = "repeatedly_failed_enrichment"
    HANDLED_BY_LOGIC = "handled_by_logic"

class ContentDensity(Enum):
    DENSE = "dense"
    LIGHT = "light"
    SKIP = "skip"

class PromptStrategy(Enum):
    STRICT_ADVANCED = "A_STRICT"
    FOUNDATION_AWARE = "B_FOUNDATION"
    DSA_FOCUSED = "C_DSA"

class ContentIntent(Enum):
    """What the content is trying to achieve"""
    TUTORIAL = "tutorial"
    PROBLEM_SOLVING = "problem_solving"
    OVERVIEW = "overview"
    MOTIVATIONAL = "motivational"
    COMPARISON = "comparison"
    WARNING = "warning"
    BEST_PRACTICE = "best_practice"

class CardType(Enum):
    BASIC = "basic"
    CLOZE = "cloze"
    TRADEOFF = "tradeoff"

# ============================================================
# DATA CLASSES
# ============================================================
@dataclass
class QualityDimensions:
    correctness_score: float
    richness_score: float
    combined_score: float

    @classmethod
    def calculate(cls, atoms: Dict, cards: List[Dict]) -> 'QualityDimensions':
        """
        Calculate quality dimensions with proper validation.
        """
        has_definition = bool(atoms.get("definition", "").strip())
        tech_points = atoms.get("technical_points", [])
        has_tech_points = len([p for p in tech_points if p.strip()]) >= 3
        
        solutions = atoms.get("solutions", [])
        has_solutions = len([s for s in solutions if s.strip()]) >= 1

        correctness = (
            (0.4 if has_definition else 0) +
            (0.4 if has_tech_points else 0) +
            (0.2 if has_solutions else 0)
        )

        card_count = len(cards)
        related_concepts = atoms.get("related_concepts", [])
        has_related = len([rc for rc in related_concepts if rc.get("name")]) > 0
        has_tradeoffs = atoms.get("has_tradeoffs", False)

        richness = (
            min(card_count / 3, 0.5) +
            (0.3 if has_related else 0) +
            (0.2 if has_tradeoffs else 0)
        )
        richness = min(max(richness, 0), 1.0)  # Clamp between 0-1

        combined = correctness * 0.6 + richness * 0.4

        return cls(
            correctness_score=round(correctness, 2),
            richness_score=round(richness, 2),
            combined_score=round(combined, 2)
        )

@dataclass
class ModelConfig:
    name: str
    gpu_layers: int = -1
    temperature: float = 0.1
    num_predict: int = 2000
    top_p: float = 0.9
    timeout: int = 600
    
    def to_ollama_payload(self, prompt: str) -> Dict:
        """Convert to Ollama API payload"""
        return {
            "model": self.name,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": self.temperature,
                "top_p": self.top_p,
                "num_predict": self.num_predict,
                "num_gpu": self.gpu_layers if self.gpu_layers > 0 else None
            }
        }

@dataclass
class ReelMetadata:
    """Structured metadata for a reel"""
    reel_id: str
    caption: str
    transcript: str
    category: str = ""
    category_confidence: int = 0
    source_url: str = ""
    created_at: str = ""
    
    @classmethod
    def from_dict(cls, data: Dict) -> 'ReelMetadata':
        """Create from raw dictionary data"""
        return cls(
            reel_id=str(data.get("reel_id", data.get("id", ""))),
            caption=str(data.get("caption", "")),
            transcript=str(data.get("transcript", "")),
            category=str(data.get("category", "")),
            category_confidence=int(data.get("category_confidence", 0) or 0),
            source_url=str(data.get("source_url", data.get("url", ""))),
            created_at=str(data.get("created_at", data.get("timestamp", "")))
        )

@dataclass
class ProcessingResult:
    """Result of processing a single reel"""
    reel_id: str
    status: str
    reason: str = ""
    atoms: Optional[Dict] = None
    cards: List[Dict] = None
    confidence: float = 0.0
    tech_score: int = 0
    topic_class: str = ""
    prompt_strategy: str = ""
    processing_time: float = 0.0
    error_details: Optional[Dict] = None
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for serialization"""
        return {
            "reel_id": self.reel_id,
            "status": self.status,
            "reason": self.reason,
            "confidence": self.confidence,
            "tech_score": self.tech_score,
            "topic_class": self.topic_class,
            "prompt_strategy": self.prompt_strategy,
            "processing_time": self.processing_time,
            "card_count": len(self.cards) if self.cards else 0,
            "error_details": self.error_details,
            "timestamp": time.time()
        }

# ============================================================
# SCHEMA CLASSES WITH VALIDATION
# ============================================================
class BaseSchema:
    required: Set[str] = set()
    optional: Set[str] = set()
    
    @classmethod
    def validate(cls, data: dict) -> Tuple[bool, List[str]]:
        """
        Validate schema, return (is_valid, errors)
        """
        if not isinstance(data, dict):
            return False, ["Data must be a dictionary"]
        
        errors = []
        
        # Check required fields
        missing = cls.required - set(data.keys())
        if missing:
            errors.append(f"Missing required fields: {missing}")
        
        # Check field types (basic validation)
        for field in cls.required:
            if field in data and data[field] is None:
                errors.append(f"Field '{field}' cannot be None")
        
        return len(errors) == 0, errors
    
    @classmethod
    def sanitize(cls, data: dict) -> dict:
        """Sanitize data to match schema"""
        sanitized = {}
        all_fields = cls.required.union(cls.optional)
        
        for field in all_fields:
            if field in data:
                value = data[field]
                # Basic type preservation
                sanitized[field] = value
        
        return sanitized

class AtomsSchema(BaseSchema):
    required = {
        "concept", "category", "definition",
        "technical_points", "solutions", "impact", "has_tradeoffs",
    }
    optional = {
        "valid", "reject_reason", "related_concepts", 
        "learning_key", "topic_class", "tech_score",
        "was_enriched", "prompt_version", "routing_reason"
    }

class BasicCardSchema(BaseSchema):
    required = {"type", "front", "back"}
    optional = {"concept_source", "tags", "quality", "priority", "reel_id"}

class ClozeCardSchema(BaseSchema):
    required = {"type", "cloze"}
    optional = {"concept_source", "tags", "quality", "priority", "reel_id"}

class TradeoffCardSchema(BaseSchema):
    required = {"type", "tradeoffs"}
    optional = {"front", "concept_source", "tags", "quality", "priority", "reel_id"}

class CardsContainerSchema(BaseSchema):
    required = {"cards"}
    optional = {"count", "confidence", "completion_state"}

class EnrichmentRequestSchema(BaseSchema):
    """Schema for enrichment requests"""
    required = {"original_atoms", "reason", "attempt_number"}
    optional = {"strategy_hint", "max_retries", "timeout"}

# ============================================================
# GLOBAL STATE (PROPERLY MANAGED)
# ============================================================
from threading import Lock

# Thread-safe metrics with locks
class MetricsTracker:
    """Thread-safe metrics tracking"""
    
    def __init__(self):
        self.counts = defaultdict(int)
        self.timings = defaultdict(list)
        self.lock = Lock()
        self.start_time = time.time()
    
    def increment(self, key: str, value: int = 1):
        """Increment a counter"""
        with self.lock:
            self.counts[key] += value
    
    def record_timing(self, key: str, duration: float):
        """Record a timing measurement"""
        with self.lock:
            self.timings[key].append(duration)
            # Keep only last 1000 measurements
            if len(self.timings[key]) > 1000:
                self.timings[key] = self.timings[key][-1000:]
    
    def get_stats(self, key: str) -> Dict:
        """Get statistics for a key"""
        with self.lock:
            if key in self.timings and self.timings[key]:
                values = self.timings[key]
                return {
                    "count": len(values),
                    "mean": sum(values) / len(values),
                    "min": min(values),
                    "max": max(values),
                    "p95": sorted(values)[int(len(values) * 0.95)] if len(values) > 1 else values[0]
                }
            return {"count": 0}
    
    def to_dict(self) -> Dict:
        """Export all metrics"""
        with self.lock:
            return {
                "counts": dict(self.counts),
                "timings": {k: self.get_stats(k) for k in self.timings},
                "uptime": time.time() - self.start_time,
                "timestamp": time.time()
            }
    
    def reset(self):
        """Reset all metrics (for testing)"""
        with self.lock:
            self.counts.clear()
            self.timings.clear()
            self.start_time = time.time()

# Initialize global trackers
ROUTING_METRICS = MetricsTracker()
PROCESSING_METRICS = MetricsTracker()

# Model capability cache with lock
MODEL_CAPABILITY_CACHE = {}
MODEL_CAPABILITY_LOCK = Lock()

# Prompt version stats with lock
PROMPT_VERSION_STATS = {}
PROMPT_VERSION_LOCK = Lock()

# Confidence calibration with lock
CONFIDENCE_CALIBRATION = {
    "buckets": {
        "0.5-0.6": {"total": 0, "accepted": 0},
        "0.6-0.7": {"total": 0, "accepted": 0},
        "0.7-0.8": {"total": 0, "accepted": 0},
        "0.8-0.9": {"total": 0, "accepted": 0},
        "0.9-1.0": {"total": 0, "accepted": 0}
    },
    "total_checks": 0,
    "calibration_factor": 1.0
}
CONFIDENCE_CALIBRATION_LOCK = Lock()

# Enrichment budget tracking
ENRICHMENT_BUDGET = defaultdict(int)
ENRICHMENT_TIMESTAMPS = defaultdict(float)
ENRICHMENT_BUDGET_LOCK = Lock()
ENRICHMENT_BUDGET_RESET_DAYS = 7

# Performance monitoring
PERFORMANCE_SAMPLES = {
    "extract_atoms": [],
    "generate_cards": [],
    "total_processing": []
}
PERFORMANCE_LOCK = Lock()
MAX_PERFORMANCE_SAMPLES = 1000

# ============================================================
# VALIDATION UTILITIES
# ============================================================
class ValidationError(Exception):
    """Custom exception for validation errors"""
    pass

def validate_and_sanitize(data: dict, schema_class) -> dict:
    """
    Validate data against schema and return sanitized version.
    Raises ValidationError if invalid.
    """
    is_valid, errors = schema_class.validate(data)
    if not is_valid:
        raise ValidationError(f"Schema validation failed: {errors}")
    
    return schema_class.sanitize(data)

def validate_card(card: dict) -> Tuple[bool, List[str]]:
    """
    Validate a card based on its type.
    """
    card_type = card.get("type", "basic")
    
    if card_type == "basic":
        schema = BasicCardSchema
    elif card_type == "cloze":
        schema = ClozeCardSchema
    elif card_type == "tradeoff":
        schema = TradeoffCardSchema
    else:
        return False, [f"Unknown card type: {card_type}"]
    
    return schema.validate(card)

def validate_atoms(atoms: dict) -> Tuple[bool, List[str]]:
    """
    Validate atoms with additional semantic checks.
    """
    # Schema validation
    is_valid, errors = AtomsSchema.validate(atoms)
    if not is_valid:
        return False, errors
    
    # Semantic validation
    if not atoms.get("valid", True):
        return False, ["Atoms marked as invalid"]
    
    concept = atoms.get("concept", "").strip()
    if not concept:
        errors.append("Concept cannot be empty")
    
    definition = atoms.get("definition", "").strip()
    if not definition:
        errors.append("Definition cannot be empty")
    
    tech_points = atoms.get("technical_points", [])
    if not tech_points or len([p for p in tech_points if p.strip()]) < 2:
        errors.append("Need at least 2 technical points")
    
    return len(errors) == 0, errors

# ============================================================
# SERIALIZATION UTILITIES
# ============================================================
class JSONEncoder(json.JSONEncoder):
    """Custom JSON encoder for our data types"""
    
    def default(self, obj):
        if isinstance(obj, Enum):
            return obj.value
        if isinstance(obj, (ReelMetadata, ProcessingResult, ModelConfig)):
            return obj.__dict__
        if isinstance(obj, Path):
            return str(obj)
        if hasattr(obj, 'to_dict'):
            return obj.to_dict()
        
        return super().default(obj)

def serialize_result(result: ProcessingResult) -> str:
    """Serialize processing result to JSON"""
    return json.dumps(result.to_dict(), cls=JSONEncoder, indent=2)

def deserialize_result(data: dict) -> ProcessingResult:
    """Deserialize processing result from dictionary"""
    return ProcessingResult(**data)

# ============================================================
# STATISTICS AND ANALYTICS
# ============================================================
class PipelineStatistics:
    """Collect and analyze pipeline statistics"""
    
    def __init__(self):
        self.stats = {
            "reels_processed": 0,
            "reels_successful": 0,
            "reels_failed": 0,
            "reels_rejected": 0,
            "cards_generated": 0,
            "duplicates_filtered": 0,
            "enrichment_attempts": 0,
            "enrichment_successes": 0,
            "avg_confidence": 0.0,
            "avg_processing_time": 0.0,
            "by_topic_class": defaultdict(int),
            "by_content_density": defaultdict(int),
            "by_rejection_reason": defaultdict(int),
            "timestamp": time.time()
        }
        self.lock = Lock()
    
    def update(self, result: ProcessingResult):
        """Update statistics with processing result"""
        with self.lock:
            self.stats["reels_processed"] += 1
            
            if result.status == "success":
                self.stats["reels_successful"] += 1
                self.stats["cards_generated"] += len(result.cards or [])
                self.stats["avg_confidence"] = (
                    (self.stats["avg_confidence"] * (self.stats["reels_successful"] - 1) + result.confidence) 
                    / self.stats["reels_successful"]
                )
                self.stats["avg_processing_time"] = (
                    (self.stats["avg_processing_time"] * (self.stats["reels_successful"] - 1) + result.processing_time) 
                    / self.stats["reels_successful"]
                )
                
                if result.topic_class:
                    self.stats["by_topic_class"][result.topic_class] += 1
            elif result.status == "rejected":
                self.stats["reels_rejected"] += 1
                if result.reason:
                    self.stats["by_rejection_reason"][result.reason] += 1
            else:
                self.stats["reels_failed"] += 1
    
    def record_enrichment(self, successful: bool):
        """Record enrichment attempt"""
        with self.lock:
            self.stats["enrichment_attempts"] += 1
            if successful:
                self.stats["enrichment_successes"] += 1
    
    def record_duplicates(self, count: int):
        """Record duplicates filtered"""
        with self.lock:
            self.stats["duplicates_filtered"] += count
    
    def record_content_density(self, density: str):
        """Record content density classification"""
        with self.lock:
            self.stats["by_content_density"][density] += 1
    
    def get_summary(self) -> Dict:
        """Get summary statistics"""
        with self.lock:
            summary = self.stats.copy()
            summary["success_rate"] = (
                self.stats["reels_successful"] / self.stats["reels_processed"] * 100 
                if self.stats["reels_processed"] > 0 else 0
            )
            summary["enrichment_success_rate"] = (
                self.stats["enrichment_successes"] / self.stats["enrichment_attempts"] * 100 
                if self.stats["enrichment_attempts"] > 0 else 0
            )
            summary["cards_per_reel"] = (
                self.stats["cards_generated"] / self.stats["reels_successful"] 
                if self.stats["reels_successful"] > 0 else 0
            )
            summary["timestamp"] = time.time()
            
            return summary
    
    def save(self, filepath: Path):
        """Save statistics to file"""
        with self.lock:
            atomic_write(self.get_summary(), filepath)

# Initialize statistics tracker
STATISTICS = PipelineStatistics()

# ============================================================
# INITIALIZATION LOGGING
# ============================================================
logger.info("=" * 60)
logger.info("DATA TYPES & SCHEMAS INITIALIZED")
logger.info("=" * 60)
logger.info(f"CompletionState: {[s.value for s in CompletionState]}")
logger.info(f"RejectionType: {[r.value for r in RejectionType]}")
logger.info(f"ContentDensity: {[d.value for d in ContentDensity]}")
logger.info(f"PromptStrategy: {[s.value for s in PromptStrategy]}")
logger.info("=" * 60)

In [None]:
# ============================================================
# CELL 3: CONTENT PROCESSING & CLASSIFICATION (COMPLETE)
# ============================================================

import re
from typing import List, Tuple, Dict, Set
from collections import Counter
import json

logger = logging.getLogger("anki-pipeline.content")

# ============================================================
# CONTENT INTENT PATTERNS
# ============================================================
CONTENT_INTENT_PATTERNS = {
    ContentIntent.TUTORIAL: [
        r"step\s+\d+", r"first\s+you", r"then\s+you", r"next\s+you",
        r"let.*s\s+walk", r"walk.*through", r"follow\s+along",
        r"tutorial", r"guide", r"how\s+to", r"implement"
    ],
    ContentIntent.PROBLEM_SOLVING: [
        r"problem", r"solution", r"solve", r"issue",
        r"challenge", r"debug", r"fix", r"error",
        r"bug", r"exception", r"troubleshoot"
    ],
    ContentIntent.OVERVIEW: [
        r"overview", r"introduction", r"what\s+is",
        r"basics\s+of", r"fundamentals", r"explained",
        r"understanding", r"about\s+[a-z]+\s+in"
    ],
    ContentIntent.MOTIVATIONAL: [
        r"you\s+must", r"trust\s+me", r"important\s+to",
        r"should\s+know", r"every\s+developer",
        r"stop\s+scrolling", r"no\s+one\s+tells",
        r"secret", r"hidden", r"underrated"
    ],
    ContentIntent.COMPARISON: [
        r"vs\.", r"versus", r"compared\s+to",
        r"difference\s+between", r"vs\s+",
        r"rather\s+than", r"instead\s+of",
        r"alternative\s+to", r"choice\s+between"
    ],
    ContentIntent.WARNING: [
        r"warning", r"danger", r"avoid", r"never",
        r"don.*t\s+use", r"should.*t", r"mistake",
        r"pitfall", r"gotcha", r"common\s+error"
    ],
    ContentIntent.BEST_PRACTICE: [
        r"best\s+practice", r"recommended", r"should\s+use",
        r"proper\s+way", r"correct\s+approach",
        r"industry\s+standard", r"production\s+ready"
    ]
}

# ============================================================
# MECHANISM MARKERS FOR DEEP CONTENT DETECTION
# ============================================================
MECHANISM_MARKERS = [
    # Contract/guarantee markers
    "contract", "invariant", "guarantee", "violates", "breaks", "ensures",
    "depends on", "requires", "only if", "undefined behavior", "happens-before",
    
    # Concurrency markers
    "race condition", "memory model", "visibility", "ordering", "consistency",
    "deadlock", "livelock", "starvation", "thread safety", "atomicity",
    
    # Architecture markers
    "semantics", "specification", "constraint", "precondition", "postcondition",
    "internal", "mechanism", "how it works", "under the hood", "implementation",
    
    # Performance markers
    "latency", "throughput", "bottleneck", "overhead", "scalability",
    "optimization", "efficiency", "performance characteristics",
    
    # System design markers
    "trade-off", "tradeoff", "compromise", "sacrifice", "balance",
    "pros and cons", "advantages disadvantages", "strengths weaknesses"
]

# ============================================================
# TOPIC CLASSIFICATION CONFIGURATION
# ============================================================
TOPIC_CLASSES = {
    "foundation": {
        "keywords": [
            # Core Java
            "immutability", "string", "array", "oops", "collection",
            "hashmap", "equals", "hashcode", "sorting", "basics",
            "fundamentals", "introduction", "inheritance", "polymorphism",
            "encapsulation", "abstraction", "list", "set", "queue",
            
            # Basic concepts
            "variable", "method", "class", "object", "constructor",
            "static", "final", "interface", "abstract class",
            "exception", "try catch", "finally", "throw",
            
            # Common patterns
            "singleton", "factory", "builder", "observer"
        ],
        "threshold_adjustment": -3,
        "expected_depth": "intermediate",
        "expected_cards": 2,
        "confidence_boost": 1.2,
        "min_transcript_words": 80
    },
    "intermediate": {
        "keywords": [
            # Concurrency
            "concurrency", "thread", "lock", "synchronization", "volatile",
            "atomic", "concurrent", "deadlock", "race condition",
            "memory model", "jvm", "garbage collection", "classloader",
            
            # Advanced Java
            "generics", "annotations", "reflection", "serialization",
            "stream api", "lambda", "functional interface", "optional",
            
            # Spring basics
            "spring", "@autowired", "@component", "@service", "@repository",
            "dependency injection", "bean", "application context"
        ],
        "threshold_adjustment": -1,
        "expected_depth": "deep",
        "expected_cards": 3,
        "confidence_boost": 1.0,
        "min_transcript_words": 120
    },
    "advanced": {
        "keywords": [
            # Distributed systems
            "distributed", "saga", "circuit breaker", "event sourcing",
            "cqrs", "microservice", "kubernetes", "docker", "kafka",
            "consistency model", "partition tolerance", "cap theorem",
            
            # System design
            "load balancing", "caching strategy", "database sharding",
            "message queue", "api gateway", "service mesh",
            
            # Cloud/DevOps
            "azure", "aws", "gcp", "terraform", "helm", "jenkins",
            "ci/cd", "monitoring", "observability"
        ],
        "threshold_adjustment": 0,
        "expected_depth": "very_deep",
        "expected_cards": 4,
        "confidence_boost": 0.9,
        "min_transcript_words": 200
    }
}

# ============================================================
# TECHNICAL KEYWORDS FOR SCORING (ENHANCED)
# ============================================================
TECHNICAL_KEYWORDS = {
    "java": [
        "heap", "stack", "garbage collection", "jvm", "bytecode",
        "classloader", "reflection", "synchronized", "volatile",
        "thread", "immutable", "serialization", "generics",
        "annotation", "enum", "interface", "abstract class",
        "try-with-resources", "autocloseable", "stream",
        "optional", "lambda", "method reference"
    ],
    "spring": [
        "dependency injection", "bean", "autowired", "aop",
        "transactional", "repository", "service", "controller",
        "configuration", "component scan", "spring boot",
        "starter", "actuator", "security", "jpa", "hibernate"
    ],
    "microservices": [
        "saga", "event sourcing", "cqrs", "api gateway",
        "service mesh", "circuit breaker", "bulkhead", "rate limiting",
        "idempotency", "distributed transaction", "event bus",
        "service discovery", "config server"
    ],
    "algorithms": [
        "time complexity", "space complexity", "big o", "recursion",
        "dynamic programming", "backtracking", "greedy", "divide and conquer",
        "sorting algorithm", "search algorithm", "graph algorithm",
        "tree traversal", "hash table", "linked list"
    ],
    "system_design": [
        "scalability", "availability", "consistency", "partition tolerance",
        "load balancer", "caching", "sharding", "replication",
        "eventual consistency", "cap theorem", "acid", "base",
        "vertical scaling", "horizontal scaling"
    ],
    "databases": [
        "index", "transaction", "isolation level", "acid compliance",
        "normalization", "denormalization", "join", "foreign key",
        "primary key", "nosql", "sql", "orm", "migration"
    ],
    "testing": [
        "unit test", "integration test", "mock", "stub", "spy",
        "test coverage", "tdd", "bdd", "junit", "testng",
        "mockito", "assertj", "hamcrest"
    ]
}

# Foundation-specific keywords for better scoring
FOUNDATION_ENHANCEMENT_KEYWORDS = [
    "string", "array", "list", "map", "set", "queue", "stack",
    "equals", "hashcode", "immutable", "mutable", "inheritance",
    "polymorphism", "encapsulation", "abstraction", "interface",
    "abstract class", "constructor", "static", "final", "volatile",
    "transient", "synchronized", "exception", "try", "catch", "finally",
    "stream", "lambda", "functional interface", "optional", "generic"
]

# ============================================================
# PREPROCESSING UTILITIES
# ============================================================
class TextPreprocessor:
    """Text preprocessing utilities with caching"""
    
    def __init__(self):
        self._cache = {}
        self._cache_size = 10000
    
    def _normalize_cache_key(self, text: str) -> str:
        """Create cache key for text"""
        return hashlib.md5(text.encode()).hexdigest()
    
    def tokenize(self, text: str) -> List[str]:
        """Tokenize text into words"""
        cache_key = self._normalize_cache_key(f"tokenize:{text}")
        if cache_key in self._cache:
            return self._cache[cache_key]
        
        # Convert to lowercase and split
        tokens = re.findall(r'\b\w+\b', text.lower())
        
        # Cache result
        if len(self._cache) >= self._cache_size:
            # Remove oldest entry (FIFO)
            self._cache.pop(next(iter(self._cache)))
        
        self._cache[cache_key] = tokens
        return tokens
    
    def remove_fillers(self, text: str) -> str:
        """Remove conversational fillers"""
        fillers = [
            r'\bso\b', r'\bwell\b', r'\bright\b', r'\bokay\b',
            r'\byou know\b', r'\bi mean\b', r'\blike\b',
            r'\bactually\b', r'\bbasically\b', r'\bliterally\b',
            r'\bjust\b', r'\breally\b', r'\bvery\b', r'\bquite\b'
        ]
        
        for filler in fillers:
            text = re.sub(filler, ' ', text, flags=re.IGNORECASE)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def extract_code_snippets(self, text: str) -> List[str]:
        """Extract code snippets from text"""
        # Look for code blocks
        code_blocks = re.findall(r'```(?:[a-z]+)?\n(.*?)\n```', text, re.DOTALL)
        
        # Look for inline code
        inline_code = re.findall(r'`([^`]+)`', text)
        
        return code_blocks + inline_code
    
    def has_code_example(self, text: str) -> bool:
        """Check if text contains code examples"""
        code_snippets = self.extract_code_snippets(text)
        return len(code_snippets) > 0

# Initialize preprocessor
preprocessor = TextPreprocessor()

# ============================================================
# CONTENT INTENT CLASSIFICATION
# ============================================================
def detect_content_intent(caption: str, transcript: str) -> List[Tuple[ContentIntent, float]]:
    """
    Detect content intent with confidence scores.
    Returns list of (intent, confidence) sorted by confidence.
    """
    combined_text = f"{caption} {transcript}".lower()
    tokens = set(preprocessor.tokenize(combined_text))
    
    intent_scores = []
    
    for intent, patterns in CONTENT_INTENT_PATTERNS.items():
        score = 0.0
        
        # Pattern matching
        for pattern in patterns:
            if re.search(pattern, combined_text, re.IGNORECASE):
                score += 0.3
        
        # Exact keyword matching (stronger signal)
        intent_keywords = {
            ContentIntent.TUTORIAL: ["tutorial", "guide", "walkthrough", "step-by-step"],
            ContentIntent.PROBLEM_SOLVING: ["problem", "solution", "solve", "fix"],
            ContentIntent.OVERVIEW: ["overview", "introduction", "basics", "fundamentals"],
            ContentIntent.MOTIVATIONAL: ["must know", "important", "secret", "hidden"],
            ContentIntent.COMPARISON: ["vs", "versus", "compared", "difference"],
            ContentIntent.WARNING: ["warning", "avoid", "never", "danger"],
            ContentIntent.BEST_PRACTICE: ["best practice", "recommended", "should use"]
        }
        
        if intent in intent_keywords:
            for keyword in intent_keywords[intent]:
                if keyword in combined_text:
                    score += 0.5
        
        # Length-based adjustments
        word_count = len(transcript.split())
        if intent == ContentIntent.TUTORIAL and word_count > 200:
            score += 0.2
        elif intent == ContentIntent.OVERVIEW and word_count < 150:
            score += 0.2
        
        if score > 0:
            # Normalize to 0-1 range
            score = min(score, 1.0)
            intent_scores.append((intent, score))
    
    # Sort by confidence
    intent_scores.sort(key=lambda x: x[1], reverse=True)
    
    # If no strong intent detected, default to OVERVIEW
    if not intent_scores or intent_scores[0][1] < 0.3:
        intent_scores.append((ContentIntent.OVERVIEW, 0.2))
    
    return intent_scores

# ============================================================
# TOPIC CLASSIFICATION
# ============================================================
def classify_topic(caption: str, category: str, transcript: str = "") -> str:
    """
    Classify topic depth based on keywords and context.
    Returns: 'foundation', 'intermediate', or 'advanced'
    """
    text = f"{caption} {transcript}".lower()
    tokens = set(preprocessor.tokenize(text))
    
    # Check each class in reverse order (advanced first)
    for topic_class, config in reversed(list(TOPIC_CLASSES.items())):
        keywords = config["keywords"]
        
        # Count keyword matches
        matches = sum(1 for kw in keywords if kw in text)
        
        # For foundation topics, require stronger signal
        if topic_class == "foundation":
            if matches >= 2:
                return topic_class
        elif matches >= 1:
            return topic_class
    
    # Enhanced: Check for decision/comparison/tradeoff signals
    tradeoff_signals = ["vs", "versus", "instead", "trade-off", "tradeoff",
                       "compare", "comparison", "difference between", "when to", "choose"]
    decision_signals = ["decide", "decision", "which", "what if", "scenario",
                       "use case", "depends on", "context", "situation"]
    
    has_tradeoff = any(signal in text for signal in tradeoff_signals)
    has_decision = any(signal in text for signal in decision_signals)
    
    # Count technical depth markers
    depth_markers = ["internal", "mechanism", "how it works", "under the hood",
                    "implementation", "architecture", "design pattern", "best practice"]
    depth_count = sum(1 for marker in depth_markers if marker in text)
    
    # Don't default to foundation - use multiple signals
    transcript_len = len(transcript.strip())
    
    # Advanced signals: long transcript + tradeoffs/decisions + depth
    if transcript_len > 400 and (has_tradeoff or depth_count >= 2):
        return "advanced"
    
    # Intermediate signals: medium length + decisions OR good depth
    if transcript_len > 200 and (has_decision or has_tradeoff or depth_count >= 1):
        return "intermediate"
    
    # Long transcripts without keywords default to intermediate (not foundation)
    if transcript_len > 500:
        return "intermediate"
    elif transcript_len > 250:
        return "intermediate"
    else:
        return "foundation"

def classify_topic_with_confidence(caption: str, category: str, transcript: str = "") -> Tuple[str, float]:
    """
    Classify topic with confidence score.
    """
    topic_class = classify_topic(caption, category, transcript)
    
    # Calculate confidence based on signal strength
    text = f"{caption} {transcript}".lower()
    config = TOPIC_CLASSES.get(topic_class, {})
    keywords = config.get("keywords", [])
    
    # Count keyword matches
    matches = sum(1 for kw in keywords if kw in text)
    
    # Confidence calculation
    if topic_class == "advanced":
        base_confidence = 0.7 if matches >= 2 else 0.5
    elif topic_class == "intermediate":
        base_confidence = 0.8 if matches >= 2 else 0.6
    else:  # foundation
        base_confidence = 0.9 if matches >= 2 else 0.7
    
    # Adjust based on transcript length
    word_count = len(transcript.split())
    expected_min = config.get("min_transcript_words", 100)
    
    if word_count >= expected_min * 1.5:
        base_confidence = min(base_confidence + 0.1, 0.95)
    elif word_count < expected_min * 0.5:
        base_confidence = max(base_confidence - 0.2, 0.3)
    
    return topic_class, round(base_confidence, 2)

# ============================================================
# CONTENT DENSITY CLASSIFICATION
# ============================================================
def classify_content_density(caption: str, transcript: str, category: str = "") -> ContentDensity:
    """
    PHASE 3: Classify content density for multi-track routing.
    
    Returns:
    - DENSE: Tutorial/problem-solving with concrete examples → Full pipeline
    - LIGHT: Foundation/motivational content → Reference-only cards  
    - SKIP: Pure CTA/promotional → Don't process
    """
    text = f"{caption} {transcript}".lower()
    word_count = len(transcript.split())
    
    # Dense indicators (tutorial/problem-solving content)
    dense_signals = {
        "code_present": preprocessor.has_code_example(text) or any(marker in text for marker in [
            "code", "function", "method", "class", "variable", 
            "algorithm", "implementation", "syntax", "example"
        ]),
        "problem_solving": any(marker in text for marker in [
            "question", "problem", "solve", "solution", "approach",
            "step 1", "step 2", "algorithm", "complexity"
        ]),
        "concrete_example": any(marker in text for marker in [
            "for example", "let's say", "consider", "here is",
            "output", "result", "returns"
        ]),
        "mechanism": any(marker in text for marker in [
            "how it works", "internally", "mechanism", "under the hood",
            "behind the scenes", "what happens"
        ]),
        "structured": "step" in text and word_count > 150,
        "long_form": word_count > 300,
        "has_numbers": bool(re.search(r'\b\d+\b', text))  # Contains numbers
    }
    
    # Light indicators (overview/motivational content)
    light_signals = {
        "overview": any(marker in text for marker in [
            "introduction", "overview", "basics", "fundamentals",
            "what is", "definition", "concept"
        ]),
        "motivational": any(marker in text for marker in [
            "you must know", "important to", "should learn",
            "every developer", "trust me", "secret"
        ]),
        "list_based": any(marker in text for marker in [
            "top 10", "top 5", "things to", "tips", "must know",
            "reasons why", "benefits of"
        ]),
        "short_form": word_count < 150,
        "cta_present": any(marker in text for marker in [
            "comment link", "link in bio", "check out", "dm for",
            "go ahead", "share with", "follow me"
        ])
    }
    
    # Calculate density score
    dense_score = sum(1 for v in dense_signals.values() if v)
    light_score = sum(1 for v in light_signals.values() if v)
    
    # Track metrics
    STATISTICS.record_content_density("analyzed")
    
    # Decision logic with thresholds
    if light_signals["cta_present"] and word_count < 100:
        logger.debug(f"Content density: SKIP (promotional CTA)")
        STATISTICS.record_content_density("skip")
        return ContentDensity.SKIP
    
    if dense_score >= 4:
        logger.debug(f"Content density: DENSE (score: {dense_score})")
        STATISTICS.record_content_density("dense")
        return ContentDensity.DENSE
    elif dense_score >= 3 and light_score <= 2:
        logger.debug(f"Content density: DENSE (score: {dense_score}, light: {light_score})")
        STATISTICS.record_content_density("dense")
        return ContentDensity.DENSE
    elif light_score >= 3 or word_count < CONFIG.get("LIGHT_CONTENT_MAX_WORDS", 100):
        logger.debug(f"Content density: LIGHT (score: {light_score}, words: {word_count})")
        STATISTICS.record_content_density("light")
        return ContentDensity.LIGHT
    elif dense_score >= 2:
        # Borderline case - check intent
        intent_scores = detect_content_intent(caption, transcript)
        if intent_scores and intent_scores[0][0] in [ContentIntent.TUTORIAL, ContentIntent.PROBLEM_SOLVING]:
            logger.debug(f"Content density: DENSE (intent: {intent_scores[0][0].value})")
            STATISTICS.record_content_density("dense")
            return ContentDensity.DENSE
    
    # Default: if unclear, treat as LIGHT (safer)
    logger.debug(f"Content density: LIGHT (default)")
    STATISTICS.record_content_density("light")
    return ContentDensity.LIGHT

def classify_content_density_with_score(caption: str, transcript: str, category: str = "") -> Tuple[ContentDensity, Dict]:
    """
    Classify content density with detailed scoring.
    """
    text = f"{caption} {transcript}".lower()
    word_count = len(transcript.split())
    
    # Calculate signals
    dense_signals = {
        "code_present": preprocessor.has_code_example(text),
        "problem_solving": any(marker in text for marker in ["problem", "solution", "solve"]),
        "concrete_example": any(marker in text for marker in ["for example", "example"]),
        "mechanism": any(marker in text for marker in ["how it works", "internally"]),
        "structured": "step" in text and word_count > 150,
        "has_numbers": bool(re.search(r'\b\d+\b', text))
    }
    
    light_signals = {
        "overview": any(marker in text for marker in ["overview", "introduction"]),
        "motivational": any(marker in text for marker in ["must know", "important"]),
        "list_based": any(marker in text for marker in ["top 10", "tips"]),
        "short_form": word_count < 150,
        "cta_present": any(marker in text for marker in ["link in bio", "dm for"])
    }
    
    dense_score = sum(dense_signals.values())
    light_score = sum(light_signals.values())
    
    # Classify
    density = classify_content_density(caption, transcript, category)
    
    # Calculate confidence
    total_signals = dense_score + light_score
    if total_signals == 0:
        confidence = 0.5  # Default confidence
    else:
        if density == ContentDensity.DENSE:
            confidence = dense_score / max(total_signals, 1)
        elif density == ContentDensity.LIGHT:
            confidence = light_score / max(total_signals, 1)
        else:
            confidence = 0.8  # High confidence for SKIP
    
    return density, {
        "dense_score": dense_score,
        "light_score": light_score,
        "confidence": round(confidence, 2),
        "word_count": word_count,
        "dense_signals": {k: v for k, v in dense_signals.items() if v},
        "light_signals": {k: v for k, v in light_signals.items() if v}
    }

# ============================================================
# TEXT NORMALIZATION
# ============================================================
def normalize_learning_key(text: str) -> str:
    """
    Robust caption normalization to prevent memory fragmentation.
    
    Without this, memory fragments across:
    - punctuation variants ("HashMap!" vs "HashMap")
    - emoji presence
    - trailing hashtags
    - "Part 1 / Part 2" suffixes
    
    This ensures learning convergence.
    """
    if not text:
        return ""
    
    text = text.lower()
    text = re.sub(r'#\w+', '', text)              # Remove hashtags
    text = re.sub(r'\bpart\s*\d+\b', '', text, flags=re.IGNORECASE)  # Remove "Part 1", "Part 2"
    text = re.sub(r'[^\w\s]', ' ', text)          # Remove punctuation, keep words and spaces
    text = re.sub(r'\s+', ' ', text).strip()      # Normalize whitespace
    
    # Remove common filler words
    filler_words = ['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for']
    words = text.split()
    filtered_words = [w for w in words if w not in filler_words]
    
    return ' '.join(filtered_words)

def create_concept_signature(text: str) -> str:
    """
    Create a unique signature for a concept for deduplication.
    More aggressive than normalize_learning_key.
    """
    normalized = normalize_learning_key(text)
    
    # Remove common words
    common_words = {
        'what', 'why', 'how', 'when', 'where', 'which',
        'java', 'spring', 'microservice', 'system', 'design',
        'interview', 'question', 'answer', 'explain', 'difference'
    }
    
    words = normalized.split()
    signature_words = [w for w in words if w not in common_words and len(w) > 2]
    
    # Sort words to be order-independent
    signature_words.sort()
    
    # Take first 5 words or all if less
    signature_words = signature_words[:5]
    
    return '_'.join(signature_words) if signature_words else 'empty'

# ============================================================
# TECHNICAL SCORING
# ============================================================
@lru_cache(maxsize=10000)
def technical_score(text: str) -> int:
    """
    Calculate technical density score (0-10).
    Cached with LRU for 50k+ reel nightly processing.
    """
    if not text:
        return 0
    
    if not isinstance(text, str):
        text = json.dumps(text, ensure_ascii=False)
    
    text_lower = text.lower()
    score = 0
    
    # Keyword matching (base score)
    for category, keywords in TECHNICAL_KEYWORDS.items():
        matches = sum(1 for kw in keywords if kw in text_lower)
        score += min(matches, 3)  # Cap per category
    
    # Enhanced: Foundation-specific keyword bonus
    foundation_matches = sum(1 for kw in FOUNDATION_ENHANCEMENT_KEYWORDS if kw in text_lower)
    score += min(foundation_matches, 4)  # Up to +4 for foundation content
    
    # Mechanism marker bonus
    mechanism_count = sum(1 for marker in MECHANISM_MARKERS if marker in text_lower)
    if mechanism_count > 0:
        score += min(mechanism_count * 2, 4)  # Up to +4 for deep content
    
    # Code presence bonus
    if preprocessor.has_code_example(text):
        score += 3
    
    # Length normalization
    word_count = len(text.split())
    if word_count < 50:
        score = int(score * 0.7)
    elif word_count > 200:
        score = int(score * 1.2)
    
    # Cap score
    return min(max(score, 1), 10)

def technical_score_detailed(text: str) -> Dict:
    """
    Calculate technical score with breakdown.
    """
    if not text:
        return {"total": 0, "breakdown": {}}
    
    text_lower = text.lower()
    breakdown = {}
    
    # Keyword matching
    for category, keywords in TECHNICAL_KEYWORDS.items():
        matches = sum(1 for kw in keywords if kw in text_lower)
        if matches > 0:
            breakdown[category] = min(matches, 3)
    
    # Foundation keywords
    foundation_matches = sum(1 for kw in FOUNDATION_ENHANCEMENT_KEYWORDS if kw in text_lower)
    if foundation_matches > 0:
        breakdown["foundation"] = min(foundation_matches, 4)
    
    # Mechanism markers
    mechanism_count = sum(1 for marker in MECHANISM_MARKERS if marker in text_lower)
    if mechanism_count > 0:
        breakdown["mechanism"] = min(mechanism_count * 2, 4)
    
    # Code presence
    if preprocessor.has_code_example(text):
        breakdown["code"] = 3
    
    # Calculate total
    total = sum(breakdown.values())
    
    # Length adjustment
    word_count = len(text.split())
    length_factor = 1.0
    if word_count < 50:
        length_factor = 0.7
    elif word_count > 200:
        length_factor = 1.2
    
    total = int(total * length_factor)
    total = min(max(total, 1), 10)
    
    return {
        "total": total,
        "breakdown": breakdown,
        "word_count": word_count,
        "length_factor": length_factor
    }

# ============================================================
# CONTENT QUALITY ASSESSMENT
# ============================================================
def assess_content_quality(caption: str, transcript: str, category: str = "") -> Dict:
    """
    Comprehensive content quality assessment.
    """
    # Classify everything
    topic_class, topic_confidence = classify_topic_with_confidence(caption, category, transcript)
    density, density_details = classify_content_density_with_score(caption, transcript, category)
    intent_scores = detect_content_intent(caption, transcript)
    tech_score_details = technical_score_detailed(f"{caption} {transcript}")
    
    # Calculate overall quality score (0-100)
    quality_score = 0
    
    # Technical depth contribution (40%)
    quality_score += (tech_score_details["total"] / 10) * 40
    
    # Content density contribution (30%)
    if density == ContentDensity.DENSE:
        quality_score += 30
    elif density == ContentDensity.LIGHT:
        quality_score += 15
    # SKIP gets 0
    
    # Intent contribution (20%)
    if intent_scores:
        primary_intent, intent_confidence = intent_scores[0]
        if primary_intent in [ContentIntent.TUTORIAL, ContentIntent.PROBLEM_SOLVING]:
            quality_score += 20 * intent_confidence
        elif primary_intent in [ContentIntent.BEST_PRACTICE, ContentIntent.COMPARISON]:
            quality_score += 15 * intent_confidence
        else:
            quality_score += 10 * intent_confidence
    
    # Transcript length contribution (10%)
    word_count = len(transcript.split())
    if word_count > 300:
        quality_score += 10
    elif word_count > 150:
        quality_score += 7
    elif word_count > 80:
        quality_score += 5
    else:
        quality_score += 2
    
    # Cap at 100
    quality_score = min(quality_score, 100)
    
    return {
        "quality_score": round(quality_score, 1),
        "topic_class": topic_class,
        "topic_confidence": topic_confidence,
        "content_density": density.value,
        "density_details": density_details,
        "primary_intent": intent_scores[0][0].value if intent_scores else "unknown",
        "intent_confidence": intent_scores[0][1] if intent_scores else 0.0,
        "technical_score": tech_score_details["total"],
        "technical_breakdown": tech_score_details["breakdown"],
        "transcript_word_count": word_count,
        "has_code": preprocessor.has_code_example(transcript),
        "assessment_timestamp": time.time()
    }

# ============================================================
# CONTENT FILTERING
# ============================================================
def should_process_reel(reel: Dict) -> Tuple[bool, str, Dict]:
    """
    PHASE 1: Pre-filter to identify reels that are worth processing.
    Returns (should_process, reason, quality_assessment)
    """
    transcript = reel.get("transcript", "").strip()
    caption = reel.get("caption", "").strip()
    category_confidence = reel.get("category_confidence", 0)
    
    # Convert category_confidence to int
    try:
        category_confidence = int(category_confidence) if category_confidence else 0
    except:
        category_confidence = 0
    
    # Quick quality assessment
    quality = assess_content_quality(caption, transcript, reel.get("category", ""))
    
    # Filter 1: Content density SKIP
    if quality["content_density"] == "skip":
        return False, RejectionReason.PROMOTIONAL_CTA.value, quality
    
    # Filter 2: Too short
    if quality["transcript_word_count"] < CONFIG.get("MIN_TRANSCRIPT_LENGTH", 80):
        return False, RejectionReason.TOO_SHORT.value, quality
    
    # Filter 3: Low category confidence
    if category_confidence < CONFIG.get("MIN_CATEGORY_CONFIDENCE", 70):
        return False, RejectionReason.LOW_CONFIDENCE_CATEGORY.value, quality
    
    # Filter 4: Poor quality score
    if quality["quality_score"] < 30:
        return False, RejectionReason.LOW_TECHNICAL_SIGNAL.value, quality
    
    # Filter 5: Pure motivational with no technical content
    if (quality["primary_intent"] == "motivational" and 
        quality["technical_score"] < 3 and
        not quality["has_code"]):
        return False, RejectionReason.MOTIVATIONAL_CONTENT.value, quality
    
    # All checks passed
    return True, "", quality

# ============================================================
# INITIALIZATION LOGGING
# ============================================================
logger.info("=" * 60)
logger.info("CONTENT PROCESSING & CLASSIFICATION INITIALIZED")
logger.info("=" * 60)
logger.info(f"Technical keyword categories: {list(TECHNICAL_KEYWORDS.keys())}")
logger.info(f"Mechanism markers: {len(MECHANISM_MARKERS)} markers")
logger.info(f"Topic classes: {list(TOPIC_CLASSES.keys())}")
logger.info(f"Content intents: {[i.value for i in ContentIntent]}")
logger.info("=" * 60)

In [None]:
# ============================================================
# CELL 4: MEMORY & LEARNING SYSTEMS (COMPLETE)
# ============================================================

import pickle
import time
from datetime import datetime, timedelta
from collections import defaultdict, deque
import hashlib
import json
from pathlib import Path
from typing import Dict, List, Set, Optional, Tuple, Any
from threading import Lock
import re

logger = logging.getLogger("anki-pipeline.memory")

# ============================================================
# BLOOM FILTER ENHANCEMENTS
# ============================================================
class EnhancedBloomFilter:
    """
    Enhanced bloom filter with statistics and auto-rebuild.
    """
    
    def __init__(self, max_elements: int = 2_000_000, error_rate: float = 0.001):
        if not BLOOM_AVAILABLE:
            self.available = False
            self.filter = None
            logger.warning("Bloom filter not available - using fallback")
            return
        
        self.available = True
        self.max_elements = max_elements
        self.error_rate = error_rate
        self.filter = BloomFilter(max_elements=max_elements, error_rate=error_rate)
        
        # Statistics
        self.checks = 0
        self.hits = 0
        self.false_positives_suspected = 0
        self.lock = Lock()
        
        # Capacity tracking
        self.estimated_count = 0
        self.last_rebuild = time.time()
        
        logger.info(f"Bloom filter initialized: max={max_elements:,}, error_rate={error_rate}")
    
    def add(self, item: str):
        """Add item to bloom filter"""
        if not self.available or not self.filter:
            return
        
        with self.lock:
            self.filter.add(item)
            self.estimated_count += 1
    
    def __contains__(self, item: str) -> bool:
        """Check if item might be in bloom filter"""
        if not self.available or not self.filter:
            return False
        
        with self.lock:
            self.checks += 1
            result = item in self.filter
            if result:
                self.hits += 1
            return result
    
    def get_stats(self) -> Dict:
        """Get bloom filter statistics"""
        if not self.available:
            return {
                "available": False,
                "estimated_count": 0,
                "checks": 0,
                "hit_rate": 0.0,
                "false_positive_rate": 0.0,
                "occupancy": 0.0,
                "needs_rebuild": False
            }
        
        with self.lock:
            hit_rate = self.hits / self.checks if self.checks > 0 else 0.0
            fp_rate = self.false_positives_suspected / self.checks if self.checks > 0 else 0.0
            occupancy = self.estimated_count / self.max_elements
            
            needs_rebuild = (
                occupancy > 0.8 or  # High occupancy
                fp_rate > 0.01 or   # High false positive rate
                (time.time() - self.last_rebuild > 30 * 24 * 3600)  # 30 days old
            )
            
            return {
                "available": True,
                "estimated_count": self.estimated_count,
                "max_elements": self.max_elements,
                "checks": self.checks,
                "hits": self.hits,
                "hit_rate": round(hit_rate, 4),
                "false_positive_rate": round(fp_rate, 4),
                "occupancy": round(occupancy, 4),
                "needs_rebuild": needs_rebuild,
                "days_since_rebuild": round((time.time() - self.last_rebuild) / (24 * 3600), 1)
            }
    
    def record_false_positive(self):
        """Record suspected false positive"""
        if not self.available:
            return
        
        with self.lock:
            self.false_positives_suspected += 1
    
    def rebuild_if_needed(self) -> bool:
        """Rebuild bloom filter if needed, returns True if rebuilt"""
        stats = self.get_stats()
        if not stats["needs_rebuild"]:
            return False
        
        logger.warning(f"Bloom filter needs rebuild: occupancy={stats['occupancy']:.1%}, "
                      f"FP rate={stats['false_positive_rate']:.3%}")
        
        # In production, we would create a new filter and migrate
        # For now, just reset statistics
        with self.lock:
            self.estimated_count = int(self.estimated_count * 0.9)  # Estimate
            self.last_rebuild = time.time()
            self.false_positives_suspected = 0
        
        return True

# ============================================================
# HYBRID DUPLICATE DETECTOR (COMPLETE)
# ============================================================
class HybridDuplicateDetector:
    """
    Hybrid duplicate detection:
    - Bloom filter for O(1) probabilistic check
    - Exact set for final verification
    - Batch persistence
    """
    
    def __init__(self, fingerprints_file: Path, bloom_file: Path):
        self.fingerprints_file = fingerprints_file
        self.bloom_file = bloom_file
        self.lock = Lock()
        
        # Load fingerprints
        self.fingerprints: Set[str] = self._load_fingerprints()
        
        # Initialize enhanced bloom filter
        self.bloom = EnhancedBloomFilter()
        self._load_bloom()
        
        # Batch persistence
        self._dirty = False
        self._dirty_count = 0
        self._last_save = time.time()
        
        # Statistics
        self.duplicates_filtered = 0
        self.total_checks = 0
        
        logger.info(f"Duplicate detector initialized: {len(self.fingerprints):,} fingerprints loaded")
    
    def _load_fingerprints(self) -> Set[str]:
        """Load fingerprints from JSON file"""
        if not self.fingerprints_file.exists():
            return set()
        
        try:
            data = atomic_read(self.fingerprints_file, default=[])
            if isinstance(data, list):
                return set(data)
            elif isinstance(data, dict) and "fingerprints" in data:
                return set(data["fingerprints"])
            else:
                logger.warning(f"Unexpected fingerprint format in {self.fingerprints_file}")
                return set()
        except Exception as e:
            logger.error(f"Error loading fingerprints: {e}")
            return set()
    
    def _load_bloom(self):
        """Load bloom filter from disk"""
        if not self.bloom.available or not self.bloom_file.exists():
            return
        
        try:
            with open(self.bloom_file, "rb") as f:
                loaded = pickle.load(f)
                if isinstance(loaded, BloomFilter):
                    self.bloom.filter = loaded
                    # Estimate count (BloomFilter doesn't track exact count)
                    self.bloom.estimated_count = len(self.fingerprints)
                    logger.info(f"Bloom filter loaded from disk")
        except Exception as e:
            logger.warning(f"Error loading bloom filter: {e}")
    
    def _normalize_text(self, text: Any) -> str:
        """Normalize text for fingerprinting"""
        if not isinstance(text, str):
            try:
                text = json.dumps(text, ensure_ascii=False)
            except:
                text = str(text)
        
        # Remove punctuation and normalize
        text = re.sub(r'[^\w\s]', ' ', text.lower())
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def _create_fingerprint(self, card: Dict) -> str:
        """
        Create deterministic fingerprint for a card.
        Consistent across different representations of same content.
        """
        card_type = card.get("type", "basic")
        
        if card_type == "basic":
            front = self._normalize_text(card.get("front", ""))
            back = self._normalize_text(card.get("back", ""))
            
            # Extract key phrases (first 100 chars of each)
            front_key = ' '.join(front.split()[:20])  # First 20 words
            back_key = ' '.join(back.split()[:50])    # First 50 words
            
            content = f"basic:{front_key}:{back_key}"
            
        elif card_type == "cloze":
            cloze = card.get("cloze", "")
            # Remove cloze markers for content
            clean = re.sub(r'\{\{c\d+::(.*?)\}\}', r'\1', cloze)
            normalized = self._normalize_text(clean)
            
            # Take key part
            content_key = ' '.join(normalized.split()[:30])
            content = f"cloze:{content_key}"
            
        elif card_type == "tradeoff":
            front = self._normalize_text(card.get("front", ""))
            tradeoffs = card.get("tradeoffs", [])
            
            # Extract approach names
            approaches = []
            for t in tradeoffs[:3]:  # First 3 approaches
                approach = self._normalize_text(t.get("approach", ""))
                if approach:
                    approaches.append(approach[:50])  # First 50 chars
            
            content = f"tradeoff:{front}:{':'.join(approaches)}"
            
        else:
            content = f"unknown:{json.dumps(card, sort_keys=True)}"
        
        # Create MD5 hash
        return hashlib.md5(content.encode()).hexdigest()
    
    def _create_semantic_fingerprint(self, card: Dict) -> Optional[str]:
        """
        Create semantic fingerprint for similarity detection.
        Uses key terms and structure instead of exact content.
        """
        try:
            card_type = card.get("type", "basic")
            
            if card_type == "basic":
                text = f"{card.get('front', '')} {card.get('back', '')}"
            elif card_type == "cloze":
                cloze = card.get("cloze", "")
                # Extract content from cloze
                text = re.sub(r'\{\{c\d+::(.*?)\}\}', r'\1', cloze)
            elif card_type == "tradeoff":
                text = card.get("front", "")
                for t in card.get("tradeoffs", []):
                    text += f" {t.get('approach', '')}"
            else:
                return None
            
            # Normalize
            normalized = self._normalize_text(text)
            
            # Extract key terms (nouns and verbs)
            words = normalized.split()
            
            # Remove stop words
            stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 
                         'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were'}
            content_words = [w for w in words if w not in stop_words and len(w) > 3]
            
            # Take top 10 content words
            word_counter = Counter(content_words)
            top_words = [word for word, _ in word_counter.most_common(10)]
            
            if not top_words:
                return None
            
            # Sort for consistency
            top_words.sort()
            semantic_key = f"{card_type}:" + ":".join(top_words)
            
            return hashlib.md5(semantic_key.encode()).hexdigest()
            
        except Exception as e:
            logger.debug(f"Semantic fingerprint failed: {e}")
            return None
    
    def is_duplicate(self, card: Dict) -> Tuple[bool, Optional[str]]:
        """
        Check if card is a duplicate.
        Returns (is_duplicate, fingerprint_type)
        """
        self.total_checks += 1
        
        # Create fingerprints
        exact_fp = self._create_fingerprint(card)
        semantic_fp = self._create_semantic_fingerprint(card)
        
        # Track metrics
        PROCESSING_METRICS.increment("duplicate_checks")
        
        # First check: Exact fingerprint
        with self.lock:
            if exact_fp in self.fingerprints:
                self.duplicates_filtered += 1
                PROCESSING_METRICS.increment("exact_duplicates_found")
                return True, "exact"
        
        # Second check: Bloom filter (fast probabilistic)
        if self.bloom.available and exact_fp in self.bloom:
            # Might be a false positive, check exact set again
            with self.lock:
                if exact_fp in self.fingerprints:
                    self.duplicates_filtered += 1
                    PROCESSING_METRICS.increment("bloom_confirmed_duplicates")
                    return True, "bloom_confirmed"
                else:
                    # False positive suspected
                    self.bloom.record_false_positive()
                    PROCESSING_METRICS.increment("bloom_false_positives")
        
        # Third check: Semantic similarity (if enabled)
        if semantic_fp and CONFIG.get("ENABLE_SEMANTIC_DEDUP", True):
            with self.lock:
                # Check if we've seen similar content before
                semantic_key = f"semantic:{semantic_fp}"
                if semantic_key in self.fingerprints:
                    self.duplicates_filtered += 1
                    PROCESSING_METRICS.increment("semantic_duplicates_found")
                    return True, "semantic"
        
        return False, None
    
    def add_fingerprint(self, fingerprint: str, semantic_fp: Optional[str] = None):
        """Add fingerprint to tracking systems"""
        with self.lock:
            self.fingerprints.add(fingerprint)
            self.bloom.add(fingerprint)
            
            if semantic_fp:
                semantic_key = f"semantic:{semantic_fp}"
                self.fingerprints.add(semantic_key)
                self.bloom.add(semantic_key)
            
            self._dirty = True
            self._dirty_count += 1
    
    def save_if_dirty(self, force: bool = False):
        """Save fingerprints if dirty or forced"""
        if not self._dirty and not force:
            return
        
        if not force and self._dirty_count < CONFIG.get("FINGERPRINT_BATCH_SIZE", 100):
            return
        
        with self.lock:
            try:
                # Save fingerprints
                data = {
                    "fingerprints": list(self.fingerprints),
                    "count": len(self.fingerprints),
                    "duplicates_filtered": self.duplicates_filtered,
                    "total_checks": self.total_checks,
                    "last_updated": time.time(),
                    "version": "v2"
                }
                atomic_write(data, self.fingerprints_file)
                
                # Save bloom filter if available
                if self.bloom.available and self.bloom.filter:
                    with open(self.bloom_file, "wb") as f:
                        pickle.dump(self.bloom.filter, f)
                
                # Rebuild bloom filter if needed
                if self.bloom.rebuild_if_needed():
                    logger.info("Bloom filter rebuilt")
                
                self._dirty = False
                self._dirty_count = 0
                self._last_save = time.time()
                
                logger.debug(f"Saved {len(self.fingerprints):,} fingerprints")
                
            except Exception as e:
                logger.error(f"Error saving fingerprints: {e}")
    
    def get_stats(self) -> Dict:
        """Get detector statistics"""
        bloom_stats = self.bloom.get_stats() if self.bloom else {}
        
        return {
            "fingerprints_count": len(self.fingerprints),
            "duplicates_filtered": self.duplicates_filtered,
            "total_checks": self.total_checks,
            "duplicate_rate": self.duplicates_filtered / self.total_checks if self.total_checks > 0 else 0,
            "bloom_stats": bloom_stats,
            "last_save": self._last_save,
            "dirty": self._dirty,
            "dirty_count": self._dirty_count
        }
    
    def cleanup_old_fingerprints(self, max_age_days: int = 90):
        """Cleanup old fingerprints (not implemented for bloom filter)"""
        # Note: Bloom filters don't support deletion
        # In production, we would periodically create a new filter
        logger.info("Fingerprint cleanup not implemented for bloom filters")

# ============================================================
# TERMINAL REJECTION TRACKER (COMPLETE)
# ============================================================
class TerminalRejectionTracker:
    """
    Track reels that are terminally rejected by logic.
    Once a reel is marked terminal, it NEVER re-enters pipeline.
    """
    
    def __init__(self, file_path: Path):
        self.file_path = file_path
        self.terminal_reels: Dict[str, Dict] = self._load()
        self.lock = Lock()
        self._dirty = False
        
        logger.info(f"Terminal rejection tracker: {len(self.terminal_reels):,} reels tracked")
    
    def _load(self) -> Dict[str, Dict]:
        """Load terminal rejections from disk"""
        if not self.file_path.exists():
            return {}
        
        try:
            data = atomic_read(self.file_path, default={})
            if isinstance(data, dict) and "terminal_reels" in data:
                return data["terminal_reels"]
            elif isinstance(data, dict):
                return data  # Old format
            else:
                logger.warning(f"Unexpected terminal rejections format")
                return {}
        except Exception as e:
            logger.error(f"Error loading terminal rejections: {e}")
            return {}
    
    def mark_terminal(self, reel_id: str, reason: str = "", 
                     stage: str = "stage1", rejection_type: str = "STRUCTURAL",
                     details: Dict = None):
        """
        Mark a reel as terminally rejected.
        """
        with self.lock:
            self.terminal_reels[reel_id] = {
                "reason": reason,
                "stage": stage,
                "rejection_type": rejection_type,
                "details": details or {},
                "timestamp": time.time(),
                "iso_timestamp": datetime.utcnow().isoformat() + "Z"
            }
            self._dirty = True
            
            # Track metrics
            ROUTING_METRICS.increment(f"terminal_rejection_{rejection_type}")
            ROUTING_METRICS.increment(f"terminal_rejection_stage_{stage}")
            
            logger.debug(f"Reel {reel_id} marked terminal: {reason}")
    
    def is_terminal(self, reel_id: str) -> bool:
        """Check if reel is terminally rejected"""
        with self.lock:
            return reel_id in self.terminal_reels
    
    def get_rejection_details(self, reel_id: str) -> Optional[Dict]:
        """Get rejection details for a reel"""
        with self.lock:
            return self.terminal_reels.get(reel_id)
    
    def get_stats(self) -> Dict:
        """Get statistics about terminal rejections"""
        with self.lock:
            reasons = defaultdict(int)
            stages = defaultdict(int)
            types = defaultdict(int)
            
            now = time.time()
            recent_count = 0
            old_count = 0
            
            for entry in self.terminal_reels.values():
                reason = entry.get("reason", "unknown")
                stage = entry.get("stage", "unknown")
                rejection_type = entry.get("rejection_type", "unknown")
                timestamp = entry.get("timestamp", now)
                
                reasons[reason] += 1
                stages[stage] += 1
                types[rejection_type] += 1
                
                # Age analysis
                age_hours = (now - timestamp) / 3600
                if age_hours < 24:
                    recent_count += 1
                elif age_hours > 30 * 24:  # 30 days
                    old_count += 1
            
            return {
                "total": len(self.terminal_reels),
                "recent_24h": recent_count,
                "old_30d": old_count,
                "by_reason": dict(sorted(reasons.items(), key=lambda x: x[1], reverse=True)[:10]),
                "by_stage": dict(stages),
                "by_type": dict(types)
            }
    
    def save(self):
        """Save terminal rejections to disk"""
        if not self._dirty:
            return
        
        with self.lock:
            try:
                data = {
                    "terminal_reels": self.terminal_reels,
                    "count": len(self.terminal_reels),
                    "stats": self.get_stats(),
                    "last_updated": time.time(),
                    "version": "v2"
                }
                atomic_write(data, self.file_path)
                self._dirty = False
                logger.debug(f"Saved {len(self.terminal_reels):,} terminal rejections")
            except Exception as e:
                logger.error(f"Failed to save terminal rejections: {e}")
    
    def cleanup_old_rejections(self, max_age_days: int = 90):
        """Cleanup rejections older than max_age_days"""
        with self.lock:
            now = time.time()
            max_age_seconds = max_age_days * 24 * 3600
            
            to_remove = []
            for reel_id, entry in self.terminal_reels.items():
                timestamp = entry.get("timestamp", 0)
                if now - timestamp > max_age_seconds:
                    to_remove.append(reel_id)
            
            for reel_id in to_remove:
                del self.terminal_reels[reel_id]
            
            if to_remove:
                self._dirty = True
                logger.info(f"Cleaned up {len(to_remove)} old terminal rejections (> {max_age_days} days)")
    
    def export_for_analysis(self) -> List[Dict]:
        """Export rejections for analysis"""
        with self.lock:
            return [
                {
                    "reel_id": reel_id,
                    **details
                }
                for reel_id, details in self.terminal_reels.items()
            ]

# ============================================================
# REJECTION MEMORY WITH LEARNING (COMPLETE)
# ============================================================
class RejectionMemory:
    """
    Learning system that remembers rejection patterns and successful strategies.
    Key format: concept::category::topic_class
    """
    
    def __init__(self, memory_file: Path):
        self.memory_file = memory_file
        self.memory = self._load()
        self.lock = Lock()
        self._dirty = False
        
        # Learning parameters
        self.min_attempts_for_learning = 3
        self.success_decay_factor = 0.95  # Decay old successes
        self.rejection_decay_factor = 0.9  # Decay old rejections
        
        logger.info(f"Rejection memory: {len(self.memory):,} concepts tracked")
    
    def _load(self) -> Dict:
        """Load rejection memory from disk"""
        if not self.memory_file.exists():
            return {}
        
        try:
            data = atomic_read(self.memory_file, default={})
            if isinstance(data, dict):
                return data
            else:
                logger.warning(f"Unexpected memory format")
                return {}
        except Exception as e:
            logger.error(f"Error loading rejection memory: {e}")
            return {}
    
    def _make_key(self, concept: str, category: str = "", topic_class: str = "") -> str:
        """Create memory key"""
        return f"{concept}::{category}::{topic_class}"
    
    def _make_canonical_key(self, concept: str, category: str = "") -> str:
        """Create canonical key (without topic class)"""
        return f"{concept}::{category or 'unknown'}"
    
    def _get_or_create_entry(self, concept: str, category: str = "", topic_class: str = "") -> Dict:
        """Get or create memory entry"""
        key = self._make_key(concept, category, topic_class)
        
        if key not in self.memory:
            self.memory[key] = {
                "concept": concept,
                "category": category,
                "topic_class": topic_class,
                "rejections": [],
                "successful_strategies": [],  # Now a list to track multiple successes
                "first_seen_ts": time.time(),
                "first_success_ts": None,
                "last_success_ts": None,
                "attempts_until_first_success": None,
                "total_attempts": 0,
                "success_count": 0,
                "rejection_count": 0,
                "success_by_strategy": defaultdict(int),
                "rejection_by_reason": defaultdict(int),
                "topic_class_history": [],  # Track topic class changes
                "confidence_history": [],  # Track confidence over time
                "metadata": {}
            }
            self._dirty = True
        
        return self.memory[key]
    
    def record_rejection(self, concept: str, score: int, reason: str,
                        category: str = "", topic_class: str = "",
                        details: Dict = None):
        """
        Record a rejection with details.
        """
        with self.lock:
            entry = self._get_or_create_entry(concept, category, topic_class)
            
            rejection_record = {
                "score": score,
                "reason": reason,
                "details": details or {},
                "timestamp": time.time(),
                "topic_class": topic_class
            }
            
            entry["rejections"].append(rejection_record)
            entry["rejection_count"] += 1
            entry["total_attempts"] += 1
            entry["rejection_by_reason"][reason] += 1
            
            # Track topic class history
            if topic_class and (not entry["topic_class_history"] or 
                              entry["topic_class_history"][-1]["topic_class"] != topic_class):
                entry["topic_class_history"].append({
                    "topic_class": topic_class,
                    "timestamp": time.time()
                })
            
            self._dirty = True
            
            # Apply decay to old records
            self._apply_decay(entry)
            
            # Log for debugging
            if entry["rejection_count"] % 10 == 0:
                logger.debug(f"Concept '{concept}' has {entry['rejection_count']} rejections")
    
    def record_success(self, concept: str, strategy: str, confidence: float,
                      category: str = "", topic_class: str = "",
                      delta_score: int = 0, details: Dict = None):
        """
        Record a successful processing.
        """
        with self.lock:
            entry = self._get_or_create_entry(concept, category, topic_class)
            
            success_record = {
                "strategy": strategy,
                "confidence": confidence,
                "delta_score": delta_score,
                "topic_class": topic_class,
                "details": details or {},
                "timestamp": time.time()
            }
            
            entry["successful_strategies"].append(success_record)
            entry["success_count"] += 1
            entry["total_attempts"] += 1
            entry["last_success_ts"] = time.time()
            entry["success_by_strategy"][strategy] += 1
            
            # Track confidence history
            entry["confidence_history"].append({
                "confidence": confidence,
                "strategy": strategy,
                "timestamp": time.time()
            })
            
            # Keep only recent confidence history
            if len(entry["confidence_history"]) > 20:
                entry["confidence_history"] = entry["confidence_history"][-20:]
            
            # Record first success
            if entry["first_success_ts"] is None:
                entry["first_success_ts"] = time.time()
                entry["attempts_until_first_success"] = entry["rejection_count"]
                logger.info(f"Concept '{concept}' succeeded after {entry['attempts_until_first_success']} attempts")
            
            self._dirty = True
            
            # Apply decay to old records
            self._apply_decay(entry)
    
    def _apply_decay(self, entry: Dict):
        """Apply decay to old records to give more weight to recent data"""
        now = time.time()
        decay_window = 30 * 24 * 3600  # 30 days
        
        # Decay old successes
        recent_successes = []
        for success in entry.get("successful_strategies", []):
            age = now - success.get("timestamp", 0)
            if age < decay_window:
                recent_successes.append(success)
        
        if len(recent_successes) < len(entry.get("successful_strategies", [])):
            entry["successful_strategies"] = recent_successes
        
        # Decay old rejections
        recent_rejections = []
        for rejection in entry.get("rejections", []):
            age = now - rejection.get("timestamp", 0)
            if age < decay_window:
                recent_rejections.append(rejection)
        
        if len(recent_rejections) < len(entry.get("rejections", [])):
            entry["rejections"] = recent_rejections
    
    def get_best_strategy(self, concept: str, category: str = "", 
                         topic_class: str = "") -> Optional[Dict]:
        """
        Get the best strategy for a concept based on historical success.
        Returns strategy details or None if no successful strategy.
        """
        with self.lock:
            key = self._make_key(concept, category, topic_class)
            entry = self.memory.get(key)
            
            if not entry or not entry.get("successful_strategies"):
                # Try canonical key (without topic class)
                canonical_key = self._make_canonical_key(concept, category)
                for stored_key, stored_entry in self.memory.items():
                    if stored_key.startswith(canonical_key + "::"):
                        if stored_entry.get("successful_strategies"):
                            entry = stored_entry
                            break
            
            if not entry or not entry.get("successful_strategies"):
                return None
            
            # Find most recent successful strategy
            strategies = entry.get("successful_strategies", [])
            if not strategies:
                return None
            
            # Sort by recency and confidence
            strategies.sort(key=lambda x: (
                x.get("timestamp", 0),
                x.get("confidence", 0)
            ), reverse=True)
            
            best_strategy = strategies[0]
            
            # Calculate success rate for this strategy
            strategy_name = best_strategy.get("strategy", "")
            total_with_strategy = entry["success_by_strategy"].get(strategy_name, 0)
            total_attempts = entry.get("total_attempts", 0)
            
            success_rate = total_with_strategy / total_attempts if total_attempts > 0 else 0
            
            return {
                "strategy": strategy_name,
                "confidence": best_strategy.get("confidence", 0),
                "success_rate": success_rate,
                "last_used": best_strategy.get("timestamp", 0),
                "total_uses": total_with_strategy,
                "topic_class": best_strategy.get("topic_class", ""),
                "is_recent": (time.time() - best_strategy.get("timestamp", 0)) < (7 * 24 * 3600)
            }
    
    def should_skip(self, concept: str, category: str = "", 
                   topic_class: str = "", max_rejections: int = 5) -> bool:
        """
        Determine if a concept should be skipped due to too many rejections.
        """
        with self.lock:
            key = self._make_key(concept, category, topic_class)
            entry = self.memory.get(key)
            
            if not entry:
                return False
            
            rejection_count = entry.get("rejection_count", 0)
            has_success = entry.get("success_count", 0) > 0
            
            # If too many rejections and no success, skip
            if rejection_count >= max_rejections and not has_success:
                return True
            
            # If recent rejections are increasing
            recent_rejections = 0
            now = time.time()
            for rejection in entry.get("rejections", []):
                if now - rejection.get("timestamp", 0) < 24 * 3600:  # Last 24 hours
                    recent_rejections += 1
            
            if recent_rejections >= 3 and not has_success:
                return True
            
            return False
    
    def get_concept_stats(self, concept: str, category: str = "", 
                         topic_class: str = "") -> Optional[Dict]:
        """Get statistics for a specific concept"""
        with self.lock:
            key = self._make_key(concept, category, topic_class)
            entry = self.memory.get(key)
            
            if not entry:
                return None
            
            # Calculate success rate
            success_rate = entry["success_count"] / entry["total_attempts"] if entry["total_attempts"] > 0 else 0
            
            # Most common rejection reason
            rejection_reasons = entry.get("rejection_by_reason", {})
            most_common_reason = max(rejection_reasons.items(), key=lambda x: x[1])[0] if rejection_reasons else "none"
            
            # Most successful strategy
            success_strategies = entry.get("success_by_strategy", {})
            most_successful_strategy = max(success_strategies.items(), key=lambda x: x[1])[0] if success_strategies else "none"
            
            return {
                "concept": concept,
                "category": category,
                "topic_class": topic_class,
                "total_attempts": entry["total_attempts"],
                "success_count": entry["success_count"],
                "rejection_count": entry["rejection_count"],
                "success_rate": success_rate,
                "first_seen": entry["first_seen_ts"],
                "first_success": entry["first_success_ts"],
                "last_success": entry["last_success_ts"],
                "attempts_until_first_success": entry["attempts_until_first_success"],
                "most_common_rejection_reason": most_common_reason,
                "most_successful_strategy": most_successful_strategy,
                "rejection_by_reason": dict(rejection_reasons),
                "success_by_strategy": dict(success_strategies)
            }
    
    def get_learning_velocity(self) -> Dict:
        """Calculate learning velocity metrics"""
        with self.lock:
            concepts_with_success = []
            concepts_learning = []
            
            for entry in self.memory.values():
                if entry.get("success_count", 0) > 0:
                    concepts_with_success.append(entry)
                elif entry.get("rejection_count", 0) > 0:
                    concepts_learning.append(entry)
            
            # Calculate average attempts until first success
            attempts_until_success = []
            for entry in concepts_with_success:
                if entry.get("attempts_until_first_success"):
                    attempts_until_success.append(entry["attempts_until_first_success"])
            
            avg_attempts = sum(attempts_until_success) / len(attempts_until_success) if attempts_until_success else 0
            
            # Calculate success rate by topic class
            success_by_topic = defaultdict(lambda: {"success": 0, "total": 0})
            for entry in self.memory.values():
                topic = entry.get("topic_class", "unknown")
                success_by_topic[topic]["success"] += entry.get("success_count", 0)
                success_by_topic[topic]["total"] += entry.get("total_attempts", 0)
            
            # Calculate success rates
            success_rates = {}
            for topic, stats in success_by_topic.items():
                if stats["total"] > 0:
                    success_rates[topic] = stats["success"] / stats["total"]
            
            return {
                "total_concepts": len(self.memory),
                "concepts_learned": len(concepts_with_success),
                "concepts_learning": len(concepts_learning),
                "avg_attempts_until_success": round(avg_attempts, 2),
                "success_rates_by_topic": success_rates,
                "learning_efficiency": len(concepts_with_success) / len(self.memory) if self.memory else 0
            }
    
    def save(self, prune: bool = True):
        """Save rejection memory to disk"""
        if not self._dirty:
            return
        
        with self.lock:
            try:
                # Prune old entries if requested
                if prune:
                    self._prune_old_entries()
                
                data = {
                    "memory": self.memory,
                    "count": len(self.memory),
                    "learning_stats": self.get_learning_velocity(),
                    "last_updated": time.time(),
                    "version": "v2"
                }
                atomic_write(data, self.memory_file)
                self._dirty = False
                logger.debug(f"Saved rejection memory: {len(self.memory):,} concepts")
            except Exception as e:
                logger.error(f"Failed to save rejection memory: {e}")
    
    def _prune_old_entries(self, ttl_days: int = 90):
        """Prune old entries that haven't been successful"""
        now = time.time()
        ttl_seconds = ttl_days * 24 * 60 * 60
        
        keys_to_remove = []
        for key, entry in self.memory.items():
            first_seen = entry.get("first_seen_ts", now)
            age_seconds = now - first_seen
            has_success = entry.get("success_count", 0) > 0
            
            if age_seconds > ttl_seconds and not has_success:
                keys_to_remove.append(key)
        
        for key in keys_to_remove:
            del self.memory[key]
        
        if keys_to_remove:
            logger.info(f"Pruned {len(keys_to_remove)} stale memory entries (>{ttl_days} days old, no success)")
            self._dirty = True
    
    def export_for_analysis(self) -> List[Dict]:
        """Export memory for analysis"""
        with self.lock:
            return [
                self.get_concept_stats(
                    entry["concept"],
                    entry["category"],
                    entry["topic_class"]
                )
                for entry in self.memory.values()
            ]

# ============================================================
# INITIALIZE GLOBAL INSTANCES
# ============================================================
logger.info("=" * 60)
logger.info("INITIALIZING MEMORY & LEARNING SYSTEMS")
logger.info("=" * 60)

# Initialize duplicate detector
duplicate_detector = HybridDuplicateDetector(CARD_FINGERPRINTS_FILE, BLOOM_FILE)

# Initialize terminal rejection tracker
terminal_rejections = TerminalRejectionTracker(TERMINAL_REJECTIONS_FILE)

# Initialize rejection memory if enabled
rejection_memory = None
if CONFIG.get("ENABLE_REJECTION_LEARNING", True):
    rejection_memory = RejectionMemory(REJECTION_MEMORY_FILE)
    logger.info("Rejection memory enabled")
else:
    logger.info("Rejection memory disabled")

# Initialize from existing state
logger.info(f"Duplicate detector: {duplicate_detector.get_stats()['fingerprints_count']:,} fingerprints")
logger.info(f"Terminal rejections: {terminal_rejections.get_stats()['total']:,} reels")
if rejection_memory:
    stats = rejection_memory.get_learning_velocity()
    logger.info(f"Rejection memory: {stats['total_concepts']:,} concepts, "
                f"{stats['concepts_learned']:,} learned")

logger.info("=" * 60)

In [None]:
# ============================================================
# CELL 5: LLM & MODEL MANAGEMENT (COMPLETE)
# ============================================================

import time
import subprocess
import requests
from typing import Dict, List, Optional, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import json

logger = logging.getLogger("anki-pipeline.llm")

# ============================================================
# MODEL CONFIGURATION
# ============================================================

# Primary model configurations
PIPELINE_MODELS = {
    'extract': ModelConfig(
        name='mistral:7b-instruct-v0.2',
        temperature=0.1,
        num_predict=2000,
        timeout=300
    ),
    'extract_retry': ModelConfig(
        name='mistral:7b-instruct-v0.2',
        temperature=0.15,
        num_predict=1500,
        timeout=180
    ),
    'generate_basic': ModelConfig(
        name='mistral:7b-instruct',
        temperature=0.2,
        num_predict=1500,
        timeout=180
    ),
    'generate_cloze': ModelConfig(
        name='mistral:7b-instruct',
        temperature=0.2,
        num_predict=800,
        timeout=120
    ),
    'generate_tradeoff': ModelConfig(
        name='qwen2.5:7b-instruct',
        temperature=0.2,
        num_predict=1200,
        timeout=150
    ),
    'validate': ModelConfig(
        name='qwen2.5:7b-instruct',
        temperature=0.05,
        num_predict=500,
        timeout=60
    ),
    'normalize': ModelConfig(
        name='mistral:7b-instruct-v0.2',
        temperature=0.1,
        num_predict=1000,
        timeout=120
    )
}

# Model fallback chains
MODEL_FALLBACKS = {
    'mistral:7b-instruct-v0.2': ['mistral:7b-instruct', 'mistral:latest'],
    'llama3.1:8b-instruct-q8_0': [
        'mistral:7b-instruct',
        'qwen2.5:7b-instruct',
        'llama3.1:8b-instruct-q4_0',
        'llama3.1:latest'
    ],
    'qwen2.5:7b-instruct-q4_K_M': [
        'qwen2.5:7b-instruct',
        'mistral:7b-instruct',
        'llama3.1:8b-instruct-q4_0'
    ],
    'qwen2.5:7b-instruct': [
        'qwen2.5:latest',
        'mistral:7b-instruct',
        'phi3:mini'
    ],
    'mistral:7b-instruct': [
        'mistral:latest',
        'qwen2.5:7b-instruct',
        'llama3.1:8b-instruct-q4_0'
    ]
}

# Model capability profiles
MODEL_CAPABILITIES = {
    'mistral': {
        'context_window': 32768,
        'supports_json': True,
        'max_tokens': 8000,
        'good_at': ['reasoning', 'instruction_following']
    },
    'qwen': {
        'context_window': 32768,
        'supports_json': True,
        'max_tokens': 8000,
        'good_at': ['coding', 'technical_content']
    },
    'llama': {
        'context_window': 8192,
        'supports_json': True,
        'max_tokens': 4000,
        'good_at': ['general_knowledge', 'summarization']
    },
    'phi': {
        'context_window': 4096,
        'supports_json': True,
        'max_tokens': 2000,
        'good_at': ['fast_inference', 'small_tasks']
    }
}

# ============================================================
# HTTP SESSION MANAGEMENT
# ============================================================

class ResilientSession:
    """HTTP session with retry and circuit breaker support"""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            "Connection": "keep-alive",
            "User-Agent": "Anki-Pipeline/1.0"
        })
        
        # Retry configuration
        self.max_retries = 3
        self.retry_delays = [1, 2, 5, 10]  # seconds
        
        # Statistics
        self.requests_made = 0
        self.failures = 0
        self.total_latency = 0.0
        
        logger.info("Resilient HTTP session initialized")
    
    def request_with_retry(self, method: str, url: str, **kwargs) -> requests.Response:
        """Make HTTP request with exponential backoff"""
        last_error = None
        
        for attempt in range(self.max_retries + 1):
            try:
                start_time = time.time()
                
                # Add timeout if not specified
                if 'timeout' not in kwargs:
                    kwargs['timeout'] = 30
                
                response = self.session.request(method, url, **kwargs)
                latency = time.time() - start_time
                
                # Track statistics
                self.requests_made += 1
                self.total_latency += latency
                
                # Check for HTTP errors
                if response.status_code >= 500:
                    raise requests.exceptions.HTTPError(f"Server error: {response.status_code}")
                
                # Success
                PROCESSING_METRICS.record_timing("http_request", latency)
                return response
                
            except (requests.exceptions.Timeout, 
                    requests.exceptions.ConnectionError,
                    requests.exceptions.HTTPError) as e:
                
                last_error = e
                self.failures += 1
                
                if attempt < self.max_retries:
                    delay = self.retry_delays[min(attempt, len(self.retry_delays) - 1)]
                    logger.warning(f"Request failed (attempt {attempt + 1}/{self.max_retries + 1}): {e}. Retrying in {delay}s...")
                    time.sleep(delay)
                else:
                    logger.error(f"Request failed after {self.max_retries + 1} attempts: {e}")
        
        # All retries failed
        error_tracker.record(
            "http_request_failed",
            f"Failed after {self.max_retries + 1} attempts",
            {"url": url, "method": method, "last_error": str(last_error)}
        )
        
        raise last_error or requests.exceptions.RequestException("Request failed")
    
    def get_stats(self) -> Dict:
        """Get session statistics"""
        avg_latency = self.total_latency / self.requests_made if self.requests_made > 0 else 0
        success_rate = (self.requests_made - self.failures) / self.requests_made if self.requests_made > 0 else 0
        
        return {
            "requests_made": self.requests_made,
            "failures": self.failures,
            "success_rate": success_rate,
            "avg_latency": avg_latency,
            "total_latency": self.total_latency
        }

# Create global session
session = ResilientSession()

# ============================================================
# GPU & HARDWARE DETECTION
# ============================================================

def detect_hardware() -> Dict:
    """Detect available hardware for model optimization"""
    hardware_info = {
        "gpu_available": False,
        "gpu_memory_gb": 0,
        "cpu_cores": os.cpu_count() or 1,
        "total_memory_gb": 0,
        "platform": sys.platform
    }
    
    try:
        # Check for NVIDIA GPU
        result = subprocess.run(
            ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader,nounits"],
            capture_output=True,
            text=True,
            timeout=5
        )
        
        if result.returncode == 0:
            hardware_info["gpu_available"] = True
            memory_mb = int(result.stdout.strip().split('\n')[0])
            hardware_info["gpu_memory_gb"] = memory_mb / 1024
            
            # Get GPU name
            result = subprocess.run(
                ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"],
                capture_output=True,
                text=True,
                timeout=5
            )
            if result.returncode == 0:
                hardware_info["gpu_name"] = result.stdout.strip()
    
    except (subprocess.TimeoutExpired, FileNotFoundError, ValueError):
        # No GPU or nvidia-smi not available
        pass
    
    try:
        # Get system memory (Linux/Mac)
        if sys.platform == "linux":
            with open('/proc/meminfo', 'r') as f:
                for line in f:
                    if 'MemTotal' in line:
                        kb = int(line.split()[1])
                        hardware_info["total_memory_gb"] = kb / (1024 * 1024)
                        break
        elif sys.platform == "darwin":  # macOS
            result = subprocess.run(
                ["sysctl", "-n", "hw.memsize"],
                capture_output=True,
                text=True
            )
            if result.returncode == 0:
                bytes_mem = int(result.stdout.strip())
                hardware_info["total_memory_gb"] = bytes_mem / (1024 ** 3)
    
    except Exception as e:
        logger.debug(f"Could not detect system memory: {e}")
    
    logger.info(f"Hardware detected: {json.dumps(hardware_info, indent=2)}")
    return hardware_info

# Detect hardware
HARDWARE_INFO = detect_hardware()

# ============================================================
# MODEL MANAGEMENT
# ============================================================

class ModelManager:
    """Manage LLM models with health checks and fallbacks"""
    
    def __init__(self):
        self.available_models_cache = None
        self.cache_timestamp = 0
        self.cache_ttl = 30  # seconds
        self.lock = Lock()
        
        # Model health tracking
        self.model_health = defaultdict(lambda: {"success": 0, "failures": 0})
        self.unhealthy_models = set()
        self.health_lock = Lock()
        
        logger.info("Model manager initialized")
    
    def get_available_models(self, force_refresh: bool = False) -> List[str]:
        """Get list of available models from Ollama"""
        now = time.time()
        
        with self.lock:
            if (not force_refresh and 
                self.available_models_cache and 
                now - self.cache_timestamp < self.cache_ttl):
                return self.available_models_cache
        
        try:
            response = session.request_with_retry(
                "GET",
                f"{CONFIG['OLLAMA_URL']}/api/tags",
                timeout=10
            )
            response.raise_for_status()
            
            models_data = response.json()
            models = [m["name"] for m in models_data.get("models", [])]
            
            with self.lock:
                self.available_models_cache = models
                self.cache_timestamp = now
            
            logger.debug(f"Found {len(models)} available models")
            return models
            
        except Exception as e:
            logger.error(f"Failed to get available models: {e}")
            
            # Return cache if available, even if stale
            with self.lock:
                if self.available_models_cache:
                    logger.warning("Using stale model cache")
                    return self.available_models_cache
            
            return []
    
    def check_model_health(self, model_name: str) -> Tuple[bool, Optional[str]]:
        """Check if a model is healthy and responsive"""
        # Skip if marked unhealthy recently
        with self.health_lock:
            if model_name in self.unhealthy_models:
                return False, "Marked as unhealthy"
        
        try:
            # Simple test prompt
            test_payload = {
                "model": model_name,
                "prompt": "test",
                "stream": False,
                "options": {"num_predict": 1}
            }
            
            start_time = time.time()
            response = session.request_with_retry(
                "POST",
                f"{CONFIG['OLLAMA_URL']}/api/generate",
                json=test_payload,
                timeout=15
            )
            latency = time.time() - start_time
            
            if response.status_code == 200:
                with self.health_lock:
                    self.model_health[model_name]["success"] += 1
                
                logger.debug(f"Model {model_name} health check passed ({latency:.2f}s)")
                return True, None
            else:
                error_msg = response.text
                logger.warning(f"Model {model_name} health check failed: {response.status_code} - {error_msg}")
                
                with self.health_lock:
                    self.model_health[model_name]["failures"] += 1
                
                return False, f"HTTP {response.status_code}: {error_msg}"
                
        except Exception as e:
            logger.warning(f"Model {model_name} health check error: {e}")
            
            with self.health_lock:
                self.model_health[model_name]["failures"] += 1
                if self.model_health[model_name]["failures"] >= 3:
                    self.unhealthy_models.add(model_name)
                    logger.error(f"Model {model_name} marked as unhealthy")
            
            return False, str(e)
    
    def get_model_capability(self, model_name: str) -> Dict:
        """Get capabilities of a specific model"""
        model_lower = model_name.lower()
        capabilities = {
            "supports_generate": True,
            "supports_json": True,
            "estimated_context": 4096,
            "estimated_max_tokens": 2000,
            "family": "unknown"
        }
        
        # Determine model family
        for family, profile in MODEL_CAPABILITIES.items():
            if family in model_lower:
                capabilities.update(profile)
                capabilities["family"] = family
                break
        
        # Check for chat-only models
        chat_hints = ["chat", "instruct-chat", "assistant"]
        if any(hint in model_lower for hint in chat_hints):
            capabilities["supports_generate"] = True  # Most instruct models support generate
        
        # Check for embedding models
        if "embed" in model_lower:
            capabilities["supports_generate"] = False
        
        # Adjust based on model size hints
        if "7b" in model_lower:
            capabilities["estimated_context"] = min(capabilities["estimated_context"], 32768)
            capabilities["estimated_max_tokens"] = min(capabilities["estimated_max_tokens"], 8000)
        elif "13b" in model_lower:
            capabilities["estimated_context"] = min(capabilities["estimated_context"], 16384)
            capabilities["estimated_max_tokens"] = min(capabilities["estimated_max_tokens"], 4000)
        
        # Cache the capability
        with MODEL_CAPABILITY_LOCK:
            MODEL_CAPABILITY_CACHE[model_name] = capabilities
        
        return capabilities
    
    def select_best_model(self, preferred: str, fallbacks: List[str], 
                         purpose: str = "general") -> Optional[str]:
        """
        Select the best available model for a specific purpose.
        """
        available_models = set(self.get_available_models())
        
        # Check preferred model first
        if preferred in available_models:
            is_healthy, error = self.check_model_health(preferred)
            if is_healthy:
                capabilities = self.get_model_capability(preferred)
                
                # Check if model supports the purpose
                if purpose == "extract" and capabilities.get("supports_json", True):
                    logger.debug(f"Selected preferred model: {preferred}")
                    return preferred
                elif purpose == "generate" and capabilities.get("supports_generate", True):
                    logger.debug(f"Selected preferred model: {preferred}")
                    return preferred
                else:
                    logger.warning(f"Preferred model {preferred} doesn't support {purpose}")
        
        # Try fallbacks
        for fallback in fallbacks:
            if fallback in available_models:
                is_healthy, error = self.check_model_health(fallback)
                if is_healthy:
                    capabilities = self.get_model_capability(fallback)
                    
                    # Check capability for purpose
                    if (purpose == "extract" and capabilities.get("supports_json", True)) or \
                       (purpose == "generate" and capabilities.get("supports_generate", True)):
                        logger.info(f"Using fallback model: {fallback} (instead of {preferred})")
                        return fallback
        
        logger.error(f"No suitable model found for {purpose}. Preferred: {preferred}, Fallbacks: {fallbacks}")
        return None
    
    def warmup_models(self, models_to_warm: Optional[List[str]] = None):
        """Warm up models in parallel"""
        if models_to_warm is None:
            models_to_warm = list(set(cfg.name for cfg in PIPELINE_MODELS.values()))
        
        logger.info(f"Warming up {len(models_to_warm)} models...")
        
        warmup_results = []
        with ThreadPoolExecutor(max_workers=min(4, len(models_to_warm))) as executor:
            future_to_model = {
                executor.submit(self.check_model_health, model): model 
                for model in models_to_warm
            }
            
            for future in as_completed(future_to_model):
                model = future_to_model[future]
                try:
                    healthy, error = future.result(timeout=30)
                    warmup_results.append((model, healthy, error))
                except Exception as e:
                    warmup_results.append((model, False, str(e)))
        
        # Log results
        healthy_count = sum(1 for _, healthy, _ in warmup_results if healthy)
        logger.info(f"Warmup complete: {healthy_count}/{len(warmup_results)} models healthy")
        
        for model, healthy, error in warmup_results:
            if healthy:
                logger.info(f"  ✅ {model}")
            else:
                logger.warning(f"  ❌ {model}: {error}")
        
        return warmup_results
    
    def get_model_stats(self) -> Dict:
        """Get statistics about model usage and health"""
        with self.health_lock:
            stats = {}
            for model, health in self.model_health.items():
                total = health["success"] + health["failures"]
                success_rate = health["success"] / total if total > 0 else 0
                stats[model] = {
                    "success": health["success"],
                    "failures": health["failures"],
                    "success_rate": success_rate,
                    "unhealthy": model in self.unhealthy_models
                }
            
            return {
                "model_stats": stats,
                "unhealthy_models": list(self.unhealthy_models),
                "cache_size": len(self.available_models_cache) if self.available_models_cache else 0
            }

# Initialize model manager
model_manager = ModelManager()

# ============================================================
# LLM INTERFACE WITH CIRCUIT BREAKER
# ============================================================

class LLMInterface:
    """Interface to LLM with circuit breaker protection"""
    
    def __init__(self):
        self.circuit_breaker = OLLAMA_CIRCUIT_BREAKER
        self.session = session
        self.lock = Lock()
        
        # Token tracking
        self.total_tokens = 0
        self.total_requests = 0
        
        # Performance tracking
        self.response_times = []
        self.max_response_time_history = 1000
        
        logger.info("LLM interface initialized with circuit breaker")
    
    def _truncate_prompt_if_needed(self, prompt: str, model_config: ModelConfig) -> str:
        """Truncate prompt if too long for model context"""
        prompt_len = len(prompt)
        
        # Get model capabilities
        capabilities = model_manager.get_model_capability(model_config.name)
        max_context = capabilities.get("estimated_context", 4096)
        
        # Rough estimate: 1 token ≈ 4 chars for English
        estimated_tokens = prompt_len // 4
        
        if estimated_tokens > max_context * 0.8:  # 80% of context
            logger.warning(f"Prompt may be too long: {estimated_tokens} tokens "
                          f"(model context: {max_context})")
            
            # Calculate how much to truncate
            target_chars = int(max_context * 0.7 * 4)  # 70% of context
            if prompt_len > target_chars:
                logger.warning(f"Truncating prompt from {prompt_len} to {target_chars} chars")
                
                # Try to truncate at a paragraph boundary
                truncated = prompt[:target_chars]
                last_period = truncated.rfind('.')
                last_newline = truncated.rfind('\n')
                
                if last_period > target_chars * 0.9:  # Good break point
                    prompt = truncated[:last_period + 1]
                elif last_newline > target_chars * 0.9:
                    prompt = truncated[:last_newline]
                else:
                    prompt = truncated
                
                # Add instruction
                prompt += "\n\n[Content truncated due to length. Provide concise answer.]"
        
        return prompt
    
    def _prepare_payload(self, prompt: str, model_config: ModelConfig) -> Dict:
        """Prepare payload for Ollama API"""
        # Truncate if needed
        prompt = self._truncate_prompt_if_needed(prompt, model_config)
        
        # Create payload
        payload = model_config.to_ollama_payload(prompt)
        
        # Add GPU layers if available
        if HARDWARE_INFO["gpu_available"] and model_config.gpu_layers == -1:
            # Auto-configure GPU layers based on available memory
            gpu_memory_gb = HARDWARE_INFO["gpu_memory_gb"]
            if gpu_memory_gb >= 16:
                payload["options"]["num_gpu"] = 100  # Use all layers
            elif gpu_memory_gb >= 8:
                payload["options"]["num_gpu"] = 50   # Half layers
            elif gpu_memory_gb >= 4:
                payload["options"]["num_gpu"] = 25   # Quarter layers
        
        return payload
    
    def generate(self, prompt: str, model_config: ModelConfig, 
                max_retries: int = 3) -> str:
        """
        Generate text using LLM with circuit breaker protection.
        
        Args:
            prompt: The prompt to send
            model_config: Model configuration
            max_retries: Maximum number of retries
            
        Returns:
            Generated text
            
        Raises:
            RuntimeError: If generation fails after retries
        """
        start_time = time.time()
        
        def _call_ollama():
            """Inner function for circuit breaker protection"""
            payload = self._prepare_payload(prompt, model_config)
            
            try:
                response = self.session.request_with_retry(
                    "POST",
                    f"{CONFIG['OLLAMA_URL']}/api/generate",
                    json=payload,
                    timeout=model_config.timeout
                )
                response.raise_for_status()
                
                result = response.json()
                
                # Track tokens
                if "prompt_eval_count" in result:
                    self.total_tokens += result.get("prompt_eval_count", 0)
                if "eval_count" in result:
                    self.total_tokens += result.get("eval_count", 0)
                
                self.total_requests += 1
                
                return result.get("response", "")
                
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 400:
                    error_detail = e.response.text
                    logger.error(f"HTTP 400 from Ollama: {error_detail}")
                    
                    # Check for common errors
                    if "context" in error_detail.lower() or "too long" in error_detail.lower():
                        raise RuntimeError(f"Prompt too long for model {model_config.name}")
                    elif "model not found" in error_detail.lower():
                        raise RuntimeError(f"Model {model_config.name} not found. "
                                          f"Pull it with: ollama pull {model_config.name}")
                
                raise
        
        # Use circuit breaker
        try:
            response_text = self.circuit_breaker.call(_call_ollama)
            
            # Record success
            latency = time.time() - start_time
            self.response_times.append(latency)
            if len(self.response_times) > self.max_response_time_history:
                self.response_times = self.response_times[-self.max_response_time_history:]
            
            PROCESSING_METRICS.record_timing("llm_generation", latency)
            PROCESSING_METRICS.increment("llm_requests")
            
            logger.debug(f"LLM call succeeded: {model_config.name}, {latency:.2f}s")
            return response_text
            
        except Exception as e:
            latency = time.time() - start_time
            PROCESSING_METRICS.increment("llm_failures")
            
            error_tracker.record(
                "llm_generation_failed",
                str(e),
                {
                    "model": model_config.name,
                    "prompt_length": len(prompt),
                    "latency": latency,
                    "retries": max_retries
                }
            )
            
            logger.error(f"LLM generation failed after {latency:.2f}s: {e}")
            raise
    
    def batch_generate(self, prompts: List[str], model_config: ModelConfig,
                      max_concurrent: int = 3) -> List[str]:
        """Generate multiple prompts concurrently"""
        results = []
        
        with ThreadPoolExecutor(max_workers=max_concurrent) as executor:
            future_to_idx = {
                executor.submit(self.generate, prompt, model_config): i
                for i, prompt in enumerate(prompts)
            }
            
            for future in as_completed(future_to_idx):
                idx = future_to_idx[future]
                try:
                    result = future.result()
                    results.append((idx, result))
                except Exception as e:
                    logger.error(f"Batch generation failed for prompt {idx}: {e}")
                    results.append((idx, f"ERROR: {str(e)}"))
        
        # Sort by original index
        results.sort(key=lambda x: x[0])
        return [result for _, result in results]
    
    def get_stats(self) -> Dict:
        """Get LLM interface statistics"""
        avg_response_time = sum(self.response_times) / len(self.response_times) if self.response_times else 0
        p95_response_time = sorted(self.response_times)[int(len(self.response_times) * 0.95)] if len(self.response_times) > 1 else 0
        
        return {
            "total_requests": self.total_requests,
            "total_tokens": self.total_tokens,
            "avg_response_time": avg_response_time,
            "p95_response_time": p95_response_time,
            "response_time_samples": len(self.response_times),
            "circuit_breaker_state": self.circuit_breaker.state
        }

# Initialize LLM interface
llm_interface = LLMInterface()

# ============================================================
# JSON EXTRACTION UTILITIES
# ============================================================

def extract_json(text: str, lenient: bool = False) -> Dict:
    """
    Extract JSON from LLM response with robust error handling.
    
    Args:
        text: Text potentially containing JSON
        lenient: Whether to try lenient parsing if strict fails
        
    Returns:
        Parsed JSON as dictionary, or empty dict if failed
    """
    if not text:
        return {}
    
    # Clean the text
    text = text.strip()
    
    # Try to find JSON object or array
    json_match = None
    
    # Look for JSON object
    obj_match = re.search(r'\{[^{}]*\}', text, re.DOTALL)
    if obj_match:
        # Try to expand to include nested objects
        start = obj_match.start()
        end = obj_match.end()
        
        # Count braces to find complete object
        brace_count = 0
        in_string = False
        escape = False
        
        for i, char in enumerate(text[start:]):
            if escape:
                escape = False
                continue
            
            if char == '\\':
                escape = True
            elif char == '"' and not escape:
                in_string = not in_string
            elif not in_string:
                if char == '{':
                    brace_count += 1
                elif char == '}':
                    brace_count -= 1
                    if brace_count == 0:
                        end = start + i + 1
                        json_match = text[start:end]
                        break
        
        if not json_match and obj_match:
            json_match = obj_match.group()
    
    # If no object found, look for array
    if not json_match:
        array_match = re.search(r'\[[^\[\]]*\]', text, re.DOTALL)
        if array_match:
            json_match = array_match.group()
    
    # If still no match, use the whole text
    if not json_match:
        json_match = text
    
    # Clean up common JSON issues
    cleaned = json_match
    
    # Remove trailing commas before } or ]
    cleaned = re.sub(r',(\s*[}\]])', r'\1', cleaned)
    
    # Remove control characters
    cleaned = re.sub(r'[\x00-\x1F\x7F]', '', cleaned)
    
    # Fix unescaped quotes
    lines = cleaned.split('\n')
    fixed_lines = []
    for line in lines:
        # Count quotes in line
        quote_count = line.count('"')
        if quote_count % 2 != 0:
            # Odd number of quotes, try to fix
            line = line.replace('"', "'")
        fixed_lines.append(line)
    cleaned = '\n'.join(fixed_lines)
    
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError as e:
        if lenient:
            try:
                # Try more aggressive cleaning
                cleaned = re.sub(r'([{\[,])\s*,', r'\1', cleaned)  # Remove leading commas
                cleaned = re.sub(r'":\s*,', '": null,', cleaned)   # Fix empty values
                cleaned = re.sub(r',\s*"}', '"}', cleaned)         # Remove trailing commas
                
                return json.loads(cleaned)
            except:
                logger.warning(f"JSON parsing failed even with lenient mode: {e}")
                logger.debug(f"Problematic text: {text[:500]}...")
                return {}
        else:
            logger.warning(f"JSON parsing failed: {e}")
            logger.debug(f"Problematic text: {text[:500]}...")
            return {}

def validate_json_structure(data: Dict, expected_structure: Dict) -> Tuple[bool, List[str]]:
    """
    Validate JSON structure against expected format.
    
    Args:
        data: JSON data to validate
        expected_structure: Dictionary mapping keys to expected types
        
    Returns:
        Tuple of (is_valid, list_of_errors)
    """
    if not isinstance(data, dict):
        return False, ["Data is not a dictionary"]
    
    errors = []
    
    for key, expected_type in expected_structure.items():
        if key not in data:
            errors.append(f"Missing required key: {key}")
        elif expected_type == "list" and not isinstance(data[key], list):
            errors.append(f"Key '{key}' should be a list, got {type(data[key]).__name__}")
        elif expected_type == "str" and not isinstance(data[key], str):
            errors.append(f"Key '{key}' should be a string, got {type(data[key]).__name__}")
        elif expected_type == "bool" and not isinstance(data[key], bool):
            errors.append(f"Key '{key}' should be a boolean, got {type(data[key]).__name__}")
        elif expected_type == "int" and not isinstance(data[key], int):
            errors.append(f"Key '{key}' should be an integer, got {type(data[key]).__name__}")
        elif expected_type == "float" and not isinstance(data[key], (int, float)):
            errors.append(f"Key '{key}' should be a number, got {type(data[key]).__name__}")
        elif expected_type == "dict" and not isinstance(data[key], dict):
            errors.append(f"Key '{key}' should be a dictionary, got {type(data[key]).__name__}")
    
    return len(errors) == 0, errors

# ============================================================
# MODEL VALIDATION & FALLBACK
# ============================================================

def validate_and_fix_models() -> Tuple[List[str], Dict]:
    """
    Validate all pipeline models and apply fallbacks if needed.
    
    Returns:
        Tuple of (missing_models, fixed_configs)
    """
    logger.info("Validating pipeline models...")
    
    missing_models = []
    fixed_configs = {}
    
    for stage, config in PIPELINE_MODELS.items():
        original_model = config.name
        purpose = "extract" if "extract" in stage else "generate"
        
        # Get best available model
        fallbacks = MODEL_FALLBACKS.get(original_model, [])
        selected_model = model_manager.select_best_model(original_model, fallbacks, purpose)
        
        if selected_model and selected_model != original_model:
            # Update config
            config.name = selected_model
            fixed_configs[stage] = {
                "original": original_model,
                "selected": selected_model,
                "reason": "fallback_used"
            }
            logger.info(f"  {stage}: {original_model} → {selected_model}")
        elif not selected_model:
            missing_models.append((stage, original_model))
            logger.error(f"  ❌ {stage}: {original_model} - NO SUITABLE MODEL FOUND")
        else:
            logger.info(f"  ✅ {stage}: {original_model}")
    
    if missing_models:
        logger.error(f"Missing models for stages: {missing_models}")
    
    return missing_models, fixed_configs

def check_ollama_health() -> bool:
    """Check if Ollama server is healthy"""
    try:
        response = session.request_with_retry(
            "GET",
            f"{CONFIG['OLLAMA_URL']}/api/tags",
            timeout=5
        )
        return response.status_code == 200
    except Exception as e:
        logger.error(f"Ollama health check failed: {e}")
        return False

# ============================================================
# INITIALIZATION
# ============================================================

def initialize_llm_system():
    """Initialize the complete LLM system"""
    logger.info("=" * 60)
    logger.info("INITIALIZING LLM SYSTEM")
    logger.info("=" * 60)
    
    # Check Ollama health
    if not check_ollama_health():
        logger.error("❌ Ollama server not responding!")
        raise RuntimeError("Ollama server not available")
    
    logger.info("✅ Ollama server is healthy")
    
    # Validate and fix models
    missing_models, fixes = validate_and_fix_models()
    
    if missing_models:
        logger.error(f"❌ Missing models: {missing_models}")
        # Don't fail immediately, try to continue
    
    # Warm up models
    warmup_results = model_manager.warmup_models()
    
    # Check for critical failures
    critical_models = ["extract", "generate_basic"]
    critical_failed = []
    
    for stage in critical_models:
        if stage in PIPELINE_MODELS:
            model = PIPELINE_MODELS[stage].name
            is_healthy = any(r[0] == model and r[1] for r in warmup_results)
            if not is_healthy:
                critical_failed.append((stage, model))
    
    if critical_failed:
        logger.error(f"❌ Critical models failed warmup: {critical_failed}")
        logger.error("Pipeline may not function correctly")
    
    # Log hardware info
    logger.info(f"Hardware: GPU={HARDWARE_INFO.get('gpu_available', False)}, "
                f"Memory={HARDWARE_INFO.get('gpu_memory_gb', 0):.1f}GB GPU, "
                f"{HARDWARE_INFO.get('total_memory_gb', 0):.1f}GB System")
    
    # Log circuit breaker status
    logger.info(f"Circuit breaker: {OLLAMA_CIRCUIT_BREAKER.state}")
    
    logger.info("=" * 60)
    logger.info("LLM SYSTEM READY")
    logger.info("=" * 60)

# Run initialization
try:
    initialize_llm_system()
except Exception as e:
    logger.error(f"LLM system initialization failed: {e}")
    # Continue anyway, some features may work

# ============================================================
# CONVENIENCE FUNCTION
# ============================================================

def llm(prompt: str, model_config: ModelConfig, **kwargs) -> str:
    """
    Convenience function for LLM generation.
    
    This is the main function other cells should use.
    """
    return llm_interface.generate(prompt, model_config, **kwargs)

In [None]:
# ============================================================
# CELL 6: PROMPT MANAGEMENT & GENERATION (COMPLETE)
# ============================================================

import json
import time
from typing import Dict, List, Optional, Tuple, Any
from datetime import datetime
from collections import defaultdict, deque
import hashlib

logger = logging.getLogger("anki-pipeline.prompts")

# ============================================================
# PROGRESS TRACKER (COMPLETE)
# ============================================================

class ProgressTracker:
    """
    Track processed reels with atomic file operations and buffer management.
    """
    
    def __init__(self, progress_file: Path):
        self.progress_file = progress_file
        self.processed: Set[str] = self._load()
        self.lock = Lock()
        self._buffer: List[str] = []
        self._buffer_size = CONFIG.get("PROGRESS_BUFFER_SIZE", 50)
        self._last_flush = time.time()
        self.flush_interval = 30  # seconds
        
        # Statistics
        self.loaded_count = len(self.processed)
        logger.info(f"Progress tracker initialized: {self.loaded_count:,} reels already processed")
    
    def _load(self) -> Set[str]:
        """Load processed reels from file"""
        if not self.progress_file.exists():
            return set()
        
        try:
            # Read with error handling
            with open(self.progress_file, 'r', encoding='utf-8') as f:
                lines = []
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if line:
                        if ',' in line:  # Handle CSV-style if needed
                            parts = line.split(',')
                            if parts:
                                lines.append(parts[0].strip())
                        else:
                            lines.append(line)
                
                loaded_set = set(lines)
                logger.debug(f"Loaded {len(loaded_set):,} processed reels from {self.progress_file}")
                return loaded_set
                
        except Exception as e:
            logger.error(f"Error loading progress file {self.progress_file}: {e}")
            
            # Try backup if available
            backup_file = self.progress_file.with_suffix('.progress.bak')
            if backup_file.exists():
                logger.info(f"Trying backup file: {backup_file}")
                try:
                    with open(backup_file, 'r') as f:
                        loaded_set = set(line.strip() for line in f if line.strip())
                    logger.info(f"Recovered {len(loaded_set):,} reels from backup")
                    return loaded_set
                except:
                    pass
            
            return set()
    
    def mark_processed(self, reel_id: str):
        """Mark a reel as processed (buffered write)"""
        with self.lock:
            if reel_id and reel_id not in self.processed:
                self.processed.add(reel_id)
                self._buffer.append(reel_id)
                
                # Check if we should flush
                should_flush = (
                    len(self._buffer) >= self._buffer_size or
                    (time.time() - self._last_flush) >= self.flush_interval
                )
                
                if should_flush:
                    self.flush()
    
    def flush(self):
        """Flush buffer to disk"""
        if not self._buffer:
            return
        
        try:
            # Create backup first
            if self.progress_file.exists():
                backup_file = self.progress_file.with_suffix('.progress.bak')
                self.progress_file.rename(backup_file)
            
            # Append new entries
            with open(self.progress_file, 'a', encoding='utf-8') as f:
                f.write('\n'.join(self._buffer) + '\n')
            
            self._buffer.clear()
            self._last_flush = time.time()
            logger.debug(f"Progress flushed: {len(self._buffer)} entries")
            
        except Exception as e:
            logger.error(f"Error flushing progress: {e}")
            
            # Try to restore backup
            backup_file = self.progress_file.with_suffix('.progress.bak')
            if backup_file.exists():
                try:
                    backup_file.rename(self.progress_file)
                except:
                    pass
    
    def is_processed(self, reel_id: str) -> bool:
        """Check if reel has been processed"""
        with self.lock:
            return reel_id in self.processed
    
    def get_stats(self) -> Dict:
        """Get progress statistics"""
        with self.lock:
            return {
                "total_processed": len(self.processed),
                "buffer_size": len(self._buffer),
                "last_flush": self._last_flush,
                "loaded_count": self.loaded_count
            }
    
    def save_backup(self):
        """Create a backup of progress file"""
        try:
            if self.progress_file.exists():
                backup_file = self.progress_file.with_suffix(f'.progress.{int(time.time())}.bak')
                import shutil
                shutil.copy2(self.progress_file, backup_file)
                logger.debug(f"Progress backup created: {backup_file}")
        except Exception as e:
            logger.error(f"Error creating progress backup: {e}")

# Initialize progress tracker
progress_tracker = ProgressTracker(PROGRESS_FILE)

# ============================================================
# CACHE MANAGEMENT
# ============================================================

class PipelineCache:
    """
    Cache for intermediate pipeline results with versioning and invalidation.
    """
    
    def __init__(self, cache_dir: Path):
        self.cache_dir = cache_dir
        self.cache_dir.mkdir(exist_ok=True)
        
        # Cache statistics
        self.hits = 0
        self.misses = 0
        self.writes = 0
        self.lock = Lock()
        
        # Version for cache invalidation
        self.cache_version = CONFIG.get("CACHE_VERSION", "v1")
        
        logger.info(f"Pipeline cache initialized at {cache_dir}, version: {self.cache_version}")
    
    def _get_cache_key(self, reel_id: str, stage: str) -> str:
        """Generate cache key with version"""
        # Create deterministic key
        key_data = f"{reel_id}::{stage}::{self.cache_version}"
        return hashlib.md5(key_data.encode()).hexdigest()
    
    def _get_cache_file(self, reel_id: str, stage: str) -> Path:
        """Get cache file path"""
        cache_key = self._get_cache_key(reel_id, stage)
        return self.cache_dir / f"{cache_key}.json"
    
    def get(self, reel_id: str, stage: str) -> Optional[Dict]:
        """Get cached result"""
        cache_file = self._get_cache_file(reel_id, stage)
        
        if not cache_file.exists():
            with self.lock:
                self.misses += 1
            return None
        
        try:
            data = atomic_read(cache_file, default=None)
            if data:
                # Check if cache is still valid
                cache_timestamp = data.get("_cache_timestamp", 0)
                cache_ttl = data.get("_cache_ttl", 7 * 24 * 3600)  # Default 7 days
                
                if time.time() - cache_timestamp < cache_ttl:
                    with self.lock:
                        self.hits += 1
                    return data
                else:
                    logger.debug(f"Cache expired for {reel_id}::{stage}")
                    cache_file.unlink(missing_ok=True)
            
        except Exception as e:
            logger.warning(f"Error reading cache {cache_file}: {e}")
        
        with self.lock:
            self.misses += 1
        return None
    
    def set(self, reel_id: str, stage: str, data: Dict, ttl: int = 7 * 24 * 3600):
        """Set cached result with TTL"""
        if not data:
            return
        
        try:
            # Add cache metadata
            data_with_meta = data.copy()
            data_with_meta["_cache_timestamp"] = time.time()
            data_with_meta["_cache_ttl"] = ttl
            data_with_meta["_cache_version"] = self.cache_version
            data_with_meta["_reel_id"] = reel_id
            data_with_meta["_stage"] = stage
            
            cache_file = self._get_cache_file(reel_id, stage)
            atomic_write(data_with_meta, cache_file)
            
            with self.lock:
                self.writes += 1
            
            logger.debug(f"Cached {reel_id}::{stage}")
            
        except Exception as e:
            logger.error(f"Error writing cache for {reel_id}::{stage}: {e}")
    
    def invalidate(self, reel_id: str = None, stage: str = None):
        """Invalidate cache entries"""
        try:
            if reel_id and stage:
                # Invalidate specific entry
                cache_file = self._get_cache_file(reel_id, stage)
                cache_file.unlink(missing_ok=True)
                logger.debug(f"Invalidated cache for {reel_id}::{stage}")
            
            elif reel_id:
                # Invalidate all entries for reel
                pattern = f"*{reel_id}*" if "*" not in reel_id else reel_id
                for cache_file in self.cache_dir.glob(pattern):
                    cache_file.unlink(missing_ok=True)
                logger.debug(f"Invalidated all cache for reel {reel_id}")
            
            elif stage:
                # Invalidate all entries for stage
                pattern = f"*::{stage}::*"
                for cache_file in self.cache_dir.glob("*"):
                    try:
                        if stage in cache_file.name:
                            cache_file.unlink(missing_ok=True)
                    except:
                        pass
                logger.debug(f"Invalidated all cache for stage {stage}")
            
            else:
                # Invalidate all cache
                for cache_file in self.cache_dir.glob("*"):
                    cache_file.unlink(missing_ok=True)
                logger.info("Invalidated all cache")
                
        except Exception as e:
            logger.error(f"Error invalidating cache: {e}")
    
    def get_stats(self) -> Dict:
        """Get cache statistics"""
        with self.lock:
            total = self.hits + self.misses
            hit_rate = self.hits / total if total > 0 else 0
            
            # Count cache files
            cache_files = list(self.cache_dir.glob("*.json"))
            cache_size_mb = sum(f.stat().st_size for f in cache_files) / (1024 * 1024)
            
            return {
                "hits": self.hits,
                "misses": self.misses,
                "writes": self.writes,
                "hit_rate": hit_rate,
                "cache_files": len(cache_files),
                "cache_size_mb": round(cache_size_mb, 2),
                "cache_dir": str(self.cache_dir),
                "cache_version": self.cache_version
            }
    
    def cleanup_old_entries(self, max_age_days: int = 30):
        """Cleanup old cache entries"""
        try:
            now = time.time()
            max_age_seconds = max_age_days * 24 * 3600
            deleted = 0
            
            for cache_file in self.cache_dir.glob("*.json"):
                try:
                    # Read cache timestamp without loading entire file
                    data = atomic_read(cache_file, default={})
                    cache_timestamp = data.get("_cache_timestamp", 0)
                    
                    if now - cache_timestamp > max_age_seconds:
                        cache_file.unlink()
                        deleted += 1
                except:
                    # If can't read, delete anyway
                    cache_file.unlink(missing_ok=True)
                    deleted += 1
            
            if deleted > 0:
                logger.info(f"Cleaned up {deleted} old cache entries (> {max_age_days} days)")
                
        except Exception as e:
            logger.error(f"Error cleaning up cache: {e}")

# Initialize cache
pipeline_cache = PipelineCache(CONFIG["CACHE_DIR"])

# Convenience functions
def get_cached_result(reel_id: str, stage: str) -> Optional[Dict]:
    """Get cached result (convenience wrapper)"""
    return pipeline_cache.get(reel_id, stage)

def save_cached_result(reel_id: str, stage: str, data: Dict, ttl: int = 7 * 24 * 3600):
    """Save result to cache (convenience wrapper)"""
    pipeline_cache.set(reel_id, stage, data, ttl)

# ============================================================
# PROMPT VERSION LIFECYCLE MANAGEMENT
# ============================================================

class PromptVersionManager:
    """
    Manage prompt versions with auto-deprecation and success tracking.
    """
    
    def __init__(self, stats_file: Path):
        self.stats_file = stats_file
        self.stats = self._load_stats()
        self.lock = Lock()
        self._dirty = False
        
        # Deprecation thresholds
        self.min_attempts_for_deprecation = 8
        self.deprecation_threshold = 0.35  # 35% success rate
        self.recovery_threshold = 0.45     # 45% success rate to recover
        self.recovery_check_interval = 50   # Check every 50 attempts
        
        logger.info(f"Prompt version manager initialized: {len(self.stats):,} versions tracked")
    
    def _load_stats(self) -> Dict:
        """Load prompt version statistics"""
        if not self.stats_file.exists():
            return {}
        
        try:
            data = atomic_read(self.stats_file, default={})
            return data.get("versions", {})
        except Exception as e:
            logger.error(f"Error loading prompt version stats: {e}")
            return {}
    
    def _save_stats(self):
        """Save prompt version statistics"""
        if not self._dirty:
            return
        
        try:
            data = {
                "versions": self.stats,
                "last_updated": time.time(),
                "total_versions": len(self.stats)
            }
            atomic_write(data, self.stats_file)
            self._dirty = False
            logger.debug(f"Saved prompt version stats: {len(self.stats):,} versions")
        except Exception as e:
            logger.error(f"Error saving prompt version stats: {e}")
    
    def record_result(self, version: str, success: bool, stage: str = "unknown", 
                     details: Dict = None):
        """
        Record prompt version result.
        
        Args:
            version: Prompt version identifier
            success: Whether the prompt was successful
            stage: Which pipeline stage
            details: Additional details about the attempt
        """
        with self.lock:
            if version not in self.stats:
                self.stats[version] = {
                    "attempts": 0,
                    "successes": 0,
                    "mechanical_failures": 0,
                    "deprecated": False,
                    "success_rate": 0.0,
                    "last_failure_stage": None,
                    "last_success_timestamp": None,
                    "first_seen": time.time(),
                    "stage_stats": defaultdict(lambda: {"attempts": 0, "successes": 0}),
                    "recent_attempts": deque(maxlen=20),  # Track recent attempts
                    "metadata": details or {}
                }
            
            stats = self.stats[version]
            
            # Mechanical failures don't count against prompt quality
            if stage == "mechanical":
                stats["mechanical_failures"] += 1
                self._dirty = True
                return
            
            # Record attempt
            stats["attempts"] += 1
            stats["recent_attempts"].append((success, time.time()))
            
            # Update stage statistics
            stats["stage_stats"][stage]["attempts"] += 1
            if success:
                stats["successes"] += 1
                stats["stage_stats"][stage]["successes"] += 1
                stats["last_success_timestamp"] = time.time()
            else:
                stats["last_failure_stage"] = stage
            
            # Calculate success rate
            if stats["attempts"] > 0:
                stats["success_rate"] = stats["successes"] / stats["attempts"]
            
            # Auto-deprecation logic
            if (stats["attempts"] >= self.min_attempts_for_deprecation and
                stats["success_rate"] < self.deprecation_threshold and
                not stats["deprecated"]):
                
                stats["deprecated"] = True
                logger.warning(f"🚫 Auto-deprecated prompt version '{version}' "
                              f"(success rate: {stats['success_rate']:.1%} after {stats['attempts']} attempts)")
            
            # Recovery logic for deprecated prompts
            if (stats["deprecated"] and 
                stats["attempts"] % self.recovery_check_interval == 0):
                
                if stats["success_rate"] >= self.recovery_threshold:
                    stats["deprecated"] = False
                    logger.info(f"♻️  Un-deprecated prompt version '{version}' "
                               f"(success rate recovered: {stats['success_rate']:.1%})")
            
            # Periodic save
            if stats["attempts"] % 10 == 0:
                self._dirty = True
            
            self._dirty = True
    
    def is_deprecated(self, version: str) -> bool:
        """Check if a prompt version is deprecated"""
        with self.lock:
            if version not in self.stats:
                return False
            
            return self.stats[version].get("deprecated", False)
    
    def get_version_stats(self, version: str) -> Optional[Dict]:
        """Get statistics for a specific version"""
        with self.lock:
            if version not in self.stats:
                return None
            
            stats = self.stats[version].copy()
            
            # Calculate recent success rate (last 10 attempts)
            recent_attempts = list(stats.get("recent_attempts", []))
            if recent_attempts:
                recent_successes = sum(1 for success, _ in recent_attempts if success)
                stats["recent_success_rate"] = recent_successes / len(recent_attempts)
            
            # Convert stage_stats defaultdict to regular dict
            stats["stage_stats"] = dict(stats["stage_stats"])
            
            return stats
    
    def get_best_version_for_stage(self, stage: str, min_attempts: int = 10) -> Optional[str]:
        """Get the best prompt version for a specific stage"""
        with self.lock:
            best_version = None
            best_success_rate = -1
            
            for version, stats in self.stats.items():
                stage_stat = stats["stage_stats"].get(stage, {"attempts": 0, "successes": 0})
                
                if (stage_stat["attempts"] >= min_attempts and
                    not stats["deprecated"]):
                    
                    success_rate = stage_stat["successes"] / stage_stat["attempts"]
                    
                    if success_rate > best_success_rate:
                        best_success_rate = success_rate
                        best_version = version
            
            if best_version:
                logger.debug(f"Best version for {stage}: {best_version} ({best_success_rate:.1%})")
            
            return best_version
    
    def get_overall_stats(self) -> Dict:
        """Get overall statistics"""
        with self.lock:
            total_versions = len(self.stats)
            active_versions = sum(1 for v in self.stats.values() if not v["deprecated"])
            deprecated_versions = total_versions - active_versions
            
            # Calculate aggregate success rate
            total_attempts = sum(v["attempts"] for v in self.stats.values())
            total_successes = sum(v["successes"] for v in self.stats.values())
            overall_success_rate = total_successes / total_attempts if total_attempts > 0 else 0
            
            # Success rate by stage
            stage_stats = defaultdict(lambda: {"attempts": 0, "successes": 0})
            for version_stats in self.stats.values():
                for stage, stats in version_stats["stage_stats"].items():
                    stage_stats[stage]["attempts"] += stats["attempts"]
                    stage_stats[stage]["successes"] += stats["successes"]
            
            # Calculate success rates
            stage_success_rates = {}
            for stage, stats in stage_stats.items():
                if stats["attempts"] > 0:
                    stage_success_rates[stage] = stats["successes"] / stats["attempts"]
            
            return {
                "total_versions": total_versions,
                "active_versions": active_versions,
                "deprecated_versions": deprecated_versions,
                "overall_success_rate": overall_success_rate,
                "total_attempts": total_attempts,
                "total_successes": total_successes,
                "stage_success_rates": stage_success_rates,
                "top_versions": self._get_top_versions(5)
            }
    
    def _get_top_versions(self, n: int = 5) -> List[Dict]:
        """Get top n versions by success rate"""
        versions_with_stats = []
        
        for version, stats in self.stats.items():
            if stats["attempts"] >= 10:  # Only consider versions with enough attempts
                versions_with_stats.append({
                    "version": version,
                    "success_rate": stats["success_rate"],
                    "attempts": stats["attempts"],
                    "deprecated": stats["deprecated"],
                    "last_success": stats.get("last_success_timestamp")
                })
        
        # Sort by success rate, then by attempts
        versions_with_stats.sort(key=lambda x: (x["success_rate"], x["attempts"]), reverse=True)
        
        return versions_with_stats[:n]
    
    def save(self):
        """Save statistics to disk"""
        with self.lock:
            self._save_stats()

# Initialize prompt version manager
prompt_version_manager = PromptVersionManager(PROMPT_VERSION_FILE)

# Convenience functions
def record_prompt_version_result(version: str, success: bool, stage: str = "unknown", details: Dict = None):
    """Record prompt version result (convenience wrapper)"""
    prompt_version_manager.record_result(version, success, stage, details)

def is_prompt_version_deprecated(version: str) -> bool:
    """Check if prompt version is deprecated (convenience wrapper)"""
    return prompt_version_manager.is_deprecated(version)

# ============================================================
# CONFIDENCE CALIBRATION SYSTEM
# ============================================================

class ConfidenceCalibrator:
    """
    Calibrate confidence scores based on historical accuracy.
    """
    
    def __init__(self, calibration_file: Path):
        self.calibration_file = calibration_file
        self.calibration_data = self._load_calibration()
        self.lock = Lock()
        self._dirty = False
        
        # Calibration parameters
        self.min_samples_per_bucket = 10
        self.calibration_update_interval = 10  # Update every 10 samples
        
        logger.info("Confidence calibrator initialized")
    
    def _load_calibration(self) -> Dict:
        """Load calibration data"""
        if not self.calibration_file.exists():
            return CONFIDENCE_CALIBRATION  # Default from CELL 2
        
        try:
            data = atomic_read(self.calibration_file, default={})
            return data.get("calibration", CONFIDENCE_CALIBRATION)
        except Exception as e:
            logger.error(f"Error loading confidence calibration: {e}")
            return CONFIDENCE_CALIBRATION
    
    def _save_calibration(self):
        """Save calibration data"""
        if not self._dirty:
            return
        
        try:
            data = {
                "calibration": self.calibration_data,
                "last_updated": time.time(),
                "version": "v2"
            }
            atomic_write(data, self.calibration_file)
            self._dirty = False
            logger.debug("Saved confidence calibration")
        except Exception as e:
            logger.error(f"Error saving confidence calibration: {e}")
    
    def _get_bucket(self, confidence: float) -> Optional[str]:
        """Get bucket for confidence score"""
        if 0.5 <= confidence < 0.6:
            return "0.5-0.6"
        elif 0.6 <= confidence < 0.7:
            return "0.6-0.7"
        elif 0.7 <= confidence < 0.8:
            return "0.7-0.8"
        elif 0.8 <= confidence < 0.9:
            return "0.8-0.9"
        elif 0.9 <= confidence <= 1.0:
            return "0.9-1.0"
        return None
    
    def track_outcome(self, confidence: float, accepted: bool, 
                     topic_class: str = "", details: Dict = None):
        """
        Track confidence outcome for calibration.
        
        Args:
            confidence: Predicted confidence score
            accepted: Whether the card was actually accepted
            topic_class: Topic classification for stratified calibration
            details: Additional context
        """
        bucket = self._get_bucket(confidence)
        if not bucket:
            return
        
        with self.lock:
            # Initialize bucket if needed
            if "buckets" not in self.calibration_data:
                self.calibration_data["buckets"] = {}
            
            if bucket not in self.calibration_data["buckets"]:
                self.calibration_data["buckets"][bucket] = {
                    "total": 0,
                    "accepted": 0,
                    "by_topic_class": defaultdict(lambda: {"total": 0, "accepted": 0})
                }
            
            bucket_data = self.calibration_data["buckets"][bucket]
            bucket_data["total"] += 1
            if accepted:
                bucket_data["accepted"] += 1
            
            # Track by topic class if provided
            if topic_class:
                if "by_topic_class" not in bucket_data:
                    bucket_data["by_topic_class"] = defaultdict(lambda: {"total": 0, "accepted": 0})
                
                topic_stats = bucket_data["by_topic_class"][topic_class]
                topic_stats["total"] += 1
                if accepted:
                    topic_stats["accepted"] += 1
            
            # Track overall statistics
            if "overall" not in self.calibration_data:
                self.calibration_data["overall"] = {"total": 0, "accepted": 0}
            
            self.calibration_data["overall"]["total"] += 1
            if accepted:
                self.calibration_data["overall"]["accepted"] += 1
            
            # Update calibration factor periodically
            if bucket_data["total"] % self.calibration_update_interval == 0:
                self._update_calibration_factor()
            
            self._dirty = True
    
    def _update_calibration_factor(self):
        """Update global calibration factor based on all buckets"""
        if "buckets" not in self.calibration_data:
            return
        
        total_samples = 0
        total_discrepancy = 0.0
        
        for bucket_name, bucket_data in self.calibration_data["buckets"].items():
            if bucket_data["total"] >= self.min_samples_per_bucket:
                # Parse bucket range
                low, high = map(float, bucket_name.split('-'))
                expected_accuracy = (low + high) / 2
                
                actual_accuracy = bucket_data["accepted"] / bucket_data["total"]
                
                # Weight discrepancy by number of samples
                weight = bucket_data["total"]
                discrepancy = (actual_accuracy - expected_accuracy) * weight
                
                total_discrepancy += discrepancy
                total_samples += weight
        
        if total_samples > 0:
            avg_discrepancy = total_discrepancy / total_samples
            self.calibration_data["calibration_factor"] = 1.0 - avg_discrepancy * 0.5
            
            logger.debug(f"Updated calibration factor: {self.calibration_data['calibration_factor']:.3f}")
    
    def calibrate_confidence(self, raw_confidence: float, topic_class: str = "") -> float:
        """
        Calibrate confidence score based on historical accuracy.
        
        Args:
            raw_confidence: Raw confidence score (0-1)
            topic_class: Optional topic class for stratified calibration
            
        Returns:
            Calibrated confidence score
        """
        with self.lock:
            # Apply global calibration factor
            calibration_factor = self.calibration_data.get("calibration_factor", 1.0)
            calibrated = raw_confidence * calibration_factor
            
            # Apply topic-specific adjustment if available
            if topic_class and "buckets" in self.calibration_data:
                bucket = self._get_bucket(raw_confidence)
                if bucket and bucket in self.calibration_data["buckets"]:
                    bucket_data = self.calibration_data["buckets"][bucket]
                    
                    if ("by_topic_class" in bucket_data and 
                        topic_class in bucket_data["by_topic_class"]):
                        
                        topic_stats = bucket_data["by_topic_class"][topic_class]
                        if topic_stats["total"] >= 5:
                            topic_accuracy = topic_stats["accepted"] / topic_stats["total"]
                            
                            # Adjust based on topic accuracy
                            low, high = map(float, bucket.split('-'))
                            expected_accuracy = (low + high) / 2
                            
                            adjustment = (topic_accuracy - expected_accuracy) * 0.3
                            calibrated = min(max(calibrated + adjustment, 0), 1)
            
            return min(max(calibrated, 0), 1)  # Clamp to [0, 1]
    
    def get_calibration_stats(self) -> Dict:
        """Get calibration statistics"""
        with self.lock:
            stats = self.calibration_data.copy()
            
            # Calculate calibration quality
            if "buckets" in stats:
                calibration_quality = []
                for bucket_name, bucket_data in stats["buckets"].items():
                    if bucket_data["total"] >= self.min_samples_per_bucket:
                        low, high = map(float, bucket_name.split('-'))
                        expected = (low + high) / 2
                        actual = bucket_data["accepted"] / bucket_data["total"]
                        calibration_quality.append(abs(actual - expected))
                
                if calibration_quality:
                    stats["avg_calibration_error"] = sum(calibration_quality) / len(calibration_quality)
            
            # Add metadata
            stats["calibration_factor"] = stats.get("calibration_factor", 1.0)
            stats["total_samples"] = stats.get("overall", {}).get("total", 0)
            
            return stats
    
    def save(self):
        """Save calibration data"""
        with self.lock:
            self._save_calibration()

# Initialize confidence calibrator
confidence_calibrator = ConfidenceCalibrator(CONFIDENCE_CALIBRATION_FILE)

# Convenience functions
def track_confidence_outcome(confidence: float, accepted: bool, topic_class: str = "", details: Dict = None):
    """Track confidence outcome (convenience wrapper)"""
    confidence_calibrator.track_outcome(confidence, accepted, topic_class, details)

def calibrate_confidence(raw_confidence: float, topic_class: str = "") -> float:
    """Calibrate confidence score (convenience wrapper)"""
    return confidence_calibrator.calibrate_confidence(raw_confidence, topic_class)

# ============================================================
# STAGE 0: TRANSCRIPT NORMALIZATION
# ============================================================

def normalize_transcript(transcript: str, caption: str) -> str:
    """
    PHASE 2: Transform spoken Instagram Reel transcript into structured content.
    Converts conversational/spoken content into declarative educational prose.
    
    Args:
        transcript: Raw transcript text
        caption: Video caption for context
        
    Returns:
        Normalized transcript
    """
    if not CONFIG.get("ENABLE_TRANSCRIPT_NORMALIZATION", True):
        return transcript
    
    if not transcript or len(transcript.strip()) < 50:
        return transcript
    
    # Check if normalization is needed
    filler_count = (
        transcript.lower().count("so ") + 
        transcript.lower().count("well ") + 
        transcript.lower().count("let's ") + 
        transcript.lower().count("hello ") + 
        transcript.lower().count(" follow ") + 
        transcript.lower().count(" follow @")
    )
    
    # If already structured and minimal fillers, skip normalization
    if filler_count < 2 and len(transcript.split()) > 100:
        return transcript
    
    logger.debug(f"Normalizing transcript ({len(transcript)} chars, {filler_count} fillers)")
    
    prompt = f"""You are a technical content normalizer for educational flashcards. Transform spoken Instagram Reel transcripts into structured technical prose.

INPUT TRANSCRIPT (spoken/conversational):
{transcript}

TOPIC/CAPTION:
{caption}

CRITICAL RULES:
1. Convert speech → declarative statements
2. Remove ALL non-technical content:
   - Filler: "so", "well", "hello", "let's see", "right?", "okay"
   - Self-reference: "I have created", "I will show", "comment link", "DM for"
   - CTAs: "go ahead", "link in bio", "check out", "share with your friends"
3. Extract ONLY what was explicitly said or clearly implied
4. Preserve: ALL technical terms, code examples, specific numbers/metrics
5. Format as neutral technical explanation

OUTPUT FORMAT (JSON):
{{
  "normalized_content": "Technical explanation in declarative prose",
  "key_concepts": ["term1", "term2"],
  "has_code_example": true/false,
  "content_type": "explanation|warning|comparison|definition"
}}

Return ONLY valid JSON:"""
    
    try:
        start_time = time.time()
        raw = llm(prompt, PIPELINE_MODELS['normalize'])
        result = extract_json(raw, lenient=True)
        latency = time.time() - start_time
        
        PROCESSING_METRICS.record_timing("transcript_normalization", latency)
        ROUTING_METRICS.increment("transcript_normalization_attempts")
        
        normalized = result.get("normalized_content", "").strip()
        content_type = result.get("content_type", "explanation")
        
        if not normalized or len(normalized) < 50:
            logger.debug("Normalization skipped: no technical content extracted")
            ROUTING_METRICS.increment("transcript_normalization_skipped")
            return transcript
        
        # Validate normalization
        original_words = len(transcript.split())
        normalized_words = len(normalized.split())
        ratio = normalized_words / original_words if original_words > 0 else 0
        
        if ratio < 0.5:
            logger.warning(f"Normalization suspicious: {ratio:.1%} of original length")
            ROUTING_METRICS.increment("transcript_normalization_suspicious")
            return transcript
        
        if ratio > 2.0:
            logger.warning(f"Normalization rejected: {ratio:.1%} of original (hallucination risk)")
            ROUTING_METRICS.increment("transcript_normalization_rejected")
            return transcript
        
        if normalized.lower().strip() == transcript.lower().strip():
            logger.debug("Normalization unchanged")
            return transcript
        
        logger.info(f"✨ Normalized: {original_words}w → {normalized_words}w ({ratio:.0%}, {content_type}, {latency:.2f}s)")
        ROUTING_METRICS.increment("transcript_normalization_successful")
        
        return normalized
        
    except Exception as e:
        logger.error(f"Normalization error: {e}")
        ROUTING_METRICS.increment("transcript_normalization_errors")
        return transcript

# ============================================================
# INITIALIZATION LOGGING
# ============================================================

logger.info("=" * 60)
logger.info("PROMPT MANAGEMENT INITIALIZED")
logger.info("=" * 60)
logger.info(f"Progress tracker: {progress_tracker.get_stats()['total_processed']:,} reels")
logger.info(f"Pipeline cache: {pipeline_cache.get_stats()['cache_files']:,} entries")
logger.info(f"Prompt versions: {len(prompt_version_manager.stats):,} versions tracked")
logger.info(f"Confidence calibrator: {confidence_calibrator.get_calibration_stats().get('total_samples', 0):,} samples")
logger.info("=" * 60)

In [None]:
# ============================================================
# CELL 7: MAIN PIPELINE & ORCHESTRATION (COMPLETE)
# ============================================================

import time
import sys
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Optional, Tuple, Any
import pandas as pd
import math

logger = logging.getLogger("anki-pipeline.orchestration")

# ============================================================
# CORE PIPELINE COMPONENTS
# ============================================================

def compact_atoms_for_generation(atoms: Dict, max_chars: int = 2000) -> Dict:
    """
    Compact atoms before Stage-2 to prevent context overflow.
    
    Args:
        atoms: Full atoms dictionary
        max_chars: Maximum character limit for compacted atoms
        
    Returns:
        Compacted atoms for generation
    """
    compacted = {
        "concept": atoms.get("concept", "")[:200],
        "category": atoms.get("category", ""),
        "definition": atoms.get("definition", "")[:300],
        "technical_points": atoms.get("technical_points", [])[:4],
        "solutions": atoms.get("solutions", [])[:2],
        "impact": atoms.get("impact", [])[:2],
        "has_tradeoffs": atoms.get("has_tradeoffs", False),
        "related_concepts": [
            {
                "name": rc.get("name", "")[:100],
                "why_relevant": rc.get("why_relevant", "")[:150],
                "key_points": rc.get("key_points", [])[:2]
            }
            for rc in atoms.get("related_concepts", [])[:2]
        ]
    }
    
    # Add topic class if available
    if "topic_class" in atoms:
        compacted["topic_class"] = atoms["topic_class"]
    
    # Ensure we don't exceed max_chars
    compacted_str = json.dumps(compacted)
    if len(compacted_str) > max_chars:
        # Further truncation
        compacted["technical_points"] = compacted["technical_points"][:2]
        compacted["related_concepts"] = compacted["related_concepts"][:1]
        compacted_str = json.dumps(compacted)
        
        if len(compacted_str) > max_chars:
            compacted["related_concepts"] = []
    
    return compacted

def validate_cloze(card: Dict, topic_class: str) -> bool:
    """
    Topic-aware cloze validation.
    
    Foundation: 2 clozes minimum, 60 chars minimum
    Intermediate/Advanced: 3 clozes minimum, 90 chars minimum
    """
    cloze = card.get("cloze", "")
    
    # Count actual cloze deletions
    cloze_pattern = r'\{\{c\d+::'
    cloze_count = len(re.findall(cloze_pattern, cloze))
    
    if topic_class == "foundation":
        return cloze_count >= 2 and len(cloze) >= 60
    else:
        return cloze_count >= 3 and len(cloze) >= 90

# ============================================================
# STAGE 2: CARD GENERATION PROMPTS
# ============================================================

def stage2a_basic_prompt(atoms: Dict) -> str:
    """Generate 2-3 BASIC interview cards."""
    clean_atoms = compact_atoms_for_generation(atoms)
    
    return f"""Generate 2-3 BASIC interview cards.

Input:
{json.dumps(clean_atoms, indent=2)}

CRITICAL RULES:
1. Generate 2–3 cards for the PRIMARY concept
2. Each card must be interview-ready with clear front/back
3. Use bullet points for clarity in back
4. Include concrete examples or code snippets
5. Focus on understanding, not just facts

OUTPUT JSON:
{{
  "cards": [
    {{
      "type": "basic",
      "concept_source": "primary",
      "front": "Clear question about {clean_atoms.get('concept', 'concept')}",
      "back": "Detailed answer with bullet points and examples",
      "tags": ["domain:{clean_atoms.get('category', 'java')}", "difficulty:medium"]
    }}
  ]
}}

Return ONLY valid JSON."""

def stage2b_cloze_prompt(atoms: Dict) -> str:
    """Generate 1-2 CLOZE deletion cards."""
    clean_atoms = compact_atoms_for_generation(atoms)
    
    return f"""Generate 1-2 CLOZE deletion cards.

Input:
{json.dumps(clean_atoms, indent=2)}

CRITICAL RULES:
1. Each card MUST have at least 2-3 deletions (topic-aware)
2. Prefer mechanisms, definitions, and key concepts for cloze
3. Length 60–180 characters (topic-aware)
4. Avoid filler text - focus on core technical content
5. Make deletions meaningful (key terms, not obvious fillers)

FORMAT:
Use {{{{c1:: }}}}, {{{{c2:: }}}}, {{{{c3:: }}}} syntax

OUTPUT JSON:
{{
  "cards": [
    {{
      "type": "cloze",
      "concept_source": "primary",
      "cloze": "Text with {{{{c1::term}}}} and {{{{c2::another}}}} deletion...",
      "tags": ["domain:{clean_atoms.get('category', 'java')}", "difficulty:medium"]
    }}
  ]
}}

Return ONLY valid JSON."""

def stage2c_tradeoff_prompt(atoms: Dict) -> str:
    """Generate ONE TRADEOFF card."""
    if not atoms.get("has_tradeoffs"):
        return ""
    
    clean_atoms = compact_atoms_for_generation(atoms)
    
    return f"""Generate ONE TRADEOFF card.

Input:
{json.dumps(clean_atoms, indent=2)}

RULES:
1. Compare exactly 2–3 approaches
2. Each approach MUST have 2 PROS and 2 CONS
3. Focus on engineering tradeoffs (performance, complexity, maintainability)
4. If no explicit alternative exists, infer the most common real-world alternative

OUTPUT JSON:
{{
  "cards": [
    {{
      "type": "tradeoff",
      "concept_source": "primary",
      "front": "What are the trade-offs when implementing {clean_atoms.get('concept', 'this concept')}?",
      "tradeoffs": [
        {{
          "approach": "Approach name",
          "pros": ["Performance benefit...", "Simplicity..."],
          "cons": ["Memory overhead...", "Complexity..."]
        }}
      ],
      "tags": ["domain:{clean_atoms.get('category', 'java')}", "difficulty:senior"]
    }}
  ]
}}

Return ONLY valid JSON."""

def generate_basic_cards(atoms: Dict) -> List[Dict]:
    """Generate basic cards from atoms."""
    try:
        prompt = stage2a_basic_prompt(atoms)
        raw = llm(prompt, PIPELINE_MODELS['generate_basic'])
        output = extract_json(raw, lenient=True)
        
        if not output:
            logger.debug("Basic card generation returned no output")
            return []
        
        # Handle different response formats
        if isinstance(output, list):
            return output
        elif isinstance(output, dict):
            cards = output.get("cards", [])
            if cards:
                logger.debug(f"Generated {len(cards)} basic cards")
            return cards
        else:
            return []
            
    except Exception as e:
        logger.error(f"Basic generation failed: {e}")
        PROCESSING_METRICS.increment("card_generation_errors")
        return []

def generate_cloze_cards(atoms: Dict) -> List[Dict]:
    """Generate cloze cards from atoms."""
    try:
        topic_class = atoms.get("topic_class", "intermediate")
        tech_points = atoms.get("technical_points", [])
        definition = atoms.get("definition", "")
        
        # Check if content is suitable for cloze
        has_suitable_content = False
        if topic_class == "foundation":
            combined_text = f"{definition} {' '.join(tech_points)}"
            has_definition = len(definition.split()) >= 5
            has_is_statement = " is " in combined_text.lower() or " are " in combined_text.lower()
            has_cause_effect = any(word in combined_text.lower() for word in ["because", "therefore", "causes", "leads to", "results in"])
            has_suitable_content = has_definition or has_is_statement or has_cause_effect
        else:
            has_suitable_content = len(tech_points) >= 2
        
        if not has_suitable_content:
            logger.debug(f"Skipping cloze generation - unsuitable content for {topic_class}")
            return []
        
        prompt = stage2b_cloze_prompt(atoms)
        raw = llm(prompt, PIPELINE_MODELS['generate_cloze'])
        output = extract_json(raw, lenient=True)
        
        if not output:
            logger.debug("Cloze card generation returned no output")
            return []
        
        # Extract cards
        cards = output.get("cards", []) if isinstance(output, dict) else output
        
        # Validate each cloze card
        valid_cards = []
        for card in cards:
            if validate_cloze(card, topic_class):
                valid_cards.append(card)
            else:
                logger.debug(f"Rejected invalid cloze card: {card.get('cloze', '')[:50]}...")
        
        if valid_cards:
            logger.debug(f"Generated {len(valid_cards)} valid cloze cards")
        
        return valid_cards
        
    except Exception as e:
        logger.error(f"Cloze generation failed: {e}")
        PROCESSING_METRICS.increment("cloze_generation_errors")
        return []

def generate_tradeoff_cards(atoms: Dict) -> List[Dict]:
    """Generate tradeoff cards from atoms."""
    if not atoms.get("has_tradeoffs") or not CONFIG.get("ENABLE_TRADEOFFS", True):
        return []
    
    try:
        prompt = stage2c_tradeoff_prompt(atoms)
        if not prompt:
            return []
        
        raw = llm(prompt, PIPELINE_MODELS['generate_tradeoff'])
        output = extract_json(raw, lenient=True)
        
        if not output:
            logger.debug("Tradeoff card generation returned no output")
            return []
        
        # Extract cards
        cards = output.get("cards", []) if isinstance(output, dict) else output
        
        # Validate tradeoff cards
        valid_cards = []
        for card in cards:
            tradeoffs = card.get("tradeoffs", [])
            if len(tradeoffs) >= 2:
                valid = all(
                    len(t.get("pros", [])) >= 2 and len(t.get("cons", [])) >= 2
                    for t in tradeoffs
                )
                if valid:
                    valid_cards.append(card)
                else:
                    logger.debug("Rejected tradeoff card - insufficient pros/cons")
        
        if valid_cards:
            logger.debug(f"Generated {len(valid_cards)} tradeoff cards")
        
        return valid_cards
        
    except Exception as e:
        logger.error(f"Tradeoff generation failed: {e}")
        PROCESSING_METRICS.increment("tradeoff_generation_errors")
        return []

def generate_adjacent_card(atoms: Dict) -> Optional[Dict]:
    """Generate one adjacent card for foundation concepts."""
    try:
        concept = atoms.get("concept", "")
        definition = atoms.get("definition", "")
        technical_points = atoms.get("technical_points", [])
        
        prompt = f"""Generate ONE adjacent basic card for a foundation concept.

Original concept: {concept}
Definition: {definition}
Key points: {technical_points[:3]}

Rules:
1. Must directly relate to the SAME concept
2. Focus on practical application, common mistake, or edge case
3. Keep it simple and interview-ready
4. No new concepts - only extensions of the original

OUTPUT JSON:
{{
  "type": "basic",
  "concept_source": "adjacent",
  "front": "Question about practical application of {concept}",
  "back": "Answer with practical insight, common mistake, or edge case",
  "tags": ["domain:foundation", "type:adjacent"]
}}

Return ONLY valid JSON:"""
        
        raw = llm(prompt, PIPELINE_MODELS['generate_basic'])
        output = extract_json(raw, lenient=True)
        
        if not output:
            return None
        
        # Handle different response formats
        if isinstance(output, dict) and "type" in output:
            return output
        elif isinstance(output, list) and len(output) > 0:
            return output[0]
        elif isinstance(output, dict) and "cards" in output:
            cards = output["cards"]
            return cards[0] if cards else None
        
        return None
        
    except Exception as e:
        logger.error(f"Adjacent card generation failed: {e}")
        return None

# ============================================================
# QUALITY & COMPLETION ASSESSMENT
# ============================================================

def determine_completion_state(card_count: int, has_tradeoffs: bool, tradeoff_count: int,
                              duplicates_filtered: int = 0, topic_class: str = "intermediate") -> Tuple[CompletionState, str]:
    """
    Determine completion state based on generated cards.
    
    Returns (state, reason) for observability.
    """
    if card_count < CONFIG["MIN_CARDS_FOR_PARTIAL"]:
        if duplicates_filtered > 0:
            return CompletionState.INCOMPLETE, "duplicates_filtered"
        return CompletionState.INCOMPLETE, "low_card_count"
    
    if card_count >= CONFIG["MIN_CARDS_FOR_FULL"]:
        if (topic_class != "foundation" and
            has_tradeoffs and
            tradeoff_count == 0 and
            CONFIG.get("ENABLE_TRADEOFFS", True)):
            return CompletionState.PARTIAL, "missing_tradeoff"
        return CompletionState.FULL, "complete"
    
    return CompletionState.PARTIAL, "partial_cards"

def quality_score_fallback(card: Dict) -> int:
    """
    Enhanced quality scoring for cards.
    Returns score 0-100.
    """
    score = 0
    card_type = card.get("type", "basic")
    
    front = card.get("front") or ""
    back = card.get("back") or ""
    cloze = card.get("cloze") or ""
    
    full_text = f"{front} {back} {cloze}".strip()
    
    if not full_text:
        return 0
    
    # Enhanced technical scoring
    tech_count = 0
    for kws in TECHNICAL_KEYWORDS.values():
        for kw in kws:
            if kw in full_text.lower():
                tech_count += 1
    
    foundation_count = sum(1 for kw in FOUNDATION_ENHANCEMENT_KEYWORDS if kw in full_text.lower())
    
    score += min(tech_count * 6, 30)
    score += min(foundation_count * 3, 15)
    
    # Card type specific scoring
    if card_type == "basic":
        if len(back) >= 150:
            score += 15
        bullet_count = back.count("•") + back.count("-") + back.count("*")
        score += min(bullet_count * 3, 25)
        if "`" in back or "```" in back:
            score += 10
    
    elif card_type == "cloze":
        cloze_count = cloze.count("{{c")
        score += min(cloze_count * 8, 25)
        if len(cloze) >= 120:
            score += 15
    
    elif card_type == "tradeoff":
        tradeoffs = card.get("tradeoffs", [])
        if len(tradeoffs) >= 2:
            score += 20
            # Check for balanced pros/cons
            balanced = all(len(t.get("pros", [])) >= 2 and len(t.get("cons", [])) >= 2 for t in tradeoffs)
            if balanced:
                score += 15
    
    # Question quality
    if any(ind in front.lower() for ind in ["what", "how", "why", "explain", "compare", "difference"]):
        score += 20
    if len(front.split()) >= 5:
        score += 10
    
    # Clamp to 0-100
    return min(max(score, 0), 100)

def assign_priority(quality: int) -> str:
    """Assign priority based on quality score."""
    if quality >= 85:
        return "P0"
    elif quality >= 70:
        return "P1"
    else:
        return "P2"

def calculate_confidence(atoms: Dict, cards: List[Dict], quality_dims: QualityDimensions) -> float:
    """
    Topic-aware confidence calculation with floor for high-quality foundation cards.
    """
    topic_class = atoms.get("topic_class", "intermediate")
    expected_cards = TOPIC_CLASSES.get(topic_class, {}).get("expected_cards", 3)
    
    final_count = len(cards)
    avg_quality = sum(quality_score_fallback(c) for c in cards) / max(final_count, 1)
    
    # Base confidence calculation
    card_factor = min(final_count / expected_cards, 1.0)
    quality_factor = avg_quality / 100
    
    confidence = (
        card_factor * 0.4 +
        quality_factor * 0.3 +
        quality_dims.combined_score * 0.3
    )
    
    # CRITICAL FIX #7: Confidence MUST align with correctness
    if quality_dims.correctness_score >= 0.9 and final_count >= 1:
        confidence = max(confidence, 0.6)
        logger.debug(f"High correctness ({quality_dims.correctness_score:.2f}) → confidence floor 0.6")
    
    # Adaptive foundation confidence boost
    if topic_class == "foundation":
        if quality_dims.correctness_score >= 0.95 and quality_dims.richness_score >= 0.75:
            boost_factor = 1.35
            confidence *= boost_factor
            logger.debug(f"Foundation quality boost: {confidence:.2f} (correctness={quality_dims.correctness_score:.2f}, richness={quality_dims.richness_score:.2f})")
        elif quality_dims.correctness_score >= 0.9:
            boost_factor = 1.2
            confidence *= boost_factor
            logger.debug(f"Foundation moderate boost: {confidence:.2f}")
    
    # Confidence floor prevents penalizing high-quality foundational cards
    if final_count >= 1 and avg_quality >= 70 and topic_class == "foundation":
        confidence = max(confidence, 0.65)
        logger.debug(f"Foundation quality floor: {confidence:.2f}")
    
    # Incomplete completion state reduces confidence
    completion_state_obj = atoms.get("_completion_state_obj")
    if completion_state_obj == CompletionState.INCOMPLETE:
        penalty = 0.85 if topic_class == "foundation" else 0.6
        confidence *= penalty
        logger.debug(f"Confidence reduced due to incomplete state ({topic_class}): {confidence:.2f}")
    
    # Topic-aware confidence floor
    CONFIDENCE_FLOOR = {
        "foundation": 0.45,
        "intermediate": 0.50,
        "advanced": 0.55
    }
    
    floor = CONFIDENCE_FLOOR.get(topic_class, 0.50)
    if confidence < floor:
        logger.debug(f"Applying confidence floor: {confidence:.2f} → {floor:.2f} ({topic_class})")
        confidence = floor
    
    # Apply calibration
    calibrated_confidence = calibrate_confidence(confidence, topic_class)
    
    return min(calibrated_confidence, 1.0)

# ============================================================
# STAGE 1: ATOM EXTRACTION WITH RETRY
# ============================================================

def extract_atoms_with_retry(reel: Dict) -> Dict:
    """
    Extract atoms with adaptive routing and enrichment.
    All CRITICAL FIXES applied here.
    """
    reel_id = str(reel.get("reel_id", reel.get("id", "")))
    caption = str(reel.get("caption", ""))
    transcript = str(reel.get("transcript", ""))
    category = str(reel.get("category", ""))
    
    logger.info(f"🎬 Processing reel {reel_id}: {caption[:50]}...")
    
    try:
        # Check terminal rejections FIRST
        if terminal_rejections.is_terminal(reel_id):
            logger.info(f"  ⏭️ Skipping - terminally rejected")
            return {
                "reel_id": reel_id,
                "status": "terminal_rejected",
                "reason": "Previously rejected by logic - no retry allowed"
            }
        
        # Check cache first
        cached = get_cached_result(reel_id, "stage1_atoms")
        if cached and cached.get("status") == "extracted":
            logger.info(f"  ⏭️ Using cached atoms")
            return cached
        
        # PHASE 2: Apply transcript normalization
        if CONFIG.get("ENABLE_TRANSCRIPT_NORMALIZATION", True):
            original_transcript = transcript
            transcript = normalize_transcript(transcript, caption)
            
            if len(transcript) != len(original_transcript):
                normalization_delta = len(transcript) - len(original_transcript)
                ROUTING_METRICS.increment("content_signal::transcript_normalized")
        
        # Classify topic
        topic_class, topic_confidence = classify_topic_with_confidence(caption, category, transcript)
        logger.info(f"  📊 Topic: {topic_class} (confidence: {topic_confidence})")
        
        # Create normalized concept key upfront
        normalized_concept = normalize_learning_key(caption)
        
        # Get learned strategy from rejection memory
        learned_override = None
        if rejection_memory:
            best_strategy = rejection_memory.get_best_strategy(
                concept=normalized_concept,
                category=category,
                topic_class=topic_class
            )
            if best_strategy:
                learned_override = best_strategy.get("strategy")
                logger.info(f"  🧠 Learned strategy: {learned_override}")
        
        # Select prompt strategy
        strategy, routing_reason = select_prompt_strategy(
            caption, transcript, category, learned_override
        )
        logger.info(f"  🎯 Strategy: {strategy.value} ({routing_reason})")
        
        # Build prompt
        if strategy == PromptStrategy.STRICT_ADVANCED:
            prompt = stage1_prompt_a_strict(caption, transcript)
        elif strategy == PromptStrategy.DSA_FOCUSED:
            prompt = stage1_prompt_c_dsa(caption, transcript)
        else:
            prompt = stage1_prompt_b_foundation(caption, transcript)
        
        # Call LLM
        start_time = time.time()
        raw = llm(prompt, PIPELINE_MODELS['extract'])
        extraction_time = time.time() - start_time
        
        PROCESSING_METRICS.record_timing("atom_extraction", extraction_time)
        PROCESSING_METRICS.increment("atom_extraction_attempts")
        
        atoms = extract_json(raw, lenient=True)
        
        # Validate atoms
        is_valid, errors = validate_atoms(atoms)
        if not is_valid:
            logger.error(f"  ❌ Invalid atoms schema: {errors}")
            return {
                "reel_id": reel_id,
                "status": "error",
                "reason": f"Invalid schema: {errors[:3]}"
            }
        
        if not atoms.get("valid", True):
            reject_reason = atoms.get("reject_reason", "Unknown")
            logger.info(f"  ❌ Rejected by logic: {reject_reason}")
            
            terminal_rejections.mark_terminal(
                reel_id,
                reason=f"handled_by_logic: {reject_reason}",
                stage="stage1",
                rejection_type="SEMANTIC_TERMINAL"
            )
            
            return {
                "reel_id": reel_id,
                "status": "rejected",
                "reason": reject_reason,
                "handled_by_logic": True
            }
        
        # Use caption as canonical learning_key, not LLM output
        atoms["learning_key"] = normalized_concept
        
        # Calculate technical score
        tech_score_val = technical_score(
            f"{atoms.get('concept','')} {atoms.get('definition','')} "
            f"{' '.join(atoms.get('technical_points', []))}"
        )
        
        # Topic-aware thresholds
        min_score = 4
        if CONFIG.get("NORMALIZE_TECH_SCORES", True):
            adjustment = TOPIC_CLASSES.get(topic_class, {}).get("threshold_adjustment", 0)
            min_score += adjustment
        
        logger.info(f"  📊 Tech score: {tech_score_val} | threshold: {min_score}")
        
        # Foundation topics: disable tradeoffs only if truly basic
        if topic_class == "foundation":
            has_comparison = any(word in transcript.lower() for word in ["vs", "versus", "instead", "rather than", "compared"])
            if not has_comparison:
                atoms["has_tradeoffs"] = False
            else:
                atoms["has_tradeoffs"] = (
                    atoms.get("has_tradeoffs", False) and
                    len(atoms.get("related_concepts", [])) >= 1
                )
        
        # Structural impossibility gate
        char_length = len(transcript.strip())
        if char_length < 80 and topic_class != "foundation":
            logger.info(f"  🚫 Structural impossibility: transcript too short ({char_length} chars)")
            terminal_rejections.mark_terminal(
                reel_id,
                reason="insufficient_source_material",
                stage="stage1",
                rejection_type="STRUCTURAL"
            )
            return {
                "reel_id": reel_id,
                "status": RejectionType.STRUCTURAL.value,
                "reason": "insufficient_source_material",
                "skip_enrichment": True
            }
        
        # Check if below threshold
        if tech_score_val < min_score:
            if topic_class == "foundation":
                is_valuable = (
                    len(atoms.get("related_concepts", [])) > 0 or
                    len(atoms.get("technical_points", [])) >= 3 or
                    atoms.get("definition", "") != ""
                )
                
                if is_valuable:
                    logger.info(f"  ✅ Foundation topic - pedagogically valuable (accepted despite score)")
                    atoms["low_density_but_valid"] = True
                else:
                    logger.info(f"  ❌ Foundation topic - insufficient pedagogical value (rejected)")
            else:
                logger.info(f"  ⚠️ Below depth threshold (rejected unless enriched)")
        else:
            logger.info(f"  ✅ Passed depth threshold (accepted)")
        
        # Enrichment logic
        has_rejection_history = (
            rejection_memory and
            rejection_memory.get_rejection_count(normalized_concept, category, topic_class) > 0
        )
        
        # Predictive confidence check
        would_have_low_confidence = (
            len(atoms.get("technical_points", [])) < 3 or
            len(atoms.get("solutions", [])) < 1 or
            (topic_class != "foundation" and not atoms.get("has_tradeoffs", False))
        )
        
        should_enrich = (
            (tech_score_val < min_score or has_rejection_history or would_have_low_confidence) and
            CONFIG.get("ENABLE_ENRICHMENT", True) and
            not atoms.get("low_density_but_valid", False)
        )
        
        enrichment_attempted = False
        if should_enrich:
            concept = atoms.get("learning_key")
            enrichment_key = f"{concept}::{category or 'unknown'}"
            
            # Topic-aware enrichment budget
            max_enrichments = CONFIG["MAX_ENRICHMENTS_PER_CONCEPT"].get(
                topic_class,
                CONFIG["MAX_ENRICHMENTS_PER_CONCEPT"].get("intermediate", 2)
            )
            
            # Check per-concept enrichment budget with temporal spacing
            with ENRICHMENT_BUDGET_LOCK:
                now = time.time()
                last_enrichment = ENRICHMENT_TIMESTAMPS.get(enrichment_key, 0)
                days_since_last = (now - last_enrichment) / (24 * 60 * 60)
                
                if days_since_last > ENRICHMENT_BUDGET_RESET_DAYS:
                    if ENRICHMENT_BUDGET[enrichment_key] > 0:
                        logger.info(f"  🔄 Resetting enrichment budget ({days_since_last:.1f} days since last attempt)")
                    ENRICHMENT_BUDGET[enrichment_key] = 0
                
                concept_enrichments = ENRICHMENT_BUDGET[enrichment_key]
                already_enriched = concept_enrichments >= max_enrichments
            
            if already_enriched:
                logger.info(f"  ⏭️ Skipping enrichment - concept budget exhausted ({concept_enrichments}/{max_enrichments} for {topic_class})")
                logger.info(f"     Budget will reset in {max(0, ENRICHMENT_BUDGET_RESET_DAYS - days_since_last):.1f} days")
            else:
                # Check rejection memory
                if rejection_memory:
                    if rejection_memory.should_skip(concept, category, topic_class):
                        logger.info(f"  ⏭️ Skipping - too many rejections")
                        
                        terminal_rejections.mark_terminal(
                            reel_id,
                            reason="repeatedly_failed_enrichment",
                            stage="stage1_enrichment",
                            rejection_type="SEMANTIC_TERMINAL"
                        )
                        
                        return {
                            "reel_id": reel_id,
                            "status": RejectionType.SEMANTIC.value,
                            "reason": "repeatedly_failed"
                        }
                    
                    strategy_hint = rejection_memory.get_best_strategy(concept, category, topic_class)
                else:
                    strategy_hint = None
                
                enrichment_reason = (
                    f"low_score ({tech_score_val})" if tech_score_val < min_score
                    else "predicted_low_confidence" if would_have_low_confidence
                    else "rejection_history"
                )
                logger.info(f"  🔄 Enriching (reason: {enrichment_reason})")
                
                # Increment using canonical key
                with ENRICHMENT_BUDGET_LOCK:
                    ENRICHMENT_BUDGET[enrichment_key] += 1
                    ENRICHMENT_TIMESTAMPS[enrichment_key] = time.time()
                
                ROUTING_METRICS.increment("enriched")
                enrichment_attempted = True
                
                enrichment_start = time.time()
                
                try:
                    enrichment_prompt = create_enrichment_prompt(atoms, strategy_hint.get("strategy") if strategy_hint else None)
                    enriched_raw = llm(enrichment_prompt, PIPELINE_MODELS['extract_retry'])
                    enriched_atoms = extract_json(enriched_raw, lenient=True)
                    
                    if validate_atoms(enriched_atoms)[0] and enriched_atoms.get("valid", True):
                        new_score = technical_score(
                            f"{enriched_atoms.get('concept','')} {enriched_atoms.get('definition','')} "
                            f"{' '.join(enriched_atoms.get('technical_points', []))}"
                        )
                        
                        logger.info(f"  ✨ Enriched score: {new_score} (was: {tech_score_val})")
                        
                        score_improvement = new_score - tech_score_val
                        if new_score >= min_score and score_improvement > 1:
                            atoms = enriched_atoms
                            tech_score_val = new_score
                            atoms["was_enriched"] = True
                            atoms["_enrichment_attempts"] = 1
                            atoms["_elapsed_enrichment"] = round(time.time() - enrichment_start, 2)
                            atoms["_delta_score"] = score_improvement
                            logger.info(f"  ✅ Enrichment successful! (Δ={score_improvement})")
                        elif new_score >= min_score:
                            logger.info(f"  ⚠️ Enrichment marginal (Δ={score_improvement}) - not learning from noise")
                        else:
                            logger.info(f"  ⚠️ Still below threshold")
                            
                except Exception as e:
                    logger.error(f"  ⚠️ Enrichment failed: {e}")
        
        # Final threshold check - SEMANTIC rejection
        if tech_score_val < min_score and not atoms.get("low_density_but_valid", False):
            if rejection_memory:
                concept_key = atoms.get("learning_key", normalized_concept)
                rejection_memory.record_rejection(
                    concept_key,
                    tech_score_val,
                    f"Low technical signal ({tech_score_val})",
                    category=category,
                    topic_class=topic_class
                )
            
            logger.info(f"  ❌ Final rejection: Low technical signal ({tech_score_val})")
            return {
                "reel_id": reel_id,
                "status": RejectionType.SEMANTIC.value,
                "reason": f"Low technical signal ({tech_score_val})"
            }
        
        # Success
        atoms["reel_id"] = reel_id
        atoms["status"] = "extracted"
        atoms["tech_score"] = tech_score_val
        atoms["topic_class"] = topic_class
        atoms["prompt_strategy"] = strategy.value
        atoms["extraction_time"] = extraction_time
        atoms["enrichment_attempted"] = enrichment_attempted
        
        # Model-aware prompt versioning
        model_name = PIPELINE_MODELS['extract'].name.replace(":", "_").replace(".", "_")
        atoms["prompt_version"] = f"stage1_{strategy.value}_{model_name}_{CONFIG['CACHE_VERSION']}"
        atoms["routing_reason"] = routing_reason
        
        ROUTING_METRICS.increment(f"reason::{routing_reason}")
        
        # Cache result
        save_cached_result(reel_id, "stage1_atoms", atoms)
        
        logger.info(f"  ✅ Atoms extracted successfully (score: {tech_score_val})")
        return atoms
        
    except Exception as e:
        logger.error(f"  ❌ Atom extraction failed: {e}")
        error_tracker.record(
            "atom_extraction_error",
            str(e),
            {"reel_id": reel_id, "caption": caption[:100]}
        )
        
        return {
            "reel_id": reel_id,
            "status": "error",
            "reason": str(e)
        }

# ============================================================
# STAGE 2: CARD GENERATION
# ============================================================

def generate_cards_from_atoms(atoms: Dict) -> Dict:
    """
    Generate all card types from atoms with density-aware routing.
    """
    reel_id = atoms.get("reel_id", "")
    prompt_recorded = False
    
    try:
        # Check cache
        cached = get_cached_result(reel_id, "stage2_cards")
        if cached and cached.get("status") == "success":
            logger.info(f"  ⏭️ Using cached cards")
            return cached
        
        logger.info(f"  🎴 Generating cards...")
        
        # PHASE 3: Determine content density and route accordingly
        caption = atoms.get("caption", "")
        transcript = atoms.get("transcript", "")
        category = atoms.get("category", "")
        
        if CONFIG.get("ENABLE_HYBRID_ROUTING", True):
            content_density = classify_content_density(caption, transcript, category)
            ROUTING_METRICS.increment(f"content_density::{content_density.value}")
            
            logger.info(f"  📚 Content density: {content_density.value}")
            
            if content_density == ContentDensity.DENSE:
                logger.info(f"    → Full card generation (basic + cloze + tradeoff)")
                basic_cards = generate_basic_cards(atoms)
                cloze_cards = generate_cloze_cards(atoms)
                tradeoff_cards = generate_tradeoff_cards(atoms)
                all_cards = basic_cards + cloze_cards + tradeoff_cards
                
            elif content_density == ContentDensity.LIGHT:
                logger.info(f"    → Reference cards only (basic)")
                basic_cards = generate_basic_cards(atoms)
                all_cards = basic_cards
                for card in all_cards:
                    card["content_density"] = "light"
                    
            else:
                logger.info(f"    → Content too sparse, skipping")
                return {
                    "reel_id": reel_id,
                    "status": "rejected",
                    "reason": "Content too sparse for card generation"
                }
        else:
            logger.info(f"    → Full card generation (hybrid routing disabled)")
            basic_cards = generate_basic_cards(atoms)
            cloze_cards = generate_cloze_cards(atoms)
            tradeoff_cards = generate_tradeoff_cards(atoms)
            all_cards = basic_cards + cloze_cards + tradeoff_cards
            content_density = ContentDensity.DENSE
        
        logger.info(f"    Generated: Basic={len(basic_cards)}, Cloze={len(cloze_cards)}, Tradeoff={len(tradeoff_cards)}")
        
        # Foundation Expansion (only for DENSE content)
        topic_class = atoms.get("topic_class", "intermediate")
        if (content_density == ContentDensity.DENSE and
            topic_class == "foundation" and
            len(all_cards) <= 2 and
            CONFIG.get("ENABLE_FOUNDATION_EXPANSION", True)):
            
            temp_dims = QualityDimensions.calculate(atoms, all_cards)
            was_enriched = atoms.get("was_enriched", False)
            tech_score = atoms.get("tech_score", 0)
            min_score_for_expansion = 3
            
            can_expand = (
                temp_dims.correctness_score >= 0.9 and
                temp_dims.richness_score < 0.7 and
                not was_enriched and
                tech_score >= min_score_for_expansion
            )
            
            if can_expand:
                logger.info(f"  🌱 Attempting foundation expansion...")
                adjacent_card = generate_adjacent_card(atoms)
                if adjacent_card:
                    all_cards.append(adjacent_card)
                    logger.info(f"    ✅ Added adjacent card")
            elif was_enriched:
                logger.info(f"    ⏭️ Skipping expansion - content was enriched")
            elif tech_score < min_score_for_expansion:
                logger.info(f"    ⏭️ Skipping expansion - tech score too low ({tech_score} < {min_score_for_expansion})")
        
        if not all_cards:
            logger.info(f"  ❌ No valid cards generated")
            return {
                "reel_id": reel_id,
                "status": "rejected",
                "reason": "No valid cards generated"
            }
        
        # Filter duplicates but DEFER fingerprint registration
        unique_cards = []
        duplicates_found = 0
        pending_fingerprints = []
        
        for card in all_cards:
            is_dup, fp_type = duplicate_detector.is_duplicate(card)
            if not is_dup:
                unique_cards.append(card)
                fingerprint = duplicate_detector._create_fingerprint(card)
                semantic_fp = duplicate_detector._create_semantic_fingerprint(card)
                pending_fingerprints.append((fingerprint, semantic_fp))
            else:
                duplicates_found += 1
                logger.debug(f"    Duplicate card filtered: {fp_type}")
        
        if duplicates_found > 0:
            logger.info(f"  🔄 Filtered {duplicates_found} duplicates")
        
        final_count = len(unique_cards)
        
        if final_count == 0:
            logger.info(f"  ❌ All cards were duplicates")
            return {
                "reel_id": reel_id,
                "status": "rejected",
                "reason": "All cards were duplicates"
            }
        
        # Quality scoring
        quality_dims = QualityDimensions.calculate(atoms, unique_cards)
        logger.info(f"  📊 Quality: correctness={quality_dims.correctness_score:.2f}, richness={quality_dims.richness_score:.2f}")
        
        # Completion state with explicit reason
        completion_state, completion_reason = determine_completion_state(
            final_count,
            atoms.get("has_tradeoffs", False),
            len(tradeoff_cards),
            duplicates_found,
            topic_class=atoms.get("topic_class", "intermediate")
        )
        
        atoms["completion_state"] = completion_state.value
        atoms["completion_reason"] = completion_reason
        atoms["_completion_state_obj"] = completion_state
        
        logger.info(f"  🎯 Completion: {completion_state.value} ({completion_reason})")
        
        # Confidence calculation
        confidence = calculate_confidence(atoms, unique_cards, quality_dims)
        logger.info(f"  💪 Confidence: {confidence:.2f}")
        
        # Assign priorities and reel_id to cards
        for card in unique_cards:
            card["quality"] = quality_score_fallback(card)
            card["priority"] = assign_priority(card["quality"])
            card["reel_id"] = reel_id
            card["topic_class"] = topic_class
            card["confidence"] = confidence
        
        # NOW register fingerprints after cards passed all quality gates
        for fingerprint, semantic_fp in pending_fingerprints:
            duplicate_detector.add_fingerprint(fingerprint, semantic_fp)
        
        # Explicit approval logic
        auto_approved = (
            confidence >= 0.85 and
            quality_dims.correctness_score >= 0.9
        )
        
        if auto_approved:
            logger.info(f"  ✅ Auto-approved (high confidence)")
        
        result = {
            "reel_id": reel_id,
            "status": "success",
            "cards": unique_cards,
            "basic_count": len(basic_cards),
            "cloze_count": len(cloze_cards),
            "tradeoff_count": len(tradeoff_cards),
            "duplicates_filtered": duplicates_found,
            "final_count": final_count,
            "confidence": round(confidence, 2),
            "completion_state": completion_state.value,
            "completion_reason": completion_reason,
            "correctness_score": quality_dims.correctness_score,
            "richness_score": quality_dims.richness_score,
            "attempted_enrichment": atoms.get("was_enriched", False),
            "topic_class": atoms.get("topic_class", "intermediate"),
            "prompt_strategy": atoms.get("prompt_strategy", "unknown"),
            "prompt_version": atoms.get("prompt_version", "unknown"),
            "auto_approved": auto_approved
        }
        
        # Cache result
        save_cached_result(reel_id, "stage2_cards", result)
        
        # Record prompt version success ONLY if cards meet minimum threshold
        prompt_version = atoms.get("prompt_version")
        if prompt_version and not prompt_recorded:
            if final_count >= CONFIG["MIN_CARDS_FOR_PARTIAL"]:
                record_prompt_version_result(prompt_version, True, stage="complete")
            else:
                record_prompt_version_result(prompt_version, False, stage="generation")
            prompt_recorded = True
        
        # Record success in rejection memory if enriched
        if atoms.get("was_enriched") and rejection_memory:
            concept_key = atoms.get("learning_key", atoms.get("concept", ""))
            delta_score = atoms.get("_delta_score", 0)
            rejection_memory.record_success(
                concept_key,
                atoms.get("prompt_strategy", ""),
                confidence=confidence,
                category=atoms.get("category", ""),
                topic_class=atoms.get("topic_class", ""),
                delta_score=delta_score
            )
        
        logger.info(f"  ✅ Generated {final_count} cards")
        return result
        
    except Exception as e:
        logger.error(f"  ❌ Card generation failed: {e}")
        
        # Record prompt version failure
        prompt_version = atoms.get("prompt_version")
        if prompt_version and not prompt_recorded:
            record_prompt_version_result(prompt_version, False, stage="mechanical")
        
        error_tracker.record(
            "card_generation_error",
            str(e),
            {"reel_id": reel_id, "atoms_concept": atoms.get("concept", "")}
        )
        
        return {
            "reel_id": reel_id,
            "status": "error",
            "reason": str(e)
        }

# ============================================================
# SINGLE REEL PROCESSING
# ============================================================

def process_single_reel(reel: Dict) -> Dict:
    """Process one reel through the full pipeline"""
    start_time = time.time()
    reel_id = str(reel.get("reel_id", reel.get("id", "")))
    
    # Create ProcessingResult
    result = ProcessingResult(
        reel_id=reel_id,
        status="pending",
        processing_time=0.0
    )
    
    try:
        # Stage 1: Extract atoms
        atoms = extract_atoms_with_retry(reel)
        
        if atoms.get("status") != "extracted":
            result.status = atoms.get("status", "error")
            result.reason = atoms.get("reason", "Unknown error")
            result.processing_time = time.time() - start_time
            
            # Track rejection
            if result.status == "rejected":
                STATISTICS.update(result)
            
            return result.to_dict()
        
        logger.info(f"  ✅ Atoms extracted (score: {atoms.get('tech_score')}, class: {atoms.get('topic_class')})")
        
        # Stage 2: Generate cards
        card_result = generate_cards_from_atoms(atoms)
        
        result.processing_time = time.time() - start_time
        
        if card_result.get("status") == "success":
            result.status = "success"
            result.cards = card_result.get("cards", [])
            result.confidence = card_result.get("confidence", 0)
            result.tech_score = atoms.get("tech_score", 0)
            result.topic_class = atoms.get("topic_class", "")
            result.prompt_strategy = atoms.get("prompt_strategy", "")
            
            logger.info(f"  ✅ {card_result['final_count']} cards (confidence: {card_result['confidence']:.2f}, time: {result.processing_time:.1f}s)")
        else:
            result.status = card_result.get("status", "error")
            result.reason = card_result.get("reason", "Unknown error")
            logger.info(f"  ❌ {result.status}: {result.reason}")
        
        # Update statistics
        STATISTICS.update(result)
        
        return result.to_dict()
        
    except Exception as e:
        result.status = "error"
        result.reason = str(e)
        result.processing_time = time.time() - start_time
        result.error_details = {"traceback": traceback.format_exc()}
        
        logger.error(f"  ❌ Processing failed: {e}")
        error_tracker.record(
            "reel_processing_error",
            str(e),
            {"reel_id": reel_id, "processing_time": result.processing_time}
        )
        
        return result.to_dict()

# ============================================================
# BATCH PROCESSING
# ============================================================

def process_batch(reels: List[Dict]) -> List[Dict]:
    """Process reels in parallel"""
    logger.info(f"📦 Processing batch of {len(reels)} reels")
    results = []
    
    with ThreadPoolExecutor(max_workers=CONFIG["MAX_WORKERS"]) as executor:
        future_to_reel = {
            executor.submit(process_single_reel, reel): reel
            for reel in reels
        }
        
        for future in as_completed(future_to_reel):
            reel = future_to_reel[future]
            reel_id = str(reel.get("reel_id", reel.get("id", "")))
            
            try:
                result = future.result()
                results.append(result)
                
                # Periodic saves
                duplicate_detector.save_if_dirty()
                progress_tracker.flush()
                
                # Mark as processed
                if result.get("reel_id"):
                    progress_tracker.mark_processed(result["reel_id"])
                
            except Exception as e:
                logger.error(f"❌ Error processing reel {reel_id}: {e}")
                error_tracker.record(
                    "batch_processing_error",
                    str(e),
                    {"reel_id": reel_id}
                )
                
                results.append({
                    "reel_id": reel_id,
                    "status": "error",
                    "reason": str(e)
                })
    
    return results

# ============================================================
# CSV LOADING & DIAGNOSTICS
# ============================================================

def diagnose_csv_file(csv_path: str) -> Dict:
    """
    Diagnose CSV file issues and provide actionable feedback.
    """
    diagnosis = {
        "exists": False,
        "readable": False,
        "line_count": 0,
        "issues": [],
        "sample_lines": []
    }
    
    try:
        csv_file = Path(csv_path)
        diagnosis["exists"] = csv_file.exists()
        
        if not diagnosis["exists"]:
            diagnosis["issues"].append(f"File not found: {csv_path}")
            return diagnosis
        
        # Try to read with different encodings
        encodings = ['utf-8', 'latin-1', 'cp1252']
        
        for encoding in encodings:
            try:
                with open(csv_path, 'r', encoding=encoding, errors='ignore') as f:
                    lines = f.readlines()
                diagnosis["readable"] = True
                diagnosis["encoding"] = encoding
                break
            except UnicodeDecodeError:
                continue
        
        if not diagnosis["readable"]:
            diagnosis["issues"].append("Cannot read file with any standard encoding")
            return diagnosis
        
        diagnosis["line_count"] = len(lines)
        
        if len(lines) < 2:
            diagnosis["issues"].append("File has fewer than 2 lines (need header + data)")
            return diagnosis
        
        # Analyze header
        header = lines[0].strip()
        diagnosis["header"] = header
        
        # Count fields in header
        header_fields = len(header.split('|'))
        diagnosis["expected_fields"] = header_fields
        
        # Sample first few data lines
        for i, line in enumerate(lines[1:6], start=2):
            if line.strip():
                diagnosis["sample_lines"].append(f"Line {i}: {line.strip()[:100]}")
        
        # Check for consistent field counts
        bad_lines = []
        for i, line in enumerate(lines[1:21], start=2):  # Check first 20 data lines
            if line.strip():
                field_count = len(line.strip().split('|'))
                if field_count != header_fields:
                    bad_lines.append((i, field_count, line.strip()[:80]))
        
        if bad_lines:
            diagnosis["issues"].append(f"Found {len(bad_lines)} lines with inconsistent field counts")
            for line_num, count, preview in bad_lines[:3]:
                diagnosis["issues"].append(f"Line {line_num}: has {count} fields (expected {header_fields}): {preview}...")
        
    except Exception as e:
        diagnosis["issues"].append(f"Error reading file: {e}")
    
    return diagnosis

def load_csv_file(csv_path: str) -> pd.DataFrame:
    """Load CSV file with robust error handling"""
    logger.info(f"Loading CSV: {csv_path}")
    
    # First diagnose
    diagnosis = diagnose_csv_file(csv_path)
    
    if not diagnosis["exists"]:
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
    
    if diagnosis["issues"]:
        logger.warning(f"CSV issues detected: {diagnosis['issues']}")
    
    try:
        # Try standard parsing first
        df = pd.read_csv(csv_path, sep="|", encoding='utf-8')
        logger.info(f"✅ CSV loaded: {len(df)} rows, {len(df.columns)} columns")
        return df
        
    except pd.errors.ParserError as e:
        logger.warning(f"Parser error: {e}")
        
        # Try with error handling
        try:
            df = pd.read_csv(
                csv_path,
                sep="|",
                on_bad_lines='skip',
                engine='python',
                quoting=1,
                encoding='utf-8'
            )
            logger.info(f"✅ CSV recovered with lenient parsing: {len(df)} rows")
            return df
        except Exception as e2:
            logger.error(f"Lenient parsing also failed: {e2}")
            raise
    
    except Exception as e:
        logger.error(f"Unexpected error loading CSV: {e}")
        raise

def normalize_reel(reel: dict) -> dict:
    """Normalize reel data to prevent NaN and type issues"""
    normalized = {}
    
    for k, v in reel.items():
        if isinstance(v, float):
            if math.isnan(v):
                normalized[k] = ""
            else:
                # Convert to int if it's a whole number
                if v.is_integer():
                    normalized[k] = str(int(v))
                else:
                    normalized[k] = str(v)
        elif v is None:
            normalized[k] = ""
        else:
            normalized[k] = str(v)
    
    return normalized

# ============================================================
# MAIN PIPELINE EXECUTION
# ============================================================

def main():
    """Main pipeline execution function"""
    start_time = time.time()
    
    logger.info("=" * 60)
    logger.info("STARTING ANKI GENERATION PIPELINE")
    logger.info("=" * 60)
    
    # Check Ollama health
    if not check_ollama_health():
        logger.error("❌ Ollama server not responding!")
        logger.error("Please start Ollama: ollama serve")
        sys.exit(1)
    
    # Load CSV
    if not Path(CONFIG["CSV_FILE"]).exists():
        logger.error(f"❌ CSV not found: {CONFIG['CSV_FILE']}")
        sys.exit(1)
    
    try:
        df = load_csv_file(CONFIG["CSV_FILE"])
    except Exception as e:
        logger.error(f"❌ Failed to load CSV: {e}")
        sys.exit(1)
    
    logger.info(f"📄 Loaded {len(df)} reels from CSV")
    
    # Normalize and filter unprocessed reels
    unprocessed = []
    for _, row in df.iterrows():
        reel_dict = normalize_reel(row.to_dict())
        reel_id = str(reel_dict.get("reel_id", ""))
        
        if reel_id and not progress_tracker.is_processed(reel_id):
            unprocessed.append(reel_dict)
    
    logger.info(f"♻️ {len(progress_tracker.processed):,} already processed")
    logger.info(f"🆕 {len(unprocessed):,} remaining")
    
    if not unprocessed:
        logger.info("✅ All reels already processed!")
        return
    
    # PHASE 1: Apply pre-filtering
    if CONFIG.get("ENABLE_CONTENT_FILTERING", True):
        logger.info(f"🔍 Applying content quality filters...")
        filtered_reels = []
        skip_stats = defaultdict(int)
        
        for reel in unprocessed:
            is_worthy, skip_reason, quality_assessment = should_process_reel(reel)
            if is_worthy:
                filtered_reels.append(reel)
            else:
                skip_stats[skip_reason] += 1
                reel_id = str(reel.get("reel_id", reel.get("id", "")))
                if reel_id:
                    progress_tracker.mark_processed(reel_id)
        
        logger.info(f"✅ {len(filtered_reels)} reels passed quality filters")
        logger.info(f"⏭️ {len(unprocessed) - len(filtered_reels)} reels skipped:")
        for reason, count in sorted(skip_stats.items(), key=lambda x: x[1], reverse=True):
            logger.info(f"   - {reason}: {count}")
        
        if not filtered_reels:
            logger.info("⚠️ No reels passed quality filters!")
            return
    else:
        logger.info(f"⏭️ Content filtering disabled - processing all reels")
        filtered_reels = unprocessed
    
    # Limit to MAX_REELS
    to_process = filtered_reels[:CONFIG["MAX_REELS"]]
    
    if len(to_process) < len(filtered_reels):
        logger.info(f"⚠️ Processing first {CONFIG['MAX_REELS']} reels (use MAX_REELS config to adjust)")
    
    # Process in batches
    all_results = []
    batch_size = CONFIG["BATCH_SIZE"]
    
    total_batches = (len(to_process) - 1) // batch_size + 1
    
    for i in range(0, len(to_process), batch_size):
        batch = to_process[i:i + batch_size]
        batch_num = i // batch_size + 1
        
        logger.info(f"\n📦 Batch {batch_num}/{total_batches} ({len(batch)} reels)")
        
        batch_results = process_batch(batch)
        all_results.extend(batch_results)
        
        # Intermediate save
        if batch_num % 5 == 0:
            logger.info("💾 Intermediate save...")
            duplicate_detector.save_if_dirty(force=True)
            progress_tracker.flush()
            if rejection_memory:
                rejection_memory.save()
    
    # Final save
    logger.info("\n💾 Final save...")
    duplicate_detector.save_if_dirty(force=True)
    progress_tracker.flush()
    if rejection_memory:
        rejection_memory.save()
    terminal_rejections.save()
    prompt_version_manager.save()
    confidence_calibrator.save()
    
    # Save routing metrics
    try:
        with open(ROUTING_METRICS_FILE, 'w') as f:
            json.dump(dict(ROUTING_METRICS.counts), f, indent=2)
        logger.info(f"💾 Routing metrics saved to: {ROUTING_METRICS_FILE}")
    except Exception as e:
        logger.error(f"⚠️ Failed to save routing metrics: {e}")
    
    # Generate output
    successful = [r for r in all_results if r.get("status") == "success"]
    
    all_cards = []
    for result in successful:
        all_cards.extend(result.get("cards", []))
    
    # Save cards
    output_file = CONFIG["OUT_DIR"] / "anki_cards.json"
    with open(output_file, 'w') as f:
        json.dump(all_cards, f, indent=2)
    
    # Needs review
    needs_review = [r for r in successful if 0.65 <= r.get("confidence", 0) < 0.85]
    if needs_review:
        review_file = CONFIG["OUT_DIR"] / "needs_review.json"
        with open(review_file, 'w') as f:
            json.dump(needs_review, f, indent=2)
    
    # Generate summary
    generate_summary(all_results, successful, all_cards, needs_review, start_time)
    
    logger.info("\n✅ PIPELINE COMPLETE")
    logger.info("=" * 60)

def generate_summary(all_results: List[Dict], successful: List[Dict], 
                    all_cards: List[Dict], needs_review: List[Dict], start_time: float):
    """Generate comprehensive summary of pipeline execution"""
    elapsed = time.time() - start_time
    
    # Basic statistics
    total_processed = len(all_results)
    successful_count = len(successful)
    error_count = len([r for r in all_results if r.get("status") == "error"])
    rejected_count = len([r for r in all_results if r.get("status") == "rejected"])
    
    # Card statistics
    total_cards = len(all_cards)
    basic_cards = sum(1 for c in all_cards if c.get("type") == "basic")
    cloze_cards = sum(1 for c in all_cards if c.get("type") == "cloze")
    tradeoff_cards = sum(1 for c in all_cards if c.get("type") == "tradeoff")
    
    total_duplicates = sum(r.get("duplicates_filtered", 0) for r in successful)
    
    # Quality statistics
    if successful:
        avg_quality = sum(c.get("quality", 0) for c in all_cards) / max(len(all_cards), 1)
        avg_confidence = sum(r.get("confidence", 0) for r in successful) / max(len(successful), 1)
        avg_correctness = sum(r.get("correctness_score", 0) for r in successful) / max(len(successful), 1)
        avg_richness = sum(r.get("richness_score", 0) for r in successful) / max(len(successful), 1)
    else:
        avg_quality = avg_confidence = avg_correctness = avg_richness = 0
    
    # Priority distribution
    p0_cards = sum(1 for c in all_cards if c.get("priority") == "P0")
    p1_cards = sum(1 for c in all_cards if c.get("priority") == "P1")
    p2_cards = sum(1 for c in all_cards if c.get("priority") == "P2")
    
    # Confidence breakdown
    high_confidence = [r for r in successful if r.get("confidence", 0) >= 0.85]
    low_confidence = [r for r in successful if r.get("confidence", 0) < 0.65]
    
    auto_approved_count = len([r for r in successful if r.get("auto_approved", False)])
    
    # Topic class distribution
    topic_class_dist = defaultdict(int)
    for r in successful:
        topic_class_dist[r.get("topic_class", "unknown")] += 1
    
    # Completion states
    completion_states = defaultdict(int)
    for r in successful:
        completion_states[r.get("completion_state", "unknown")] += 1
    
    # Enrichment statistics
    enrichment_attempts = len([r for r in all_results if r.get("attempted_enrichment")])
    enrichment_successes = len([r for r in successful if r.get("attempted_enrichment")])
    
    # Print report
    logger.info("\n" + "=" * 60)
    logger.info("FINAL STATS")
    logger.info("=" * 60)
    logger.info(f"⏱️  Time: {elapsed:.1f}s")
    logger.info(f"🎬 Processed: {total_processed} reels")
    logger.info(f"✅ Success: {successful_count} reels")
    logger.info(f"❌ Errors: {error_count} reels")
    logger.info(f"🚫 Rejected: {rejected_count} reels")
    logger.info(f"🎴 Cards Generated: {total_cards}")
    logger.info(f"   Basic: {basic_cards}, Cloze: {cloze_cards}, Tradeoff: {tradeoff_cards}")
    logger.info(f"🔄 Duplicates filtered: {total_duplicates}")
    logger.info(f"⭐ Avg Quality: {avg_quality:.1f}/100")
    logger.info(f"📊 Avg Confidence: {avg_confidence:.2f}")
    logger.info(f"🚀 Throughput: {total_processed/elapsed:.2f} reels/s")
    logger.info(f"📊 QUALITY DIMENSIONS:")
    logger.info(f"   Correctness: {avg_correctness:.2f}/1.0")
    logger.info(f"   Richness: {avg_richness:.2f}/1.0")
    logger.info(f"🎯 COMPLETION STATES:")
    for state, count in sorted(completion_states.items()):
        pct = (count / successful_count * 100) if successful_count > 0 else 0
        logger.info(f"   {state}: {count} ({pct:.1f}%)")
    logger.info(f"📌 Priority Distribution:")
    logger.info(f"   P0 (≥85): {p0_cards} cards")
    logger.info(f"   P1 (70-84): {p1_cards} cards")
    logger.info(f"   P2 (<70): {p2_cards} cards")
    logger.info(f"📊 Confidence Breakdown:")
    logger.info(f"   High (≥0.85): {len(high_confidence)} reels — Auto-approved ✅")
    logger.info(f"   Medium (0.65-0.84): {len(needs_review)} reels — Review needed ⚠️")
    logger.info(f"   Low (<0.65): {len(low_confidence)} reels — Consider re-prompt 🔄")
    logger.info(f"   Auto-approved: {auto_approved_count} reels 🎯")
    logger.info(f"🧠 ADAPTIVE LEARNING STATS")
    logger.info(f"-" * 60)
    logger.info(f"🔄 Enrichment attempts: {enrichment_attempts}")
    logger.info(f"✅ Enrichment successes: {enrichment_successes}")
    if enrichment_attempts > 0:
        success_rate = (enrichment_successes / enrichment_attempts) * 100
        logger.info(f"📈 Success rate: {success_rate:.1f}%")
    
    # Get more detailed statistics
    if rejection_memory:
        mem_stats = rejection_memory.get_learning_velocity()
        logger.info(f"🧠 Rejection memory:")
        logger.info(f"   Concepts tracked: {mem_stats['total_concepts']}")
        logger.info(f"   Concepts learned: {mem_stats['concepts_learned']}")
        logger.info(f"   Concepts still learning: {mem_stats['concepts_learning']}")
        logger.info(f"   Avg attempts to success: {mem_stats['avg_attempts_until_success']:.1f}")
    
    term_stats = terminal_rejections.get_stats()
    if term_stats['total'] > 0:
        logger.info(f"🚫 Terminal Rejections: {term_stats['total']} reels")
    
    logger.info(f"📚 Topic Class Distribution:")
    for topic_class, count in sorted(topic_class_dist.items()):
        logger.info(f"   {topic_class}: {count} reels")
    
    # System statistics
    cache_stats = pipeline_cache.get_stats()
    logger.info(f"💾 Cache: {cache_stats['hit_rate']:.1%} hit rate ({cache_stats['hits']}/{cache_stats['hits']+cache_stats['misses']})")
    
    llm_stats = llm_interface.get_stats()
    logger.info(f"🤖 LLM: {llm_stats['total_requests']} requests, {llm_stats['avg_response_time']:.2f}s avg")
    
    logger.info("\n📁 Output Files:")
    logger.info(f"   Cards: {CONFIG['OUT_DIR']}/anki_cards.json")
    if needs_review:
        logger.info(f"   Review needed: {CONFIG['OUT_DIR']}/needs_review.json")
    logger.info(f"   Progress: {PROGRESS_FILE}")
    logger.info(f"   Cache: {CONFIG['CACHE_DIR']}")
    logger.info(f"   Metrics: {ROUTING_METRICS_FILE}")
    
    logger.info("\n✅ SUMMARY COMPLETE")
    logger.info("=" * 60)

# ============================================================
# EXECUTION GUARD
# ============================================================

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        logger.info("\n\n⚠️  Interrupted - saving state...")
        duplicate_detector.save_if_dirty(force=True)
        progress_tracker.flush()
        if rejection_memory:
            rejection_memory.save()
        terminal_rejections.save()
        prompt_version_manager.save()
        confidence_calibrator.save()
        
        try:
            with open(ROUTING_METRICS_FILE, 'w') as f:
                json.dump(dict(ROUTING_METRICS.counts), f, indent=2)
        except:
            pass
        
        logger.info("State saved. Exiting.")
        sys.exit(0)
    except Exception as e:
        logger.error(f"\n\n❌ ERROR: {e}")
        traceback.print_exc()
        
        # Emergency save
        duplicate_detector.save_if_dirty(force=True)
        if rejection_memory:
            rejection_memory.save()
        prompt_version_manager.save()
        
        sys.exit(1)