# Monitoring LLM Evaluation Results

This section monitors the quality and relevance of the LLM evaluation results to detect potential issues such as:
- Inconsistent scoring patterns
- Evaluation drift over time
- Outlier scores that may indicate model issues

In [None]:
import pandas as pd
import numpy as np
import mlflow
from datetime import datetime
import pendulum
from sqlalchemy import Boolean, Column, Float, Integer, String, DateTime
from sqlalchemy.orm import declarative_base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from evidently import Dataset
from evidently import Report
from evidently.presets import DataDriftPreset
import warnings

EXPERIMENT_NAME = "deck_royale"

# Suppress all numpy warnings to avoid division by zero and invalid value warnings
warnings.filterwarnings('ignore')
np.seterr(all='ignore')  # Suppress numpy errors/warnings

# Database setup for Grafana
USER = "admin"
PASSWORD = "admin"
MONITORING_DB_URI = f"postgresql+psycopg2://{USER}:{PASSWORD}@127.0.0.1:5432/monitoring_db"

# Create database table for LLM evaluation monitoring
Base = declarative_base()

class LLMEvaluationTable(Base):
    """Table for LLM evaluation monitoring metrics."""
    __tablename__ = "llm_evaluation_monitoring"
    
    id = Column(Integer, primary_key=True)
    timestamp = Column(Float)
    avg_overall_score = Column(Float)
    avg_defense_score = Column(Float)
    avg_attack_score = Column(Float)
    avg_synergy_score = Column(Float)
    avg_versatility_score = Column(Float)
    avg_difficulty_score = Column(Float)
    score_variance = Column(Float)
    low_score_count = Column(Integer)
    total_evaluations = Column(Integer)
    system_health_status = Column(String)

def create_monitoring_db():
    """Create monitoring database tables."""
    engine = create_engine(MONITORING_DB_URI)
    Base.metadata.create_all(engine)
    print("✅ LLM Evaluation monitoring database created")

def get_llm_monitoring_metrics(runs_df):
    """Extract monitoring metrics from MLflow runs with robust error handling."""
    metrics = {}
    
    # Calculate averages and variance for each metric with safety checks
    eval_metrics = ['eval_overall', 'eval_defense', 'eval_attack', 'eval_synergy', 'eval_versatility', 'eval_difficulty']
    
    for metric in eval_metrics:
        col_name = f'metrics.{metric}'
        if col_name in runs_df.columns:
            values = runs_df[col_name].dropna()
            if len(values) > 0:
                # Safe mean calculation
                mean_val = values.mean()
                if pd.isna(mean_val) or not np.isfinite(mean_val):
                    mean_val = 0.0
                metrics[f'avg_{metric}_score'] = mean_val
    
    # Calculate overall variance (instability indicator) with comprehensive safety
    overall_scores = runs_df['metrics.eval_overall'].dropna() if 'metrics.eval_overall' in runs_df.columns else pd.Series()
    if len(overall_scores) > 1:
        try:
            variance = overall_scores.var()
            if pd.isna(variance) or not np.isfinite(variance):
                variance = 0.0
            metrics['score_variance'] = variance
        except (ZeroDivisionError, RuntimeWarning):
            metrics['score_variance'] = 0.0
    else:
        metrics['score_variance'] = 0.0
    
    # Count low scores (quality issues) with comprehensive safety
    low_threshold = 3.0
    low_score_count = 0
    for metric in eval_metrics:
        col_name = f'metrics.{metric}'
        if col_name in runs_df.columns:
            try:
                values = runs_df[col_name].dropna()
                if len(values) > 0:
                    # Safe comparison that handles NaN values
                    low_scores = np.sum(values < low_threshold)
                    if pd.isna(low_scores) or not np.isfinite(low_scores):
                        low_scores = 0
                    low_score_count += int(low_scores)
            except (TypeError, ValueError):
                continue  # Skip problematic columns
    
    metrics['low_score_count'] = low_score_count
    metrics['total_evaluations'] = len(runs_df)
    
    # Determine system health with ultra-safe division
    avg_overall = metrics.get('avg_eval_overall_score', 0)
    if pd.isna(avg_overall) or not np.isfinite(avg_overall):
        avg_overall = 0.0
    
    try:
        if avg_overall >= 7.0:
            metrics['system_health_status'] = 'HEALTHY'
        elif avg_overall >= 5.0:
            metrics['system_health_status'] = 'CAUTION'
        else:
            metrics['system_health_status'] = 'CRITICAL'
    except (TypeError, ValueError):
        metrics['system_health_status'] = 'UNKNOWN'
    
    return metrics

# Initialize monitoring database
try:
    create_monitoring_db()
    
    # Retrieve MLflow experiment data
    mlflow.set_tracking_uri("http://localhost:5001")
    experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

    # Create experiment if it doesn't exist
    if experiment is None:
        print(f"⚠️ Experiment '{EXPERIMENT_NAME}' not found. Creating new experiment...")
        experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
        experiment = mlflow.get_experiment(experiment_id)
    
    print(f"✅ Monitoring MLflow experiment: {experiment.name} (ID: {experiment.experiment_id})")
    runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
    
    if len(runs) > 0:
        print(f"🔍 Monitoring {len(runs)} evaluation runs...")
        
        # Extract monitoring metrics
        monitoring_metrics = get_llm_monitoring_metrics(runs)
        
        # Display current monitoring status
        print("\n📊 LLM Evaluation Monitoring Report:")
        print(f"   • Total evaluations: {monitoring_metrics['total_evaluations']}")
        print(f"   • Average overall score: {monitoring_metrics.get('avg_eval_overall_score', 0):.2f}")
        print(f"   • Score variance: {monitoring_metrics['score_variance']:.3f}")
        print(f"   • Low scores detected: {monitoring_metrics['low_score_count']}")
        print(f"   • System status: {monitoring_metrics['system_health_status']}")
        
        # Log metrics to database for Grafana
        engine = create_engine(MONITORING_DB_URI)
        Session = sessionmaker(bind=engine)
        session = Session()
        
        # Create monitoring record - convert NumPy types to Python types
        timestamp = pendulum.now().timestamp()
        monitoring_record = LLMEvaluationTable(
            timestamp=timestamp,
            avg_overall_score=float(monitoring_metrics.get('avg_eval_overall_score', 0)),
            avg_defense_score=float(monitoring_metrics.get('avg_eval_defense_score', 0)),
            avg_attack_score=float(monitoring_metrics.get('avg_eval_attack_score', 0)),
            avg_synergy_score=float(monitoring_metrics.get('avg_eval_synergy_score', 0)),
            avg_versatility_score=float(monitoring_metrics.get('avg_eval_versatility_score', 0)),
            avg_difficulty_score=float(monitoring_metrics.get('avg_eval_difficulty_score', 0)),
            score_variance=float(monitoring_metrics['score_variance']),
            low_score_count=int(monitoring_metrics['low_score_count']),
            total_evaluations=int(monitoring_metrics['total_evaluations']),
            system_health_status=str(monitoring_metrics['system_health_status'])
        )
        
        session.add(monitoring_record)
        session.commit()
        session.close()
        
        print(f"✅ Monitoring metrics logged to database for Grafana visualization")
        print(f"   Database URI: {MONITORING_DB_URI}")
        print(f"   Table: llm_evaluation_monitoring")
        
        # Generate Evidently report for detailed analysis
        if len(runs) >= 2:
            # Create a simple dataset for drift analysis on evaluation scores
            eval_data = runs[['metrics.eval_overall', 'metrics.eval_defense', 'metrics.eval_attack', 
                            'metrics.eval_synergy', 'metrics.eval_versatility', 'metrics.eval_difficulty']].dropna()
            
            if len(eval_data) >= 2:
                # Split data into reference (first half) and current (second half)
                split_idx = len(eval_data) // 2
                reference_data = eval_data.iloc[:split_idx]
                current_data = eval_data.iloc[split_idx:]
                
                # Create Evidently datasets
                reference_dataset = Dataset.from_pandas(reference_data)
                current_dataset = Dataset.from_pandas(current_data)
                
                # Create report to detect evaluation drift
                report = Report([DataDriftPreset()])
                report.run(current_data=current_dataset, reference_data=reference_dataset)
                
                print(f"\n📈 Evidently Report Generated:")
                print(f"   • Reference evaluations: {len(reference_data)}")
                print(f"   • Current evaluations: {len(current_data)}")
                print(f"   • Drift analysis completed")
    else:
        print("❌ No evaluation runs found in MLflow experiment")
        
except Exception as e:
    print(f"❌ Error setting up monitoring: {e}")
    print("Make sure PostgreSQL is running and accessible")

✅ LLM Evaluation monitoring database created
✅ Monitoring MLflow experiment: deck_royale (ID: 591958987431704571)
🔍 Monitoring 6 evaluation runs...

📊 LLM Evaluation Monitoring Report:
   • Total evaluations: 6
   • Average overall score: 6.83
   • Score variance: 0.167
   • Low scores detected: 0
   • System status: CAUTION
✅ Monitoring metrics logged to database for Grafana visualization
   Database URI: postgresql+psycopg2://admin:admin@127.0.0.1:5432/monitoring_db
   Table: llm_evaluation_monitoring

📈 Evidently Report Generated:
   • Reference evaluations: 3
   • Current evaluations: 3
   • Drift analysis completed
