# MLflow Experiment Analysis

This notebook helps you analyze your MLflow experiments and visualize model performance.

In [None]:
import mlflow
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set MLflow tracking URI
mlflow.set_tracking_uri("../mlruns")

print("✓ Imports successful")

## 1. Load Experiments

In [None]:
# List all experiments
experiments = mlflow.search_experiments()

print("Available Experiments:")
for exp in experiments:
    print(f"  - {exp.name} (ID: {exp.experiment_id})")

In [None]:
# Select an experiment (change the name as needed)
experiment_name = "model_validation"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment:
    print(f"✓ Loaded experiment: {experiment_name}")
    print(f"  Experiment ID: {experiment.experiment_id}")
else:
    print(f"✗ Experiment '{experiment_name}' not found")

## 2. Load Runs

In [None]:
# Get all runs from the experiment
if experiment:
    runs = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["start_time DESC"]
    )
    
    print(f"Total runs: {len(runs)}")
    print(f"\nColumns: {list(runs.columns)}")
    
    # Display first few runs
    runs.head()

## 3. Summary Statistics

In [None]:
# Get metric columns
metric_cols = [col for col in runs.columns if col.startswith('metrics.')]

if metric_cols:
    print("Metric Summary:\n")
    print(runs[metric_cols].describe())
else:
    print("No metrics found")

## 4. Visualizations

In [None]:
# Latency distribution
if 'metrics.latency' in runs.columns:
    fig = px.histogram(
        runs,
        x='metrics.latency',
        nbins=30,
        title='Response Latency Distribution',
        labels={'metrics.latency': 'Latency (seconds)'},
        template='plotly_white'
    )
    fig.show()
else:
    print("No latency data available")

In [None]:
# Quality scores over time
if 'metrics.quality_score' in runs.columns:
    fig = px.scatter(
        runs,
        x='start_time',
        y='metrics.quality_score',
        color='params.model',
        title='Quality Score Over Time',
        labels={'metrics.quality_score': 'Quality Score', 'start_time': 'Time'},
        template='plotly_white',
        hover_data=['params.temperature']
    )
    fig.update_traces(marker=dict(size=10))
    fig.show()
else:
    print("No quality score data available")

In [None]:
# Multiple metrics comparison
metrics_to_plot = ['metrics.coherence_score', 'metrics.relevance_score', 'metrics.safety_score']
available_metrics = [m for m in metrics_to_plot if m in runs.columns]

if available_metrics:
    fig = go.Figure()
    
    for metric in available_metrics:
        metric_name = metric.replace('metrics.', '').replace('_', ' ').title()
        fig.add_trace(go.Box(
            y=runs[metric],
            name=metric_name
        ))
    
    fig.update_layout(
        title='Quality Metrics Distribution',
        yaxis_title='Score',
        template='plotly_white',
        showlegend=True
    )
    fig.show()
else:
    print("No quality metrics available")

In [None]:
# Latency vs Quality scatter
if 'metrics.latency' in runs.columns and 'metrics.quality_score' in runs.columns:
    fig = px.scatter(
        runs,
        x='metrics.latency',
        y='metrics.quality_score',
        color='params.temperature',
        size='metrics.total_tokens',
        title='Latency vs Quality Score',
        labels={
            'metrics.latency': 'Latency (seconds)',
            'metrics.quality_score': 'Quality Score',
            'params.temperature': 'Temperature'
        },
        template='plotly_white',
        hover_data=['params.model']
    )
    fig.show()
else:
    print("Insufficient data for latency vs quality plot")

## 5. Model Comparison

In [None]:
# Compare models if multiple models were tested
if 'params.model' in runs.columns:
    model_comparison = runs.groupby('params.model').agg({
        'metrics.latency': ['mean', 'std'],
        'metrics.quality_score': ['mean', 'std'],
        'metrics.tokens_per_second': ['mean', 'std']
    }).round(3)
    
    print("Model Comparison:\n")
    print(model_comparison)

In [None]:
# Model performance radar chart
if 'params.model' in runs.columns and len(runs['params.model'].unique()) > 1:
    models = runs['params.model'].unique()
    
    categories = ['Quality', 'Coherence', 'Relevance', 'Speed']
    
    fig = go.Figure()
    
    for model in models:
        model_data = runs[runs['params.model'] == model]
        
        values = [
            model_data['metrics.quality_score'].mean(),
            model_data['metrics.coherence_score'].mean(),
            model_data['metrics.relevance_score'].mean(),
            1 - (model_data['metrics.latency'].mean() / 10)  # Normalized speed score
        ]
        
        fig.add_trace(go.Scatterpolar(
            r=values,
            theta=categories,
            fill='toself',
            name=model
        ))
    
    fig.update_layout(
        polar=dict(
            radialaxis=dict(
                visible=True,
                range=[0, 1]
            )
        ),
        title='Model Performance Comparison',
        template='plotly_white'
    )
    fig.show()

## 6. Temperature Analysis

In [None]:
# Effect of temperature on quality
if 'params.temperature' in runs.columns and 'metrics.quality_score' in runs.columns:
    temp_analysis = runs.groupby('params.temperature').agg({
        'metrics.quality_score': ['mean', 'std', 'count'],
        'metrics.latency': 'mean'
    }).round(3)
    
    print("Temperature Effect:\n")
    print(temp_analysis)

## 7. Best Performing Runs

In [None]:
# Top 10 runs by quality score
if 'metrics.quality_score' in runs.columns:
    top_runs = runs.nlargest(10, 'metrics.quality_score')[[
        'run_id',
        'params.model',
        'params.temperature',
        'metrics.quality_score',
        'metrics.latency',
        'metrics.tokens_per_second'
    ]]
    
    print("Top 10 Runs by Quality:\n")
    print(top_runs)

## 8. Export Results

In [None]:
# Export to CSV
output_file = '../experiment_results.csv'
runs.to_csv(output_file, index=False)
print(f"✓ Results exported to: {output_file}")

## 9. Custom Analysis

Add your own analysis here!

In [None]:
# Your custom analysis code here
