# =============================================================================
# EVALUATE BENCHMARK NOTEBOOK
# =============================================================================
    Purpose:
        Load baseline model, trained agent (or best action), test data.
        Apply baseline, manual, and agent optimizations.
        Run comprehensive benchmark and visualize results.
# =============================================================================

# Import Configuration and Libraries

In [None]:
import os
import warnings
import logging
import json

# Suppress warnings
os.environ["GYM_DISABLE_WARNINGS"] = "true"
warnings.filterwarnings("ignore", module="gymnasium")
warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("gymnasium").setLevel(logging.ERROR)
logging.getLogger("stable_baselines3").setLevel(logging.ERROR)


import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import gymnasium as gym # Might be needed if loading agent requires env instance
from gymnasium import spaces # Might be needed if loading agent requires env instance
import copy
from sklearn.metrics import accuracy_score
from stable_baselines3 import PPO # Needed if loading agent directly
from codecarbon import EmissionsTracker
from ptflops import get_model_complexity_info
import torch_pruning as tp
import matplotlib.pyplot as plt
import seaborn as sns
from builtins import print as builtin_print # To avoid conflict

print("Libraries imported.")

# Configuration Class

In [None]:
class Config:
    # Paths for Loading
    PROCESSED_DATA_PATH = '/kaggle/working/processed_data.pt'
    BASELINE_MODEL_PATH = '/kaggle/working/baseline_model.pth'
    AGENT_SAVE_PATH = "/kaggle/working/sustainable_ai_agent_ppo.zip"
    BEST_ACTION_PATH = "/kaggle/working/best_action.json"

    # Data Parameters
    SEQUENCE_LENGTH = 30
    INPUT_DIM = 4

    # Model Parameters (must match baseline)
    HIDDEN_DIM = 256
    N_LAYERS = 2
    OUTPUT_DIM = 1
    DROPOUT = 0.2

    # Evaluation Parameters
    EVAL_BATCH_SIZE = 64
    CODECARBON_BATCHES = 10

    # Device (Evaluation mostly on CPU, but consistency)
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Device for potential agent loading: {Config.DEVICE}")
print(f"Loading data from: {Config.PROCESSED_DATA_PATH}")
print(f"Loading baseline model from: {Config.BASELINE_MODEL_PATH}")
print(f"Loading agent from: {Config.AGENT_SAVE_PATH}")
print(f"Loading best action from: {Config.BEST_ACTION_PATH}")

# Helper Functions and Model Definition
# (Copied again for standalone execution)

In [None]:
def count_parameters(model):
    """Counts trainable parameters."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def evaluate_model(model, loader, codecarbon_batches=10):
    """Comprehensive evaluation (accuracy, params, flops, energy). Runs on CPU."""
    model_cpu = copy.deepcopy(model).cpu()
    device = torch.device("cpu")
    model_cpu.eval() # Ensure model is in evaluation mode

    # Accuracy
    y_true, y_pred = [], []
    with torch.no_grad():
        for inputs, labels in loader:
            inputs = inputs.to(device)
            outputs = model_cpu(inputs)
            preds = (outputs > 0.5).float()
            y_pred.extend(preds.cpu().numpy())
            y_true.extend(labels.cpu().numpy())
    accuracy = accuracy_score(y_true, y_pred)

    # Energy and CO2 with CodeCarbon
    energy_kwh = 0
    co2_eq_kg = 0
    try:
        tracker = EmissionsTracker(log_level="error", output_dir="/kaggle/working/", tracking_mode="process")
        tracker.start()
        with torch.no_grad():
            for i, (inputs, _) in enumerate(loader):
                if i >= codecarbon_batches: break
                model_cpu(inputs.to(device))
        tracker.stop()
        if tracker.final_emissions_data:
             energy_kwh = tracker.final_emissions_data.energy_consumed or 0
             co2_eq_kg = tracker.final_emissions_data.emissions or 0
        else:
            builtin_print("Warning: CodeCarbon tracker did not record final emissions data.")
    except Exception as e:
        builtin_print(f"Warning: CodeCarbon measurement failed - {e}")

    # Params
    params = count_parameters(model_cpu)

    # FLOPs with ptflops
    flops = 0
    try:
        # Create a dummy input on CPU for ptflops
        dummy_input = torch.randn(1, Config.SEQUENCE_LENGTH, Config.INPUT_DIM).to(device)
        macs, _ = get_model_complexity_info(
            model_cpu, (Config.SEQUENCE_LENGTH, Config.INPUT_DIM),
            input_constructor=lambda x: {'x': dummy_input}, # Provide input via constructor if needed
            as_strings=False, print_per_layer_stat=False, verbose=False)
        flops = macs * 2
    except (KeyError, AttributeError, RuntimeError, TypeError):
        # builtin_print("Note: Could not calculate FLOPs via ptflops (likely quantized model). Reporting 0.")
        flops = 0

    return {
        "accuracy": accuracy, "energy_kwh": energy_kwh, "co2_eq_kg": co2_eq_kg,
        "flops": flops, "params": params,
    }

def apply_l1_pruning(model, amount):
    """Applies L1 pruning, ignoring GRU layers. Returns CPU model."""
    model_to_prune = copy.deepcopy(model).cpu()
    model_to_prune.eval()
    ignored_layers = [m for m in model_to_prune.modules() if isinstance(m, nn.GRU)]
    example_inputs = torch.randn(1, Config.SEQUENCE_LENGTH, Config.INPUT_DIM)
    try:
        pruner = tp.pruner.MagnitudePruner(
            model_to_prune,
            example_inputs=example_inputs,
            importance=tp.importance.MagnitudeImportance(p=1),
            pruning_ratio=amount,
            ignored_layers=ignored_layers
        )
        pruner.step()
    except Exception as e:
        builtin_print(f"Error during pruning: {e}")
        # Return the unpruned model copy in case of error
    return model_to_prune

def apply_dynamic_quantization(model):
    """Applies dynamic quantization. Returns CPU model."""
    try:
        quantized_model = copy.deepcopy(model).cpu()
        quantized_model.eval()
        quantized_model = torch.quantization.quantize_dynamic(
            quantized_model, {nn.GRU, nn.Linear}, dtype=torch.qint8
        )
        return quantized_model
    except Exception as e:
         builtin_print(f"Error during dynamic quantization: {e}")
         # Return the original model copy in case of error
         return copy.deepcopy(model).cpu()


class WeatherGRU(nn.Module):
    """GRU model definition."""
    def __init__(self, input_dim, hidden_dim, n_layers, output_dim, dropout):
        super(WeatherGRU, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers,
                          batch_first=True, dropout=dropout if n_layers > 1 else 0)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        gru_out, _ = self.gru(x)
        out = self.fc(gru_out[:, -1, :])
        return self.sigmoid(out)

print("Helper functions and WeatherGRU class defined.")

# Load Baseline Model and Test Data

In [None]:
# Load processed test data
try:
    processed_data = torch.load(Config.PROCESSED_DATA_PATH)
    X_test_tensor = processed_data['X_test']
    y_test_tensor = processed_data['y_test']
    print(f"Processed test data loaded from {Config.PROCESSED_DATA_PATH}")
except FileNotFoundError:
    print(f"Error: Processed data file not found at {Config.PROCESSED_DATA_PATH}. Run train_baseline.ipynb first.")
    raise
except Exception as e:
    print(f"Error loading processed data: {e}")
    raise

# Create Test DataLoader
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=Config.EVAL_BATCH_SIZE, shuffle=False)
print("Test DataLoader created.")

# Initialize and load baseline model
baseline_model = WeatherGRU(
    input_dim=Config.INPUT_DIM,
    hidden_dim=Config.HIDDEN_DIM,
    n_layers=Config.N_LAYERS,
    output_dim=Config.OUTPUT_DIM,
    dropout=Config.DROPOUT
)
try:
    baseline_model.load_state_dict(torch.load(Config.BASELINE_MODEL_PATH, map_location=torch.device('cpu')))
    baseline_model.eval()
    print(f"Baseline model state loaded from {Config.BASELINE_MODEL_PATH}")
except FileNotFoundError:
    print(f"Error: Baseline model file not found at {Config.BASELINE_MODEL_PATH}. Run train_baseline.ipynb first.")
    raise
except Exception as e:
    print(f"Error loading baseline model state: {e}")
    raise

# Keep a clean copy on CPU
baseline_model_cpu = baseline_model.cpu()

# Determine Best Action from Agent

In [None]:
best_solution_info = None

# Try loading from JSON first (more efficient)
try:
    with open(Config.BEST_ACTION_PATH, 'r') as f:
        best_solution_info = json.load(f)
        # Validate keys needed for benchmark function
        if 'pruning_amount' in best_solution_info and 'quantized' in best_solution_info:
             print(f"Best action info loaded from {Config.BEST_ACTION_PATH}:")
             print(f"  Pruning: {best_solution_info['pruning_amount']*100:.0f}%")
             print(f"  Quantized: {best_solution_info['quantized']}")
        else:
             print(f"JSON file {Config.BEST_ACTION_PATH} does not contain required keys ('pruning_amount', 'quantized').")
             best_solution_info = None # Reset if invalid
except FileNotFoundError:
    print(f"Info: Best action JSON file not found at {Config.BEST_ACTION_PATH}. Will try loading agent.")
except Exception as e:
    print(f"Error loading best action JSON: {e}. Will try loading agent.")

# If JSON loading failed or file not found, load agent and predict
if best_solution_info is None:
    print(f"\n--- Loading trained agent from {Config.AGENT_SAVE_PATH} to predict best action ---")
    try:
        # We need a dummy env instance to load the agent if not passed explicitly
        # Define necessary parameters for dummy env init
        dummy_pruning_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
        dummy_action_space = spaces.Discrete(len(dummy_pruning_levels) * 2)
        dummy_obs_space = spaces.Box(low=np.array([0.0, -1.0, 0.0, 0.0]), high=np.array([1.0, 1.0, 1.0, 1.0]), dtype=np.float32)

        # Minimal mock environment class just for loading if SustainableAIAgentEnv not defined
        class MockEnv(gym.Env):
            def __init__(self):
                super().__init__()
                self.action_space = dummy_action_space
                self.observation_space = dummy_obs_space
            def step(self, action): pass
            def reset(self, seed=None, options=None):
                 super().reset(seed=seed)
                 # Return a plausible initial state matching the space
                 return np.array([0.6, 0.0, 0.0, 0.0], dtype=np.float32), {}
            def render(self): pass
            def close(self): pass

        mock_env_instance = MockEnv()

        # Load agent - ensure device matches where it was trained or use 'auto'/'cpu'
        agent = PPO.load(Config.AGENT_SAVE_PATH, env=mock_env_instance, device='cpu') # Load to CPU for prediction
        print("Agent loaded successfully.")

        # Get initial observation for prediction (use mock env reset)
        obs, _ = mock_env_instance.reset()

        # Predict the best action
        action, _ = agent.predict(obs, deterministic=True)
        action_item = action.item()
        print(f"Agent predicted best action: {action_item}")

        # Decode action
        pruning_amount = dummy_pruning_levels[action_item % len(dummy_pruning_levels)]
        apply_quant = (action_item >= len(dummy_pruning_levels))

        best_solution_info = {
            'pruning_amount': pruning_amount,
            'quantized': apply_quant
        }
        print(f"  Decoded Action -> Pruning: {pruning_amount*100:.0f}%, Quantized: {apply_quant}")

    except FileNotFoundError:
        print(f"Error: Agent file not found at {Config.AGENT_SAVE_PATH}. Cannot evaluate agent solution.")
    except Exception as e:
        print(f"Error loading or predicting from agent: {e}. Cannot evaluate agent solution.")

# Run Full Benchmark Comparison

In [None]:
def run_full_benchmark(baseline_model_cpu, test_loader, best_action_info):
    """Runs benchmark comparing Baseline, Manual Pruning, Manual Quantization, and Agent Optimized."""
    results = {}
    print("\n--- Starting Full Benchmark Evaluation ---")

    # 1. Baseline Evaluation
    print("--- 1. Evaluating Baseline Model ---")
    try:
        # Evaluate a fresh copy to ensure no side effects
        results['Baseline'] = evaluate_model(copy.deepcopy(baseline_model_cpu), test_loader, Config.CODECARBON_BATCHES)
        print("  Baseline evaluation complete.")
    except Exception as e:
        print(f"  Error evaluating baseline: {e}")
        results['Baseline'] = {}

    # 2. Manual Pruning (50%) Evaluation
    print("--- 2. Evaluating Manual Pruning (50%) ---")
    try:
        pruned_model_manual = apply_l1_pruning(baseline_model_cpu, 0.5)
        results['Manual Pruning (50%)'] = evaluate_model(pruned_model_manual, test_loader, Config.CODECARBON_BATCHES)
        print("  Manual pruning evaluation complete.")
    except Exception as e:
        print(f"  Error evaluating manual pruning: {e}")
        results['Manual Pruning (50%)'] = {}

    # 3. Manual Quantization Evaluation
    print("--- 3. Evaluating Manual Quantization ---")
    try:
        quantized_model_manual = apply_dynamic_quantization(baseline_model_cpu)
        results['Manual Quantization'] = evaluate_model(quantized_model_manual, test_loader, Config.CODECARBON_BATCHES)
        print("  Manual quantization evaluation complete.")
    except Exception as e:
        print(f"  Error evaluating manual quantization: {e}")
        results['Manual Quantization'] = {}

    # 4. Agent Optimized Evaluation
    print("--- 4. Evaluating Agent Optimized Model ---")
    if best_action_info and 'pruning_amount' in best_action_info and 'quantized' in best_action_info:
        pruning_amount_agent = best_action_info['pruning_amount']
        apply_quant_agent = best_action_info['quantized']
        print(f"  Applying Agent's strategy: Pruning {pruning_amount_agent*100:.0f}%, Quantized: {apply_quant_agent}")
        try:
            agent_optimized_model = copy.deepcopy(baseline_model_cpu) # Start fresh
            if pruning_amount_agent > 0:
                agent_optimized_model = apply_l1_pruning(agent_optimized_model, pruning_amount_agent)
            if apply_quant_agent:
                agent_optimized_model = apply_dynamic_quantization(agent_optimized_model)

            results['Agent Optimized'] = evaluate_model(agent_optimized_model, test_loader, Config.CODECARBON_BATCHES)
            print("  Agent optimized evaluation complete.")
        except Exception as e:
            print(f"  Error evaluating agent optimized model: {e}")
            results['Agent Optimized'] = {}
    else:
        print("  Skipping agent evaluation: Best action info not available or invalid.")
        results['Agent Optimized'] = {}

    # --- Format Results ---
    df_results = pd.DataFrame(results).T.fillna(0) # Transpose and fill NaNs if errors occurred

    # Calculate derived metrics (handle potential missing columns)
    df_results['flops_g'] = (df_results['flops'] / 1e9).round(6) if 'flops' in df_results else 0
    df_results['params_m'] = (df_results['params'] / 1e6).round(6) if 'params' in df_results else 0
    df_results['energy_mwh'] = (df_results['energy_kwh'] * 1000).round(6) if 'energy_kwh' in df_results else 0

    # Ensure required columns exist, adding them with 0 if missing
    final_columns = ['accuracy', 'params_m', 'flops_g', 'energy_mwh', 'co2_eq_kg']
    for col in final_columns:
        if col not in df_results:
            df_results[col] = 0.0 # Add missing columns with default value

    # Ensure correct data types (especially after fillna)
    for col in ['accuracy', 'params_m', 'flops_g', 'energy_mwh', 'co2_eq_kg']:
         if col in df_results:
             df_results[col] = pd.to_numeric(df_results[col], errors='coerce').fillna(0.0)


    print("\n--- Benchmark Evaluation Finished ---")
    return df_results[final_columns] # Return only the desired, formatted columns

# Execute the benchmark
benchmark_results = run_full_benchmark(baseline_model_cpu, test_loader, best_solution_info)

# Display the results table
print("\n\n--- FINAL COMPREHENSIVE BENCHMARK RESULTS ---")
# Use markdown format for better display in Jupyter/Kaggle
print(benchmark_results.to_markdown())

# Visualize Results

In [None]:
print("\n--- Generating Visualizations ---")

sns.set_style("whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(16, 12)) # Adjusted size slightly
fig.suptitle('Comprehensive Model Performance Comparison', fontsize=18, weight='bold')

# Metrics and their corresponding axes and palettes
metrics_plot_info = [
    ('accuracy', axes[0, 0], 'Greens_r', 'Model Accuracy', 'Accuracy Score'),
    ('params_m', axes[0, 1], 'Blues_r', 'Model Size (Millions of Parameters)', 'Parameters (M)'),
    ('flops_g', axes[1, 0], 'Oranges_r', 'Computational Cost (GFLOPs)', 'GFLOPs'),
    ('energy_mwh', axes[1, 1], 'Reds_r', 'Energy Consumption (mWh)', 'Energy (mWh)')
]

# Plot each metric
for metric, ax, palette, title, ylabel in metrics_plot_info:
    if metric in benchmark_results:
        try:
            sns.barplot(x=benchmark_results.index, y=metric, data=benchmark_results, ax=ax, palette=palette)
            ax.set_title(title, fontsize=12, weight='bold')
            ax.set_ylabel(ylabel)
            ax.tick_params(axis='x', rotation=15)
            # Add value labels on bars for accuracy
            if metric == 'accuracy':
                 for index, value in enumerate(benchmark_results[metric]):
                      ax.text(index, value, f'{value:.4f}', ha='center', va='bottom', fontsize=9)
            # Handle potential zero values more gracefully in y-limits
            if benchmark_results[metric].max() > benchmark_results[metric].min():
                 ax.set_ylim(bottom=min(0, benchmark_results[metric].min() * 1.1), top=benchmark_results[metric].max() * 1.1)
            elif benchmark_results[metric].max() > 0: # All values same and positive
                 ax.set_ylim(bottom = 0, top = benchmark_results[metric].max() * 1.2)
            else: # All values are zero or negative
                 ax.set_ylim(bottom = benchmark_results[metric].min() * 1.1 if benchmark_results[metric].min() < 0 else -0.1, top = 0.1)

        except Exception as e:
            builtin_print(f"Warning: Could not plot {metric}: {e}")
            ax.set_title(f"{title}\n(Plotting Error)", fontsize=12, weight='bold')


plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout to prevent title overlap
plt.show()

# Pareto Front Plot (Accuracy vs. FLOPs)
print("\n--- Generating Pareto Plot (Accuracy vs. FLOPs) ---")
plt.figure(figsize=(10, 7)) # Adjusted size
if 'flops_g' in benchmark_results and 'accuracy' in benchmark_results:
    try:
        sns.scatterplot(
            data=benchmark_results,
            x='flops_g',
            y='accuracy',
            hue=benchmark_results.index,
            s=150, # Slightly smaller points
            style=benchmark_results.index,
            palette='viridis'
        )
        plt.title('Accuracy vs. Computational Cost Trade-off', fontsize=14, weight='bold')
        plt.xlabel('Computational Cost (GFLOPs)', fontsize=11)
        plt.ylabel('Accuracy', fontsize=11)
        plt.legend(title='Model Version', bbox_to_anchor=(1.05, 1), loc='upper left') # Move legend outside
        plt.grid(True)
        # Adjust x-lim to handle near-zero FLOPs for quantized models better
        max_flops = benchmark_results['flops_g'].max()
        if max_flops > 0:
             plt.xlim(left=-0.001, right=max_flops * 1.1) # Start slightly negative for visibility
        else:
             plt.xlim(left=-0.001, right=0.001)
        # Adjust y-lim based on accuracy range
        min_acc = benchmark_results['accuracy'].min()
        max_acc = benchmark_results['accuracy'].max()
        plt.ylim(bottom=min_acc - 0.01, top=max_acc + 0.01)

        plt.tight_layout() # Adjust layout
        plt.show()
    except Exception as e:
        builtin_print(f"Warning: Could not plot Pareto plot: {e}")

else:
    builtin_print("Skipping Pareto plot: 'flops_g' or 'accuracy' column missing.")