# =============================================================================
# REINFORCEMENT LEARNING AGENT TRAINING NOTEBOOK
# =============================================================================
## Purpose:
    - Define the custom RL Environment (`SustainableAIAgentEnv`) with fail-safe reward mechanisms.
    - Implement a Proximal Policy Optimization (PPO) agent with entropy regularization.
    - Conduct a comparative benchmark against a Random Search strategy.
    - Save the best-performing policy and metrics for final evaluation.
# =============================================================================

# === Clone Repository & Install Dependencies ===

In [1]:
# Use if run on Kaggle
!rm -rf Sustainable_AI_Agent_Project
!git clone https://github.com/trongjhuongwr/Sustainable_AI_Agent_Project.git
%cd Sustainable_AI_Agent_Project

Cloning into 'Sustainable_AI_Agent_Project'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (51/51), done.[K
remote: Total 68 (delta 26), reused 54 (delta 15), pack-reused 0 (from 0)[K
Receiving objects: 100% (68/68), 1.16 MiB | 15.23 MiB/s, done.
Resolving deltas: 100% (26/26), done.
/kaggle/working/Sustainable_AI_Agent_Project


In [2]:
!pip install -q --extra-index-url https://download.pytorch.org/whl/cu121 -r /kaggle/working/Sustainable_AI_Agent_Project/requirements.txt

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m117.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━

In [3]:
# Uninstall torchvision to prevent import conflicts with ptflops/pytorch
!pip uninstall -y torchvision
print("Torchvision uninstalled.")

Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Torchvision uninstalled.


# 1. Import Libraries and Configuration

In [4]:
import os
import warnings
import logging
import json
import copy
import random

# Suppress specific warnings for cleaner output
os.environ["GYM_DISABLE_WARNINGS"] = "true"
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", module="gymnasium")
warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("gymnasium").setLevel(logging.ERROR)
logging.getLogger("stable_baselines3").setLevel(logging.ERROR)

import torch
import torch.nn as nn
import torch.nn.utils.prune as prune
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
import gymnasium as gym
from gymnasium import spaces
from sklearn.metrics import accuracy_score
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import CheckpointCallback
from codecarbon import EmissionsTracker
from ptflops import get_model_complexity_info
import torch_pruning as tp
from tqdm.notebook import tqdm
from builtins import print as builtin_print

print("Libraries imported successfully.")

2025-12-22 13:24:48.063452: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766409888.253697      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766409888.305716      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Libraries imported successfully.


# 2. Configuration Class

In [5]:
# Defines hyperparameters, file paths, and environment parameters used throughout the notebook.
class Config:
    # --- Input/Output Paths ---
    PROCESSED_DATA_PATH = '/kaggle/input/baseline-model-saa/processed_data.pt'  # Input: Path to saved processed data tensors
    BASELINE_MODEL_PATH = '/kaggle/input/baseline-model-saa/baseline_model.pth' # Input: Path to the trained baseline model state dictionary
    AGENT_SAVE_PATH = "/kaggle/working/sustainable_ai_agent_ppo.zip"            # Output: Path to save/load the trained PPO agent
    BEST_ACTION_SAVE_PATH = "/kaggle/working/best_action.json"                  # Output: Path to save information about the best discovered action
    TENSORBOARD_LOG_PATH = "/kaggle/working/ppo_tensorboard/"                   # Output: Directory for TensorBoard logs

    # --- Data Parameters (consistent with baseline training) ---
    SEQUENCE_LENGTH = 30
    INPUT_DIM = 4

    # --- Model Architecture Parameters (must match baseline) ---
    HIDDEN_DIM = 256
    N_LAYERS = 2
    OUTPUT_DIM = 1
    DROPOUT = 0.2

    # --- RL Agent Training Parameters ---
    TOTAL_TIMESTEPS = 30000    # Total number of environment steps for training
    TIMESTEPS_PER_CHUNK = 500  # Save agent state every N steps
    SEED = 42                  # For reproducibility

    # --- RL Environment Parameters (Reward shaping and constraints) ---
    ACCURACY_PENALTY_THRESHOLD = 0.98   # Threshold below baseline accuracy triggering heavy penalty (e.g., 0.95 = 5% drop allowed)
    ACC_REWARD_SCALE = 20.0             # Scaling factor for accuracy-based reward/penalty
    FLOPS_REWARD_SCALE = 2.0            # Scaling factor for inference FLOPs reduction reward
    PARAMS_REWARD_SCALE = 1.0           # Scaling factor for parameter reduction reward (training energy proxy)
    INACTION_PENALTY = -1.0             # Penalty for choosing action 0 (no optimization)
    ENV_ERROR_REWARD = -10.0            # Heavy penalty if an environment step fails (e.g., optimization error)

    # --- Evaluation Parameters (within the environment) ---
    EVAL_BATCH_SIZE = 64      # Batch size used for evaluation within the environment
    CODECARBON_BATCHES = 10   # Number of batches used for CodeCarbon energy measurement during env init

    # --- Computation Device ---
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True 
    torch.backends.cudnn.benchmark = False

seed_everything(Config.SEED)

print(f"Configuration loaded. Using device: {Config.DEVICE}")
print(f"Seed set to: {Config.SEED}")
print(f"Loading processed data from: {Config.PROCESSED_DATA_PATH}")
print(f"Loading baseline model from: {Config.BASELINE_MODEL_PATH}")
print(f"Agent will be saved to: {Config.AGENT_SAVE_PATH}")

Configuration loaded. Using device: cuda
Seed set to: 42
Loading processed data from: /kaggle/input/baseline-model-saa/processed_data.pt
Loading baseline model from: /kaggle/input/baseline-model-saa/baseline_model.pth
Agent will be saved to: /kaggle/working/sustainable_ai_agent_ppo.zip


# 3. Utility Functions and Model Definition

In [6]:
def count_parameters(model):
    """Returns the total number of trainable parameters."""
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

class WeatherGRU(nn.Module):
    """
    GRU Architecture (Must be identical to Baseline for state dict loading).
    """
    def __init__(self, config):
        super(WeatherGRU, self).__init__()
        self.gru = nn.GRU(
            input_size=config.INPUT_DIM,
            hidden_size=config.HIDDEN_DIM,
            num_layers=config.N_LAYERS,
            batch_first=True,
            dropout=config.DROPOUT if config.N_LAYERS > 1 else 0
        )
        self.fc = nn.Linear(config.HIDDEN_DIM, config.OUTPUT_DIM)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return self.sigmoid(out)

# 4. Optimization Primitives (Pruning & Quantization)

In [7]:
def count_parameters(model):
    """
    Counts effective (non-zero) parameters. 
    Crucial for rewarding unstructured pruning (sparsity).
    """
    total_params = 0
    for p in model.parameters():
        if p.requires_grad:
            # Only count non-zero weights (Effective Sparsity)
            total_params += torch.count_nonzero(p).item()
    return total_params

def apply_pruning(model, amount):
    """
    Applies Unstructured L1 Pruning to the Linear layer weights.
    This creates sparsity (zeros) without changing tensor shapes, preventing crashes.
    """
    if amount <= 0: return model
    
    model_copy = copy.deepcopy(model)
    for name, module in model_copy.named_modules():
        if isinstance(module, nn.Linear):
            # Prune 'amount' % of weights with lowest L1 magnitude
            prune.l1_unstructured(module, name='weight', amount=amount)
            # Make pruning permanent (remove mask) for cleaner export
            prune.remove(module, 'weight')
    return model_copy

def apply_quantization(model):
    """
    Applies dynamic quantization (INT8) to reduce model size and theoretical latency.
    """
    model_copy = copy.deepcopy(model)
    model_copy.to('cpu')
    model_copy.eval()
    # Quantize Linear and GRU layers
    quantized_model = torch.quantization.quantize_dynamic(
        model_copy, {nn.Linear, nn.GRU}, dtype=torch.qint8
    )
    return quantized_model

# 5. Custom RL Environment with Fail-Safe Mechanism

In [8]:
class SustainableAIAgentEnv(gym.Env):
    def __init__(self, baseline_model, val_loader, config):
        super(SustainableAIAgentEnv, self).__init__()
        self.baseline_model = baseline_model
        self.val_loader = val_loader
        self.config = config
        
        # Calculate Baseline Metrics using the FIXED count_parameters
        # We assume baseline is unpruned and FP32
        self.baseline_metrics = self._evaluate_performance(self.baseline_model)
        # Force baseline FLOPs to a fixed high value if not set, to ensure relative reduction works
        if self.baseline_metrics['flops'] < 1000: self.baseline_metrics['flops'] = 1e6 
            
        print(f"Baseline Metrics: {self.baseline_metrics}")
        
        self.pruning_levels = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]
        self.action_space = spaces.Discrete(len(self.pruning_levels) * 2)
        self.observation_space = spaces.Box(low=-1.0, high=1.0, shape=(4,), dtype=np.float32)
        
    def _evaluate_performance(self, model):
        model.eval()

        is_quantized = any("quantized" in str(type(m)).lower() for m in model.modules())
        
        if is_quantized:
            device = torch.device("cpu")
        else:
            device = torch.device(self.config.DEVICE)
            
        model.to(device)
        
        # 1. Accuracy
        y_true, y_pred = [], []
        with torch.no_grad():
            for X, y in self.val_loader:
                X = X.to(device)
                preds = (model(X) > 0.5).float()
                y_true.extend(y.cpu().numpy())
                y_pred.extend(preds.cpu().numpy())
        accuracy = accuracy_score(y_true, y_pred)
        
        # 2. Parameters (Non-zero count)
        params = count_parameters(model)
        
        # 3. Robust FLOPs/Energy Proxy
        baseline_p = self.baseline_metrics['params'] if hasattr(self, 'baseline_metrics') else params
        if baseline_p == 0: baseline_p = 1
        
        param_ratio = params / baseline_p
        
        # Check Quantization
        # PyTorch Dynamic Quantization swaps nn.Linear with nn.quantized.dynamic.Linear
        is_quantized = any("quantized" in str(type(m)).lower() for m in model.modules())
        quant_factor = 0.25 if is_quantized else 1.0
        
        # Baseline FLOPs
        baseline_f = self.baseline_metrics['flops'] if hasattr(self, 'baseline_metrics') else 1e6
        
        flops = baseline_f * param_ratio * quant_factor
        return {"accuracy": accuracy, "params": params, "flops": flops}

    def step(self, action):
        pruning_idx = action % len(self.pruning_levels)
        quantization_idx = action // len(self.pruning_levels)
        
        pruning_rate = self.pruning_levels[pruning_idx]
        use_quantization = bool(quantization_idx)
        
        current_model = copy.deepcopy(self.baseline_model)
        # Apply Fixed Pruning
        current_model = apply_pruning(current_model, pruning_rate)
        # Apply Quantization
        if use_quantization:
            current_model = apply_quantization(current_model)
            
        metrics = self._evaluate_performance(current_model)
        
        # Rewards
        acc_drop = metrics['accuracy'] - self.baseline_metrics['accuracy']
        flops_reduction = 1.0 - (metrics['flops'] / self.baseline_metrics['flops'])
        params_reduction = 1.0 - (metrics['params'] / self.baseline_metrics['params'])
        
        if metrics['accuracy'] < (self.baseline_metrics['accuracy'] * self.config.ACCURACY_PENALTY_THRESHOLD):
            acc_reward = -10.0
        else:
            acc_reward = acc_drop * self.config.ACC_REWARD_SCALE
            
        eff_reward = (flops_reduction * self.config.FLOPS_REWARD_SCALE) + \
                     (params_reduction * self.config.PARAMS_REWARD_SCALE)
        
        total_reward = acc_reward + eff_reward
        
        obs = np.array([metrics['accuracy'], acc_drop, params_reduction, flops_reduction], dtype=np.float32)
        info = {
            "pruning_rate": pruning_rate, 
            "quantization": use_quantization, 
            "accuracy": metrics['accuracy'],
            "reward": total_reward
        }
        
        return obs, total_reward, True, False, info

    def reset(self, seed=None):
        super().reset(seed=seed)
        obs = np.array([self.baseline_metrics['accuracy'], 0.0, 0.0, 0.0], dtype=np.float32)
        return obs, {}

# 6. Main Execution Pipeline

In [9]:
# A. Load Data & Model
try:
    processed_data = torch.load(Config.PROCESSED_DATA_PATH)
    # Use Validation set for RL Agent feedback (Test set is for final eval only)
    val_dataset = TensorDataset(processed_data['X_val'], processed_data['y_val'])
    val_loader = DataLoader(val_dataset, batch_size=1024, shuffle=False) # Large batch for faster step

    baseline_model = WeatherGRU(Config)
    baseline_model.load_state_dict(torch.load(Config.BASELINE_MODEL_PATH))
    print("Baseline artifacts loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}. Please check Config paths.")

# B. Initialize Environment
env = SustainableAIAgentEnv(baseline_model, val_loader, Config)

# C. Train PPO Agent
print("\n--- Starting PPO Agent Training ---")

checkpoint_callback = CheckpointCallback(
    save_freq=5000,
    save_path='/kaggle/working/checkpoints/',
    name_prefix='ppo_agent_sustainable'
)

# ent_coef=0.01 encourages exploration (prevents getting stuck in local optima)
agent = PPO("MlpPolicy", env, verbose=1, seed=Config.SEED, ent_coef=0.01, device=Config.DEVICE)
agent.learn(total_timesteps=Config.TOTAL_TIMESTEPS, callback=checkpoint_callback)
agent.save(Config.AGENT_SAVE_PATH)
print("PPO Training Completed.")

# D. Benchmark: Random Search (Requirement: Comparison with other methods)
print("\n--- Running Random Search Benchmark ---")
best_random_reward = -float('inf')
best_random_action = None

for _ in range(50): # 50 Random Trials
    action = env.action_space.sample()
    _, reward, _, _, info = env.step(action)
    
    if reward > best_random_reward:
        best_random_reward = reward
        best_random_action = info

print(f"Random Search Best Reward: {best_random_reward:.4f}")
print(f"Random Search Strategy: {best_random_action}")

# E. Save Best Agent Action
# Extract best action from PPO (Predict on initial state)
obs, _ = env.reset()
action, _ = agent.predict(obs, deterministic=True)
_, _, _, _, best_agent_info = env.step(action)

print("\n--- Comparison ---")
print(f"Agent Best Reward: {best_agent_info['reward'] if 'reward' in best_agent_info else 'N/A'}")
print(f"Agent Strategy: Pruning={best_agent_info['pruning_rate']}, Quant={best_agent_info['quantization']}")

# Save for evaluation notebook
with open(Config.BEST_ACTION_SAVE_PATH, 'w') as f:
    json.dump({
        "action_code": int(action),
        "pruning_rate": float(best_agent_info['pruning_rate']),
        "quantization": bool(best_agent_info['quantization'])
    }, f)
print(f"Best action saved to {Config.BEST_ACTION_SAVE_PATH}")

Baseline artifacts loaded successfully.
Baseline Metrics: {'accuracy': 0.6608695652173913, 'params': 596225, 'flops': 1000000.0}

--- Starting PPO Agent Training ---
Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | 1.48     |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 63       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 1.99        |
| time/                   |             |
|    fps                  | 29          |
|    iterations           | 2           |
|    time_elapsed         | 139         |
|    total_timesteps      | 4096        |
| train/             