# Deep Reinforcement Learning Laboratory

In this laboratory session we will work on getting more advanced versions of Deep Reinforcement Learning algorithms up and running. Deep Reinforcement Learning is **hard**, and getting agents to stably train can be frustrating and requires quite a bit of subtlety in analysis of intermediate results. We will start by refactoring (a bit) my implementation of `REINFORCE` on the [Cartpole environment](https://gymnasium.farama.org/environments/classic_control/cart_pole/).

### Import the packages

In [None]:
# %%capture
# !apt install python-opengl
# !apt install ffmpeg
# !apt install xvfb
# !pip install pyvirtualdisplay
# !pip install pyglet==1.5.1
# !pip install gym_pygame

In [None]:
import numpy as np

from collections import deque

import matplotlib.pyplot as plt
%matplotlib inline

import imageio
from PIL import Image

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Gym
import gymnasium as gym

import wandb
from wandb.integration.sb3 import WandbCallback
import copy
import random
import cv2

from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.vec_env import VecTransposeImage, DummyVecEnv, VecFrameStack, VecNormalize, SubprocVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack

## Exercise 1: Improving my `REINFORCE` Implementation (warm up)

In this exercise we will refactor a bit and improve some aspects of my `REINFORCE` implementation.

### The CartPole environment

In [None]:
# Virtual display
from pyvirtualdisplay import Display
from IPython.display import Video, display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [None]:
env_id = "CartPole-v1"

env = gym.make(env_id)

eval_env = gym.make(env_id)

s_size = env.observation_space.shape[0]
a_size = env.action_space.n

In [None]:
print("_____OBSERVATION SPACE_____ \n")
print("The State Space is: ", s_size)
print("Sample observation", env.observation_space.sample()) # Get a random observation

In [None]:
print("\n _____ACTION SPACE_____ \n")
print("The Action Space is: ", a_size)
print("Action Space Sample", env.action_space.sample()) # Take a random action

### Reinforce Architecture

> ⚠️ **Disclaimer**  
> L'implementazione riportata di seguito si basa su una combinazione di risorse pubblicamente disponibili e suggerite dal corso **Deep Reinforcement Learning with Hugging Face**. In particolare, il codice e i concetti sono stati ispirati e adattati da:
>
> - [Esempio ufficiale PyTorch REINFORCE](https://github.com/pytorch/examples/blob/main/reinforcement_learning/reinforce.py)  
> - [Implementazione REINFORCE di Udacity](https://github.com/udacity/deep-reinforcement-learning/blob/master/reinforce/REINFORCE.ipynb)  
> - [Pull Request di ottimizzazione di Chris1nexus](https://github.com/huggingface/deep-rl-class/pull/95)
>
> In particolare, l'efficiente calcolo del **reward-to-go** è stato adattato dal lavoro di [Chris1nexus](https://github.com/Chris1nexus), il cui codice e spiegazioni sono ben documentati nella [pull request](https://github.com/huggingface/deep-rl-class/pull/95) al corso Hugging Face.
>
> Queste fonti sono state selezionate tra quelle raccomandate nel modulo dedicato alla policy gradient del corso, e hanno fornito una base solida sia dal punto di vista teorico che implementativo.

<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit6/reinforce.png" alt="Reinforce"/>


In [None]:
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, a_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.softmax(x, dim=1)

    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

### Reinforce Training

In [None]:
class ReinforceTrainer:
    def __init__(self, policy, optimizer, env, gamma=0.99, max_t=1000, project_name=None):
        self.policy = policy
        self.optimizer = optimizer
        self.env = env
        self.gamma = gamma
        self.max_t = max_t
        self.project_name = project_name

        self.scores = []
        self.scores_deque = deque(maxlen=100)
        self.best_score = -float('inf')

        if self.project_name:
            wandb.init(project=self.project_name)

    def compute_returns(self, rewards):
        returns = deque(maxlen=self.max_t)
        for t in reversed(range(len(rewards))):
            R = rewards[t] + self.gamma * (returns[0] if returns else 0.0)
            returns.appendleft(R)
        returns = torch.tensor(returns)
        eps = np.finfo(np.float32).eps.item()
        return (returns - returns.mean()) / (returns.std() + eps)

    def reinforce(self, num_episodes=500, print_every=50, save_path="best_reinforce.pt"):
        for i_episode in range(1, num_episodes + 1):
            saved_log_probs = []
            rewards = []
            # Gymnasium reset returns a tuple (observation, info), we only need the observation
            state, _ = self.env.reset()

            for t in range(self.max_t):
                action, log_prob = self.policy.act(state)
                saved_log_probs.append(log_prob)
                # Gymnasium step returns (observation, reward, terminated, truncated, info)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                rewards.append(reward)
                done = terminated or truncated # In Gymnasium, done is terminated or truncated
                state = next_state # Update state for the next iteration
                if done:
                    break

            total_reward = sum(rewards)
            self.scores.append(total_reward)
            self.scores_deque.append(total_reward)

            returns = self.compute_returns(rewards)

            policy_loss = [-log_prob * R for log_prob, R in zip(saved_log_probs, returns)]
            policy_loss = torch.cat(policy_loss).sum()

            self.optimizer.zero_grad()
            policy_loss.backward()
            self.optimizer.step()

            avg_score = np.mean(self.scores_deque)

            if self.project_name:
                wandb.log({
                    'episode': i_episode,
                    'reward': total_reward,
                    'avg_reward': avg_score
                })

            if avg_score > self.best_score:
                self.best_score = avg_score
                torch.save(self.policy.state_dict(), save_path)
                if self.project_name:
                    wandb.run.summary["best_avg_reward"] = avg_score

            if i_episode % print_every == 0:
                print(f"Episode {i_episode}\tAverage Score: {avg_score:.2f}")

        self.plot_rewards(self.scores) # Use self.scores which is accumulated over all episodes

        return self.scores

    def plot_rewards(self, reward_list, window=100):
          plt.figure(figsize=(12, 6))
          plt.plot(reward_list, label='Reward per Episode')

          if len(reward_list) >= window:
              moving_avg = np.convolve(reward_list, np.ones(window)/window, mode='valid')
              plt.plot(range(window - 1, len(reward_list)), moving_avg, label=f'{window}-Episode Moving Avg', linewidth=2)

          plt.xlabel('Episode')
          plt.ylabel('Reward')
          plt.title('Training Progress')
          plt.legend()
          plt.grid(True)
          plt.show()

In [None]:
def evaluate_agent(env, max_steps, n_eval_episodes, policy):
    """
    Evaluate the agent for `n_eval_episodes` episodes and return the average and standard deviation of rewards.

    :param env: The evaluation environment (gym.Env)
    :param max_steps: Maximum number of steps per episode
    :param n_eval_episodes: Number of episodes to evaluate the agent
    :param policy: The Reinforce policy with an `act(state)` method
    :return: Tuple of (mean_reward, std_reward)
    """
    episode_rewards = []

    for episode in range(n_eval_episodes):
        state, _ = env.reset()  # Gymnasium reset returns (obs, info)
        total_rewards_ep = 0

        for step in range(max_steps):
            action, _ = policy.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_rewards_ep += reward
            done = terminated or truncated

            if done:
                break
            state = next_state

        episode_rewards.append(total_rewards_ep)

    mean_reward = np.mean(episode_rewards)
    std_reward = np.std(episode_rewards)

    return mean_reward, std_reward


In [None]:
def record_video(env, policy, out_path, fps=30):
    """
    Record a video of an agent acting in an environment.
    :param env: the environment with render_mode='rgb_array'
    :param policy: the agent, must implement policy.act(state)
    :param out_path: full path to output .mp4 or .gif
    :param fps: frames per second
    """
    images = []
    state, _ = env.reset()
    done = False

    img = env.render()
    images.append(img)

    while not done:
        action, _ = policy.act(state)
        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        img = env.render()
        images.append(img)

    imageio.mimsave(out_path, [np.array(img) for img in images], fps=fps)
    print(f"Video saved to {out_path}")
    display(Video(out_path, embed=True))

In [None]:
cartpole_hyperparameters = {
    "h_size": 16,
    "n_training_episodes": 600,
    "n_evaluation_episodes": 10,
    "max_t": 1000,
    "gamma": 1.0,
    "lr": 1e-2,
    "env_id": env_id,
    "state_space": s_size,
    "action_space": a_size,
}

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
env = gym.make("CartPole-v1", render_mode="rgb_array")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
policy = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

trainer = ReinforceTrainer(policy, optimizer, env, gamma=0.99, max_t=cartpole_hyperparameters['max_t'])

scores = trainer.reinforce(cartpole_hyperparameters['n_training_episodes'], print_every=10)


In [None]:
record_video(env, policy, "Cart_Pole.mp4")

-----
## Exercise 2: `REINFORCE` with a Value Baseline (warm up)


In questa variante dell’algoritmo REINFORCE, invece di aggiornare la policy solo in base al ritorno totale \( G_t \), si introduce una **baseline** per ridurre la varianza della stima del gradiente. Una scelta comune è usare una **value function** \( V(s_t) \), appresa tramite regressione sui ritorni osservati.

L’aggiornamento della policy diventa:

\[
\theta \leftarrow \theta + \alpha \, (G_t - V(s_t)) \, \nabla_\theta \log \pi(a_t | s_t)
\]

Dove:
- \( G_t \) è il ritorno cumulato dall’istante \( t \)
- \( V(s_t) \) è il valore stimato dello stato corrente
- \( A_t = G_t - V(s_t) \) è il **vantaggio**: quanto il risultato effettivo è migliore del previsto

In pratica:
- Addestriamo una seconda rete (`value_net`) per approssimare \( V(s) \)
- Usiamo \( A_t \) come pesatura per il gradiente della policy (invece di usare direttamente \( G_t \))
- Questo riduce la varianza dell’aggiornamento, stabilizzando il training

**Nota:** La rete del valore viene ottimizzata separatamente minimizzando l’errore quadratico tra \( V(s_t) \) e \( G_t \).




In [None]:
class ValueNetwork(nn.Module):
    def __init__(self, state_size, hidden_size=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, state):
      return self.net(state)



In [None]:
class ReinforceWithBaselineTrainer(ReinforceTrainer):
    def __init__(self, policy, value_network, policy_optimizer, value_optimizer,
                 env, gamma=0.99, max_t=1000, project_name=None):
        super().__init__(policy, policy_optimizer, env, gamma, max_t, project_name)
        self.value_network = value_network
        self.value_optimizer = value_optimizer

    def reinforce(self, num_episodes=500, print_every=50, save_path="best_reinforce_baseline.pt"):
        for i_episode in range(1, num_episodes + 1):
            saved_log_probs = []
            rewards = []
            states = []

            state, _ = self.env.reset()

            for t in range(self.max_t):
                action, log_prob = self.policy.act(state)
                saved_log_probs.append(log_prob)
                states.append(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                rewards.append(reward)
                done = terminated or truncated
                state = next_state
                if done:
                    break

            returns = self.compute_returns(rewards)
            total_reward = sum(rewards)
            self.scores.append(total_reward)
            self.scores_deque.append(total_reward)

            policy_losses = []
            value_losses = []
            for log_prob, R, state in zip(saved_log_probs, returns, states):
                state_tensor = torch.from_numpy(state).float().unsqueeze(0).to(device)
                baseline = self.value_network(state_tensor)
                advantage = R - baseline.item()
                policy_losses.append(-log_prob * advantage)
                value_losses.append(nn.functional.mse_loss(baseline.squeeze(), torch.tensor(R).to(device)))

            self.optimizer.zero_grad()
            policy_loss = torch.stack(policy_losses).sum()
            policy_loss.backward()
            self.optimizer.step()

            self.value_optimizer.zero_grad()
            value_loss = torch.stack(value_losses).sum()
            value_loss.backward()
            self.value_optimizer.step()

            avg_score = np.mean(self.scores_deque)

            if self.project_name:
                wandb.log({
                    'episode': i_episode,
                    'reward': total_reward,
                    'avg_reward': avg_score,
                    'policy_loss': policy_loss.item(),
                    'value_loss': value_loss.item()
                })

            if avg_score > self.best_score:
                self.best_score = avg_score
                torch.save(self.policy.state_dict(), save_path)
                if self.project_name:
                    wandb.run.summary["best_avg_reward"] = avg_score

            if i_episode % print_every == 0:
                print(f"Episode {i_episode}\tAverage Score: {avg_score:.2f}")

        self.plot_rewards(self.scores)
        return self.scores


**First Things First**: Recall from the slides on Deep Reinforcement Learning that we can **subtract** any function that doesn't depend on the current action from the q-value without changing the (maximum of our) objecttive function $J$:  

$$ \nabla J(\boldsymbol{\theta}) \propto \sum_{s} \mu(s) \sum_a \left( q_{\pi}(s, a) - b(s) \right) \nabla \pi(a \mid s, \boldsymbol{\theta}) $$

In `REINFORCE` this means we can subtract from our target $G_t$:

$$ \boldsymbol{\theta}_{t+1} \triangleq \boldsymbol{\theta}_t + \alpha (G_t - b(S_t)) \frac{\nabla \pi(A_t \mid s, \boldsymbol{\theta})}{\pi(A_t \mid s, \boldsymbol{\theta})} $$

Since we are only interested in the **maximum** of our objective, we can also **rescale** our target by any function that also doesn't depend on the action. A **simple baseline** which is even independent of the state -- that is, it is **constant** for each episode -- is to just **standardize rewards within the episode**. So, we **subtract** the average return and **divide** by the variance of returns:

$$ \boldsymbol{\theta}_{t+1} \triangleq \boldsymbol{\theta}_t + \alpha \left(\frac{G_t - \bar{G}}{\sigma_G}\right) \nabla  \pi(A_t \mid s, \boldsymbol{\theta}) $$

This baseline is **already** implemented in my implementation of `REINFORCE`. Experiment with and without this standardization baseline and compare the performance. We are going to do something more interesting.

In [None]:
seed = 123
state, _ = env.reset(seed=seed)
env.action_space.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)


state_size = env.observation_space.shape[0]
action_size = env.action_space.n
hidden_size = 128

policy1 = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
policy2 = copy.deepcopy(policy1)

policy_optimizer1 = torch.optim.Adam(policy1.parameters(), lr=1e-2)
policy_optimizer2 = torch.optim.Adam(policy2.parameters(), lr=1e-2)

# Istanzia rete valore + ottimizzatore per baseline trainer
value_net = ValueNetwork(state_size, hidden_size).to(device)
value_optimizer = torch.optim.Adam(value_net.parameters(), lr=1e-2)


trainer_no_baseline = ReinforceTrainer(policy1, policy_optimizer1, env, gamma=0.99, max_t=1000)
trainer_with_baseline = ReinforceWithBaselineTrainer(policy2, value_net, policy_optimizer2, value_optimizer,
                                                    env, gamma=0.99, max_t=1000)


scores_no_baseline = trainer_no_baseline.reinforce(cartpole_hyperparameters['n_training_episodes'], print_every=10),
scores_with_baseline = trainer_with_baseline.reinforce(cartpole_hyperparameters['n_training_episodes'], print_every=10)


**The Real Exercise**: Standard practice is to use the state-value function $v(s)$ as a baseline. This is intuitively appealing -- we are more interested in updating out policy for returns that estimate the current **value** worse. Our new update becomes:

$$ \boldsymbol{\theta}_{t+1} \triangleq \boldsymbol{\theta}_t + \alpha (G_t - \tilde{v}(S_t \mid \mathbf{w})) \frac{\nabla \pi(A_t \mid s, \boldsymbol{\theta})}{\pi(A_t \mid s, \boldsymbol{\theta})} $$

where $\tilde{v}(s \mid \mathbf{w})$ is a **deep neural network** with parameters $w$ that estimates $v_\pi(s)$. What neural network? Typically, we use the **same** network architecture as that of the Policy.

**Your Task**: Modify your implementation to fit a second, baseline network to estimate the value function and use it as **baseline**.

In [None]:
# Your code here.

-----
## Exercise 3: Going Deeper

As usual, pick **AT LEAST ONE** of the following exercises to complete.

### Exercise 3.1: Solving Lunar Lander with `REINFORCE` (easy)

Use my (or even better, improve on my) implementation of `REINFORCE` to solve the [Lunar Lander Environment](https://gymnasium.farama.org/environments/box2d/lunar_lander/). This environment is a little bit harder than Cartpole, but not much. Make sure you perform the same types of analyses we did during the lab session to quantify and qualify the performance of your agents.

### Exercise 3.2: Solving Cartpole and Lunar Lander with `Deep Q-Learning` (harder)

On policy Deep Reinforcement Learning tends to be **very unstable**. Write an implementation (or adapt an existing one) of `Deep Q-Learning` to solve our two environments (Cartpole and Lunar Lander). To do this you will need to implement a **Replay Buffer** and use a second, slow-moving **target Q-Network** to stabilize learning.

### Exercise 3.3: Solving the OpenAI CarRacing environment (hardest)

Use `Deep Q-Learning` -- or even better, an off-the-shelf implementation of **Proximal Policy Optimization (PPO)** -- to train an agent to solve the [OpenAI CarRacing](https://github.com/andywu0913/OpenAI-GYM-CarRacing-DQN) environment. This will be the most *fun*, but also the most *difficult*. Some tips:

1. Make sure you use the `continuous=False` argument to the environment constructor. This ensures that the action space is **discrete** (we haven't seen how to work with continuous action spaces).
2. Your Q-Network will need to be a CNN. A simple one should do, with two convolutional + maxpool layers, folowed by a two dense layers. You will **definitely** want to use a GPU to train your agents.
3. The observation space of the environment is a single **color image** (a single frame of the game). Most implementations stack multiple frames (e.g. 3) after converting them to grayscale images as an observation.



#### Modifiche all'Ambiente e Preprocessamento delle Osservazioni

In [None]:
class CarRacingPreprocessing(gym.ObservationWrapper):
    def __init__(self, env, height=84):
        super(CarRacingPreprocessing, self).__init__(env)
        self.height = height
        # IMPORTANTE: observation space per scala di grigi, normalizzato a float32 0-1
        # L'output sarà (height, width, 1) per il wrapper, ma VecTransposeImage lo trasformerà in (1, height, width)
        self.observation_space = gym.spaces.Box(
            low=0.0, high=1.0, shape=(self.height, 96, 1), dtype=np.float32 # Modificato dtype e low/high
        )

    def observation(self, obs):
        # Converti in scala di grigi
        gray = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        # Crop dalla parte superiore (rimuove dashboard)
        gray = gray[:self.height, :]
        # Aggiungi dimensione canale
        gray = np.expand_dims(gray, axis=-1)
        # Normalizzazione: Cruciale per reti neurali
        gray = gray.astype(np.float32) / 255.0
        return gray

#### Features Extractor

In [None]:

class CNN(BaseFeaturesExtractor):
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CNN, self).__init__(observation_space, features_dim)

        n_input_channels = observation_space.shape[0] # Questo sarà 3 se n_stack=3

        self.cnn = nn.Sequential(
            # Layer 1: Kernel più piccolo, stride 2, con padding per preservare i bordi
            # Input: (C, 84, 96) -> Output: (32, 42, 48) circa
            nn.Conv2d(n_input_channels, 32, kernel_size=5, stride=2, padding=1), # Modificato kernel_size e aggiunto padding
            nn.ReLU(),
            # Layer 2: Input: (32, 42, 48) -> Output: (64, 21, 24) circa
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1), # Modificato kernel_size e aggiunto padding
            nn.ReLU(),
            # Layer 3: Input: (64, 21, 24) -> Output: (64, 11, 12) circa
            nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1), # Modificato kernel_size e aggiunto padding
            nn.ReLU(),
            nn.Flatten()
        )

        with torch.no_grad():
            sample_input = torch.as_tensor(observation_space.sample()[None]).float()
            # Se la forma è (H, W, C), PyTorch Conv2d si aspetta (C, H, W)
            # In questo caso VecFrameStack già output (C, H, W)
            # Quindi sample_input sarà (1, n_stack, H, W)
            n_flatten = self.cnn(sample_input).shape[1]

        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim),
            nn.ReLU()
        )

    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

In [4]:
def make_env():
    def _init():
        try:
            env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=False)
            env = CarRacingPreprocessing(env, height=84)
            return env
        except Exception as e:
            print(f"Error initializing environment: {e}")
            raise
    return _init


n_envs = 8
env = SubprocVecEnv([make_env() for _ in range(n_envs)])
env = VecFrameStack(env, n_stack=4)
env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_reward=10.0)

In [5]:
# Inizializza W&B
wandb.init(
    project="ppo-carracing",
    config={
        "policy_type": "CnnPolicy",
        "total_timesteps": 1_000_000,
        "learning_rate": 5e-4,
        "n_steps": 2048,
        "batch_size": 64,
        "n_epochs": 10,
        "gamma": 0.99,
        "gae_lambda": 0.95,
        "clip_range": 0.2,
        "ent_coef": 0.05,
    },
    sync_tensorboard=True,  # sincronizza TB e W&B
    monitor_gym=True,       # logga anche l’ambiente se possibile
    save_code=True,
)

# Policy kwargs
policy_kwargs = dict(
    features_extractor_class=CNN,
    features_extractor_kwargs=dict(features_dim=256),
)

# Inizializza il modello
model = PPO(
    "CnnPolicy",
    env,
    policy_kwargs=policy_kwargs,
    learning_rate=1e-4,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    ent_coef=0.04,
    verbose=1,
    tensorboard_log="./ppo_tb/",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

[34m[1mwandb[0m: Currently logged in as: [33mvincenzo-civale[0m ([33mvincenzo-civale-universi-degli-studi-di-firenze[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Using cuda device


In [None]:
# Callback W&B (puoi combinare con altri callback come EvalCallback)
wandb_callback = WandbCallback(
    gradient_save_freq=100,
    model_save_path="./models/",
    verbose=2,
)

# Avvia il training
print("Starting training...")
model.learn(
    total_timesteps=1_000_000,
    callback=wandb_callback
)

In [None]:
def record_agent_video(model, steps=1000, save_path="carracing_eval.mp4"):
    env = gym.make("CarRacing-v3", render_mode="rgb_array", continuous=False)
    obs, _ = env.reset()
    images = []
    total_reward = 0
    done = False

    for _ in range(steps):
        img = env.render()
        images.append(img)

        # Preprocessa l'osservazione per adattarla al formato atteso dal modello
        # Il modello si aspetta (3, 84, 96)

        # 1. Taglia l'immagine a 84 pixel di altezza (come nel tuo preprocessing)
        obs_cropped = obs[:84, :, :]  # Da (96, 96, 3) a (84, 96, 3)

        # 2. Trasforma da HWC a CHW
        obs_processed = np.transpose(obs_cropped, (2, 0, 1))  # Da (84, 96, 3) a (3, 84, 96)

        action, _ = model.predict(obs_processed, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)
        total_reward += reward
        if done or truncated:
            break

    env.close()
    imageio.mimsave(save_path, images, fps=30)
    return save_path, total_reward

video_path, total_reward = record_agent_video(model, steps=1000)
print("Total reward:", total_reward)

# Log su W&B
wandb.init(project="carracing-ppo-baseline", name="eval-video")
wandb.log({"evaluation_reward": total_reward})
wandb.log({"agent_play": wandb.Video(video_path, fps=30, format="mp4")})
wandb.finish()