# Bipedal Walker (easy)
[Documentation](https://gymnasium.farama.org/environments/box2d/bipedal_walker/)  
This environment is part of the Box2D environments which contains general information about the environment.

[leaderboard](https://github.com/openai/gym/wiki/Leaderboard#bipedalwalker-v2)

## Description
This is a simple 4-joint walker robot environment. There are two versions:

- Normal, with slightly uneven terrain.
- Hardcore, with ladders, stumps, pitfalls.
ls.

To solve the normal version, you need to get 300 points in 1600 time steps. To solve the hardcore version, you need 300 points in 2000 time steps.

In [1]:
# import standard libraries
import os
import base64
import random
import time
import copy
from collections import namedtuple, deque

# import third-party libraries
import gymnasium as gym
import numpy as np
import pandas as pd
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import imageio
import cv2
from IPython.display import clear_output, display, HTML
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
from matplotlib.figure import Figure
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas

## Hardware infos

In [2]:
!nvidia-smi -L

GPU 0: NVIDIA GeForce GTX 1070 (UUID: GPU-19cef1c2-e216-e824-c98e-660394f8a4bb)


In [3]:
print(f"torch.version.cuda=[{torch.version.cuda}]")
print(f"torch.cuda.is_available({torch.cuda.is_available()})")     

torch.version.cuda=[11.7]
torch.cuda.is_available(True)


In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

## Create directories

In [5]:
os.makedirs('img', exist_ok=True)
os.makedirs('saves', exist_ok=True)
print('./img :')
!ls -al img
print('./saves :')
!ls -al saves

./img :
total 29524
drwxr-xr-x 1 thiba 197609        0 Jun 21 21:11 .
drwxr-xr-x 1 thiba 197609        0 Jun 22 14:35 ..
drwxr-xr-x 1 thiba 197609        0 Jun 21 21:11 .ipynb_checkpoints
-rw-r--r-- 1 thiba 197609 12930388 May  6 09:05 lunar_lander.gif
-rw-r--r-- 1 thiba 197609 17291191 Jun 21 21:09 v3_bipedal_walker_easy.gif
./saves :
total 6780
drwxr-xr-x 1 thiba 197609      0 Jun 22 11:43 .
drwxr-xr-x 1 thiba 197609      0 Jun 22 14:35 ..
drwxr-xr-x 1 thiba 197609      0 Jun 21 21:11 .ipynb_checkpoints
-rw-r--r-- 1 thiba 197609 457239 Jun 22 14:33 bipedal_walker_easy_actor_model.pth
-rw-r--r-- 1 thiba 197609 882535 Jun 22 14:33 bipedal_walker_easy_critic0_model.pth
-rw-r--r-- 1 thiba 197609 882535 Jun 22 14:33 bipedal_walker_easy_critic1_model.pth
-rw-r--r-- 1 thiba 197609 882535 Jun 22 14:33 bipedal_walker_easy_critic2_model.pth
-rw-r--r-- 1 thiba 197609   1038 Jun 22 14:33 bipedal_walker_easy_normalizer.pkl
-rw-r--r-- 1 thiba 197609 846941 Jun 21 20:38 v3_bipedal_walker_easy_actor

## Constants
### Hyperparameters

In [6]:
BUFFER_SIZE = 1_000_000
BATCH_SIZE = 128
GAMMA = 0.98
TAU = 0.02
LR_ACTOR = 0.0001
LR_CRITIC = 0.0003
WEIGHT_DECAY = 0.00
POLICY_NOISE = 0.2
NOISE_CLIP = 0.5
POLICY_FREQ = 2
START_LEARNING = 10000 # steps in the beginning to let the agent explore
N_TOTAL_EPISODES = 1500
N_CRITICS = 3

### Paths

In [7]:
PATH = "./saves/bipedal_walker_easy"
PATH_IMG = "./img/bipedal_walker_easy"

## Utils

In [8]:
def print_infos(ep_len_mean, ep_rew_mean, ep_len, ep_rew, min_rew, ep_min_rew, episodes, fps, time_elapsed, total_timesteps, actor_loss, critic_loss_arr, learning_rate, n_updates):
    print(f"- rollout/")
    print(f"    - ep_len_mean     : {ep_len_mean}")
    print(f"    - ep_rew_mean     : {ep_rew_mean}")
    print(f"    - ep_len          : {ep_len}")
    print(f"    - ep_rew          : {ep_rew}")
    print(f"    - max_rew         : {min_rew}")
    print(f"    - ind_max_rew     : {ep_min_rew}")
    print(f"")
    print(f"- time/")
    print(f"    - episodes        : {episodes}")
    print(f"    - fps             : {fps}")
    print(f"    - time_elapsed    : {time_elapsed}")
    print(f"    - total_timesteps : {total_timesteps}")
    print(f"")
    print(f"- train/")
    print(f"    - actor_loss      : {actor_loss}")
    for i, c in enumerate(critic_loss_arr):
        print(f"    - critic_loss{i+1}    : {c}")
    print(f"    - learning_rate   : {learning_rate}")
    print(f"    - n_updates       : {n_updates}")

In [9]:
def save_gif(img_list, path):
    # Convert the list of frames to a numpy array
    resized_img_array = []
    for img in img_list:
        img_pil = Image.fromarray(img)
        # Make sure width and height are divisible by 16
        img_resized_pil = img_pil.resize((608, 400))
        img_resized = np.array(img_resized_pil)
        resized_img_array.append(img_resized)
    
    # Create gif video
    fps = 20
    imageio.mimsave(path, resized_img_array, 'GIF', duration=int(1000 * 1/fps), loop=0)

In [10]:
def plot_history(reward_history, rolling_window=20, x_label = 'Episode', y_label = 'Total Points', lower_limit=None, upper_limit=None, plot_rw=True, plot_rm=True):
    """
    Function to plot reward history and its rolling mean with some optional arguments.

    Args:
        reward_history (list): A list of rewards for each episode.
        rolling_window (int): The number of episodes for computing the rolling mean.
        lower_limit (int): Starting episode index for plotting.
        upper_limit (int): Ending episode index for plotting.
        plot_rw (bool): A flag for plotting raw reward history.
        plot_rm (bool): A flag for plotting rolling mean reward history.

    Returns:
        None
    """
    
    # If lower_limit and upper_limit are not provided, use the whole reward_history
    if lower_limit is None or upper_limit is None:
        rh = reward_history
        xs = [x for x in range(len(reward_history))]
    else:
        rh = reward_history[lower_limit:upper_limit]
        xs = [x for x in range(lower_limit,upper_limit)]
   
    # Create a DataFrame and calculate the rolling mean
    df = pd.DataFrame(rh)
    rollingMean = df.rolling(rolling_window).mean()

    # Plot the results
    plt.figure(figsize=(10,7), facecolor='white')
    
    if plot_rw:
        plt.plot(xs, rh, linewidth=1, color='cyan')
    if plot_rm:
        plt.plot(xs, rollingMean, linewidth=2, color='magenta')

    text_color = 'black'
        
    ax = plt.gca()
    ax.set_facecolor('black')
    plt.grid()

    plt.xlabel(x_label, color=text_color, fontsize=30)
    plt.ylabel(y_label, color=text_color, fontsize=30)
    yNumFmt = mticker.StrMethodFormatter('{x:,}')
    ax.yaxis.set_major_formatter(yNumFmt)
    ax.tick_params(axis='x', colors=text_color)
    ax.tick_params(axis='y', colors=text_color)
    plt.show()

In [11]:
def print_hyperparameters():
    print("----------------------------------")
    print(f"| {BUFFER_SIZE=} \t\t |")
    print(f"| {BATCH_SIZE=} \t\t |")
    print(f"| {GAMMA=} \t\t\t |")
    print(f"| {TAU=} \t\t\t |")
    print(f"| {LR_ACTOR=} \t\t |")
    print(f"| {LR_CRITIC=} \t\t |")
    print(f"| {WEIGHT_DECAY=} \t\t |")
    print(f"| {POLICY_NOISE=} \t\t |")
    print(f"| {NOISE_CLIP=} \t\t |")
    print(f"| {POLICY_FREQ=} \t\t |")
    print(f"| {START_LEARNING=} \t\t |")
    print(f"| {N_TOTAL_EPISODES=} \t |")
    print(f"| {N_CRITICS=} \t\t\t |")
    print("----------------------------------")

## Experience Replay

In [12]:
Experience = namedtuple('Experience', field_names=['state', 'action', 'reward', 'next_state', 'done'])

In [13]:
class ReplayBuffer:
    def __init__(self, buffer_size, batch_size):
        self.memory = deque(maxlen=buffer_size)  
        self.batch_size = batch_size

    def add(self, state, action, reward, next_state, done):
        e = Experience(state, action, reward, next_state, done)
        self.memory.append(e)

    def sample(self):
        experiences = random.sample(self.memory, k=self.batch_size)

        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)

        return (states, actions, rewards, next_states, dones)

    def __len__(self):
        return len(self.memory)

## Normalizers

### States normalizer

In [14]:
class Normalizer():
    def __init__(self, size):
        self.n = np.zeros(size)
        self.mean = np.zeros(size)
        self.mean_diff = np.zeros(size)
        self.var = np.zeros(size)

    def observe(self, x):
        self.n += 1.0
        last_mean = self.mean.copy()
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2) 

    def normalize(self, inputs):
        obs_mean = self.mean
        obs_std = np.sqrt(self.var)
        return (inputs - obs_mean) / obs_std

    def observe2d(self, x):
        x = np.atleast_2d(x)  # Make sure x is at least 2-D
        batch_size, _ = x.shape
        self.n += batch_size
        for instance in x:
            last_mean = self.mean.copy()
            self.mean += (instance - self.mean) / self.n
            self.mean_diff += (instance - last_mean) * (instance - self.mean)
        self.var = (self.mean_diff / self.n).clip(min=1e-2)

    def save(self):
        with open(f"{PATH}_normalizer.pkl", 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load(self):
        with open(f"{PATH}_normalizer.pkl", 'rb') as f:
            tmp_dict = pickle.load(f)
        self.__dict__.update(tmp_dict)

### Reward normalizer

In [15]:
class RewardNormalizer():
    def __init__(self):
        self.n = 0
        self.mean = 0
        self.mean_diff = 0
        self.var = 1e-2  # Start with small variance to avoid division by zero

    def observe(self, x):
        self.n += 1
        last_mean = self.mean
        self.mean += (x - self.mean) / self.n
        self.mean_diff += (x - last_mean) * (x - self.mean)
        self.var = (self.mean_diff / self.n).clip(min = 1e-2) 

    def normalize(self, reward):
        reward_std = np.sqrt(self.var)
        return (reward - self.mean) / reward_std

    def observe2d(self, x):
        x = np.atleast_2d(x)  # Make sure x is at least 2-D
        batch_size, _ = x.shape
        self.n += batch_size
        for instance in x:
            last_mean = self.mean.copy()
            self.mean += (instance - self.mean) / self.n
            self.mean_diff += (instance - last_mean) * (instance - self.mean)
        self.var = (self.mean_diff / self.n).clip(min=1e-2)

    def save(self):
        with open(f"{PATH}_reward_normalizer.pkl", 'wb') as f:
            pickle.dump(self.__dict__, f)

    def load(self):
        with open(f"{PATH}_reward_normalizer.pkl", 'rb') as f:
            tmp_dict = pickle.load(f)
        self.__dict__.update(tmp_dict)

## Networks

### Actor Network

In [16]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, seed, fc1_units=400, fc2_units=256):
        super(Actor, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, fc1_units)
        self.fc2 = nn.Linear(fc1_units, fc2_units)
        self.fc3 = nn.Linear(fc2_units, action_size)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.fc3(x))

### Critic Network

In [17]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size, seed, fcs1_units=512, fc2_units=400):
        super(Critic, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(state_size, fcs1_units)
        self.fc2 = nn.Linear(fcs1_units+action_size, fc2_units)
        self.fc3 = nn.Linear(fc2_units, 1)

    def forward(self, state, action):
        xs = F.relu(self.fcs1(state))
        x = torch.cat((xs, action), dim=1)
        x = F.relu(self.fc2(x))
        return self.fc3(x)

## DDPG Agent

In [18]:
class Agent:
    def __init__(self, state_size, action_size, reward_normalizer, n_critics = 3, random_seed = 0):
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)

        # Actor Network 
        self.actor_local = Actor(state_size, action_size, random_seed).to(device)
        self.actor_target = Actor(state_size, action_size, random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR)
        self.actor_loss_history = []

        # Critic Network (w/ Target Network)
        self.n_critics = n_critics
        self.critic_local = [Critic(state_size, action_size, random_seed).to(device) for _ in range(self.n_critics)]
        self.critic_target = [Critic(state_size, action_size, random_seed).to(device) for _ in range(self.n_critics)]
        self.critic_optimizer = [optim.Adam(self.critic_local[idx].parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) for idx in range(self.n_critics)]
        self.critic_loss_history = [[] for _ in range(self.n_critics)]

        # Replay memory
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)

        # Reward Normalizer
        self.reward_normalizer = reward_normalizer
        
        self.n_updates = 0

    def step(self, state, action, reward, next_state, done, timestep):
        self.memory.add(state, action, reward, next_state, done)

        if len(self.memory) > BATCH_SIZE and timestep % POLICY_FREQ == 0:
            experiences = self.memory.sample()
            states, actions, rewards, next_states, dones = experiences
            # switch to numpy
            states = states.cpu().numpy()
            next_states = next_states.cpu().numpy()
            rewards = rewards.cpu().numpy()
            # Normalize states
            normalizer.observe2d(states)
            states = normalizer.normalize(states)
            # Normalize next_states
            normalizer.observe2d(next_states)
            next_states = normalizer.normalize(next_states)
            # Normalize rewards
            self.reward_normalizer.observe(rewards)
            rewards = self.reward_normalizer.normalize(rewards)
            # switch to tensor
            states = torch.from_numpy(states).float().to(device)
            next_states = torch.from_numpy(next_states).float().to(device)
            rewards = torch.from_numpy(rewards).float().to(device)
            self.learn((states, actions, rewards, next_states, dones), GAMMA)

    def act(self, state, add_noise=True):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            noise = np.random.normal(0, POLICY_NOISE, size=self.action_size)
            noise = np.clip(noise, -NOISE_CLIP, NOISE_CLIP)
            action += noise
        return np.clip(action, -1, 1)

    def save_model(self):
        torch.save(self.actor_local.state_dict(), f"{PATH}_actor_model.pth")
        for idx, critic_local in enumerate(self.critic_local):
            torch.save(critic_local.state_dict(), f"{PATH}_critic{idx}_model.pth")

    def load_model(self):
        self.actor_local.load_state_dict(torch.load(f"{PATH}_actor_model.pth"))
        for idx in range(self.n_critics):
            self.critic_local[idx].load_state_dict(torch.load(f"{PATH}_critic{idx}_model.pth"))
        
    def get_lr(self):
        for param_group in self.actor_optimizer.param_groups:
            return param_group['lr']

    def learn(self, experiences, gamma):
        states, actions, rewards, next_states, dones = experiences

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        noise = torch.normal(torch.zeros(actions.size()), POLICY_NOISE).to(device)
        noise = torch.clamp(noise, -NOISE_CLIP, NOISE_CLIP)
        actions_next = self.actor_target(next_states) + noise
        actions_next = torch.clamp(actions_next, -1, 1)

        Q_targets_next_arr = [self.critic_target[idx](next_states, actions_next) for idx in range(self.n_critics)]

        Q_targets_next = Q_targets_next_arr[0]
        for idx in range(1, self.n_critics):
            Q_targets_next = torch.min(Q_targets_next, Q_targets_next_arr[idx])
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Compute all critic loss
        for idx in range(self.n_critics):
            Q_expected = self.critic_local[idx](states, actions)
            critic_loss = F.mse_loss(Q_expected, Q_targets.detach()) # detach Q_targets here to avoid the "the graph are freed" error
            # Save the critic loss
            self.critic_loss_history[idx].append(critic_loss.item())
            # Update critic loss
            self.critic_optimizer[idx].zero_grad()
            critic_loss.backward()
            self.critic_optimizer[idx].step()

        # ---------------------------- update actor ---------------------------- #
        actions_pred = self.actor_local(states)
        # Compute the average of the critics
        critic_values = [self.critic_local[idx](states, actions_pred) for idx in range(self.n_critics)]
        critic_value = sum(critic_values) / float(self.n_critics)
        actor_loss = -critic_value.mean()
        # Save the loss
        self.actor_loss_history.append(actor_loss.item())
        
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        for idx in range(self.n_critics):
            self.soft_update(self.critic_local[idx], self.critic_target[idx], TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        
        self.n_updates += 1

    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

## Train

In [19]:
def ddpg(env, agent, normalizer, n_episodes=1000, max_t=1000):
    scores_deque = deque(maxlen=100)
    timesteps_deque = deque(maxlen=100)
    scores = []
    start_time_elapsed = time.time()
    total_timesteps = 0
    min_rew = -1500 
    ep_min_rew = 0
    for i_episode in range(1, n_episodes+1):
        # function to override printlines from previous loop iteration 
        clear_output(wait=True)
        state, _ = env.reset()
        # State normalization
        normalizer.observe(state)
        normalized_state = normalizer.normalize(state)
        score = 0
        timestep = 0
        done = False
        while not done:
            if total_timesteps >= START_LEARNING:
                action = agent.act(normalized_state).squeeze(0)
            else:
                action = env.action_space.sample()  # Sample random action
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = torch.from_numpy(next_state.T).float().squeeze(0).numpy()
            
            # State normalization
            normalizer.observe(next_state)
            normalized_next_state = normalizer.normalize(next_state)
        
            done = terminated or truncated
            agent.step(state, action, reward, next_state, done, total_timesteps)
            state = next_state
            normalized_state = normalized_next_state
            score += reward
            timestep += 1
            total_timesteps += 1
        scores_deque.append(score)
        timesteps_deque.append(timestep)
        scores.append(score)

        time_elapsed = time.time() - start_time_elapsed
        if min_rew <= score:
            min_rew = score
            ep_min_rew = i_episode
            agent.save_model()
            normalizer.save()  
        print_infos(
            int(np.mean(timesteps_deque)),
            np.mean(scores_deque),
            timestep,
            score,
            min_rew,
            ep_min_rew, 
            i_episode,
            int(total_timesteps / time_elapsed),
            int(time_elapsed),
            total_timesteps,
            np.mean(agent.actor_loss_history),
            [np.mean(critic_loss_history) for critic_loss_history in agent.critic_loss_history],
            agent.get_lr(),
            agent.n_updates
        )

        if np.mean(scores_deque) >= 300:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
            break
    return scores

### create environment

In [20]:
print_hyperparameters()

----------------------------------
| BUFFER_SIZE=1000000 		 |
| BATCH_SIZE=128 		 |
| GAMMA=0.98 			 |
| TAU=0.02 			 |
| LR_ACTOR=0.0001 		 |
| LR_CRITIC=0.0003 		 |
| WEIGHT_DECAY=0.0 		 |
| POLICY_NOISE=0.2 		 |
| NOISE_CLIP=0.5 		 |
| POLICY_FREQ=2 		 |
| START_LEARNING=10000 		 |
| N_TOTAL_EPISODES=1500 	 |
| N_CRITICS=3 			 |
----------------------------------


In [None]:
env = gym.make("BipedalWalker-v3", hardcore=False)
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
normalizer = Normalizer(state_size)
reward_normalizer = RewardNormalizer()
agent = Agent(state_size, action_size, reward_normalizer, N_CRITICS, random_seed=0)
scores = ddpg(env, agent, normalizer, N_TOTAL_EPISODES)
env.close()

- rollout/
    - ep_len_mean     : 892
    - ep_rew_mean     : -113.48731746943828
    - ep_len          : 357
    - ep_rew          : -126.18929845913571
    - max_rew         : -28.98261458703377
    - ind_max_rew     : 172

- time/
    - episodes        : 186
    - fps             : 101
    - time_elapsed    : 1195
    - total_timesteps : 120957

- train/
    - actor_loss      : -5.843022902518804
    - critic_loss1    : 0.112986796827377
    - critic_loss2    : 0.112986796827377
    - critic_loss3    : 0.112986796827377
    - learning_rate   : 0.0001
    - n_updates       : 60415


### Plot rewards

In [None]:
# Plot the point history
plot_history(scores)

In [None]:
# Plot the point Actor loss history
# Reshape the data to have N_TOTAL_EPISODES number of chunks
actor_loss_history = np.array(agent.actor_loss_history)[:190000].reshape(N_TOTAL_EPISODES, -1)
# Compute the mean along axis 1 (which represents each chunk of 200 points)
mean_actor_loss_history = actor_loss_history.mean(axis=1)
plot_history(mean_actor_loss_history, y_label="Actor Loss")

In [None]:
for idx in range(agent.n_critics):
    critic_loss_history = np.array(agent.critic_loss_history[idx])[:190000].reshape(N_TOTAL_EPISODES, -1)
    # Compute the mean along axis 1 (which represents each chunk of 200 points)
    mean_critic_loss_history = critic_loss_history.mean(axis=1)
    plot_history(mean_critic_loss_history, y_label=f"Critic Loss {idx}")

## Visualizing
### create environment

In [None]:
env = gym.make("BipedalWalker-v3", hardcore=False, render_mode="rgb_array")
state_size = env.observation_space.shape[0]
action_size = env.action_space.shape[0]
new_normalizer = Normalizer(state_size)
new_normalizer.load()  # Load the saved statistics
agent2 = Agent(state_size, action_size, random_seed=0)
agent2.load_model()

In [None]:
screen_list = []
scores = []
best_score = -1500
for i in range(10):
    state, _ = env.reset()
    #Select an action
    normalized_state = normalizer.normalize(state)
    done = False
    timestep = 0
    episode_reward = 0
    e_screen_list = []
    while not done:
        action = agent2.act(normalized_state, False).squeeze(0)
    
        # Printing env render (rgb_array)
        screen = env.render()
        # Add title to the screen
        screen = cv2.putText(
            np.array(screen),
            f"Iteration=[{i}] Timestep=[{timestep +1}]",
            (25, 25),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.5,
            (0, 0, 0),
            1,
            cv2.LINE_AA
        )
        e_screen_list.append(screen)
        
        next_state, reward, terminated, truncated, _ = env.step(action)
        next_state = torch.from_numpy(next_state.T).float().squeeze(0).numpy()
        done = terminated or truncated
    
        #Select an action
        normalized_next_state = normalizer.normalize(next_state)
        state = next_state
        normalized_state = normalized_next_state
        episode_reward += reward
    
        timestep += 1

    if best_score <= episode_reward:
        best_score = episode_reward
        screen_list.append(e_screen_list)
    
    scores.append(episode_reward)

### Scores of evaluation

In [None]:
print(f"Best score: {max(scores)}")
print(f"AVG score: {np.mean(scores)}")

### Selection of the 5 best iterations

In [None]:
n = 5

In [None]:
best_screen_list = screen_list[-n:]
print(f"Number of iter : {len(best_screen_list)}")

In [None]:
selected_screens_list = np.empty((0, 400, 600, 3))
for screen_ep in best_screen_list:
    selected_screens_list = np.concatenate((selected_screens_list, np.array(screen_ep)), axis=0)

selected_screens_list = selected_screens_list.astype(np.uint8)
print(selected_screens_list.shape)

### Save gif

In [None]:
path = f"{PATH_IMG}.gif"
save_gif(list(selected_screens_list), path)

### Embed the video

In [None]:
video = open(path, 'rb').read()
b64_video = base64.b64encode(video)
video_tag = '<img src="data:image/gif;base64,{0}">'.format(b64_video.decode())

display(HTML(video_tag))

In [None]:
# Plot the point scores evaluation
plot_history(scores)