Documentation: https://gymnasium.farama.org/environments/box2d/car_racing/
Example: https://github.com/kvgarimella/dagger

In [None]:
!pip3 install swig
!pip3 install gym[box2d]
!pip3 install -q stable-baselines3[extra]
!pip install torch torchvision gym

In [None]:
import sys
USING_COLAB = 'google.colab' in sys.modules

if USING_COLAB:
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !pip install -U renderlab
    !pip install -U colabgymrender
    !pip install -U moviepy==0.2.3.5
    !pip install imageio==2.4.1
    !pip install --upgrade AutoROM
    !AutoROM --accept-license
    !pip install gymnasium
    !pip install gym[classic_control] > /dev/null 2>&1
    !pip install stable_baselines3

import numpy as np
import gymnasium as gym
import random
import matplotlib.pyplot as plt
from copy import deepcopy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from torch.utils.data import DataLoader
from torch import nn
import torch

from tqdm import tqdm, trange
from renderlab import RenderFrame

seed = 24
data_seed = 700

In [76]:
env_args = {}
directory = './video'
env_args['render_mode'] = 'rgb_array'
env = gym.make("CarRacing-v2", **env_args)
env = RenderFrame(env, directory)

In [74]:
def visualize(env_name="CarRacing-v2", agent=None, video_name="test", env_args={}, rendering=True, max_timesteps=1000):
    def get_action(state):
        if not agent:
            action = env.action_space.sample()
            return action
        else:
            with torch.no_grad():
                action = actor.select_action(state).numpy()
                #action = actor(state_tensor).squeeze().numpy()
            return action

    state = np.asarray(env.reset()[0])

    episode_reward = 0
    step = 0

    while True:
        action = get_action(state)

        next_state, r, done, truncate, info = env.step(action)

        episode_reward += r
        state = next_state
        step += 1

        if rendering:
            env.render()

        if done or step > max_timesteps:
            break

    env.play()

    return episode_reward

  and should_run_async(code)



In [78]:
visualize()


100%|██████████| 1002/1002 [00:09<00:00, 103.50it/s]


-32.98590604026889

We will now try to train the expert in different ways, the first way is using Actor Critic.

In [72]:
class Actor(nn.Module):
    def __init__(self, action_dim):
        super(Actor, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Flatten()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(5184, 128),
            nn.ReLU(),
            nn.Linear(128, action_dim)
        )
        self.tanh = nn.Tanh()

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return self.tanh(x)

    def select_action(self, obs):
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float32)
            obs = obs.permute(2, 0, 1)
        obs = obs.unsqueeze(0)
        with torch.no_grad():
            action = self.forward(obs)
            action = action.squeeze(0)
        return action

In [7]:
class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=5, stride=2),
            nn.ReLU(),
            nn.Flatten()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(5184, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = self.fc_layers(x)
        return x

In [58]:
def update_policy(state, action, reward, next_state, done, actor, critic, actor_optimizer, critic_optimizer, gamma=0.99):
    state = torch.FloatTensor(state).unsqueeze(0).permute(0, 3, 1, 2)
    next_state = torch.FloatTensor(next_state).unsqueeze(0).permute(0, 3, 1, 2)
    action = torch.FloatTensor(action).unsqueeze(0)
    reward = torch.FloatTensor([reward])
    done = torch.FloatTensor([done])

    # Critic update
    value = critic(state)
    next_value = critic(next_state)
    target_value = reward + (1 - done) * gamma * next_value
    critic_loss = (value - target_value.detach()).pow(2).mean()

    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

    # Actor update
    mu = actor(state)
    dist = Normal(mu, torch.tensor([0.1]).expand_as(mu))
    log_prob = dist.log_prob(action).sum(dim=-1, keepdim=True)
    advantage = (target_value - value).detach()
    actor_loss = -(log_prob * advantage).mean()

    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()

In [82]:
import time

action_dim = 3

actor = Actor(action_dim)
critic = Critic()
actor_optimizer = optim.Adam(actor.parameters(), lr=1e-4)
critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

num_episodes = 100

for episode in range(num_episodes):
    start_time = time.time()
    state = np.asarray(env.reset()[0])
    total_reward = 0
    done = False

    while not done and time.time() - start_time < 33:
        with torch.no_grad():
            action = actor.select_action(state).numpy()

        next_state, reward, done, _, _= env.step(action)

        update_policy(state, action, reward, next_state, done, actor, critic, actor_optimizer, critic_optimizer)

        state = next_state
        total_reward += reward


    print(f"Episode: {episode + 1}, Total Reward: {total_reward}")

    episode_reward = visualize(agent=actor)
    print(f"Episode {episode + 1} visualization reward: {episode_reward}")

env.close()


Episode: 1, Total Reward: -34.88120300751901



100%|██████████| 1002/1002 [00:08<00:00, 120.85it/s]


Episode 1 visualization reward: -93.10699300699198
Episode: 2, Total Reward: -33.22105263157915



100%|██████████| 1002/1002 [00:08<00:00, 121.10it/s]


Episode 2 visualization reward: -92.5812030075178
Episode: 3, Total Reward: -28.606493506493642



100%|██████████| 1002/1002 [00:08<00:00, 121.27it/s]


Episode 3 visualization reward: -92.28749999999903
Episode: 4, Total Reward: -28.69933993399353



100%|██████████| 1002/1002 [00:08<00:00, 117.85it/s]


Episode 4 visualization reward: -93.47748344370757
Episode: 5, Total Reward: -27.27799227799239


KeyboardInterrupt: 