<a href="https://colab.research.google.com/github/syntaxDuck/Udemy-Courses/blob/main/A3C_for_Kung_Fu_Partial_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A3C for Kung Fu

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [1]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!pip install ale-py
!apt-get install -y swig
!pip install gymnasium[box2d]

zsh:1: command not found: apt-get
zsh:1: no matches found: gymnasium[box2d]


### Importing the libraries

In [2]:
import cv2
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp
import torch.distributions as distributions
from torch.distributions import Categorical
import ale_py
import gymnasium as gym
from gymnasium.spaces import Box
from gymnasium import ObservationWrapper

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [3]:
class Network(nn.Module):
  def __init__(self, action_size):
    super(Network, self).__init__()
    self.conv1 = torch.nn.Conv2d(in_channels=4, out_channels=32, kernel_size=(3,3), stride=2)
    self.conv2 = torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2)
    self.conv3 = torch.nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(3,3), stride=2)
    self.flatten = torch.nn.Flatten()
    self.fc1 = torch.nn.Linear(in_features=512, out_features=128)
    self.fc2a = torch.nn.Linear(in_features=128, out_features=action_size)
    self.fc2s = torch.nn.Linear(in_features=128, out_features=1)

  def forward(self, state):
    x = self.conv1(state)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = self.conv3(x)
    x = F.relu(x)
    x = self.flatten(x)
    x = self.fc1(x)
    x = F.relu(x)
    x_a = self.fc2a(x)
    x_s = self.fc2s(x)[0]
    return x_a, x_s


## Part 2 - Training the AI

### Setting up the environment

In [4]:
class PreprocessAtari(ObservationWrapper):

  def __init__(self, env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4):
    super(PreprocessAtari, self).__init__(env)
    self.img_size = (height, width)
    self.crop = crop
    self.dim_order = dim_order
    self.color = color
    self.frame_stack = n_frames
    n_channels = 3 * n_frames if color else n_frames
    obs_shape = {'tensorflow': (height, width, n_channels), 'pytorch': (n_channels, height, width)}[dim_order]
    self.observation_space = Box(0.0, 1.0, obs_shape)
    self.frames = np.zeros(obs_shape, dtype = np.float32)

  def reset(self):
    self.frames = np.zeros_like(self.frames)
    obs, info = self.env.reset()
    self.update_buffer(obs)
    return self.frames, info

  def observation(self, img):
    img = self.crop(img)
    img = cv2.resize(img, self.img_size)
    if not self.color:
      if len(img.shape) == 3 and img.shape[2] == 3:
        img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img.astype('float32') / 255.
    if self.color:
      self.frames = np.roll(self.frames, shift = -3, axis = 0)
    else:
      self.frames = np.roll(self.frames, shift = -1, axis = 0)
    if self.color:
      self.frames[-3:] = img
    else:
      self.frames[-1] = img
    return self.frames

  def update_buffer(self, obs):
    self.frames = self.observation(obs)

def make_env():
  env = gym.make("KungFuMasterDeterministic-v0", render_mode = 'rgb_array')
  env = PreprocessAtari(env, height = 42, width = 42, crop = lambda img: img, dim_order = 'pytorch', color = False, n_frames = 4)
  return env

env = make_env()

state_shape = env.observation_space.shape
number_actions = env.action_space.n
print("State shape:", state_shape)
print("Number actions:", number_actions)
print("Action names:", env.env.env.env.get_action_meanings())

State shape: (4, 42, 42)
Number actions: 14
Action names: ['NOOP', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'DOWNRIGHT', 'DOWNLEFT', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE']


  logger.deprecation(
A.L.E: Arcade Learning Environment (version 0.10.1+6a7e0ae)
[Powered by Stella]


### Initializing the hyperparameters

In [5]:
learning_rate = 1e-4
discount_factor = 0.99
number_environments = 10

### Implementing the A3C class

In [7]:
class Agent():
  def __init__(self, action_size):
    self.device = torch.device("mps" if torch.cuda.is_available() else "cpu")
    self.action_size = action_size
    self.network = Network(action_size).to(self.device)
    self.optimizer = optim.Adam(self.network.parameters(), lr = learning_rate)

  def act(self, state):
    if state.ndim == 3:
      state = [state]

    state = torch.tensor(np.array(state), dtype = torch.float32).to(self.device)
    action_values, _ = self.network(state)
    policy = F.softmax(action_values, dim = -1)
    return np.array([np.random.choice(len(p), p = p) for p in policy.detach().cpu().numpy()])

  def step(self, state, action, reward, next_state, done):
    batch_size = state.shape[0]
    state = torch.tensor(state, dtype = torch.float32, device = self.device)
    next_state = torch.tensor(next_state, dtype = torch.float32, device = self.device)
    reward = torch.tensor(reward, dtype = torch.float32, device = self.device)
    done = torch.tensor(done, dtype = torch.bool, device = self.device).to(dtype=torch.float32)
    action_value, state_value = self.network(state)
    _, next_state_value = self.network(next_state)
    target_state_value = reward + discount_factor * next_state_value * (1 - done)
    advantage = target_state_value - state_value
    probs = F.softmax(action_value, dim = -1)
    logprobs = F.log_softmax(action_value, dim = -1)
    entropy = -torch.sum(probs * logprobs, axis = -1)
    batch_idx = np.arange(batch_size)
    logp_actions = logprobs[batch_idx, action]
    actor_loss = -(logp_actions * advantage.detach()).mean() - 0.001 * entropy.mean()
    critic_loss = F.mse_loss(target_state_value.detach(), state_value)
    total_loss = actor_loss + critic_loss
    self.optimizer.zero_grad()
    total_loss.backward()
    self.optimizer.step()

### Initializing the A3C agent

In [9]:
agent = Agent(number_actions)

### Evaluating our A3C agent on a single episode

In [21]:
def evaluate(agent, env, n_episodes = 1):
  episodes_rewards = []
  for _ in range(n_episodes):
    state, _ = env.reset()
    total_reward = 0
    while True:
      action = agent.act(state)
      state, reward, done, info, _ = env.step(action[0])
      total_reward += reward
      if done:
        break
    episodes_rewards.append(total_reward)
  return episodes_rewards

### Testing multiple agents on multiple environments at the same time

In [12]:
class EnvBatch():
  def __init__(self, n_envs=10):
    self.envs = [make_env() for _ in range(n_envs)]

  def reset(self):
    _states = []
    for env in self.envs:
      _states.append(env.reset()[0])
    return np.array(_states)

  def step(self, actions):
    next_states, rewards, dones, infos, _ = map(np.array, zip(*[env.step(a) for env, a in zip(self.envs, actions)]))
    for i in range(len(self.envs)):
      if dones[i]:
        next_states[i] = self.envs[i].reset()[0]
    return next_states, rewards, dones, infos


### Training the A3C agent

In [24]:
!pip install tqdm
import tqdm

env_batch = EnvBatch(number_environments)
batch_states = env_batch.reset()

with tqdm.trange(0,300001) as progress_bar:
  for i in progress_bar:
    batch_actions = agent.act(batch_states)
    batch_next_states, batch_rewards, batch_dones, _ = env_batch.step(batch_actions)
    batch_rewards *= 0.01
    agent.step(batch_states, batch_actions, batch_rewards, batch_next_states, batch_dones)
    batch_states = batch_next_states
    if i % 1000 == 0:
      print("Average agent reward: ", np.mean(evaluate(agent, env, n_episodes=10)))





  logger.deprecation(
  critic_loss = F.mse_loss(target_state_value.detach(), state_value)
  0%|                                                  | 14/300001 [00:06<29:33:39,  2.82it/s]

Average agent reward:  740.0


  0%|▏                                                | 1024/300001 [00:20<8:09:09, 10.19it/s]

Average agent reward:  440.0


  1%|▎                                                | 2014/300001 [00:35<9:33:35,  8.66it/s]

Average agent reward:  420.0


  1%|▍                                                | 3013/300001 [00:49<7:55:33, 10.41it/s]

Average agent reward:  270.0


  1%|▋                                                | 4022/300001 [01:03<8:32:00,  9.63it/s]

Average agent reward:  830.0


  2%|▊                                                | 5017/300001 [01:16<7:14:54, 11.30it/s]

Average agent reward:  180.0


  2%|▉                                                | 6017/300001 [01:28<6:10:41, 13.22it/s]

Average agent reward:  260.0


  2%|█▏                                               | 7025/300001 [01:41<6:31:16, 12.48it/s]

Average agent reward:  400.0


  3%|█▎                                               | 8018/300001 [01:52<6:00:18, 13.51it/s]

Average agent reward:  280.0


  3%|█▍                                               | 9021/300001 [02:05<6:58:12, 11.60it/s]

Average agent reward:  420.0


  3%|█▌                                              | 10016/300001 [02:19<8:43:14,  9.24it/s]

Average agent reward:  850.0


  4%|█▊                                              | 11026/300001 [02:33<8:02:51,  9.97it/s]

Average agent reward:  660.0


  4%|█▉                                              | 12015/300001 [02:45<5:39:37, 14.13it/s]

Average agent reward:  270.0


  4%|██                                             | 13021/300001 [03:01<10:35:17,  7.53it/s]

Average agent reward:  1090.0


  5%|██▏                                             | 14021/300001 [03:15<7:36:15, 10.45it/s]

Average agent reward:  440.0


  5%|██▍                                             | 15028/300001 [03:28<6:50:45, 11.56it/s]

Average agent reward:  540.0


  5%|██▌                                             | 16020/300001 [03:40<5:56:02, 13.29it/s]

Average agent reward:  460.0


  6%|██▋                                             | 17014/300001 [03:54<8:34:18,  9.17it/s]

Average agent reward:  440.0


  6%|██▉                                             | 18025/300001 [04:08<6:48:56, 11.49it/s]

Average agent reward:  550.0


  6%|███                                             | 19021/300001 [04:21<7:24:17, 10.54it/s]

Average agent reward:  660.0


  7%|███▏                                            | 20015/300001 [04:34<6:50:07, 11.38it/s]

Average agent reward:  230.0


  7%|███▎                                            | 21024/300001 [04:45<5:07:49, 15.10it/s]

Average agent reward:  60.0


  7%|███▌                                            | 22025/300001 [04:58<6:55:24, 11.15it/s]

Average agent reward:  530.0


  8%|███▌                                           | 23012/300001 [05:12<10:42:14,  7.19it/s]

Average agent reward:  390.0


  8%|███▊                                            | 24018/300001 [05:25<6:56:58, 11.03it/s]

Average agent reward:  310.0


  8%|████                                            | 25028/300001 [05:37<6:07:10, 12.48it/s]

Average agent reward:  120.0


  9%|████▏                                           | 26027/300001 [05:49<5:32:19, 13.74it/s]

Average agent reward:  80.0


  9%|████▎                                           | 27026/300001 [06:01<5:24:25, 14.02it/s]

Average agent reward:  270.0


  9%|████▍                                           | 28026/300001 [06:14<5:43:17, 13.20it/s]

Average agent reward:  90.0


 10%|████▋                                           | 29028/300001 [06:26<5:18:53, 14.16it/s]

Average agent reward:  140.0


 10%|████▊                                           | 30022/300001 [06:36<4:53:10, 15.35it/s]

Average agent reward:  170.0


 10%|████▉                                           | 31017/300001 [06:48<5:30:04, 13.58it/s]

Average agent reward:  700.0


 11%|█████                                           | 32018/300001 [07:00<5:56:08, 12.54it/s]

Average agent reward:  740.0


 11%|█████▎                                          | 33024/300001 [07:12<5:07:29, 14.47it/s]

Average agent reward:  330.0


 11%|█████▍                                          | 34022/300001 [07:24<4:55:44, 14.99it/s]

Average agent reward:  430.0


 12%|█████▌                                          | 35028/300001 [07:36<5:06:10, 14.42it/s]

Average agent reward:  170.0


 12%|█████▊                                          | 36018/300001 [07:49<6:18:46, 11.62it/s]

Average agent reward:  650.0


 12%|█████▉                                          | 37020/300001 [08:00<4:36:29, 15.85it/s]

Average agent reward:  240.0


 13%|██████                                          | 38018/300001 [08:12<5:04:55, 14.32it/s]

Average agent reward:  260.0


 13%|██████▏                                         | 39026/300001 [08:24<5:57:03, 12.18it/s]

Average agent reward:  710.0


 13%|██████▍                                         | 40027/300001 [08:35<4:53:12, 14.78it/s]

Average agent reward:  330.0


 14%|██████▌                                         | 41019/300001 [08:47<5:58:46, 12.03it/s]

Average agent reward:  760.0


 14%|██████▋                                         | 42022/300001 [08:59<5:06:16, 14.04it/s]

Average agent reward:  310.0


 14%|██████▉                                         | 43028/300001 [09:11<5:29:21, 13.00it/s]

Average agent reward:  670.0


 15%|███████                                         | 44017/300001 [09:22<4:37:30, 15.37it/s]

Average agent reward:  410.0


 15%|███████▏                                        | 45024/300001 [09:33<4:44:43, 14.93it/s]

Average agent reward:  460.0


 15%|███████▎                                        | 46019/300001 [09:44<4:39:18, 15.16it/s]

Average agent reward:  380.0


 16%|███████▌                                        | 47021/300001 [09:55<4:57:14, 14.18it/s]

Average agent reward:  390.0


 16%|███████▋                                        | 48029/300001 [10:06<4:17:49, 16.29it/s]

Average agent reward:  220.0


 16%|███████▊                                        | 49021/300001 [10:18<5:25:55, 12.83it/s]

Average agent reward:  720.0


 17%|████████                                        | 50017/300001 [10:29<4:35:34, 15.12it/s]

Average agent reward:  200.0


 17%|████████▏                                       | 51019/300001 [10:40<4:21:27, 15.87it/s]

Average agent reward:  80.0


 17%|████████▎                                       | 52022/300001 [10:51<4:18:17, 16.00it/s]

Average agent reward:  150.0


 18%|████████▍                                       | 53029/300001 [11:03<5:16:44, 13.00it/s]

Average agent reward:  510.0


 18%|████████▋                                       | 54029/300001 [11:14<4:53:00, 13.99it/s]

Average agent reward:  250.0


 18%|████████▊                                       | 55022/300001 [11:27<5:50:55, 11.63it/s]

Average agent reward:  950.0


 19%|████████▉                                       | 56017/300001 [11:39<5:36:37, 12.08it/s]

Average agent reward:  730.0


 19%|█████████                                       | 57019/300001 [11:52<5:34:04, 12.12it/s]

Average agent reward:  940.0


 19%|█████████▎                                      | 58029/300001 [12:03<4:52:59, 13.76it/s]

Average agent reward:  380.0


 20%|█████████▍                                      | 59028/300001 [12:14<4:14:05, 15.81it/s]

Average agent reward:  430.0


 20%|█████████▌                                      | 60019/300001 [12:25<4:26:46, 14.99it/s]

Average agent reward:  310.0


 20%|█████████▊                                      | 61023/300001 [12:36<3:44:56, 17.71it/s]

Average agent reward:  120.0


 21%|█████████▉                                      | 62026/300001 [12:48<5:09:23, 12.82it/s]

Average agent reward:  590.0


 21%|██████████                                      | 63020/300001 [13:01<6:16:25, 10.49it/s]

Average agent reward:  980.0


 21%|██████████▏                                     | 64017/300001 [13:14<5:08:05, 12.77it/s]

Average agent reward:  950.0


 22%|██████████▍                                     | 65025/300001 [13:25<4:23:39, 14.85it/s]

Average agent reward:  420.0


 22%|██████████▌                                     | 66017/300001 [13:39<6:36:16,  9.84it/s]

Average agent reward:  460.0


 22%|██████████▋                                     | 67018/300001 [13:52<5:14:22, 12.35it/s]

Average agent reward:  440.0


 23%|██████████▉                                     | 68025/300001 [14:03<4:32:13, 14.20it/s]

Average agent reward:  430.0


 23%|███████████                                     | 69017/300001 [14:14<4:10:17, 15.38it/s]

Average agent reward:  200.0


 23%|███████████▏                                    | 70019/300001 [14:25<4:05:43, 15.60it/s]

Average agent reward:  220.0


 24%|███████████▎                                    | 71023/300001 [14:37<4:27:37, 14.26it/s]

Average agent reward:  380.0


 24%|███████████▌                                    | 72018/300001 [14:48<4:37:08, 13.71it/s]

Average agent reward:  510.0


 24%|███████████▋                                    | 73027/300001 [15:00<4:37:42, 13.62it/s]

Average agent reward:  200.0


 25%|███████████▊                                    | 74018/300001 [15:12<4:48:19, 13.06it/s]

Average agent reward:  710.0


 25%|████████████                                    | 75028/300001 [15:24<5:08:36, 12.15it/s]

Average agent reward:  550.0


 25%|████████████▏                                   | 76021/300001 [15:35<4:01:58, 15.43it/s]

Average agent reward:  320.0


 26%|████████████▎                                   | 77020/300001 [15:46<3:52:28, 15.99it/s]

Average agent reward:  60.0


 26%|████████████▍                                   | 78016/300001 [15:58<6:01:28, 10.24it/s]

Average agent reward:  390.0


 26%|████████████▋                                   | 79023/300001 [16:09<4:04:04, 15.09it/s]

Average agent reward:  170.0


 27%|████████████▊                                   | 80020/300001 [16:20<4:01:18, 15.19it/s]

Average agent reward:  260.0


 27%|████████████▉                                   | 81025/300001 [16:32<4:58:29, 12.23it/s]

Average agent reward:  470.0


 27%|█████████████                                   | 82030/300001 [16:43<3:51:06, 15.72it/s]

Average agent reward:  200.0


 28%|█████████████▎                                  | 83028/300001 [16:54<3:58:50, 15.14it/s]

Average agent reward:  300.0


 28%|█████████████▍                                  | 84024/300001 [17:07<5:04:17, 11.83it/s]

Average agent reward:  960.0


 28%|█████████████▌                                  | 85017/300001 [17:19<4:30:08, 13.26it/s]

Average agent reward:  500.0


 29%|█████████████▊                                  | 86030/300001 [17:31<4:28:48, 13.27it/s]

Average agent reward:  590.0


 29%|█████████████▉                                  | 87018/300001 [17:42<3:58:37, 14.88it/s]

Average agent reward:  300.0


 29%|██████████████                                  | 88024/300001 [17:53<4:01:51, 14.61it/s]

Average agent reward:  410.0


 30%|██████████████▏                                 | 89016/300001 [18:05<4:29:54, 13.03it/s]

Average agent reward:  290.0


 30%|██████████████▍                                 | 90021/300001 [18:17<4:45:39, 12.25it/s]

Average agent reward:  230.0


 30%|██████████████▌                                 | 91024/300001 [18:28<3:39:51, 15.84it/s]

Average agent reward:  180.0


 31%|██████████████▋                                 | 92022/300001 [18:39<3:47:28, 15.24it/s]

Average agent reward:  80.0


 31%|██████████████▉                                 | 93021/300001 [18:49<3:32:34, 16.23it/s]

Average agent reward:  120.0


 31%|███████████████                                 | 94022/300001 [19:01<3:50:31, 14.89it/s]

Average agent reward:  250.0


 32%|███████████████▏                                | 95016/300001 [19:12<4:56:24, 11.53it/s]

Average agent reward:  230.0


 32%|███████████████▎                                | 96022/300001 [19:23<4:19:56, 13.08it/s]

Average agent reward:  610.0


 32%|███████████████▌                                | 97020/300001 [19:35<4:12:18, 13.41it/s]

Average agent reward:  570.0


 33%|███████████████▋                                | 98023/300001 [19:47<4:15:40, 13.17it/s]

Average agent reward:  580.0


 33%|███████████████▊                                | 99025/300001 [19:59<4:18:16, 12.97it/s]

Average agent reward:  710.0


 33%|███████████████▋                               | 100021/300001 [20:11<4:06:03, 13.55it/s]

Average agent reward:  550.0


 34%|███████████████▊                               | 101029/300001 [20:23<4:34:37, 12.08it/s]

Average agent reward:  1090.0


 34%|███████████████▉                               | 102016/300001 [20:35<4:17:09, 12.83it/s]

Average agent reward:  720.0


 34%|████████████████▏                              | 103020/300001 [20:47<4:05:00, 13.40it/s]

Average agent reward:  690.0


 35%|████████████████▎                              | 104029/300001 [20:59<4:01:33, 13.52it/s]

Average agent reward:  470.0


 35%|████████████████▍                              | 105019/300001 [21:10<3:37:40, 14.93it/s]

Average agent reward:  270.0


 35%|████████████████▌                              | 106017/300001 [21:21<4:04:35, 13.22it/s]

Average agent reward:  790.0


 36%|████████████████▊                              | 107025/300001 [21:33<4:21:38, 12.29it/s]

Average agent reward:  560.0


 36%|████████████████▉                              | 108028/300001 [21:45<3:48:19, 14.01it/s]

Average agent reward:  540.0


 36%|█████████████████                              | 109018/300001 [21:57<3:44:39, 14.17it/s]

Average agent reward:  300.0


 37%|█████████████████▏                             | 110019/300001 [22:07<3:01:25, 17.45it/s]

Average agent reward:  180.0


 37%|█████████████████▍                             | 111018/300001 [22:19<4:00:16, 13.11it/s]

Average agent reward:  550.0


 37%|█████████████████▌                             | 112024/300001 [22:32<4:32:35, 11.49it/s]

Average agent reward:  1160.0


 38%|█████████████████▋                             | 113024/300001 [22:44<4:13:58, 12.27it/s]

Average agent reward:  950.0


 38%|█████████████████▊                             | 114016/300001 [22:55<3:35:08, 14.41it/s]

Average agent reward:  460.0


 38%|██████████████████                             | 115030/300001 [23:07<3:28:20, 14.80it/s]

Average agent reward:  280.0


 39%|██████████████████▏                            | 116027/300001 [23:19<3:56:42, 12.95it/s]

Average agent reward:  670.0


 39%|██████████████████▎                            | 117016/300001 [23:31<3:42:30, 13.71it/s]

Average agent reward:  410.0


 39%|██████████████████▍                            | 118020/300001 [23:42<3:39:38, 13.81it/s]

Average agent reward:  210.0


 40%|██████████████████▋                            | 119020/300001 [23:53<3:00:14, 16.74it/s]

Average agent reward:  180.0


 40%|██████████████████▊                            | 120029/300001 [24:05<3:44:32, 13.36it/s]

Average agent reward:  530.0


 40%|██████████████████▉                            | 121016/300001 [24:16<3:27:04, 14.41it/s]

Average agent reward:  140.0


 41%|███████████████████                            | 122019/300001 [24:27<3:18:06, 14.97it/s]

Average agent reward:  200.0


 41%|███████████████████▎                           | 123031/300001 [24:39<3:13:13, 15.26it/s]

Average agent reward:  140.0


 41%|███████████████████▍                           | 124030/300001 [24:50<3:28:47, 14.05it/s]

Average agent reward:  450.0


 42%|███████████████████▌                           | 125022/300001 [25:02<3:18:58, 14.66it/s]

Average agent reward:  260.0


 42%|███████████████████▋                           | 126016/300001 [25:13<3:17:10, 14.71it/s]

Average agent reward:  240.0


 42%|███████████████████▉                           | 127015/300001 [25:25<5:29:48,  8.74it/s]

Average agent reward:  890.0


 43%|████████████████████                           | 128021/300001 [25:37<3:24:45, 14.00it/s]

Average agent reward:  420.0


 43%|████████████████████▏                          | 129023/300001 [25:48<3:18:22, 14.36it/s]

Average agent reward:  270.0


 43%|████████████████████▎                          | 130025/300001 [26:00<3:38:36, 12.96it/s]

Average agent reward:  410.0


 44%|████████████████████▌                          | 131023/300001 [26:11<3:21:14, 13.99it/s]

Average agent reward:  380.0


 44%|████████████████████▋                          | 132024/300001 [26:23<3:30:23, 13.31it/s]

Average agent reward:  660.0


 44%|████████████████████▊                          | 133028/300001 [26:35<3:13:03, 14.41it/s]

Average agent reward:  330.0


 45%|████████████████████▉                          | 134030/300001 [26:47<3:26:18, 13.41it/s]

Average agent reward:  510.0


 45%|█████████████████████▏                         | 135023/300001 [26:58<3:07:43, 14.65it/s]

Average agent reward:  210.0


 45%|█████████████████████▎                         | 136016/300001 [27:10<3:45:51, 12.10it/s]

Average agent reward:  740.0


 46%|█████████████████████▍                         | 137024/300001 [27:21<3:03:43, 14.78it/s]

Average agent reward:  120.0


 46%|█████████████████████▌                         | 138015/300001 [27:34<5:12:44,  8.63it/s]

Average agent reward:  660.0


 46%|█████████████████████▊                         | 139026/300001 [27:46<3:43:24, 12.01it/s]

Average agent reward:  690.0


 47%|█████████████████████▉                         | 140024/300001 [27:58<3:48:32, 11.67it/s]

Average agent reward:  920.0


 47%|██████████████████████                         | 141017/300001 [28:10<3:22:49, 13.06it/s]

Average agent reward:  440.0


 47%|██████████████████████▎                        | 142028/300001 [28:22<3:22:02, 13.03it/s]

Average agent reward:  700.0


 48%|██████████████████████▍                        | 143025/300001 [28:35<3:30:09, 12.45it/s]

Average agent reward:  540.0


 48%|██████████████████████▌                        | 144016/300001 [28:46<3:24:22, 12.72it/s]

Average agent reward:  570.0


 48%|██████████████████████▋                        | 145020/300001 [28:59<3:22:39, 12.75it/s]

Average agent reward:  840.0


 49%|██████████████████████▉                        | 146028/300001 [29:10<2:49:27, 15.14it/s]

Average agent reward:  370.0


 49%|███████████████████████                        | 147023/300001 [29:21<2:37:29, 16.19it/s]

Average agent reward:  320.0


 49%|███████████████████████▏                       | 148025/300001 [29:32<2:58:22, 14.20it/s]

Average agent reward:  630.0


 50%|███████████████████████▎                       | 149028/300001 [29:43<3:00:50, 13.91it/s]

Average agent reward:  430.0


 50%|███████████████████████▌                       | 150019/300001 [29:54<2:42:35, 15.37it/s]

Average agent reward:  280.0


 50%|███████████████████████▋                       | 151018/300001 [30:05<2:52:55, 14.36it/s]

Average agent reward:  630.0


 51%|███████████████████████▊                       | 152026/300001 [30:16<2:33:08, 16.10it/s]

Average agent reward:  140.0


 51%|███████████████████████▉                       | 153033/300001 [30:27<1:44:14, 23.50it/s]

Average agent reward:  160.0


 51%|████████████████████████▏                      | 154021/300001 [30:37<2:34:29, 15.75it/s]

Average agent reward:  360.0


 52%|████████████████████████▎                      | 155020/300001 [30:48<2:44:57, 14.65it/s]

Average agent reward:  510.0


 52%|████████████████████████▍                      | 156017/300001 [30:59<2:32:22, 15.75it/s]

Average agent reward:  440.0


 52%|████████████████████████▌                      | 157029/300001 [31:10<2:33:28, 15.53it/s]

Average agent reward:  470.0


 53%|████████████████████████▊                      | 158020/300001 [31:21<2:38:26, 14.93it/s]

Average agent reward:  540.0


 53%|████████████████████████▉                      | 159032/300001 [31:33<2:49:08, 13.89it/s]

Average agent reward:  710.0


 53%|█████████████████████████                      | 160030/300001 [31:43<2:25:50, 16.00it/s]

Average agent reward:  380.0


 54%|█████████████████████████▏                     | 161022/300001 [31:55<2:51:21, 13.52it/s]

Average agent reward:  910.0


 54%|█████████████████████████▍                     | 162017/300001 [32:06<3:47:03, 10.13it/s]

Average agent reward:  760.0


 54%|█████████████████████████▌                     | 163026/300001 [32:17<2:13:29, 17.10it/s]

Average agent reward:  120.0


 55%|█████████████████████████▋                     | 164024/300001 [32:28<2:20:17, 16.15it/s]

Average agent reward:  390.0


 55%|█████████████████████████▊                     | 165026/300001 [32:39<2:36:20, 14.39it/s]

Average agent reward:  330.0


 55%|██████████████████████████                     | 166018/300001 [32:50<2:41:36, 13.82it/s]

Average agent reward:  1010.0


 56%|██████████████████████████▏                    | 167021/300001 [33:01<2:21:46, 15.63it/s]

Average agent reward:  460.0


 56%|██████████████████████████▎                    | 168026/300001 [33:12<2:24:54, 15.18it/s]

Average agent reward:  490.0


 56%|██████████████████████████▍                    | 169022/300001 [33:24<2:48:53, 12.93it/s]

Average agent reward:  930.0


 57%|██████████████████████████▋                    | 170027/300001 [33:35<2:23:29, 15.10it/s]

Average agent reward:  230.0


 57%|██████████████████████████▊                    | 171026/300001 [33:47<2:31:49, 14.16it/s]

Average agent reward:  840.0


 57%|██████████████████████████▉                    | 172019/300001 [33:58<2:19:40, 15.27it/s]

Average agent reward:  400.0


 58%|███████████████████████████                    | 173030/300001 [34:10<2:57:08, 11.95it/s]

Average agent reward:  700.0


 58%|███████████████████████████▎                   | 174017/300001 [34:21<3:37:36,  9.65it/s]

Average agent reward:  690.0


 58%|███████████████████████████▍                   | 175020/300001 [34:33<2:27:54, 14.08it/s]

Average agent reward:  440.0


 59%|███████████████████████████▌                   | 176031/300001 [34:45<2:29:33, 13.82it/s]

Average agent reward:  930.0


 59%|███████████████████████████▋                   | 177020/300001 [34:56<2:19:41, 14.67it/s]

Average agent reward:  220.0


 59%|███████████████████████████▉                   | 178032/300001 [35:07<2:18:26, 14.68it/s]

Average agent reward:  180.0


 60%|████████████████████████████                   | 179030/300001 [35:18<2:19:39, 14.44it/s]

Average agent reward:  720.0


 60%|████████████████████████████▏                  | 180027/300001 [35:30<2:26:21, 13.66it/s]

Average agent reward:  670.0


 60%|████████████████████████████▎                  | 181024/300001 [35:41<2:14:03, 14.79it/s]

Average agent reward:  420.0


 61%|████████████████████████████▌                  | 182023/300001 [35:52<2:12:42, 14.82it/s]

Average agent reward:  350.0


 61%|████████████████████████████▋                  | 183018/300001 [36:03<2:14:10, 14.53it/s]

Average agent reward:  510.0


 61%|████████████████████████████▊                  | 184028/300001 [36:16<2:45:22, 11.69it/s]

Average agent reward:  1210.0


 62%|████████████████████████████▉                  | 185025/300001 [36:27<2:22:30, 13.45it/s]

Average agent reward:  650.0


 62%|█████████████████████████████▏                 | 186017/300001 [36:39<2:31:58, 12.50it/s]

Average agent reward:  750.0


 62%|█████████████████████████████▎                 | 187023/300001 [36:51<2:22:50, 13.18it/s]

Average agent reward:  650.0


 63%|█████████████████████████████▍                 | 188028/300001 [37:02<2:13:11, 14.01it/s]

Average agent reward:  590.0


 63%|█████████████████████████████▌                 | 189020/300001 [37:14<2:24:49, 12.77it/s]

Average agent reward:  640.0


 63%|█████████████████████████████▊                 | 190023/300001 [37:27<2:42:12, 11.30it/s]

Average agent reward:  1170.0


 64%|█████████████████████████████▉                 | 191031/300001 [37:39<2:13:05, 13.65it/s]

Average agent reward:  930.0


 64%|██████████████████████████████                 | 192031/300001 [37:50<2:07:11, 14.15it/s]

Average agent reward:  800.0


 64%|██████████████████████████████▏                | 193030/300001 [38:02<2:05:37, 14.19it/s]

Average agent reward:  690.0


 65%|██████████████████████████████▍                | 194028/300001 [38:12<1:56:13, 15.20it/s]

Average agent reward:  560.0


 65%|██████████████████████████████▌                | 195024/300001 [38:24<2:01:53, 14.35it/s]

Average agent reward:  720.0


 65%|██████████████████████████████▋                | 196033/300001 [38:35<2:01:36, 14.25it/s]

Average agent reward:  900.0


 66%|██████████████████████████████▊                | 197015/300001 [38:47<3:20:18,  8.57it/s]

Average agent reward:  1370.0


 66%|███████████████████████████████                | 198027/300001 [39:00<2:19:39, 12.17it/s]

Average agent reward:  1480.0


 66%|███████████████████████████████▏               | 199022/300001 [39:12<1:58:49, 14.16it/s]

Average agent reward:  1020.0


 67%|███████████████████████████████▎               | 200020/300001 [39:23<2:03:59, 13.44it/s]

Average agent reward:  840.0


 67%|███████████████████████████████▍               | 201018/300001 [39:35<2:04:16, 13.27it/s]

Average agent reward:  580.0


 67%|███████████████████████████████▋               | 202032/300001 [39:47<2:02:45, 13.30it/s]

Average agent reward:  960.0


 68%|███████████████████████████████▊               | 203026/300001 [39:58<1:56:05, 13.92it/s]

Average agent reward:  970.0


 68%|███████████████████████████████▉               | 204026/300001 [40:10<2:00:08, 13.31it/s]

Average agent reward:  1150.0


 68%|████████████████████████████████               | 205020/300001 [40:21<1:53:27, 13.95it/s]

Average agent reward:  820.0


 69%|████████████████████████████████▎              | 206030/300001 [40:34<2:04:32, 12.58it/s]

Average agent reward:  790.0


 69%|████████████████████████████████▍              | 207027/300001 [40:45<1:45:56, 14.63it/s]

Average agent reward:  510.0


 69%|████████████████████████████████▌              | 208020/300001 [40:57<2:09:40, 11.82it/s]

Average agent reward:  770.0


 70%|████████████████████████████████▋              | 209030/300001 [41:11<2:00:31, 12.58it/s]

Average agent reward:  610.0


 70%|████████████████████████████████▉              | 210025/300001 [41:23<2:09:57, 11.54it/s]

Average agent reward:  970.0


 70%|█████████████████████████████████              | 211027/300001 [41:35<1:52:13, 13.21it/s]

Average agent reward:  680.0


 71%|█████████████████████████████████▏             | 212028/300001 [41:47<2:01:35, 12.06it/s]

Average agent reward:  930.0


 71%|█████████████████████████████████▎             | 213029/300001 [41:59<1:54:22, 12.67it/s]

Average agent reward:  1000.0


 71%|█████████████████████████████████▌             | 214029/300001 [42:12<2:03:50, 11.57it/s]

Average agent reward:  1120.0


 72%|█████████████████████████████████▋             | 215016/300001 [42:24<1:49:02, 12.99it/s]

Average agent reward:  880.0


 72%|█████████████████████████████████▊             | 216019/300001 [42:36<1:48:15, 12.93it/s]

Average agent reward:  610.0


 72%|█████████████████████████████████▉             | 217017/300001 [42:48<2:00:45, 11.45it/s]

Average agent reward:  1020.0


 73%|██████████████████████████████████▏            | 218028/300001 [43:00<1:47:13, 12.74it/s]

Average agent reward:  790.0


 73%|██████████████████████████████████▎            | 219033/300001 [43:12<1:42:14, 13.20it/s]

Average agent reward:  750.0


 73%|██████████████████████████████████▍            | 220023/300001 [43:24<1:46:42, 12.49it/s]

Average agent reward:  540.0


 74%|██████████████████████████████████▋            | 221028/300001 [43:37<1:47:36, 12.23it/s]

Average agent reward:  950.0


 74%|██████████████████████████████████▊            | 222030/300001 [43:48<1:35:28, 13.61it/s]

Average agent reward:  690.0


 74%|██████████████████████████████████▉            | 223031/300001 [44:00<1:35:53, 13.38it/s]

Average agent reward:  790.0


 75%|███████████████████████████████████            | 224022/300001 [44:13<1:51:44, 11.33it/s]

Average agent reward:  970.0


 75%|███████████████████████████████████▎           | 225027/300001 [44:25<1:32:34, 13.50it/s]

Average agent reward:  460.0


 75%|███████████████████████████████████▍           | 226027/300001 [44:37<1:33:05, 13.24it/s]

Average agent reward:  910.0


 76%|███████████████████████████████████▌           | 227032/300001 [44:49<1:44:08, 11.68it/s]

Average agent reward:  610.0


 76%|███████████████████████████████████▋           | 228027/300001 [45:02<1:50:33, 10.85it/s]

Average agent reward:  1210.0


 76%|███████████████████████████████████▉           | 229018/300001 [45:15<1:34:42, 12.49it/s]

Average agent reward:  780.0


 77%|████████████████████████████████████           | 230023/300001 [45:27<1:39:14, 11.75it/s]

Average agent reward:  650.0


 77%|████████████████████████████████████▏          | 231022/300001 [45:40<1:40:14, 11.47it/s]

Average agent reward:  790.0


 77%|████████████████████████████████████▎          | 232021/300001 [45:53<1:34:13, 12.02it/s]

Average agent reward:  810.0


 78%|████████████████████████████████████▌          | 233015/300001 [46:06<2:32:39,  7.31it/s]

Average agent reward:  890.0


 78%|████████████████████████████████████▋          | 234018/300001 [46:20<1:35:48, 11.48it/s]

Average agent reward:  880.0


 78%|████████████████████████████████████▊          | 235019/300001 [46:32<1:25:09, 12.72it/s]

Average agent reward:  360.0


 79%|████████████████████████████████████▉          | 236019/300001 [46:45<1:42:07, 10.44it/s]

Average agent reward:  1250.0


 79%|█████████████████████████████████████▏         | 237025/300001 [46:58<1:31:59, 11.41it/s]

Average agent reward:  830.0


 79%|█████████████████████████████████████▎         | 238017/300001 [47:10<1:27:07, 11.86it/s]

Average agent reward:  600.0


 80%|█████████████████████████████████████▍         | 239025/300001 [47:23<1:27:56, 11.56it/s]

Average agent reward:  760.0


 80%|█████████████████████████████████████▌         | 240023/300001 [47:36<1:32:26, 10.81it/s]

Average agent reward:  750.0


 80%|█████████████████████████████████████▊         | 241014/300001 [47:48<1:44:37,  9.40it/s]

Average agent reward:  620.0


 81%|█████████████████████████████████████▉         | 242027/300001 [48:00<1:20:11, 12.05it/s]

Average agent reward:  980.0


 81%|██████████████████████████████████████         | 243026/300001 [48:13<1:26:46, 10.94it/s]

Average agent reward:  840.0


 81%|██████████████████████████████████████▏        | 244027/300001 [48:25<1:16:47, 12.15it/s]

Average agent reward:  700.0


 82%|██████████████████████████████████████▍        | 245024/300001 [48:39<1:30:23, 10.14it/s]

Average agent reward:  1180.0


 82%|██████████████████████████████████████▌        | 246028/300001 [48:52<1:20:39, 11.15it/s]

Average agent reward:  1010.0


 82%|██████████████████████████████████████▋        | 247025/300001 [49:05<1:13:59, 11.93it/s]

Average agent reward:  760.0


 83%|██████████████████████████████████████▊        | 248025/300001 [49:17<1:12:07, 12.01it/s]

Average agent reward:  750.0


 83%|███████████████████████████████████████        | 249014/300001 [49:32<2:12:13,  6.43it/s]

Average agent reward:  760.0


 83%|███████████████████████████████████████▏       | 250023/300001 [49:45<1:07:00, 12.43it/s]

Average agent reward:  930.0


 84%|███████████████████████████████████████▎       | 251027/300001 [49:57<1:06:54, 12.20it/s]

Average agent reward:  840.0


 84%|███████████████████████████████████████▍       | 252020/300001 [50:10<1:09:37, 11.49it/s]

Average agent reward:  920.0


 84%|███████████████████████████████████████▋       | 253025/300001 [50:23<1:06:07, 11.84it/s]

Average agent reward:  750.0


 85%|███████████████████████████████████████▊       | 254021/300001 [50:37<1:28:41,  8.64it/s]

Average agent reward:  740.0


 85%|███████████████████████████████████████▉       | 255025/300001 [50:50<1:09:36, 10.77it/s]

Average agent reward:  370.0


 85%|█████████████████████████████████████████▊       | 256025/300001 [51:02<59:46, 12.26it/s]

Average agent reward:  680.0


 86%|████████████████████████████████████████▎      | 257030/300001 [51:15<1:03:16, 11.32it/s]

Average agent reward:  620.0


 86%|██████████████████████████████████████████▏      | 258019/300001 [51:26<50:07, 13.96it/s]

Average agent reward:  280.0


 86%|██████████████████████████████████████████▎      | 259026/300001 [51:38<54:17, 12.58it/s]

Average agent reward:  580.0


 87%|██████████████████████████████████████████▍      | 260025/300001 [51:50<50:56, 13.08it/s]

Average agent reward:  820.0


 87%|██████████████████████████████████████████▋      | 261024/300001 [52:03<56:16, 11.54it/s]

Average agent reward:  520.0


 87%|██████████████████████████████████████████▊      | 262027/300001 [52:15<55:45, 11.35it/s]

Average agent reward:  720.0


 88%|██████████████████████████████████████████▉      | 263024/300001 [52:27<45:00, 13.69it/s]

Average agent reward:  560.0


 88%|███████████████████████████████████████████      | 264019/300001 [52:39<43:58, 13.64it/s]

Average agent reward:  410.0


 88%|███████████████████████████████████████████▎     | 265031/300001 [52:51<45:41, 12.76it/s]

Average agent reward:  680.0


 89%|███████████████████████████████████████████▍     | 266028/300001 [53:03<43:03, 13.15it/s]

Average agent reward:  790.0


 89%|███████████████████████████████████████████▌     | 267026/300001 [53:15<42:17, 13.00it/s]

Average agent reward:  880.0


 89%|███████████████████████████████████████████▊     | 268022/300001 [53:27<41:29, 12.85it/s]

Average agent reward:  1090.0


 90%|███████████████████████████████████████████▉     | 269024/300001 [53:39<39:48, 12.97it/s]

Average agent reward:  520.0


 90%|████████████████████████████████████████████     | 270019/300001 [53:50<37:21, 13.38it/s]

Average agent reward:  730.0


 90%|████████████████████████████████████████████▎    | 271020/300001 [54:03<39:26, 12.25it/s]

Average agent reward:  610.0


 91%|████████████████████████████████████████████▍    | 272019/300001 [54:15<36:21, 12.83it/s]

Average agent reward:  660.0


 91%|████████████████████████████████████████████▌    | 273017/300001 [54:27<34:14, 13.13it/s]

Average agent reward:  570.0


 91%|████████████████████████████████████████████▊    | 274029/300001 [54:39<33:24, 12.96it/s]

Average agent reward:  760.0


 92%|████████████████████████████████████████████▉    | 275029/300001 [54:50<31:23, 13.26it/s]

Average agent reward:  600.0


 92%|█████████████████████████████████████████████    | 276026/300001 [55:02<30:54, 12.93it/s]

Average agent reward:  460.0


 92%|█████████████████████████████████████████████▏   | 277026/300001 [55:14<30:41, 12.47it/s]

Average agent reward:  680.0


 93%|█████████████████████████████████████████████▍   | 278024/300001 [55:26<27:12, 13.46it/s]

Average agent reward:  810.0


 93%|█████████████████████████████████████████████▌   | 279024/300001 [55:38<28:26, 12.29it/s]

Average agent reward:  710.0


 93%|█████████████████████████████████████████████▋   | 280023/300001 [55:50<27:05, 12.29it/s]

Average agent reward:  490.0


 94%|█████████████████████████████████████████████▉   | 281022/300001 [56:02<24:30, 12.91it/s]

Average agent reward:  640.0


 94%|██████████████████████████████████████████████   | 282022/300001 [56:14<23:06, 12.96it/s]

Average agent reward:  780.0


 94%|██████████████████████████████████████████████▏  | 283024/300001 [56:26<21:13, 13.33it/s]

Average agent reward:  660.0


 95%|██████████████████████████████████████████████▍  | 284021/300001 [56:39<22:01, 12.09it/s]

Average agent reward:  690.0


 95%|██████████████████████████████████████████████▌  | 285027/300001 [56:50<18:34, 13.43it/s]

Average agent reward:  520.0


 95%|██████████████████████████████████████████████▋  | 286013/300001 [57:02<19:24, 12.01it/s]

Average agent reward:  430.0


 96%|██████████████████████████████████████████████▉  | 287027/300001 [57:15<17:25, 12.41it/s]

Average agent reward:  630.0


 96%|███████████████████████████████████████████████  | 288018/300001 [57:27<18:13, 10.96it/s]

Average agent reward:  790.0


 96%|███████████████████████████████████████████████▏ | 289029/300001 [57:39<14:11, 12.89it/s]

Average agent reward:  590.0


 97%|███████████████████████████████████████████████▎ | 290015/300001 [57:53<24:37,  6.76it/s]

Average agent reward:  890.0


 97%|███████████████████████████████████████████████▌ | 291022/300001 [58:06<11:21, 13.18it/s]

Average agent reward:  870.0


 97%|███████████████████████████████████████████████▋ | 292022/300001 [58:18<09:31, 13.96it/s]

Average agent reward:  300.0


 98%|███████████████████████████████████████████████▊ | 293018/300001 [58:30<08:29, 13.70it/s]

Average agent reward:  500.0


 98%|████████████████████████████████████████████████ | 294018/300001 [58:42<08:26, 11.81it/s]

Average agent reward:  780.0


 98%|████████████████████████████████████████████████▏| 295023/300001 [58:54<06:28, 12.82it/s]

Average agent reward:  700.0


 99%|████████████████████████████████████████████████▎| 296023/300001 [59:07<05:28, 12.12it/s]

Average agent reward:  660.0


 99%|████████████████████████████████████████████████▌| 297027/300001 [59:20<04:38, 10.67it/s]

Average agent reward:  1020.0


 99%|████████████████████████████████████████████████▋| 298017/300001 [59:32<02:43, 12.10it/s]

Average agent reward:  350.0


100%|████████████████████████████████████████████████▊| 299029/300001 [59:45<01:20, 12.10it/s]

Average agent reward:  710.0


100%|█████████████████████████████████████████████████| 300001/300001 [59:57<00:00, 83.39it/s]

Average agent reward:  610.0





## Part 3 - Visualizing the results

In [23]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display

def show_video_of_model(agent, env):
  state, _ = env.reset()
  done = False
  frames = []
  while not done:
    frame = env.render()
    frames.append(frame)
    action = agent.act(state)
    state, reward, done, _, _ = env.step(action[0])
  env.close()
  imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, env)

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

