<a href="https://colab.research.google.com/github/zzmtsvv/RL-with-gym/blob/main/breakoutv0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install gym
! pip install gym[atari]

In [2]:
! wget http://www.atarimania.com/roms/Roms.rar

--2021-07-21 13:20:39--  http://www.atarimania.com/roms/Roms.rar
Resolving www.atarimania.com (www.atarimania.com)... 195.154.81.199
Connecting to www.atarimania.com (www.atarimania.com)|195.154.81.199|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11128004 (11M) [application/x-rar-compressed]
Saving to: ‘Roms.rar.1’


2021-07-21 13:21:03 (458 KB/s) - ‘Roms.rar.1’ saved [11128004/11128004]



In [3]:
! unzip /content/ROMS.zip

Archive:  /content/ROMS.zip
replace ROMS/128 in 1 Game Select ROM (128 in 1) (Unknown) ~.bin? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
!python -m atari_py.import_roms /content/ROMS

In [5]:
import torch
from torch import nn
from torchvision import transforms as T
import numpy as np
import random
import datetime
import time
import matplotlib.pyplot as plt
import os
from PIL import Image
from copy import deepcopy
from pathlib import Path
from collections import deque, namedtuple
import gym
from gym.wrappers import FrameStack, GrayScaleObservation, TransformObservation
from gym.spaces import Box
from skimage import transform

In [6]:
env = gym.make('Breakout-v0')

In [7]:
env.reset()
next_state, reward, done, info = env.step(action=0)
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")

(210, 160, 3),
 0.0,
 False,
 {'ale.lives': 5}


In [8]:
def preprocess_state(state):
  cropped = Image.fromarray(state).crop((0, 34, 160, 160 + 34))
  composition = T.Compose([T.Grayscale(),
                           T.Resize((96, 96))])
  img = composition(cropped)
  small_img = np.uint8(img)
  return np.expand_dims(small_img, axis=0)


class ReplayMemory(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.memory = deque(maxlen=capacity)
    self.transition = namedtuple('Transition', ('state', 'action',
                                                'next_state', 'reward'))
    self.position = 0
  
  def __len__(self):
    return len(self.memory)
  
  def push(self, *args):
    self.memory.append(self.transition(*args))
  
  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)

In [9]:
class SkipFrame(gym.Wrapper):
  def __init__(self, env, skip):
    """Return only every `skip`-th frame"""
    super().__init__(env)
    self._skip = skip

  def step(self, action):
    """Repeat action, and sum reward"""
    total_reward = 0.0
    done = False
    for i in range(self._skip):
      # Accumulate reward and repeat the same action
      obs, reward, done, info = self.env.step(action)
      total_reward += reward
      if done:
        break
    return obs, total_reward, done, info


class ResizeObservation(gym.ObservationWrapper):
  def __init__(self, env, shape):
    super().__init__(env)
    if isinstance(shape, int):
      self.shape = (shape, shape)
    else:
      self.shape = tuple(shape)

    obs_shape = self.shape + self.observation_space.shape[2:]
    self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

  def observation(self, observation):
    resize_obs = transform.resize(observation, self.shape)
    # cast float back to uint8
    resize_obs *= 255
    resize_obs = resize_obs.astype(np.uint8)
    return resize_obs


env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, f=lambda x: x / 255.)
env = FrameStack(env, num_stack=4)

In [10]:
class ArseNet(nn.Module):
  def __init__(self, input_dim, output_dim):
    super().__init__()
    c, h, w = input_dim
    if h != 84:
      raise ArithmeticError(f"Expected input height: 84, got: {h}")
    if w != 84:
      raise ArithmeticError(f"Expected input width: 84, got: {w}")

    self.online = nn.Sequential(
        nn.Conv2d(in_channels=c, out_channels=32, kernel_size=8, stride=4),
        nn.ReLU(),
        nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2),
        nn.ReLU(),
        nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1),
        nn.ReLU(),
        nn.Flatten(),
        nn.Linear(3136, 512),
        nn.ReLU(),
        nn.Linear(512, output_dim)
    )
    self.target = deepcopy(self.online)

    for theta in self.target.parameters():
      theta.requires_grad = False
  
  def forward(self, inputs, model):
    if model == 'online':
      return self.online(inputs)
    elif model == 'target':
      return self.target(inputs)

In [11]:
class Agent():
  def __init__(self, state_dim, action_dim, save_dir, checkpoint=False):
    self.state_dim = state_dim
    self.action_dim = action_dim
    self.memory = deque(maxlen=100000)
    self.batch_size = 32

    self.exploration_rate = 1
    self.exploration_rate_decay = 0.9999995
    self.exploration_rate_min = 0.1
    self.gamma = 0.9

    self.curr_step = 0
    self.burnin = 1e5
    self.learn_every = 3
    self.sync_every = 1e4

    self.save_every = 5e5
    self.save_dir = save_dir

    self.gpu = torch.cuda.is_available()

    self.net = ArseNet(self.state_dim, self.action_dim).float()
    if self.gpu:
      self.net = self.net.to(device='cuda')
    if checkpoint:
      self.load(checkpoint)
    
    self.optimizer = torch.optim.Adam(self.net.parameters(), lr=3e-4)
    self.loss_fn = nn.SmoothL1Loss()

  
  def act(self, state):
    '''
    choose epsilon-greedy action given state and update the value of step
    '''
    # exploration
    if np.random.rand() < self.exploration_rate:
      action_idx = np.random.randint(self.action_dim)
    # exploitation
    else:
      torch.cuda.empty_cache()
      state = torch.FloatTensor(state).cuda() if self.gpu else torch.FloatTensor(state)
      state = state.unsqueeze(0)
      action_values = self.net(state, model='online')
      action_idx = torch.argmax(action_values, axis=1).item()
    
    # decrease exploration_rate
    self.exploration_rate *= self.exploration_rate_decay
    if self.exploration_rate < self.exploration_rate_min:
      self.exploration_rate = self.exploration_rate_min
    
    self.curr_step += 1
    return action_idx
  
  def cache(self, state, next_state, action, reward, done):
    '''
    store the experience to self.memory
    '''
    state = torch.FloatTensor(state).cuda() if self.gpu else torch.FloatTensor(state)
    next_state = torch.FloatTensor(next_state).cuda() if self.gpu else torch.FloatTensor(next_state)
    action = torch.LongTensor([action]).cuda() if self.gpu else torch.LongTensor([action])
    reward = torch.DoubleTensor([reward]).cuda() if self.gpu else torch.DoubleTensor([reward])
    done = torch.BoolTensor([done]).cuda() if self.gpu else torch.BoolTensor([done])

    self.memory.append((state, next_state, action, reward, done,))
  
  def recall(self):
    '''
    Take a batch of experience from memory
    '''
    batch = random.sample(self.memory, self.batch_size)
    state, next_state, action, reward, done = map(torch.stack, zip(*batch))
    return state, next_state, action.squeeze(), reward.squeeze(), done.squeeze()
  
  def td_estimate(self, state, action):
    # the predicted optimal Q_online for a given state
    current_Q = self.net(state, model="online")[np.arange(0, self.batch_size), action]
    return current_Q
  
  @torch.no_grad()
  def td_target(self, reward, next_state, done):
    '''
    TD Target - aggregation of current reward and the estimated Q
    in the next state
    '''
    next_state_Q = self.net(state, model='online')
    best_action = torch.argmax(next_state_Q, axis=1)
    next_Q = self.net(next_state, model='target')[np.arange(0, self.batch_size), best_action]
    return (reward + (1 - done.float()) * self.gamma * next_Q).float()
  
  def update_Q_online(self, td_estimate, td_target):
    loss = self.loss_fn(td_estimate, td_target)
    self.optimizer.zero_grad()
    loss.backward()
    self.optimizer.step()
    return loss.item()
  
  def sync_Q_target(self):
    self.net.target.load_state_dict(self.net.online.state_dict())
  
  def save(self):
    num = int(self.curr_step // self.save_every)
    save_path = (self.save_dir / f"mario_net_{num}.chkpt")
    torch.save(dict(model=self.net.state_dict(),
                    exploration_rate=self.exploration_rate), save_path)
    print(f"ArseNet saved to {save_path} at step {self.curr_step}")
  
  def learn(self):
    if not self.curr_step % self.sync_every:
      self.sync_Q_target()

    # if not self.curr_step % self.save_every:
    #  self.save()

    if self.curr_step < self.burnin:
      return None, None

    if self.curr_step % self.learn_every:
      return None, None

    
    # sample from memory
    state, next_state, action, reward, done = self.recall()

    # TD Estimate and TD Target
    td_est = self.td_estimate(state, action)
    td_trgt = self.td_target(reward, next_state, done)

    loss = self.update_Q_online(td_est, td_trgt)

    return td_est.mean().item(), loss
  
  def save(self):
    save_path = self.save_dir / f"arse_net_{int(self.curr_step // self.save_every)}.chkpt"
    torch.save(
        dict(
            model=self.net.state_dict(),
            exploration_rate=self.exploration_rate
        ),
        save_path
    )
    print(f"ArseNet saved to {save_path} at step {self.curr_step}")
  
  def load(self, load_path):
    if not load_path.exists():
      raise ValueError(f"{load_path} does not exist")

    ckp = torch.load(load_path, map_location=('cuda' if self.gpu else 'cpu'))
    exploration_rate = ckp.get('exploration_rate')
    state_dict = ckp.get('model')

    print(f"Loading model at {load_path} with exploration rate {exploration_rate}")
    self.net.load_state_dict(state_dict)
    self.exploration_rate = exploration_rate

In [12]:
torch.cuda.is_available()

True

In [None]:
save_dir = Path("checkpoints") / datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
save_dir.mkdir(parents=True)

agent = Agent(state_dim=(4, 84, 84), action_dim=env.action_space.n, save_dir=save_dir)

In [14]:
episodes = 1000
rewards = []

In [15]:
for e in range(episodes):
  state = env.reset()

  while True:
    action = agent.act(state)
    next_state, reward, done, info = env.step(action)

    agent.cache(state, next_state, action, reward, done)

    q, loss = agent.learn()
    state = next_state

    if done or info['ale.lives'] == 0:
      break
  
  rewards.append(reward)

RuntimeError: ignored

In [16]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [17]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

In [18]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f72f0db74d0>

In [19]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [20]:
env = wrap_env(env)
state = env.reset()
done = False
total_reward = 0
torch.cuda.empty_cache()

In [28]:
torch.cuda.empty_cache()

In [29]:
! nvidia-smi

Wed Jul 21 14:01:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    31W /  70W |  15106MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [33]:
agent.gpu = False

In [34]:
while not done:
    env.render()
    agent.net.eval()
    action = agent.act(state)
    state, reward, done, _ = env.step(action)
    total_reward += reward
env.close()
print(total_reward)
show_video()

5.0
