<a href="https://colab.research.google.com/github/zzmtsvv/RL-with-gym/blob/main/carracingv0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install box2d-py
!pip3 install gym[Box_2D]
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1



In [2]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.distributions import Beta
from torch import optim
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
import gym

In [3]:
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay

In [4]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<pyvirtualdisplay.display.Display at 0x7f140a0fead0>

In [5]:
seed = 421
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(seed)
if torch.cuda.is_available():
  torch.cuda.manual_seed(seed)

transition = np.dtype([('s', np.float64, (4, 96, 96)), ('a', np.float64, (3,)),
                       ('a_logp', np.float64), ('r', np.float64),
                       ('s_', np.float64, (4, 96, 96))])

In [6]:
class Env(gym.Wrapper):
  def __init__(self, env):
    super(Env, self).__init__(env)
    self.env.seed(seed)
    self.reward_threshold = self.env.spec.reward_threshold
  
  def reset(self):
    self.counter = 0
    self.avrg_rew = self.reward_memory()

    self.dead = False

    img_rgb = self.env.reset()
    img_gray = self.rgb2gray(img_rgb)
    self.stack = [img_gray] * 4 # frames for decision
    return np.array(self.stack)
  
  def step(self, action):
    total_reward = 0
    for _ in range(8):
      # repeat action in 8 frames
      img_rgb, reward, dead, _ = self.env.step(action)
      if dead:
        reward += 100
      
      if np.mean(img_rgb[:, :, 1]) > 185.0:
        reward -= 0.05
      total_reward += reward
      done = True if self.avrg_rew(reward) <= -0.1 else False
      if done or dead:
        break
    img_gray = self.rgb2gray(img_rgb)
    self.stack.pop(0)
    self.stack.append(img_gray)
    assert len(self.stack) == 4
    return np.array(self.stack), total_reward, done, dead
  
  def render(self):
    self.env.render()
  
  @staticmethod
  def rgb2gray(rgb, norm=True):
    gray = np.dot(rgb[..., :], [0.299, 0.587, 0.114])
    if norm:
      gray = gray / 128. - 1.
    return gray
  
  @staticmethod
  def reward_memory():
    count = 0
    length = 100
    history = np.zeros(length)

    def memory(reward):
      nonlocal count
      history[count] = reward
      count = (count + 1) % length
      return np.mean(history)
    return memory

In [7]:
class ArseNet(nn.Module):
    """
    Actor-Critic Network for PPO
    """
    def __init__(self):
      super(ArseNet, self).__init__()
      self.cnn_base = nn.Sequential(
          nn.Conv2d(4, 8, kernel_size=4, stride=2),  # (4, 96, 96)
          nn.ReLU(),  # activation
          nn.Conv2d(8, 16, kernel_size=3, stride=2),  # (8, 47, 47)
          nn.ReLU(),  # activation
          nn.Conv2d(16, 32, kernel_size=3, stride=2),  # (16, 23, 23)
          nn.ReLU(),  # activation
          nn.Conv2d(32, 64, kernel_size=3, stride=2),  # (32, 11, 11)
          nn.ReLU(),  # activation
          nn.Conv2d(64, 128, kernel_size=3, stride=1),  # (64, 5, 5)
          nn.ReLU(),  # activation
          nn.Conv2d(128, 256, kernel_size=3, stride=1),  # (128, 3, 3)
          nn.ReLU(),  # activation
      )  # output shape (256, 1, 1)
      self.v = nn.Sequential(nn.Linear(256, 100), nn.ReLU(), nn.Linear(100, 1))
      self.fc = nn.Sequential(nn.Linear(256, 100), nn.ReLU())
      self.alpha_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
      self.beta_head = nn.Sequential(nn.Linear(100, 3), nn.Softplus())
      self.apply(self._weights_init)

    @staticmethod
    def _weights_init(w):
      if isinstance(w, nn.Conv2d):
        nn.init.xavier_uniform_(w.weight, gain=nn.init.calculate_gain('relu'))
        nn.init.constant_(w.bias, 0.1)

    def forward(self, x):
      x = self.cnn_base(x)
      x = x.view(-1, 256)
      v = self.v(x)
      x = self.fc(x)
      alpha = self.alpha_head(x) + 1
      beta = self.beta_head(x) + 1
      return (alpha, beta), v

In [8]:
class Agent:
  def __init__(self):
    self.max_grad_norm = 1 / 2
    self.clip_param = .1
    self.epoch = 10
    self.capacity, self.batch_size = 2000, 128

    self.training_step = 0
    self.net = ArseNet().float().to(device)
    self.buffer = np.empty(self.capacity, dtype=transition)
    self.counter = 0

    self.optimizer = optim.Adam(self.net.parameters())
  
  def select_action(self, state):
    state = torch.from_numpy(state).float().to(device).unsqueeze(0)
    with torch.no_grad():
      alpha, beta = self.net(state)[0]
    dist = Beta(alpha, beta)
    action = dist.sample()
    a_logp = dist.log_prob(action).sum(dim=1)

    action = action.squeeze().cpu().numpy()
    return action, a_logp.item()
  
  def save_param(self):
    torch.save(self.net.state_dict(), 'param/ppo_carracing_net_params.pkl')
  
  def store(self, transition):
    self.buffer[self.counter] = transition
    self.counter += 1
    if self.counter == self.capacity:
      self.counter = 0
      return True
    return False
  
  def update(self):
    self.training_step += 1
    
    s = torch.tensor(self.buffer['s'], dtype=torch.float32).to(device)
    a = torch.tensor(self.buffer['a'], dtype=torch.float32).to(device)
    r = torch.tensor(self.buffer['r'], dtype=torch.float32).to(device).view(-1, 1)
    s_ = torch.tensor(self.buffer['s_'], dtype=torch.float32).to(device)

    old_a_logp = torch.tensor(self.buffer['a_logp'], dtype=torch.float32).to(device).view(-1, 1)

    with torch.no_grad():
      target_v = r + 0.99 * self.net(s_)[1]
      diff = target_v - self.net(s)[1]

    for _ in range(self.epoch):
      for index in BatchSampler(SubsetRandomSampler(range(self.capacity)), self.batch_size, False):
        alpha, beta = self.net(s[index])[0]
        dist = Beta(alpha, beta)
        a_logp = dist.log_prob(a[index]).sum(dim=1, keepdim=True)
        ratio = torch.exp(a_logp - old_a_logp[index])

        surr1 = ratio * adv[index]
        surr2 = torch.clamp(ratio, 1.0 - self.clip_param, 1.0 + self.clip_param) * diff[index]
        action_loss = -torch.min(surr1, surr2).mean()
        value_loss = F.smooth_l1_loss(self.net(s[index])[1], target_v[index])
        loss = action_loss + 2. * value_loss

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

  def load_param(self):
    self.net.load_state_dict(torch.load('/content/ppo_carracing_net_params.pkl'))

In [9]:
agent = Agent()
env = Env(gym.make('CarRacing-v0'))
agent.load_param()



In [None]:
training_records, running_score = [], 0
state = env.reset()
episodes = range(1000)

for episode in episodes:
  score = 0
  state = env.reset()

  for t in range(1000):
    action, a_logp = agent.select_action(state)
    state_, reward, done, dead = env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))

    if agent.store((state, action, a_logp, reward, state_)):
      print('updating...')
      agent.update()
    score += reward
    state = state_
    if done or dead:
      break
    running_score = running_score * 0.99 + score * 0.01
  training_records.append(score)
  
  if not episode % 10:
    print('Ep {}\tLast score: {:.2f}\tMoving average score: {:.2f}'.format(episode, score, running_score))
    # agent.save_param()
  
  if running_score > env.reward_threshold:
    print(f'solved - {running_score}, {score}')
    break

Track generation: 1223..1533 -> 310-tiles track
Track generation: 1223..1533 -> 310-tiles track
Ep 0	Last score: -17.98	Moving average score: -1.87
Track generation: 1163..1458 -> 295-tiles track
Track generation: 1076..1349 -> 273-tiles track
Track generation: 1160..1454 -> 294-tiles track
Track generation: 1196..1499 -> 303-tiles track
Track generation: 963..1208 -> 245-tiles track
Track generation: 1143..1433 -> 290-tiles track
Track generation: 1074..1346 -> 272-tiles track
Track generation: 1228..1539 -> 311-tiles track
Track generation: 1002..1261 -> 259-tiles track
Track generation: 1124..1409 -> 285-tiles track
Ep 10	Last score: -17.96	Moving average score: -0.12
Track generation: 1217..1525 -> 308-tiles track
Track generation: 1028..1289 -> 261-tiles track


In [11]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env.env, './video', force=True)
  return env

In [12]:
env = wrap_env(env)

In [None]:
state = env.env.reset()
total_reward = 0
done = False
dead = False

for _ in range(1000):
  env.env.render()
  action, _ = agent.select_action(state)

  state, reward, done, dead = env.env.step(action * np.array([2., 1., 1.]) + np.array([-1., 0., 0.]))
  total_reward += reward

show_video()
env.env.close()
print(total_reward)