<a href="https://colab.research.google.com/github/tianhaoz95/ultron/blob/dev%2Ftry-atari-game/notebooks/prototype_atari_with_actor_critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg x11-utils > /dev/null 2>&1

In [69]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (46.1.3)


In [70]:
import gym
import threading
import math
import glob
import io
import base64
import matplotlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
from gym.wrappers import Monitor
from tensorflow import keras
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1021'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1021'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [0]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [0]:
class PipelineArgs():
  def __init__(self,
               game_name,
               mode,
               lr):
    self.game_name = game_name
    self.mode = mode
    self.lr = lr

In [0]:
class Worker(threading.Thread):
  # The cumulative episode played
  global_eps = 0
  best_score = 0
  lock = threading.Lock()

  def __init__(self,
               args,
               game_name):
    super(Worker, self).__init__()
    self.args = args
    self.game_name = game_name
    self.env = gym.make(self.game_name)

  def compute_loss(self, done, new_state, memory):
    if done:
      reward_sum = 0
    else:
      reward_sum = self.local_model(
          tf.convert_to_tensor(
              new_state[None, :],
              dtype=tf.float32))[-1].numpy()[0]
    discounted_rewards = []
    for reward in memory.rewards[::-1]:
      reward_sum = reward + self.args.gamma * reward_sum
      discounted_rewards.append(reward_sum)
    discounted_rewards.reverse()
    logits, values = self.local_model(
        tf.convert_to_tensor(np.vstack(memory.states), dtype=tf.float32))
    advantage = tf.convert_to_tensor(
        np.array(discounted_rewards)[:, None], dtype=tf.float32) - values
    # Calculate the loss for value function which mean how off is our
    # predicted value from the true value estimated from the discounted
    # reward.
    value_loss = advantage ** 2
    # Calculate the policy loss
    policy = tf.nn.softmax(logits)
    entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=policy, logits=logits)
    policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=memory.actions, logits=logits)
    policy_loss *= tf.stop_gradient(advantage)
    policy_loss -= 0.01 * entropy
    # Combine the value and policy loss to be a single trainable
    total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss))
    return total_loss

  def run(self, args):
    # Prepare variables
    total_steps = 1
    eps_steps = 0
    # Check the maximum episode of learning is reached
    while Worker.global_eps < self.args.max_eps:
      current_state = self.env.reset()
      done = False
      time_count = 0
      # Check if the game is over
      while not done:
        # Get the policy and play the game
        logits, _ = self.local_model(
            tf.convert_to_tensor(current_state[None, 1]), dtype=tf.float32)
        probs = tf.nn.softmax(logits)
        action = np.random.choice(self.action_size, p=probs.numpy()[0])
        new_state, reward, done, _ = self.env.step(action)
        # If the game is over, give it a negative reward
        if done:
          reward = -1
        # If the explore time limit has been reached or
        # the game is over, then update the models
        if time_count >= self.args.update_freq or done:
          with tf.GradientTape() as tape:
            total_loss = self.compute_loss()
          self.eps_loss += total_loss
          grads = tape.gradient(
              total_loss, self.local_model.trainable_weights)
          self.opt.apply_gradients(zip(
              grads, self.global_model.trainable_weights))
          self.local_model.set_weights(self.global_model.get_weights())
          time_count = 0
          if done and ep_reward > Worker.best_score:
            with Worker.lock:
              print('saving best model so far')
        else:
          eps_steps += 1
          time_count += 1
          current_state = new_state
          total_steps += 1

In [0]:
class ActorCriticModel(keras.Model):
  def __init__(self, state_size, action_size):
    super(ActorCriticModel, self).__init__()
    self.state_size = state_size
    self.action_size = action_size
    self.policy_dense = keras.layers.Dense(100)
    self.policy_logits = keras.layers.Dense(self.action_size, activation='relu')
    self.value_dense = keras.layers.Dense(100, activation='relu')
    self.value = keras.layers.Dense(1)
  
  def call(self, inputs):
    p = self.policy_dense(inputs)
    logits = self.policy_logits(p)
    v = self.value_dense(inputs)
    values = self.value(v)

In [0]:
class BaselineModel():
  def __init__(self):
    print('not implemented')

In [0]:
class MasterAgent():
  def __init__(self, args):
    self.args = args
    self.game_name = args.game_name
    env = gym.make(self.game_name)
    self.action_size = env.observation_space.shape[0]
    self.state_size = env.action_space.n
    self.opt = tf.optimizers.Adam(self.args.lr)
    self.global_model = ActorCriticModel(self.state_size, self.action_size)
  
  def train(self):
    print('not implemented')
  
  def play(self):
    env = wrap_env(gym.make(self.game_name))
    state = env.reset()
    model = self.global_model
    done = False
    try:
      while not done:
        env.render()
        logit, _ = model(
            tf.convert_to_tensor(state[None, :], dtype=tf.float32))
        policy = tf.nn.softmax(logit)
        action = np.argmax(policy)
        state, reward, done, _ = env.step(action)
    except KeyboardInterrupt:
      print("Received Keyboard Interrupt. Shutting down.")
    finally:
      env.close()
      show_video()

In [0]:
def gym_sanity_check():
  print('starting gym environment sanity check')
  env = wrap_env(gym.make('Breakout-ram-v0'))
  inital_observation = observation = env.reset()
  print('observation size: ', len(inital_observation))
  print('sample observation: ', inital_observation)
  for _ in range(20):
      observation = env.reset()
      done = False
      while not done:
          env.render()
          action = env.action_space.sample()
          observation, _, done, _ = env.step(action)
  env.close()
  show_video()

In [0]:
def entry_point(args):
  if args.mode == 'sanity':
    gym_sanity_check()
  if args.mode == 'play':
    master = MasterAgent(args)
    master.play()
  print('Hello World')

In [80]:
entry_point(
    PipelineArgs(
        game_name='Breakout-ram-v0', 
        mode='play',
        lr=0.01
    )
)

TypeError: ignored