<a href="https://colab.research.google.com/github/tianhaoz95/ultron/blob/dev%2Ftry-atari-game/notebooks/prototype_atari_with_actor_critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install gym pyvirtualdisplay tqdm > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg x11-utils > /dev/null 2>&1

In [57]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install gputil > /dev/null 2>&1
!pip install pyglet==1.2.4 > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (46.1.3)


In [58]:
import tensorflow as tf
import GPUtil
print("Tensorflow version " + tf.__version__)
try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
  tf.config.experimental_connect_to_cluster(tpu)
  tf.tpu.experimental.initialize_tpu_system(tpu)
  tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
  try:
    print('ERROR: Not connected to a TPU runtime!')
    GPUs = GPUtil.getGPUs()
    print('GPU count: ' + str(len(GPUs)))
  except:
    print('Using CPU')

Tensorflow version 2.2.0-rc4
ERROR: Not connected to a TPU runtime!
GPU count: 1


In [82]:
import gym
import threading
import math
import glob
import io
import base64
import multiprocessing
import matplotlib
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline
from gym.wrappers import Monitor
from tensorflow import keras
from os import path
from time import sleep
from tqdm.notebook import tqdm
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1011'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1011'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

In [83]:
print('using tensorflow', tf.__version__)

using tensorflow 2.2.0-rc4


In [0]:
def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")

def wrap_env(env):
  wrapped_env = Monitor(env, './video', force=True)
  return wrapped_env

In [0]:
class PipelineArgs():
  def __init__(self,
               game_name='Acrobot-v1',
               mode='train',
               max_eps=100,
               update_freq=20,
               gamma=0.99,
               model_path='.',
               lr=0.001):
    self.game_name = game_name
    self.mode = mode
    self.gamma = gamma
    self.update_freq = update_freq
    self.lr = lr
    self.max_eps = max_eps
    self.model_path = model_path

In [0]:
class Recorder():
  def __init__(self):
    self.eps_rewards = []
    self.eps_steps = []
  
  def record(self, eps_reward, eps_step):
    self.eps_rewards.append(eps_reward)
    self.eps_steps.append(eps_step)

In [0]:
class Memory():
  def __init__(self):
    self.states = []
    self.actions = []
    self.rewards = []
  
  def store(self, state, action, reward):
    self.states.append(state)
    self.actions.append(action)
    self.rewards.append(reward)

  def clear(self):
    self.states = []
    self.actions = []
    self.rewards = []

In [0]:
class Worker(threading.Thread):
  # The cumulative episode played
  global_eps = 0
  best_score = 0
  lock = threading.Lock()

  def __init__(self,
               args,
               action_size,
               state_size,
               global_model,
               opt,
               pbar):
    super(Worker, self).__init__()
    self.args = args
    self.game_name = self.args.game_name
    self.env = gym.make(self.game_name)
    self.action_size = action_size
    self.state_size = state_size
    self.local_model = ActorCriticModel(self.state_size, self.action_size)
    self.eps_loss = 0
    self.opt = opt
    self.global_model = global_model
    self.pbar = pbar

  def compute_loss(self, done, new_state, memory):
    if done:
      reward_sum = 0
    else:
      reward_sum = self.local_model(
          tf.convert_to_tensor(
              new_state[None, :],
              dtype=tf.float32))[-1].numpy()[0]
    discounted_rewards = []
    for reward in memory.rewards[::-1]:
      reward_sum = reward + self.args.gamma * reward_sum
      discounted_rewards.append(reward_sum)
    discounted_rewards.reverse()
    past_states = np.vstack(memory.states)
    logits, values = self.local_model(
        tf.convert_to_tensor(past_states, dtype=tf.float32))
    advantage = tf.convert_to_tensor(
        np.array(discounted_rewards)[:, None], dtype=tf.float32) - values
    # Calculate the loss for value function which mean how off is our
    # predicted value from the true value estimated from the discounted
    # reward.
    value_loss = advantage ** 2
    # Calculate the policy loss
    policy = tf.nn.softmax(logits)
    entropy = tf.nn.softmax_cross_entropy_with_logits(
        labels=policy, logits=logits)
    policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
        labels=memory.actions, logits=logits)
    policy_loss *= tf.stop_gradient(advantage)
    policy_loss -= 0.01 * entropy
    # Combine the value and policy loss to be a single trainable
    total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss))
    return total_loss

  def compute_and_apply_gradient(self, done, new_state, mem):
    with tf.GradientTape() as tape:
      total_loss = self.compute_loss(done, new_state, mem)
    self.eps_loss += total_loss
    grads = tape.gradient(
        total_loss, self.local_model.trainable_weights)
    self.opt.apply_gradients(zip(
        grads, self.global_model.trainable_weights))
    self.local_model.set_weights(self.global_model.get_weights())

  def run(self):
    # Prepare variables
    mem = Memory()
    total_steps = 1
    # Check the maximum episode of learning is reached
    while Worker.global_eps < self.args.max_eps:
      current_state = self.env.reset()
      mem.clear()
      done = False
      ep_reward = 0
      self.eps_loss = 0
      time_count = 0
      ep_steps = 0
      # Check if the game is over
      while not done:
        # Get the policy and play the game
        logits, _ = self.local_model(
            tf.convert_to_tensor(current_state[None, :], dtype=tf.float32))
        probs = tf.nn.softmax(logits)
        action = np.random.choice(self.action_size, p=probs.numpy()[0])
        new_state, reward, done, info = self.env.step(action)
        if done:
          reward = -1
        ep_reward += reward
        mem.store(current_state, action, reward)
        # If the explore time limit has been reached or
        # the game is over, then update the models
        if time_count >= self.args.update_freq or done:
          self.compute_and_apply_gradient(done, new_state, mem)
          mem.clear()
          time_count = 0
          if done:
            with Worker.lock:
              if Worker.global_eps == 0:
                Worker.best_score = ep_reward
              if ep_reward > Worker.best_score:
                print('save best model so far')
                self.global_model.save_weights(path.join(self.args.model_path, 'best.h5'))
                Worker.best_score = ep_reward
              else:
                self.global_model.save_weights(path.join(self.args.model_path, 'backup.h5'))
              Worker.global_eps += 1
              self.pbar.set_description(f'step: {ep_steps}, reward {ep_reward}/{Worker.best_score}')
              self.pbar.update(1)
        ep_steps += 1
        time_count += 1
        current_state = new_state
        total_steps += 1

In [0]:
class ActorCriticModel(keras.Model):
  def __init__(self, state_size, action_size):
    super(ActorCriticModel, self).__init__()
    self.state_size = state_size
    self.action_size = action_size
    self.policy_dense = keras.layers.Dense(100)
    self.policy_logits = keras.layers.Dense(self.action_size, activation='relu')
    self.value_dense = keras.layers.Dense(100, activation='relu')
    self.value = keras.layers.Dense(1)
  
  def call(self, inputs):
    p = self.policy_dense(inputs)
    logits = self.policy_logits(p)
    v = self.value_dense(inputs)
    values = self.value(v)
    return logits, values

In [0]:
class BaselineModel():
  def __init__(self):
    print('not implemented')

In [0]:
class MasterAgent():
  def __init__(self, args):
    self.args = args
    self.game_name = self.args.game_name
    env = gym.make(self.game_name)
    self.action_size = env.action_space.n
    self.state_size = env.observation_space.shape[0]
    self.opt = tf.optimizers.Adam(self.args.lr)
    print('state_size: ', self.state_size)
    print('action_size: ', self.action_size)
    self.global_model = ActorCriticModel(self.state_size, self.action_size)
    random_input = np.random.random((1, self.state_size))
    dummy = tf.convert_to_tensor(random_input, dtype=tf.float32)
    self.global_model(dummy)
    if path.exists(path.join(self.args.model_path, 'best.h5')):
      self.global_model.load_weights(
          path.join(self.args.model_path, 'best.h5'))
    self.pbar = tqdm(total=self.args.max_eps)
  
  def train_sync(self):
    worker = Worker(self.args, self.action_size, self.state_size,
                    self.global_model, self.opt, self.pbar)
    worker.run()
    self.pbar.close()
  
  def train_async(self):
    Worker.global_eps = 0;
    workers = [
      Worker(self.args, self.action_size, self.state_size,
             self.global_model, self.opt, self.pbar)
      for i in range(multiprocessing.cpu_count())
    ]
    for i, worker in enumerate(workers):
      print("Starting worker {}".format(i))
      worker.start()
    [w.join() for w in workers]
    self.pbar.close()
  
  def play(self):
    env = wrap_env(gym.make(self.args.game_name))
    state = env.reset()
    model = self.global_model
    if path.exists(path.join(self.args.model_path, 'best.h5')):
      self.global_model.load_weights(
          path.join(self.args.model_path, 'best.h5'))
    done = False
    while not done:
      state_input = tf.convert_to_tensor(state[None, :], dtype=tf.float32)
      logit, _ = model(state_input)
      policy = tf.nn.softmax(logit)
      action = np.argmax(policy)
      state, reward, done, _ = env.step(action)
    env.close()
    show_video()

In [0]:
def gym_sanity_check(args):
  print('starting gym environment sanity check')
  env = wrap_env(gym.make(args.game_name))
  inital_observation = env.reset()
  sample_action = env.action_space.sample()
  print('observation size: ', len(inital_observation))
  print('sample observation: ', inital_observation)
  print('sample action: ', sample_action)
  for _ in range(20):
    observation = env.reset()
    done = False
    while not done:
        action = env.action_space.sample()
        observation, _, done, _ = env.step(action)
  env.close()
  show_video()

In [0]:
def entry_point(args):
  if args.mode == 'sanity':
    gym_sanity_check(args)
  if args.mode == 'play':
    master = MasterAgent(args)
    master.play()
  if args.mode == 'train_sync':
    master = MasterAgent(args)
    master.train_sync()
  if args.mode == 'train_async':
    master = MasterAgent(args)
    master.train_async()
  print('Hello World')

In [111]:
entry_point(
    PipelineArgs(
        game_name='Assault-ram-v0',
        update_freq=20,
        max_eps=1200,
        mode='play'))

state_size:  128
action_size:  7


HBox(children=(IntProgress(value=0, max=1200), HTML(value='')))

Hello World
