## Setup

In [6]:
!apt-get install xvfb
!pip install 'gym==0.10.11'
!pip install 'imageio==2.4.0'
!pip install PILLOW
!pip install 'pyglet==1.3.2'
!pip install pyvirtualdisplay
!pip install tf-agents-nightly
try:
  %%tensorflow_version 2.x
except:
  pass

from __future__ import absolute_import, division, print_function

import base64
import imageio
import IPython
import matplotlib
import matplotlib.pyplot as plt
import PIL.Image
import pyvirtualdisplay

import tensorflow as tf

from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

tf.compat.v1.enable_v2_behavior()

# Set up a virtual display for rendering OpenAI gym environments.
display = pyvirtualdisplay.Display(visible=0, size=(1400, 900)).start()

tf.version.VERSION

Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.3).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.


xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


TensorFlow is already loaded. Please restart the runtime to change versions.


## Environment

In [0]:
env_name = "Taxi-v2"
env = suite_gym.load(env_name)

In [11]:
env.reset()
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[35mY[0m| : |B: |
+---------+



<ipykernel.iostream.OutStream at 0x7fb41ed22c18>

In [15]:
print("Observation Spec:")
print(env.time_step_spec().observation)

print("Reward Spec:")
print(env.time_step_spec().reward)

print("Action Spec:")
print(env.action_spec())

Observation Spec:
BoundedArraySpec(shape=(), dtype=dtype('int64'), name='observation', minimum=0, maximum=499)
Reward Spec:
ArraySpec(shape=(), dtype=dtype('float32'), name='reward')
Action Spec:
BoundedArraySpec(shape=(), dtype=dtype('int64'), name='action', minimum=0, maximum=5)


In [17]:
# What is this?
time_step = env.reset()
print("Time step:")
print(time_step)

action = 1

next_time_step = env.step(action)
print("Next time step:")
print(next_time_step)

Time step:
TimeStep(step_type=array(0, dtype=int32), reward=array(0., dtype=float32), discount=array(1., dtype=float32), observation=array(167))
Next time step:
TimeStep(step_type=array(1, dtype=int32), reward=array(-1., dtype=float32), discount=array(1., dtype=float32), observation=array(67))


In [0]:
train_py_env = suite_gym.load(env_name)
eval_py_env = suite_gym.load(env_name)

In [0]:
train_env = tf_py_environment.TFPyEnvironment(train_py_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

## Model

In [0]:
# hyperparameters
num_iterations = 20000
initial_collect_steps = 1000
collect_steps_per_iteration = 1
replay_buffer_max_length = 100000
batch_size = 64
learning_rate = 1e-3
log_interval = 200
num_eval_episodes = 10
eval_interval = 1000

In [0]:
# QNetwork is a neural network model that can learn to predict QValues for all actions, given an observation from the environment.
fc_layer_params = (100,)

q_net = q_network.QNetwork(
    # state space
    train_env.observation_spec(),
    # actiona space
    train_env.action_spec(),
    fc_layer_params = fc_layer_params
)

In [0]:
# learning_rate is set at hyperparameter section
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate = learning_rate)

# What is this?
train_step_counter = tf.Variable(0)

# instantiate agent, dqn_agent is imported from tf_agents.agents.dqn
agent = dqn_agent.DqnAgent(
    # what is this?
    train_env.time_step_spec(),
    # action space
    train_env.action_spec(),
    q_network = q_net,
    optimizer = optimizer,
    # loss function
    td_errors_loss_fn = common.element_wise_squared_loss,
    # what is this?
    train_step_counter = train_step_counter
)

agent.initialize()

## Policy

In [0]:
# main policy for evaluation and deployment. What is this?
eval_policy = agent.policy

# policy for data collection. What is this?
collect_policy = agent.collect_policy