In [9]:
import tensorflow as tf
tf.enable_eager_execution()

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import gym

from collections import deque

In [10]:
ENV_NAME = 'CartPole-v0'

env = gym.make(ENV_NAME)

In [11]:
n_obs_params = env.observation_space.shape[0]
n_acts = env.action_space.n

n_obs_params, n_acts

(4, 2)

In [12]:
net = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(n_obs_params, )),
    tf.keras.layers.Dense(n_acts, activation=tf.nn.softmax)
])

net.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_2 (Dense)              (None, 16)                80        
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________


Let's see the output of our network given an observation from environment

In [15]:
sample_obs = env.reset()
sample_obs = np.expand_dims(sample_obs, axis=0)
sample_obs = np.asarray(sample_obs, dtype=np.float32)

prob_logits = net(sample_obs)[0]
action_chosen = tf.argmax(prob_logits)

print(prob_logits)
print(action_chosen)

tf.Tensor([0.50605947 0.4939405 ], shape=(2,), dtype=float32)
tf.Tensor(0, shape=(), dtype=int64)


In [18]:
def play_episode(net, render=False):
    
    observations = []
    actions = []
    rewards = []
    
    obs = env.reset()
    done = False
    
    while not done:
        if render: env.render()
        
        observations.append(obs)
        obs = np.expand_dims(obs, axis=0)
        obs = np.asarray(obs, dtype=np.float32)
        logits = net(obs)
        act = tf.squeeze(tf.multinomial(logits=logits, num_samples=1), axis=1)[0].numpy()
        actions.append(act)
        next_obs, reward, done, info = env.step(act)
        rewards.append(reward)
        obs = next_obs
        
    return observations, actions, rewards

In [19]:
play_episode(net)

Instructions for updating:
Use tf.random.categorical instead.


([array([ 0.04902787, -0.01344601, -0.00413251, -0.00987001]),
  array([ 0.04875895, -0.20850845, -0.00432991,  0.28150621]),
  array([ 0.04458878, -0.013325  ,  0.00130022, -0.01253921]),
  array([ 0.04432228, -0.20846558,  0.00104943,  0.28055367]),
  array([ 0.04015296, -0.01335861,  0.00666051, -0.01179808]),
  array([ 0.03988579, -0.20857545,  0.00642454,  0.28297885]),
  array([ 0.03571428, -0.01354572,  0.01208412, -0.0076709 ]),
  array([ 0.03544337, -0.20883887,  0.0119307 ,  0.28880008]),
  array([ 3.12665919e-02, -1.38890649e-02,  1.77067053e-02, -9.62716415e-05]),
  array([ 0.03098881,  0.18097453,  0.01770478, -0.28714034]),
  array([ 0.0346083 ,  0.37583958,  0.01196197, -0.57418717]),
  array([ 0.04212509,  0.18055198,  0.00047823, -0.27776   ]),
  array([ 0.04573613, -0.01457679, -0.00507697,  0.01507372]),
  array([ 0.0454446 , -0.20962556, -0.0047755 ,  0.30615049]),
  array([ 0.04125209, -0.40467914,  0.00134751,  0.59732353]),
  array([ 0.0331585 , -0.20957607,  0.0

In [20]:
def _convert_rewards_to_advantages(rewards):
    for i in range(-2, -len(rewards)-1, -1):
        rewards[i] += rewards[i+1]

In [21]:
test_list = [1, 2, 3, 4]
_convert_rewards_to_advantages(test_list)

assert test_list == [10, 9, 7, 4]

In [22]:
def train(net, optimizer, observations, actions, advantages, learning_rate=1e-3):
    
    observations = np.array(observations, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    advantages = np.array(advantages, dtype=np.float32)
    
    assert len(observations) == len(actions) == len(advantages)
    
    with tf.GradientTape() as t:
        
        probs_actions = net(observations)
        action_masks = tf.one_hot(actions, n_acts)
        
        log_pi = tf.reduce_sum(action_masks * tf.nn.log_softmax(probs_actions), axis=1)
        
        loss = -tf.reduce_mean( log_pi * advantages )
    
    d_loss_d_w = t.gradient(loss, net.trainable_weights)
    optimizer.apply_gradients(zip(d_loss_d_w, net.trainable_weights))
    
    return loss

In [23]:
observations, actions, rewards = play_episode(net)
_convert_rewards_to_advantages(rewards)

In [24]:
train(net, tf.train.AdamOptimizer(learning_rate=1e-2), observations, actions, rewards)

<tf.Tensor: id=1520, shape=(), dtype=float32, numpy=10.734823>

In [25]:
n_episodes = 2000
n_episodes_before_train = 25

episode_rewards = []

observations = []
actions = []
rewards = []

net = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(n_obs_params, )),
    tf.keras.layers.Dense(n_acts)
])

optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)

for episode in range(n_episodes):
    ep_observations, ep_actions, ep_rewards = play_episode(net)
    observations += ep_observations
    actions += ep_actions
    ep_reward = sum(ep_rewards)
    episode_rewards.append(ep_reward)
    _convert_rewards_to_advantages(ep_rewards)
    rewards += ep_rewards
    
    if (episode + 1) % n_episodes_before_train == 0:
        loss = train(net, optimizer, observations, actions, rewards)
        observations = []
        actions = []
        rewards = []
        
        print('Episode {}: Reward = {}'.format(episode, np.mean(episode_rewards[-n_episodes_before_train:])))
        print('Loss = {}'.format(loss))
        
        if len(episode_rewards) > 100 and np.mean(episode_rewards[-100:]) >= 195:
            net.save_weights('net.h5')
            print('Stopping Criteria Achieved')
            break

Episode 24: Reward = 24.16
Loss = 10.904436111450195
Episode 49: Reward = 24.04
Loss = 10.034758567810059
Episode 74: Reward = 23.6
Loss = 10.494437217712402
Episode 99: Reward = 30.52
Loss = 14.820843696594238
Episode 124: Reward = 44.84
Loss = 20.784473419189453
Episode 149: Reward = 27.56
Loss = 11.766745567321777
Episode 174: Reward = 32.88
Loss = 14.751398086547852
Episode 199: Reward = 28.88
Loss = 13.427520751953125
Episode 224: Reward = 39.52
Loss = 17.536441802978516
Episode 249: Reward = 35.2
Loss = 13.619742393493652
Episode 274: Reward = 48.88
Loss = 21.243408203125
Episode 299: Reward = 38.68
Loss = 16.6977481842041
Episode 324: Reward = 44.56
Loss = 18.12976837158203
Episode 349: Reward = 42.84
Loss = 18.616130828857422
Episode 374: Reward = 55.2
Loss = 23.183975219726562
Episode 399: Reward = 46.36
Loss = 16.62848663330078
Episode 424: Reward = 51.28
Loss = 20.087614059448242
Episode 449: Reward = 44.84
Loss = 18.27011489868164
Episode 474: Reward = 66.64
Loss = 26.13983

The next two cells are to check why the loss increases while training

In [35]:
def train_check_loss(net, optimizer, observations, actions, advantages, sub_observations, sub_actions,
                     sub_advantages, learning_rate=1e-3):
    observations = np.array(observations, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    advantages = np.array(advantages, dtype=np.float32)
    
    sub_observations = np.array(sub_observations, dtype=np.float32)
    sub_actions = np.array(sub_actions, dtype=np.int32)
    sub_advantages = np.array(sub_advantages, dtype=np.float32)
    
    assert len(observations) == len(actions) == len(advantages)
    
    with tf.GradientTape() as t:
        
        probs_actions = net(observations)
        action_masks = tf.one_hot(actions, n_acts)
        
        sub_probs_actions = net(sub_observations)
        sub_action_masks = tf.one_hot(sub_actions, n_acts)
        
        log_pi_sub = tf.reduce_sum(sub_action_masks * tf.nn.log_softmax(sub_probs_actions), axis=1)
        sub_loss = -tf.reduce_mean( log_pi_sub * sub_advantages )
        
        print('sub_loss:', sub_loss.numpy())
        
        log_pi = tf.reduce_sum(action_masks * tf.nn.log_softmax(probs_actions), axis=1)
        loss = -tf.reduce_mean( log_pi * advantages )
    
    d_loss_d_w = t.gradient(loss, net.trainable_weights)
    optimizer.apply_gradients(zip(d_loss_d_w, net.trainable_weights))
    
    return loss

In [36]:
n_episodes = 2000
n_episodes_before_train = 25

episode_rewards = []

observations = []
actions = []
rewards = []

sub_observations = []
sub_actions = []
sub_rewards = []

net = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(n_obs_params, )),
    tf.keras.layers.Dense(n_acts)
])

optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)

for episode in range(n_episodes):
    ep_observations, ep_actions, ep_rewards = play_episode(net)
    observations += ep_observations
    actions += ep_actions
    
    ep_reward = sum(ep_rewards)
    episode_rewards.append(ep_reward)
    _convert_rewards_to_advantages(ep_rewards)
    rewards += ep_rewards

    sub_observations += ep_observations if len(ep_observations) < 10 else ep_observations[:10]
    sub_actions += ep_actions if len(ep_actions) < 10 else ep_actions[:10]
    sub_rewards += ep_rewards if len(ep_rewards) < 10 else ep_rewards[:10]
    
    if (episode + 1) % n_episodes_before_train == 0:
        loss = train_check_loss(net, optimizer, observations, actions, rewards, sub_observations, sub_actions, sub_rewards)
        observations = []
        actions = []
        rewards = []
        
        print('Episode {}: Reward = {}'.format(episode, np.mean(episode_rewards[-n_episodes_before_train:])))
        print('Loss = {}'.format(loss))
        
        if len(episode_rewards) > 100 and np.mean(episode_rewards[-100:]) >= 195:
            net.save_weights('net.h5')
            print('Stopping Criteria Achieved')
            break

sub_loss: 8.132531
Episode 24: Reward = 16.16
Loss = 6.4081268310546875
sub_loss: 9.2115555
Episode 49: Reward = 19.28
Loss = 8.252080917358398
sub_loss: 10.024085
Episode 74: Reward = 21.32
Loss = 9.238167762756348
sub_loss: 10.743457
Episode 99: Reward = 23.36
Loss = 10.934904098510742
sub_loss: 10.735608
Episode 124: Reward = 20.12
Loss = 8.406343460083008
sub_loss: 10.977724
Episode 149: Reward = 22.52
Loss = 10.14055347442627
sub_loss: 11.53183
Episode 174: Reward = 26.52
Loss = 11.65504264831543
sub_loss: 12.560639
Episode 199: Reward = 34.12
Loss = 18.131317138671875
sub_loss: 12.978116
Episode 224: Reward = 28.76
Loss = 11.513986587524414
sub_loss: 13.300332
Episode 249: Reward = 28.68
Loss = 11.460155487060547
sub_loss: 13.863955
Episode 274: Reward = 33.68
Loss = 15.206086158752441
sub_loss: 14.5930605
Episode 299: Reward = 38.92
Loss = 16.90174674987793
sub_loss: 15.300336
Episode 324: Reward = 40.8
Loss = 16.90399742126465
sub_loss: 16.158827
Episode 349: Reward = 46.92
Los