In [None]:
import tensorflow as tf
tf.enable_eager_execution()

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import gym

from collections import deque

In [45]:
ENV_NAME = 'CartPole-v0'

env = gym.make(ENV_NAME)

In [46]:
n_obs_params = env.observation_space.shape[0]
n_acts = env.action_space.n

n_obs_params, n_acts

(4, 2)

In [47]:
net = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(n_obs_params, )),
    tf.keras.layers.Dense(n_acts, activation=tf.nn.softmax)
])

net.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 16)                80        
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 34        
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________


Let's see the output of our network given an observation from environment

In [48]:
sample_obs = env.reset()
sample_obs = np.expand_dims(sample_obs, axis=0)

prob_logits = net(sample_obs)[0]
action_chosen = tf.argmax(prob_logits)

print(prob_logits)
print(action_chosen)

tf.Tensor([0.5006045  0.49939552], shape=(2,), dtype=float32)
tf.Tensor(0, shape=(), dtype=int64)


In [49]:
def play_episode(net, render=False):
    
    observations = []
    actions = []
    rewards = []
    
    obs = env.reset()
    done = False
    
    while not done:
        if render: env.render()
        
        observations.append(obs)
        obs = np.expand_dims(obs, axis=0)
        logits = net(obs)
        act = tf.squeeze(tf.multinomial(logits=logits, num_samples=1), axis=1)[0].numpy()
        actions.append(act)
        next_obs, reward, done, info = env.step(act)
        rewards.append(reward)
        obs = next_obs
        
    return observations, actions, rewards

In [50]:
play_episode(net)

([array([ 0.01252563,  0.02398904, -0.00408571,  0.01278471]),
  array([ 0.01300541, -0.17107408, -0.00383001,  0.30417575]),
  array([0.00958393, 0.02410225, 0.0022535 , 0.01028739]),
  array([ 0.01006597, -0.17105195,  0.00245925,  0.30368047]),
  array([ 0.00664493, -0.36620886,  0.00853286,  0.59713797]),
  array([-0.00067925, -0.17120735,  0.02047562,  0.30715498]),
  array([-0.00410339, -0.36661499,  0.02661872,  0.60622441]),
  array([-0.01143569, -0.56209884,  0.03874321,  0.90717129]),
  array([-0.02267767, -0.75772329,  0.05688663,  1.21177542]),
  array([-0.03783213, -0.95353156,  0.08112214,  1.52172836]),
  array([-0.05690277, -1.14953458,  0.11155671,  1.83858979]),
  array([-0.07989346, -1.34569814,  0.1483285 ,  2.16373579]),
  array([-0.10680742, -1.15230641,  0.19160322,  1.92028236])],
 [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

In [51]:
def _convert_rewards_to_advantages(rewards):
    for i in range(-2, -len(rewards)-1, -1):
        rewards[i] += rewards[i+1]

In [52]:
test_list = [1, 2, 3, 4]
_convert_rewards_to_advantages(test_list)

assert test_list == [10, 9, 7, 4]

In [53]:
def train(net, optimizer, observations, actions, advantages, learning_rate=1e-3):
    
    observations = np.array(observations, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    advantages = np.array(advantages, dtype=np.float32)
    
    assert len(observations) == len(actions) == len(advantages)
    
    with tf.GradientTape() as t:
        
        probs_actions = net(observations)
        action_masks = tf.one_hot(actions, n_acts)
        
        log_pi = tf.reduce_sum(action_masks * tf.nn.log_softmax(probs_actions), axis=1)
        #print(log_pi)
        loss = -tf.reduce_mean( log_pi * advantages )
    
    d_loss_d_w = t.gradient(loss, net.trainable_weights)
    optimizer.apply_gradients(zip(d_loss_d_w, net.trainable_weights))
    
    return loss

In [54]:
observations, actions, rewards = play_episode(net)
_convert_rewards_to_advantages(rewards)

In [55]:
train(net, tf.train.AdamOptimizer(learning_rate=1e-2), observations, actions, rewards)

<tf.Tensor: id=7438563, shape=(), dtype=float32, numpy=6.664418>

In [56]:
n_episodes = 2000
n_episodes_before_train = 25

episode_rewards = []

observations = []
actions = []
rewards = []

net = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(n_obs_params, )),
    tf.keras.layers.Dense(n_acts)
])

optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)

for episode in range(n_episodes):
    ep_observations, ep_actions, ep_rewards = play_episode(net)
    observations += ep_observations
    actions += ep_actions
    ep_reward = sum(ep_rewards)
    episode_rewards.append(ep_reward)
    _convert_rewards_to_advantages(ep_rewards)
    rewards += ep_rewards
    
    if (episode + 1) % n_episodes_before_train == 0:
        loss = train(net, optimizer, observations, actions, rewards)
        observations = []
        actions = []
        rewards = []
        
        print('Episode {}: Reward = {}'.format(episode, np.mean(episode_rewards[-n_episodes_before_train:])))
        print('Loss = {}'.format(loss))
        
        if len(episode_rewards) > 100 and np.mean(episode_rewards[-100:]) >= 195:
            net.save('./model/net.h5')
            print('Stopping Criteria Achieved')
            break

Episode 24: Reward = 25.68
Loss = 12.84801959991455
Episode 49: Reward = 30.92
Loss = 13.88710880279541
Episode 74: Reward = 30.96
Loss = 13.513532638549805
Episode 99: Reward = 27.8
Loss = 11.627965927124023
Episode 124: Reward = 30.56
Loss = 12.238093376159668
Episode 149: Reward = 41.68
Loss = 16.866405487060547
Episode 174: Reward = 40.4
Loss = 17.21892547607422
Episode 199: Reward = 41.32
Loss = 19.61188316345215
Episode 224: Reward = 37.0
Loss = 14.223655700683594
Episode 249: Reward = 47.12
Loss = 18.473506927490234
Episode 274: Reward = 48.16
Loss = 18.564058303833008
Episode 299: Reward = 45.88
Loss = 18.801652908325195
Episode 324: Reward = 48.4
Loss = 18.708669662475586
Episode 349: Reward = 60.76
Loss = 24.638866424560547
Episode 374: Reward = 60.28
Loss = 26.26044273376465
Episode 399: Reward = 53.24
Loss = 19.812255859375
Episode 424: Reward = 57.76
Loss = 22.59198570251465
Episode 449: Reward = 67.48
Loss = 23.99311637878418
Episode 474: Reward = 67.04
Loss = 27.09157371