A simple game to be solved with reinforcement learning.

The game takes 21 time steps; actions can be 0, 1 or 2.
Steps below 11 get a reward equal to the action. Step above 10 get a reward of 5 for action 0.

The game and code are heavliy inspired by https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb

In [0]:
# Common imports
import numpy as np
import os
import sys
import tensorflow as tf
import datetime

# To make this notebook's output stable across runs
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)


In [0]:
class SimpleGame():
    def reset(self):
        self.t = 0
        return np.array(0.)
    
    def close(self):
        pass
    
    def step(self, action):
        obs = np.array(self.t)

        self.t = self.t+1
        reward = 0
        
        if self.t < 10:
            reward = action
        else:
            if action == 0:
                reward = 5
        
        done = self.t > 20
        info = {}
        return obs, reward, done, info
    


We define some convenience functions

In [0]:
def discount_rewards(rewards, discount_rate):
    discounted_rewards = np.zeros(len(rewards))
    cumulative_rewards = 0
    for step in reversed(range(len(rewards))):
        cumulative_rewards = rewards[step] + cumulative_rewards * discount_rate
        discounted_rewards[step] = cumulative_rewards
    return discounted_rewards

def discount_and_normalize_rewards(all_rewards, discount_rate):
    all_discounted_rewards = [discount_rewards(rewards, discount_rate) for rewards in all_rewards]
    flat_rewards = np.concatenate(all_discounted_rewards)
    reward_mean = flat_rewards.mean()
    reward_std = flat_rewards.std()
    return [(discounted_rewards - reward_mean)/reward_std for discounted_rewards in all_discounted_rewards]


In [0]:
# Play perfect
def play_perfect():
    env = SimpleGame()
    obs = env.reset()
    n_max_steps = 100
    rewards = []
    actions=[]
    for step in range(n_max_steps):
        if step < 9:
            action = 2
        else:
            action = 0
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            break
    print(f" Actions: {actions}\n Rewards: {rewards}; total {np.sum(rewards)}")



In [0]:
# Reinforce learning
def reinforce_learn(n_layers=1, n_hidden=4, learning_rate = 0.01, discount_rate = 0.99, activation=tf.nn.elu, verbose_output=True):
    reset_graph()

    # 1. Specify the network architecture
    n_inputs = 1
    n_outputs = 3

    initializer = tf.variance_scaling_initializer()

    # 2. Build the neural network
    X = tf.placeholder(tf.float32, shape=[None, n_inputs])
    if n_layers==1:
        hidden = tf.layers.dense(X, n_hidden, activation=activation,
                             kernel_initializer=initializer)
    elif n_layers==2:
        hidden2 = tf.layers.dense(X, n_hidden, activation=activation,
                             kernel_initializer=initializer)
        hidden = tf.layers.dense(hidden2, n_hidden, activation=activation,
                             kernel_initializer=initializer)
    else:
        raise ValueError('not implemented')
    logits = tf.layers.dense(hidden, n_outputs)
    outputs = tf.nn.softmax(logits)

    # 3. Select a random action based on the estimated probabilities
    action = tf.multinomial(tf.log(outputs), num_samples=1)

    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(action, depth=n_outputs), logits=logits)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(cross_entropy)
    gradients = [grad for grad, variable in grads_and_vars]
    gradient_placeholders = []
    grads_and_vars_feed = []
    for grad, variable in grads_and_vars:
        gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())
        gradient_placeholders.append(gradient_placeholder)
        grads_and_vars_feed.append((gradient_placeholder, variable))
    training_op = optimizer.apply_gradients(grads_and_vars_feed)

    init = tf.global_variables_initializer()

    env = SimpleGame()
    n_games_per_update = 20
    n_max_steps = 100
    n_iterations = 500

    with tf.Session() as sess:
        init.run()
        for iteration in range(n_iterations):
            all_rewards = []
            all_gradients = []
            for game in range(n_games_per_update):
                current_rewards = []
                current_gradients = []
                obs = env.reset()
                actions = []
                for step in range(n_max_steps):
                    action_val, gradients_val = sess.run([action, gradients], feed_dict={X: obs.reshape(1, n_inputs)})
                    obs, reward, done, info = env.step(action_val[0][0])
                    actions.append(action_val[0][0])
                    current_rewards.append(reward)
                    current_gradients.append(gradients_val)

                    if done:
                        break
                if verbose_output and (iteration == 40 or iteration == 100 or iteration == 249) and game == 1:
                    print(f" Iteration: {iteration}, Game: {game}")
                    print(f"  Actions: {actions}\n  Rewards: {current_rewards}; total {np.sum(current_rewards)}")

                all_rewards.append(current_rewards)
                all_gradients.append(current_gradients)

            all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)
            feed_dict = {}
            for var_index, gradient_placeholder in enumerate(gradient_placeholders):
                mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]
                                          for game_index, rewards in enumerate(all_rewards)
                                              for step, reward in enumerate(rewards)], axis=0)
                feed_dict[gradient_placeholder] = mean_gradients
            sess.run(training_op, feed_dict=feed_dict)
        if not verbose_output:
            print(f"{n_layers},{n_hidden},{discount_rate},{activation.__qualname__},{np.sum(current_rewards)}")

In [8]:
def run_all():
    print("Perfect play")
    play_perfect()
    print("Reinforce learning")
    
    activations = [tf.nn.elu, tf.nn.relu, tf.nn.sigmoid, tf.nn.leaky_relu, tf.nn.tanh, ]
    
    for n_layers in range(1,3):
        for n_hidden in range(1,10):
            for activation in activations:
                for discount_rate in [1]:
                  reinforce_learn(n_layers=n_layers, n_hidden=n_hidden, activation=activation, discount_rate=discount_rate, verbose_output=False)
    
run_all()

Perfect play, n_parameters=1
 Actions: [2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
 Rewards: [2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]; total 78
Reinforce learning
1,1,1,elu,70
1,1,1,relu,50
1,1,1,sigmoid,58
1,1,1,leaky_relu,67
1,1,1,tanh,70
1,2,1,elu,18
1,2,1,relu,18
1,2,1,sigmoid,68
1,2,1,leaky_relu,18
1,2,1,tanh,68
1,3,1,elu,18
1,3,1,relu,18
1,3,1,sigmoid,67
1,3,1,leaky_relu,18
1,3,1,tanh,70
1,4,1,elu,18
1,4,1,relu,69
1,4,1,sigmoid,67
1,4,1,leaky_relu,18
1,4,1,tanh,61
1,5,1,elu,76
1,5,1,relu,69
1,5,1,sigmoid,71
1,5,1,leaky_relu,18
1,5,1,tanh,66
1,6,1,elu,76
1,6,1,relu,72
1,6,1,sigmoid,71
1,6,1,leaky_relu,72
1,6,1,tanh,71
1,7,1,elu,76
1,7,1,relu,72
1,7,1,sigmoid,71
1,7,1,leaky_relu,74
1,7,1,tanh,71
1,8,1,elu,76
1,8,1,relu,69
1,8,1,sigmoid,71
1,8,1,leaky_relu,76
1,8,1,tanh,76
1,9,1,elu,76
1,9,1,relu,72
1,9,1,sigmoid,71
1,9,1,leaky_relu,76
1,9,1,tanh,76
2,1,1,elu,75
2,1,1,relu,60
2,1,1,sigmoid,60
2,1,1,leaky_relu,68
2,1,1,tanh,77
2,2,1,elu,74
2

In [0]:
import pandas as pd
from io import StringIO

In [0]:
data=StringIO('''
n_layer,n_hidden,update_rate,activation,score
1,1,1,elu,70
1,1,1,relu,50
1,1,1,sigmoid,58
1,1,1,leaky_relu,67
1,1,1,tanh,70
1,2,1,elu,18
1,2,1,relu,18
1,2,1,sigmoid,68
1,2,1,leaky_relu,18
1,2,1,tanh,68
1,3,1,elu,18
1,3,1,relu,18
1,3,1,sigmoid,67
1,3,1,leaky_relu,18
1,3,1,tanh,70
1,4,1,elu,18
1,4,1,relu,69
1,4,1,sigmoid,67
1,4,1,leaky_relu,18
1,4,1,tanh,61
1,5,1,elu,76
1,5,1,relu,69
1,5,1,sigmoid,71
1,5,1,leaky_relu,18
1,5,1,tanh,66
1,6,1,elu,76
1,6,1,relu,72
1,6,1,sigmoid,71
1,6,1,leaky_relu,72
1,6,1,tanh,71
1,7,1,elu,76
1,7,1,relu,72
1,7,1,sigmoid,71
1,7,1,leaky_relu,74
1,7,1,tanh,71
1,8,1,elu,76
1,8,1,relu,69
1,8,1,sigmoid,71
1,8,1,leaky_relu,76
1,8,1,tanh,76
1,9,1,elu,76
1,9,1,relu,72
1,9,1,sigmoid,71
1,9,1,leaky_relu,76
1,9,1,tanh,76
2,1,1,elu,75
2,1,1,relu,60
2,1,1,sigmoid,60
2,1,1,leaky_relu,68
2,1,1,tanh,77
2,2,1,elu,74
2,2,1,relu,60
2,2,1,sigmoid,60
2,2,1,leaky_relu,71
2,2,1,tanh,67
2,3,1,elu,78
2,3,1,relu,70
2,3,1,sigmoid,60
2,3,1,leaky_relu,72
2,3,1,tanh,78
2,4,1,elu,72
2,4,1,relu,78
2,4,1,sigmoid,60
2,4,1,leaky_relu,72
2,4,1,tanh,64
2,5,1,elu,18
2,5,1,relu,18
2,5,1,sigmoid,60
2,5,1,leaky_relu,18
2,5,1,tanh,78
2,6,1,elu,18
2,6,1,relu,18
2,6,1,sigmoid,60
2,6,1,leaky_relu,18
2,6,1,tanh,78
2,7,1,elu,18
2,7,1,relu,18
2,7,1,sigmoid,61
2,7,1,leaky_relu,18
2,7,1,tanh,76
2,8,1,elu,78
2,8,1,relu,78
2,8,1,sigmoid,64
2,8,1,leaky_relu,78
2,8,1,tanh,64
2,9,1,elu,18
2,9,1,relu,18
2,9,1,sigmoid,68
2,9,1,leaky_relu,18
2,9,1,tanh,78''')

In [13]:
df=pd.read_csv(data)
df.head()

Unnamed: 0,n_layer,n_hidden,update_rate,activation,score
0,1,1,1,elu,70
1,1,1,1,relu,50
2,1,1,1,sigmoid,58
3,1,1,1,leaky_relu,67
4,1,1,1,tanh,70


In [0]:
pt = pd.pivot_table(df, values='score', index=['activation', 'n_layer'], columns = ['n_hidden'])

In [18]:
from tabulate import tabulate
print(tabulate(pt, headers='keys', tablefmt='pipe'))

|                   |   1 |   2 |   3 |   4 |   5 |   6 |   7 |   8 |   9 |
|:------------------|----:|----:|----:|----:|----:|----:|----:|----:|----:|
| ('elu', 1)        |  70 |  18 |  18 |  18 |  76 |  76 |  76 |  76 |  76 |
| ('elu', 2)        |  75 |  74 |  78 |  72 |  18 |  18 |  18 |  78 |  18 |
| ('leaky_relu', 1) |  67 |  18 |  18 |  18 |  18 |  72 |  74 |  76 |  76 |
| ('leaky_relu', 2) |  68 |  71 |  72 |  72 |  18 |  18 |  18 |  78 |  18 |
| ('relu', 1)       |  50 |  18 |  18 |  69 |  69 |  72 |  72 |  69 |  72 |
| ('relu', 2)       |  60 |  60 |  70 |  78 |  18 |  18 |  18 |  78 |  18 |
| ('sigmoid', 1)    |  58 |  68 |  67 |  67 |  71 |  71 |  71 |  71 |  71 |
| ('sigmoid', 2)    |  60 |  60 |  60 |  60 |  60 |  60 |  61 |  64 |  68 |
| ('tanh', 1)       |  70 |  68 |  70 |  61 |  66 |  71 |  71 |  76 |  76 |
| ('tanh', 2)       |  77 |  67 |  78 |  64 |  78 |  78 |  76 |  64 |  78 |
