# Lab 3-1: REINFORCE
    In this lab, you need to implement a REINFORCE algorithm with Tensorflow and solve OpenAI Gym CartPole-v0

In [1]:
from cartpole_env import *

import numpy as np
import tensorflow as tf

from collections import namedtuple, deque

# Define the data structure of experience
Experience = namedtuple('Experience', 'state action reward next_state done')

## Implement ```discount``` function to compute discounted reward

In [2]:
def discount(rewards, gamma):
    '''
    param rewards: a rewards numpy array
    param gamma: discount factor
    '''
    discounted_rewards = np.zeros_like(rewards)
    
    # TODO: Calculate discounted rewards
    discounted_rewards = rewards
    s = 0
    for i in reversed(range(len(rewards))):
        #
        # reward = [1, 2, 3]
        # discounted_rewards = [
        #    1 + gamma * 2 + gamma * gamma * 3, 
        #    2 + gamma * 3,
        #    3 ]
        #
        
        s = s * gamma + rewards[i]
        discounted_rewards[i] = s

    return discounted_rewards

## Implement ```do_rollout``` function to collect rollout

In [3]:
def do_rollout(env, policy, render=False):
    '''
    Collect a rollout from env with policy
    
    param env: RL Environment
    param policy: a function parameterized by environment state, return a action
    return a list of (state, action, reward, next_state, done)
    '''
    # Initialize done as False
    done = False
    
    # Reset the environment and get the initial state
    state = env.reset()
    
    # Empty list
    rollout = []
    
    while not done:
        action = policy(state)
        next_state, reward, done, info = env.step(action)
        
        # Render the environment (slow)
        if render:
            env.render()
        
        rollout.append(Experience(state, action, reward, next_state, done))
        state = next_state
        
    return rollout

## Implement ```ReinforceAgent``` following ```TODO```

In [4]:
class ReinforceAgent(object):
    def __init__(self, sess, n_states, n_actions, n_hiddens, lr, gamma):
        '''
        param sess: tf session
        param n_states: dim of states
        param n_actions: dim of actions space
        param n_hiddens: dim of hidden state
        '''
        self.sess = sess
        self.n_states = n_states
        self.n_actions = n_actions
        self.n_hiddens = n_hiddens
        
        # Learning rate
        self.lr = lr
        
        # Discount factor
        self.gamma = gamma
       
        self.state = tf.placeholder(shape=[None, n_states], dtype=tf.float32)
        self.value = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        
        # TODO: Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = self.state
        # n_units = self.n_hiddens
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        self.h = tf.layers.dense(
            inputs=self.state,
            units=n_hiddens,   
            activation=tf.nn.relu,
            kernel_initializer=tf.random_normal_initializer(0., .1), 
            bias_initializer=tf.constant_initializer(0.1), 
            name='h')
        
        # Declare 1-st hidden layer
        # Define a fully-connected layer with:
        # input = 1-st hidden layer
        # n_units = self.n_actions
        # activation = relu
        # weight_initializer = random_normal(0.0, 0.1)
        # bias_initializer = constant (0.1)
        self.policy = tf.layers.dense(
            inputs=self.h,
            units=n_actions,    
            activation=tf.nn.softmax,  
            kernel_initializer=tf.random_normal_initializer(0., .1), 
            bias_initializer=tf.constant_initializer(0.1),
            name='policy')
        
        # TODO: negative log probability
        neg_log = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=self.action, logits=self.policy)
        
        # TODO: policy gradient loss function
        self.loss = tf.reduce_mean(neg_log * self.value)
        
        # Optimizer
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr)
        self.train_op = self.optimizer.minimize(self.loss)
        
    def layer(self, inputs, n_input, n_units, activation=tf.nn.relu):
        weights = tf.Variable(
            tf.random_normal([n_input, n_units], mean=0.0, stddev=0.1), name="weights")
        bias = tf.Variable(tf.constant(0.1, shape=[n_units]))
        outputs = activation(tf.matmul(inputs, weights) + bias) 
        
        return outputs
    
    def act(self, s):
        '''
        param s: a np.ndarray with shape [n_batches, n_states]
        return a batch of actions with shape [n_batches,]
        '''
        # TODO: Softmax stochastic policy
        policy = sess.run(self.policy, feed_dict={self.state: s})
        action = np.random.choice(self.n_actions, 1, p=policy.reshape(-1))
        
        return action
    
    def train(self, rollout):
        '''
        param rollout: a list o
        '''
        states = np.array([np.asarray(e.state) for e in rollout ])
        actions = np.squeeze(np.array([ e.action for e in rollout ]))
        rewards = np.array([ e.reward for e in rollout ])
        discounted_rewards = discount(rewards, gamma=self.gamma)
        
        self.sess.run(self.train_op, feed_dict={self.state: states,
                                                self.action: actions,
                                                self.value: discounted_rewards})
   
        

In [5]:
LR = 0.001
GAMMA = 0.99

sess = tf.InteractiveSession()
env = CartpoleEnvironment()
agent = ReinforceAgent(sess=sess, 
                       n_states=env.observation_space.shape[0],
                       n_actions=env.action_space.n,
                       n_hiddens=20,
                       lr=LR,
                       gamma=GAMMA)
init = tf.global_variables_initializer()
sess.run(init)

In [6]:
def policy(s):
    return agent.act([s])[0]

In [7]:
def calculate_episode_reward(rollout):
    rewards = [ e.reward for e in rollout ]
    return sum(rewards)

In [None]:
def eval_history_reward(history):
    arr = np.asarray(history)
    return arr.mean()

In [None]:
MAX_ITERATIONS = 100000

episode_reward = 0.0
history_episode_rewards = deque(maxlen=100)

plot_history_episode_rewards = []

for iter in range(MAX_ITERATIONS):
    rollout = do_rollout(env=env, policy=policy, render=False)
    agent.train(rollout=rollout)
    
    episode_reward = calculate_episode_reward(rollout)
    history_episode_rewards.append(episode_reward)
    plot_history_episode_rewards.append(episode_reward)
    mean_rewards = eval_history_reward(history_episode_rewards)
    
    print('Episode %d: Reward = %f, Mean reward (over %d episodes) = %f' % (iter, 
                                                                            episode_reward,
                                                                            len(history_episode_rewards),
                                                                            mean_rewards))
    if mean_rewards > 195.0:
        print('Pass')
        break

Episode 0: Reward = -9.000000, Mean reward (over 1 episodes) = -9.000000
Episode 1: Reward = 4.000000, Mean reward (over 2 episodes) = -2.500000
Episode 2: Reward = 2.000000, Mean reward (over 3 episodes) = -1.000000
Episode 3: Reward = 14.000000, Mean reward (over 4 episodes) = 2.750000
Episode 4: Reward = 9.000000, Mean reward (over 5 episodes) = 4.000000
Episode 5: Reward = 8.000000, Mean reward (over 6 episodes) = 4.666667
Episode 6: Reward = -1.000000, Mean reward (over 7 episodes) = 3.857143
Episode 7: Reward = -2.000000, Mean reward (over 8 episodes) = 3.125000
Episode 8: Reward = -10.000000, Mean reward (over 9 episodes) = 1.666667
Episode 9: Reward = 23.000000, Mean reward (over 10 episodes) = 3.800000
Episode 10: Reward = 3.000000, Mean reward (over 11 episodes) = 3.727273
Episode 11: Reward = -10.000000, Mean reward (over 12 episodes) = 2.583333
Episode 12: Reward = 27.000000, Mean reward (over 13 episodes) = 4.461538
Episode 13: Reward = 26.000000, Mean reward (over 14 epis

Episode 116: Reward = 30.000000, Mean reward (over 100 episodes) = 2.230000
Episode 117: Reward = 17.000000, Mean reward (over 100 episodes) = 2.460000
Episode 118: Reward = 0.000000, Mean reward (over 100 episodes) = 2.510000
Episode 119: Reward = 20.000000, Mean reward (over 100 episodes) = 2.730000
Episode 120: Reward = -2.000000, Mean reward (over 100 episodes) = 2.820000
Episode 121: Reward = -9.000000, Mean reward (over 100 episodes) = 2.530000
Episode 122: Reward = 5.000000, Mean reward (over 100 episodes) = 2.670000
Episode 123: Reward = 26.000000, Mean reward (over 100 episodes) = 3.000000
Episode 124: Reward = 14.000000, Mean reward (over 100 episodes) = 3.250000
Episode 125: Reward = -8.000000, Mean reward (over 100 episodes) = 3.280000
Episode 126: Reward = -2.000000, Mean reward (over 100 episodes) = 3.190000
Episode 127: Reward = 30.000000, Mean reward (over 100 episodes) = 3.490000
Episode 128: Reward = -11.000000, Mean reward (over 100 episodes) = 3.490000
Episode 129: 

Episode 231: Reward = 48.000000, Mean reward (over 100 episodes) = 5.910000
Episode 232: Reward = 4.000000, Mean reward (over 100 episodes) = 6.040000
Episode 233: Reward = -8.000000, Mean reward (over 100 episodes) = 5.650000
Episode 234: Reward = 42.000000, Mean reward (over 100 episodes) = 6.190000
Episode 235: Reward = -1.000000, Mean reward (over 100 episodes) = 6.170000
Episode 236: Reward = -7.000000, Mean reward (over 100 episodes) = 6.130000
Episode 237: Reward = 1.000000, Mean reward (over 100 episodes) = 6.220000
Episode 238: Reward = 35.000000, Mean reward (over 100 episodes) = 6.690000
Episode 239: Reward = -7.000000, Mean reward (over 100 episodes) = 6.550000
Episode 240: Reward = 15.000000, Mean reward (over 100 episodes) = 6.770000
Episode 241: Reward = 13.000000, Mean reward (over 100 episodes) = 6.880000
Episode 242: Reward = 29.000000, Mean reward (over 100 episodes) = 6.890000
Episode 243: Reward = 20.000000, Mean reward (over 100 episodes) = 7.070000
Episode 244: R

Episode 344: Reward = 29.000000, Mean reward (over 100 episodes) = 13.680000
Episode 345: Reward = 3.000000, Mean reward (over 100 episodes) = 13.570000
Episode 346: Reward = 7.000000, Mean reward (over 100 episodes) = 13.370000
Episode 347: Reward = -10.000000, Mean reward (over 100 episodes) = 13.140000
Episode 348: Reward = 10.000000, Mean reward (over 100 episodes) = 13.130000
Episode 349: Reward = -4.000000, Mean reward (over 100 episodes) = 13.100000
Episode 350: Reward = 49.000000, Mean reward (over 100 episodes) = 13.500000
Episode 351: Reward = 8.000000, Mean reward (over 100 episodes) = 13.390000
Episode 352: Reward = -8.000000, Mean reward (over 100 episodes) = 12.950000
Episode 353: Reward = -4.000000, Mean reward (over 100 episodes) = 12.750000
Episode 354: Reward = 4.000000, Mean reward (over 100 episodes) = 12.740000
Episode 355: Reward = 19.000000, Mean reward (over 100 episodes) = 12.860000
Episode 356: Reward = 8.000000, Mean reward (over 100 episodes) = 12.950000
Epi

Episode 451: Reward = 60.000000, Mean reward (over 100 episodes) = 26.290000
Episode 452: Reward = 29.000000, Mean reward (over 100 episodes) = 26.660000
Episode 453: Reward = 2.000000, Mean reward (over 100 episodes) = 26.720000
Episode 454: Reward = 5.000000, Mean reward (over 100 episodes) = 26.730000
Episode 455: Reward = 30.000000, Mean reward (over 100 episodes) = 26.840000
Episode 456: Reward = 56.000000, Mean reward (over 100 episodes) = 27.320000
Episode 457: Reward = 18.000000, Mean reward (over 100 episodes) = 26.930000
Episode 458: Reward = 40.000000, Mean reward (over 100 episodes) = 27.370000
Episode 459: Reward = 34.000000, Mean reward (over 100 episodes) = 27.570000
Episode 460: Reward = 24.000000, Mean reward (over 100 episodes) = 27.670000
Episode 461: Reward = 70.000000, Mean reward (over 100 episodes) = 28.190000
Episode 462: Reward = 23.000000, Mean reward (over 100 episodes) = 28.460000
Episode 463: Reward = 7.000000, Mean reward (over 100 episodes) = 28.370000
Ep

Episode 558: Reward = 93.000000, Mean reward (over 100 episodes) = 37.010000
Episode 559: Reward = 94.000000, Mean reward (over 100 episodes) = 37.610000
Episode 560: Reward = 45.000000, Mean reward (over 100 episodes) = 37.820000
Episode 561: Reward = 27.000000, Mean reward (over 100 episodes) = 37.390000
Episode 562: Reward = 6.000000, Mean reward (over 100 episodes) = 37.220000
Episode 563: Reward = 34.000000, Mean reward (over 100 episodes) = 37.490000
Episode 564: Reward = 120.000000, Mean reward (over 100 episodes) = 38.040000
Episode 565: Reward = 110.000000, Mean reward (over 100 episodes) = 39.130000
Episode 566: Reward = 93.000000, Mean reward (over 100 episodes) = 39.210000
Episode 567: Reward = 39.000000, Mean reward (over 100 episodes) = 39.600000
Episode 568: Reward = 38.000000, Mean reward (over 100 episodes) = 39.850000
Episode 569: Reward = 71.000000, Mean reward (over 100 episodes) = 40.340000
Episode 570: Reward = 36.000000, Mean reward (over 100 episodes) = 40.47000

Episode 667: Reward = 97.000000, Mean reward (over 100 episodes) = 63.780000
Episode 668: Reward = 65.000000, Mean reward (over 100 episodes) = 64.050000
Episode 669: Reward = 50.000000, Mean reward (over 100 episodes) = 63.840000
Episode 670: Reward = 77.000000, Mean reward (over 100 episodes) = 64.250000
Episode 671: Reward = 49.000000, Mean reward (over 100 episodes) = 64.450000
Episode 672: Reward = 121.000000, Mean reward (over 100 episodes) = 65.480000
Episode 673: Reward = 35.000000, Mean reward (over 100 episodes) = 65.460000
Episode 674: Reward = 41.000000, Mean reward (over 100 episodes) = 65.680000
Episode 675: Reward = 56.000000, Mean reward (over 100 episodes) = 65.910000
Episode 676: Reward = 52.000000, Mean reward (over 100 episodes) = 66.010000
Episode 677: Reward = 61.000000, Mean reward (over 100 episodes) = 66.220000
Episode 678: Reward = 81.000000, Mean reward (over 100 episodes) = 66.980000
Episode 679: Reward = 91.000000, Mean reward (over 100 episodes) = 67.60000

Episode 775: Reward = 112.000000, Mean reward (over 100 episodes) = 72.480000
Episode 776: Reward = 53.000000, Mean reward (over 100 episodes) = 72.490000
Episode 777: Reward = 98.000000, Mean reward (over 100 episodes) = 72.860000
Episode 778: Reward = 165.000000, Mean reward (over 100 episodes) = 73.700000
Episode 779: Reward = 29.000000, Mean reward (over 100 episodes) = 73.080000
Episode 780: Reward = 23.000000, Mean reward (over 100 episodes) = 71.310000
Episode 781: Reward = 43.000000, Mean reward (over 100 episodes) = 71.150000
Episode 782: Reward = 46.000000, Mean reward (over 100 episodes) = 70.420000
Episode 783: Reward = 32.000000, Mean reward (over 100 episodes) = 70.470000
Episode 784: Reward = 83.000000, Mean reward (over 100 episodes) = 70.810000
Episode 785: Reward = 134.000000, Mean reward (over 100 episodes) = 71.780000
Episode 786: Reward = 38.000000, Mean reward (over 100 episodes) = 71.530000
Episode 787: Reward = 26.000000, Mean reward (over 100 episodes) = 71.640

Episode 884: Reward = 48.000000, Mean reward (over 100 episodes) = 56.430000
Episode 885: Reward = 50.000000, Mean reward (over 100 episodes) = 55.590000
Episode 886: Reward = 61.000000, Mean reward (over 100 episodes) = 55.820000
Episode 887: Reward = 32.000000, Mean reward (over 100 episodes) = 55.880000
Episode 888: Reward = 101.000000, Mean reward (over 100 episodes) = 56.180000
Episode 889: Reward = 52.000000, Mean reward (over 100 episodes) = 56.290000
Episode 890: Reward = 30.000000, Mean reward (over 100 episodes) = 56.220000
Episode 891: Reward = 97.000000, Mean reward (over 100 episodes) = 56.650000
Episode 892: Reward = 60.000000, Mean reward (over 100 episodes) = 56.530000
Episode 893: Reward = 72.000000, Mean reward (over 100 episodes) = 55.640000
Episode 894: Reward = 67.000000, Mean reward (over 100 episodes) = 55.260000
Episode 895: Reward = 58.000000, Mean reward (over 100 episodes) = 55.400000
Episode 896: Reward = 86.000000, Mean reward (over 100 episodes) = 55.95000

Episode 992: Reward = 154.000000, Mean reward (over 100 episodes) = 58.210000
Episode 993: Reward = 32.000000, Mean reward (over 100 episodes) = 57.810000
Episode 994: Reward = 62.000000, Mean reward (over 100 episodes) = 57.760000
Episode 995: Reward = 79.000000, Mean reward (over 100 episodes) = 57.970000
Episode 996: Reward = 48.000000, Mean reward (over 100 episodes) = 57.590000
Episode 997: Reward = 63.000000, Mean reward (over 100 episodes) = 57.780000
Episode 998: Reward = 48.000000, Mean reward (over 100 episodes) = 58.020000
Episode 999: Reward = 41.000000, Mean reward (over 100 episodes) = 58.230000
Episode 1000: Reward = 59.000000, Mean reward (over 100 episodes) = 57.870000
Episode 1001: Reward = 28.000000, Mean reward (over 100 episodes) = 57.570000
Episode 1002: Reward = 64.000000, Mean reward (over 100 episodes) = 57.800000
Episode 1003: Reward = 37.000000, Mean reward (over 100 episodes) = 57.880000
Episode 1004: Reward = 34.000000, Mean reward (over 100 episodes) = 57.

Episode 1098: Reward = 51.000000, Mean reward (over 100 episodes) = 62.600000
Episode 1099: Reward = 31.000000, Mean reward (over 100 episodes) = 62.500000
Episode 1100: Reward = 79.000000, Mean reward (over 100 episodes) = 62.700000
Episode 1101: Reward = 72.000000, Mean reward (over 100 episodes) = 63.140000
Episode 1102: Reward = 87.000000, Mean reward (over 100 episodes) = 63.370000
Episode 1103: Reward = 140.000000, Mean reward (over 100 episodes) = 64.400000
Episode 1104: Reward = 73.000000, Mean reward (over 100 episodes) = 64.790000
Episode 1105: Reward = 45.000000, Mean reward (over 100 episodes) = 64.440000
Episode 1106: Reward = 47.000000, Mean reward (over 100 episodes) = 64.270000
Episode 1107: Reward = 79.000000, Mean reward (over 100 episodes) = 64.620000
Episode 1108: Reward = 178.000000, Mean reward (over 100 episodes) = 65.530000
Episode 1109: Reward = 49.000000, Mean reward (over 100 episodes) = 65.540000
Episode 1110: Reward = 100.000000, Mean reward (over 100 episo

Episode 1203: Reward = 87.000000, Mean reward (over 100 episodes) = 74.770000
Episode 1204: Reward = 200.000000, Mean reward (over 100 episodes) = 76.040000
Episode 1205: Reward = 62.000000, Mean reward (over 100 episodes) = 76.210000
Episode 1206: Reward = 34.000000, Mean reward (over 100 episodes) = 76.080000
Episode 1207: Reward = 45.000000, Mean reward (over 100 episodes) = 75.740000
Episode 1208: Reward = 47.000000, Mean reward (over 100 episodes) = 74.430000
Episode 1209: Reward = 49.000000, Mean reward (over 100 episodes) = 74.430000
Episode 1210: Reward = 80.000000, Mean reward (over 100 episodes) = 74.230000
Episode 1211: Reward = 70.000000, Mean reward (over 100 episodes) = 74.060000
Episode 1212: Reward = 96.000000, Mean reward (over 100 episodes) = 73.990000
Episode 1213: Reward = 46.000000, Mean reward (over 100 episodes) = 74.140000
Episode 1214: Reward = 56.000000, Mean reward (over 100 episodes) = 74.280000
Episode 1215: Reward = 118.000000, Mean reward (over 100 episod

Episode 1309: Reward = 67.000000, Mean reward (over 100 episodes) = 74.310000
Episode 1310: Reward = 71.000000, Mean reward (over 100 episodes) = 74.220000
Episode 1311: Reward = 85.000000, Mean reward (over 100 episodes) = 74.370000
Episode 1312: Reward = 81.000000, Mean reward (over 100 episodes) = 74.220000
Episode 1313: Reward = 55.000000, Mean reward (over 100 episodes) = 74.310000
Episode 1314: Reward = 126.000000, Mean reward (over 100 episodes) = 75.010000
Episode 1315: Reward = 32.000000, Mean reward (over 100 episodes) = 74.150000
Episode 1316: Reward = 95.000000, Mean reward (over 100 episodes) = 74.390000
Episode 1317: Reward = 67.000000, Mean reward (over 100 episodes) = 74.260000
Episode 1318: Reward = 58.000000, Mean reward (over 100 episodes) = 72.840000
Episode 1319: Reward = 56.000000, Mean reward (over 100 episodes) = 72.770000
Episode 1320: Reward = 38.000000, Mean reward (over 100 episodes) = 72.540000
Episode 1321: Reward = 52.000000, Mean reward (over 100 episode

Episode 1416: Reward = 138.000000, Mean reward (over 100 episodes) = 80.670000
Episode 1417: Reward = 107.000000, Mean reward (over 100 episodes) = 81.070000
Episode 1418: Reward = 93.000000, Mean reward (over 100 episodes) = 81.420000
Episode 1419: Reward = 85.000000, Mean reward (over 100 episodes) = 81.710000
Episode 1420: Reward = 100.000000, Mean reward (over 100 episodes) = 82.330000
Episode 1421: Reward = 79.000000, Mean reward (over 100 episodes) = 82.600000
Episode 1422: Reward = 42.000000, Mean reward (over 100 episodes) = 82.090000
Episode 1423: Reward = 55.000000, Mean reward (over 100 episodes) = 81.700000
Episode 1424: Reward = 110.000000, Mean reward (over 100 episodes) = 82.280000
Episode 1425: Reward = 62.000000, Mean reward (over 100 episodes) = 81.390000
Episode 1426: Reward = 200.000000, Mean reward (over 100 episodes) = 82.420000
Episode 1427: Reward = 96.000000, Mean reward (over 100 episodes) = 82.770000
Episode 1428: Reward = 106.000000, Mean reward (over 100 ep

Episode 1522: Reward = 93.000000, Mean reward (over 100 episodes) = 83.610000
Episode 1523: Reward = 127.000000, Mean reward (over 100 episodes) = 84.330000
Episode 1524: Reward = 73.000000, Mean reward (over 100 episodes) = 83.960000
Episode 1525: Reward = 34.000000, Mean reward (over 100 episodes) = 83.680000
Episode 1526: Reward = 200.000000, Mean reward (over 100 episodes) = 83.680000
Episode 1527: Reward = 53.000000, Mean reward (over 100 episodes) = 83.250000
Episode 1528: Reward = 94.000000, Mean reward (over 100 episodes) = 83.130000
Episode 1529: Reward = 49.000000, Mean reward (over 100 episodes) = 82.400000
Episode 1530: Reward = 37.000000, Mean reward (over 100 episodes) = 81.240000
Episode 1531: Reward = 77.000000, Mean reward (over 100 episodes) = 81.210000
Episode 1532: Reward = 200.000000, Mean reward (over 100 episodes) = 81.210000
Episode 1533: Reward = 76.000000, Mean reward (over 100 episodes) = 81.500000
Episode 1534: Reward = 123.000000, Mean reward (over 100 epis

Episode 1627: Reward = 51.000000, Mean reward (over 100 episodes) = 93.620000
Episode 1628: Reward = 75.000000, Mean reward (over 100 episodes) = 93.430000
Episode 1629: Reward = 76.000000, Mean reward (over 100 episodes) = 93.700000
Episode 1630: Reward = 96.000000, Mean reward (over 100 episodes) = 94.290000
Episode 1631: Reward = 75.000000, Mean reward (over 100 episodes) = 94.270000
Episode 1632: Reward = 74.000000, Mean reward (over 100 episodes) = 93.010000
Episode 1633: Reward = 132.000000, Mean reward (over 100 episodes) = 93.570000
Episode 1634: Reward = 56.000000, Mean reward (over 100 episodes) = 92.900000
Episode 1635: Reward = 88.000000, Mean reward (over 100 episodes) = 92.890000
Episode 1636: Reward = 99.000000, Mean reward (over 100 episodes) = 93.100000
Episode 1637: Reward = 51.000000, Mean reward (over 100 episodes) = 92.740000
Episode 1638: Reward = 84.000000, Mean reward (over 100 episodes) = 92.790000
Episode 1639: Reward = 37.000000, Mean reward (over 100 episode

Episode 1732: Reward = 61.000000, Mean reward (over 100 episodes) = 95.340000
Episode 1733: Reward = 39.000000, Mean reward (over 100 episodes) = 94.410000
Episode 1734: Reward = 84.000000, Mean reward (over 100 episodes) = 94.690000
Episode 1735: Reward = 113.000000, Mean reward (over 100 episodes) = 94.940000
Episode 1736: Reward = 62.000000, Mean reward (over 100 episodes) = 94.570000
Episode 1737: Reward = 102.000000, Mean reward (over 100 episodes) = 95.080000
Episode 1738: Reward = 72.000000, Mean reward (over 100 episodes) = 94.960000
Episode 1739: Reward = 48.000000, Mean reward (over 100 episodes) = 95.070000
Episode 1740: Reward = 108.000000, Mean reward (over 100 episodes) = 95.370000
Episode 1741: Reward = 62.000000, Mean reward (over 100 episodes) = 95.520000
Episode 1742: Reward = 132.000000, Mean reward (over 100 episodes) = 96.350000
Episode 1743: Reward = 200.000000, Mean reward (over 100 episodes) = 96.830000
Episode 1744: Reward = 175.000000, Mean reward (over 100 ep

Episode 1837: Reward = 90.000000, Mean reward (over 100 episodes) = 106.160000
Episode 1838: Reward = 65.000000, Mean reward (over 100 episodes) = 106.090000
Episode 1839: Reward = 100.000000, Mean reward (over 100 episodes) = 106.610000
Episode 1840: Reward = 77.000000, Mean reward (over 100 episodes) = 106.300000
Episode 1841: Reward = 116.000000, Mean reward (over 100 episodes) = 106.840000
Episode 1842: Reward = 169.000000, Mean reward (over 100 episodes) = 107.210000
Episode 1843: Reward = 44.000000, Mean reward (over 100 episodes) = 105.650000
Episode 1844: Reward = 50.000000, Mean reward (over 100 episodes) = 104.400000
Episode 1845: Reward = 48.000000, Mean reward (over 100 episodes) = 104.130000
Episode 1846: Reward = 200.000000, Mean reward (over 100 episodes) = 104.510000
Episode 1847: Reward = 58.000000, Mean reward (over 100 episodes) = 103.460000
Episode 1848: Reward = 59.000000, Mean reward (over 100 episodes) = 103.450000
Episode 1849: Reward = 105.000000, Mean reward (

Episode 1940: Reward = 172.000000, Mean reward (over 100 episodes) = 123.880000
Episode 1941: Reward = 98.000000, Mean reward (over 100 episodes) = 123.700000
Episode 1942: Reward = 77.000000, Mean reward (over 100 episodes) = 122.780000
Episode 1943: Reward = 136.000000, Mean reward (over 100 episodes) = 123.700000
Episode 1944: Reward = 200.000000, Mean reward (over 100 episodes) = 125.200000
Episode 1945: Reward = 200.000000, Mean reward (over 100 episodes) = 126.720000
Episode 1946: Reward = 81.000000, Mean reward (over 100 episodes) = 125.530000
Episode 1947: Reward = 118.000000, Mean reward (over 100 episodes) = 126.130000
Episode 1948: Reward = 125.000000, Mean reward (over 100 episodes) = 126.790000
Episode 1949: Reward = 157.000000, Mean reward (over 100 episodes) = 127.310000
Episode 1950: Reward = 88.000000, Mean reward (over 100 episodes) = 126.540000
Episode 1951: Reward = 117.000000, Mean reward (over 100 episodes) = 127.080000
Episode 1952: Reward = 97.000000, Mean rewar

Episode 2043: Reward = 200.000000, Mean reward (over 100 episodes) = 131.000000
Episode 2044: Reward = 85.000000, Mean reward (over 100 episodes) = 129.850000
Episode 2045: Reward = 123.000000, Mean reward (over 100 episodes) = 129.080000
Episode 2046: Reward = 200.000000, Mean reward (over 100 episodes) = 130.270000
Episode 2047: Reward = 120.000000, Mean reward (over 100 episodes) = 130.290000
Episode 2048: Reward = 122.000000, Mean reward (over 100 episodes) = 130.260000
Episode 2049: Reward = 200.000000, Mean reward (over 100 episodes) = 130.690000
Episode 2050: Reward = 90.000000, Mean reward (over 100 episodes) = 130.710000
Episode 2051: Reward = 78.000000, Mean reward (over 100 episodes) = 130.320000
Episode 2052: Reward = 126.000000, Mean reward (over 100 episodes) = 130.610000
Episode 2053: Reward = 132.000000, Mean reward (over 100 episodes) = 130.440000
Episode 2054: Reward = 164.000000, Mean reward (over 100 episodes) = 130.400000
Episode 2055: Reward = 142.000000, Mean rew

Episode 2146: Reward = 60.000000, Mean reward (over 100 episodes) = 158.160000
Episode 2147: Reward = 98.000000, Mean reward (over 100 episodes) = 157.940000
Episode 2148: Reward = 146.000000, Mean reward (over 100 episodes) = 158.180000
Episode 2149: Reward = 127.000000, Mean reward (over 100 episodes) = 157.450000
Episode 2150: Reward = 80.000000, Mean reward (over 100 episodes) = 157.350000
Episode 2151: Reward = 153.000000, Mean reward (over 100 episodes) = 158.100000
Episode 2152: Reward = 200.000000, Mean reward (over 100 episodes) = 158.840000
Episode 2153: Reward = 151.000000, Mean reward (over 100 episodes) = 159.030000
Episode 2154: Reward = 75.000000, Mean reward (over 100 episodes) = 158.140000
Episode 2155: Reward = 200.000000, Mean reward (over 100 episodes) = 158.720000
Episode 2156: Reward = 172.000000, Mean reward (over 100 episodes) = 158.440000
Episode 2157: Reward = 104.000000, Mean reward (over 100 episodes) = 157.480000
Episode 2158: Reward = 127.000000, Mean rewa

Episode 2249: Reward = 98.000000, Mean reward (over 100 episodes) = 149.940000
Episode 2250: Reward = 168.000000, Mean reward (over 100 episodes) = 150.820000
Episode 2251: Reward = 135.000000, Mean reward (over 100 episodes) = 150.640000
Episode 2252: Reward = 157.000000, Mean reward (over 100 episodes) = 150.210000
Episode 2253: Reward = 200.000000, Mean reward (over 100 episodes) = 150.700000
Episode 2254: Reward = 128.000000, Mean reward (over 100 episodes) = 151.230000
Episode 2255: Reward = 136.000000, Mean reward (over 100 episodes) = 150.590000
Episode 2256: Reward = 120.000000, Mean reward (over 100 episodes) = 150.070000
Episode 2257: Reward = 200.000000, Mean reward (over 100 episodes) = 151.030000
Episode 2258: Reward = 73.000000, Mean reward (over 100 episodes) = 150.490000
Episode 2259: Reward = 152.000000, Mean reward (over 100 episodes) = 151.030000
Episode 2260: Reward = 200.000000, Mean reward (over 100 episodes) = 151.870000
Episode 2261: Reward = 200.000000, Mean re

Episode 2353: Reward = 104.000000, Mean reward (over 100 episodes) = 165.770000
Episode 2354: Reward = 163.000000, Mean reward (over 100 episodes) = 166.120000
Episode 2355: Reward = 200.000000, Mean reward (over 100 episodes) = 166.760000
Episode 2356: Reward = 200.000000, Mean reward (over 100 episodes) = 167.560000
Episode 2357: Reward = 200.000000, Mean reward (over 100 episodes) = 167.560000
Episode 2358: Reward = 151.000000, Mean reward (over 100 episodes) = 168.340000
Episode 2359: Reward = 141.000000, Mean reward (over 100 episodes) = 168.230000
Episode 2360: Reward = 172.000000, Mean reward (over 100 episodes) = 167.950000
Episode 2361: Reward = 141.000000, Mean reward (over 100 episodes) = 167.360000
Episode 2362: Reward = 171.000000, Mean reward (over 100 episodes) = 167.460000
Episode 2363: Reward = 145.000000, Mean reward (over 100 episodes) = 167.620000
Episode 2364: Reward = 200.000000, Mean reward (over 100 episodes) = 167.910000
Episode 2365: Reward = 124.000000, Mean 

Episode 2457: Reward = 142.000000, Mean reward (over 100 episodes) = 164.650000
Episode 2458: Reward = 146.000000, Mean reward (over 100 episodes) = 164.600000
Episode 2459: Reward = 104.000000, Mean reward (over 100 episodes) = 164.230000
Episode 2460: Reward = 140.000000, Mean reward (over 100 episodes) = 163.910000
Episode 2461: Reward = 130.000000, Mean reward (over 100 episodes) = 163.800000
Episode 2462: Reward = 200.000000, Mean reward (over 100 episodes) = 164.090000
Episode 2463: Reward = 115.000000, Mean reward (over 100 episodes) = 163.790000
Episode 2464: Reward = 200.000000, Mean reward (over 100 episodes) = 163.790000
Episode 2465: Reward = 200.000000, Mean reward (over 100 episodes) = 164.550000
Episode 2466: Reward = 112.000000, Mean reward (over 100 episodes) = 163.670000
Episode 2467: Reward = 115.000000, Mean reward (over 100 episodes) = 163.250000
Episode 2468: Reward = 200.000000, Mean reward (over 100 episodes) = 163.250000
Episode 2469: Reward = 200.000000, Mean 

Episode 2560: Reward = 200.000000, Mean reward (over 100 episodes) = 163.550000
Episode 2561: Reward = 200.000000, Mean reward (over 100 episodes) = 164.250000
Episode 2562: Reward = 200.000000, Mean reward (over 100 episodes) = 164.250000
Episode 2563: Reward = 158.000000, Mean reward (over 100 episodes) = 164.680000
Episode 2564: Reward = 200.000000, Mean reward (over 100 episodes) = 164.680000
Episode 2565: Reward = 108.000000, Mean reward (over 100 episodes) = 163.760000
Episode 2566: Reward = 155.000000, Mean reward (over 100 episodes) = 164.190000
Episode 2567: Reward = 143.000000, Mean reward (over 100 episodes) = 164.470000
Episode 2568: Reward = 200.000000, Mean reward (over 100 episodes) = 164.470000
Episode 2569: Reward = 200.000000, Mean reward (over 100 episodes) = 164.470000
Episode 2570: Reward = 118.000000, Mean reward (over 100 episodes) = 163.650000
Episode 2571: Reward = 200.000000, Mean reward (over 100 episodes) = 164.350000
Episode 2572: Reward = 200.000000, Mean 

Episode 2663: Reward = 200.000000, Mean reward (over 100 episodes) = 166.440000
Episode 2664: Reward = 200.000000, Mean reward (over 100 episodes) = 166.440000
Episode 2665: Reward = 200.000000, Mean reward (over 100 episodes) = 167.360000
Episode 2666: Reward = 200.000000, Mean reward (over 100 episodes) = 167.810000
Episode 2667: Reward = 200.000000, Mean reward (over 100 episodes) = 168.380000
Episode 2668: Reward = 159.000000, Mean reward (over 100 episodes) = 167.970000
Episode 2669: Reward = 200.000000, Mean reward (over 100 episodes) = 167.970000
Episode 2670: Reward = 144.000000, Mean reward (over 100 episodes) = 168.230000
Episode 2671: Reward = 120.000000, Mean reward (over 100 episodes) = 167.430000
Episode 2672: Reward = 172.000000, Mean reward (over 100 episodes) = 167.150000
Episode 2673: Reward = 143.000000, Mean reward (over 100 episodes) = 166.580000
Episode 2674: Reward = 151.000000, Mean reward (over 100 episodes) = 166.670000
Episode 2675: Reward = 175.000000, Mean 

Episode 2766: Reward = 200.000000, Mean reward (over 100 episodes) = 171.210000
Episode 2767: Reward = 161.000000, Mean reward (over 100 episodes) = 170.820000
Episode 2768: Reward = 111.000000, Mean reward (over 100 episodes) = 170.340000
Episode 2769: Reward = 200.000000, Mean reward (over 100 episodes) = 170.340000
Episode 2770: Reward = 105.000000, Mean reward (over 100 episodes) = 169.950000
Episode 2771: Reward = 200.000000, Mean reward (over 100 episodes) = 170.750000
Episode 2772: Reward = 161.000000, Mean reward (over 100 episodes) = 170.640000
Episode 2773: Reward = 200.000000, Mean reward (over 100 episodes) = 171.210000
Episode 2774: Reward = 200.000000, Mean reward (over 100 episodes) = 171.700000
Episode 2775: Reward = 200.000000, Mean reward (over 100 episodes) = 171.950000
Episode 2776: Reward = 200.000000, Mean reward (over 100 episodes) = 171.950000
Episode 2777: Reward = 123.000000, Mean reward (over 100 episodes) = 171.180000
Episode 2778: Reward = 143.000000, Mean 

Episode 2869: Reward = 159.000000, Mean reward (over 100 episodes) = 170.250000
Episode 2870: Reward = 133.000000, Mean reward (over 100 episodes) = 170.530000
Episode 2871: Reward = 138.000000, Mean reward (over 100 episodes) = 169.910000
Episode 2872: Reward = 200.000000, Mean reward (over 100 episodes) = 170.300000
Episode 2873: Reward = 162.000000, Mean reward (over 100 episodes) = 169.920000
Episode 2874: Reward = 200.000000, Mean reward (over 100 episodes) = 169.920000
Episode 2875: Reward = 163.000000, Mean reward (over 100 episodes) = 169.550000
Episode 2876: Reward = 147.000000, Mean reward (over 100 episodes) = 169.020000
Episode 2877: Reward = 200.000000, Mean reward (over 100 episodes) = 169.790000
Episode 2878: Reward = 133.000000, Mean reward (over 100 episodes) = 169.690000
Episode 2879: Reward = 200.000000, Mean reward (over 100 episodes) = 169.690000
Episode 2880: Reward = 127.000000, Mean reward (over 100 episodes) = 169.420000
Episode 2881: Reward = 200.000000, Mean 

Episode 2973: Reward = 200.000000, Mean reward (over 100 episodes) = 172.460000
Episode 2974: Reward = 200.000000, Mean reward (over 100 episodes) = 172.460000
Episode 2975: Reward = 200.000000, Mean reward (over 100 episodes) = 172.830000
Episode 2976: Reward = 148.000000, Mean reward (over 100 episodes) = 172.840000
Episode 2977: Reward = 122.000000, Mean reward (over 100 episodes) = 172.060000
Episode 2978: Reward = 152.000000, Mean reward (over 100 episodes) = 172.250000
Episode 2979: Reward = 120.000000, Mean reward (over 100 episodes) = 171.450000
Episode 2980: Reward = 200.000000, Mean reward (over 100 episodes) = 172.180000
Episode 2981: Reward = 137.000000, Mean reward (over 100 episodes) = 171.550000
Episode 2982: Reward = 162.000000, Mean reward (over 100 episodes) = 171.170000
Episode 2983: Reward = 200.000000, Mean reward (over 100 episodes) = 171.420000
Episode 2984: Reward = 145.000000, Mean reward (over 100 episodes) = 171.590000
Episode 2985: Reward = 200.000000, Mean 

Episode 3077: Reward = 117.000000, Mean reward (over 100 episodes) = 157.930000
Episode 3078: Reward = 107.000000, Mean reward (over 100 episodes) = 157.480000
Episode 3079: Reward = 200.000000, Mean reward (over 100 episodes) = 158.280000
Episode 3080: Reward = 162.000000, Mean reward (over 100 episodes) = 157.900000
Episode 3081: Reward = 116.000000, Mean reward (over 100 episodes) = 157.690000
Episode 3082: Reward = 200.000000, Mean reward (over 100 episodes) = 158.070000
Episode 3083: Reward = 200.000000, Mean reward (over 100 episodes) = 158.070000
Episode 3084: Reward = 200.000000, Mean reward (over 100 episodes) = 158.620000
Episode 3085: Reward = 200.000000, Mean reward (over 100 episodes) = 158.620000
Episode 3086: Reward = 166.000000, Mean reward (over 100 episodes) = 158.280000
Episode 3087: Reward = 167.000000, Mean reward (over 100 episodes) = 157.950000
Episode 3088: Reward = 101.000000, Mean reward (over 100 episodes) = 156.960000
Episode 3089: Reward = 200.000000, Mean 

Episode 3181: Reward = 141.000000, Mean reward (over 100 episodes) = 149.670000
Episode 3182: Reward = 94.000000, Mean reward (over 100 episodes) = 148.610000
Episode 3183: Reward = 107.000000, Mean reward (over 100 episodes) = 147.680000
Episode 3184: Reward = 114.000000, Mean reward (over 100 episodes) = 146.820000
Episode 3185: Reward = 98.000000, Mean reward (over 100 episodes) = 145.800000
Episode 3186: Reward = 144.000000, Mean reward (over 100 episodes) = 145.580000
Episode 3187: Reward = 160.000000, Mean reward (over 100 episodes) = 145.510000
Episode 3188: Reward = 139.000000, Mean reward (over 100 episodes) = 145.890000
Episode 3189: Reward = 148.000000, Mean reward (over 100 episodes) = 145.370000
Episode 3190: Reward = 110.000000, Mean reward (over 100 episodes) = 144.470000
Episode 3191: Reward = 118.000000, Mean reward (over 100 episodes) = 144.550000
Episode 3192: Reward = 107.000000, Mean reward (over 100 episodes) = 143.860000
Episode 3193: Reward = 162.000000, Mean re

Episode 3284: Reward = 141.000000, Mean reward (over 100 episodes) = 159.740000
Episode 3285: Reward = 121.000000, Mean reward (over 100 episodes) = 159.970000
Episode 3286: Reward = 200.000000, Mean reward (over 100 episodes) = 160.530000
Episode 3287: Reward = 200.000000, Mean reward (over 100 episodes) = 160.930000
Episode 3288: Reward = 141.000000, Mean reward (over 100 episodes) = 160.950000
Episode 3289: Reward = 173.000000, Mean reward (over 100 episodes) = 161.200000
Episode 3290: Reward = 200.000000, Mean reward (over 100 episodes) = 162.100000
Episode 3291: Reward = 126.000000, Mean reward (over 100 episodes) = 162.180000
Episode 3292: Reward = 136.000000, Mean reward (over 100 episodes) = 162.470000
Episode 3293: Reward = 200.000000, Mean reward (over 100 episodes) = 162.850000
Episode 3294: Reward = 120.000000, Mean reward (over 100 episodes) = 162.920000
Episode 3295: Reward = 117.000000, Mean reward (over 100 episodes) = 162.090000
Episode 3296: Reward = 200.000000, Mean 

Episode 3388: Reward = 138.000000, Mean reward (over 100 episodes) = 168.000000
Episode 3389: Reward = 200.000000, Mean reward (over 100 episodes) = 168.270000
Episode 3390: Reward = 173.000000, Mean reward (over 100 episodes) = 168.000000
Episode 3391: Reward = 123.000000, Mean reward (over 100 episodes) = 167.970000
Episode 3392: Reward = 134.000000, Mean reward (over 100 episodes) = 167.950000
Episode 3393: Reward = 200.000000, Mean reward (over 100 episodes) = 167.950000
Episode 3394: Reward = 200.000000, Mean reward (over 100 episodes) = 168.750000
Episode 3395: Reward = 155.000000, Mean reward (over 100 episodes) = 169.130000
Episode 3396: Reward = 157.000000, Mean reward (over 100 episodes) = 168.700000
Episode 3397: Reward = 125.000000, Mean reward (over 100 episodes) = 167.950000
Episode 3398: Reward = 148.000000, Mean reward (over 100 episodes) = 167.430000
Episode 3399: Reward = 200.000000, Mean reward (over 100 episodes) = 168.110000
Episode 3400: Reward = 200.000000, Mean 

Episode 3492: Reward = 200.000000, Mean reward (over 100 episodes) = 172.010000
Episode 3493: Reward = 200.000000, Mean reward (over 100 episodes) = 172.010000
Episode 3494: Reward = 200.000000, Mean reward (over 100 episodes) = 172.010000
Episode 3495: Reward = 200.000000, Mean reward (over 100 episodes) = 172.460000
Episode 3496: Reward = 200.000000, Mean reward (over 100 episodes) = 172.890000
Episode 3497: Reward = 200.000000, Mean reward (over 100 episodes) = 173.640000
Episode 3498: Reward = 200.000000, Mean reward (over 100 episodes) = 174.160000
Episode 3499: Reward = 200.000000, Mean reward (over 100 episodes) = 174.160000
Episode 3500: Reward = 200.000000, Mean reward (over 100 episodes) = 174.160000
Episode 3501: Reward = 200.000000, Mean reward (over 100 episodes) = 174.160000
Episode 3502: Reward = 118.000000, Mean reward (over 100 episodes) = 174.250000
Episode 3503: Reward = 200.000000, Mean reward (over 100 episodes) = 174.570000
Episode 3504: Reward = 140.000000, Mean 

Episode 3596: Reward = 200.000000, Mean reward (over 100 episodes) = 170.950000
Episode 3597: Reward = 121.000000, Mean reward (over 100 episodes) = 170.160000
Episode 3598: Reward = 200.000000, Mean reward (over 100 episodes) = 170.160000
Episode 3599: Reward = 141.000000, Mean reward (over 100 episodes) = 169.570000
Episode 3600: Reward = 117.000000, Mean reward (over 100 episodes) = 168.740000
Episode 3601: Reward = 164.000000, Mean reward (over 100 episodes) = 168.380000
Episode 3602: Reward = 139.000000, Mean reward (over 100 episodes) = 168.590000
Episode 3603: Reward = 127.000000, Mean reward (over 100 episodes) = 167.860000
Episode 3604: Reward = 131.000000, Mean reward (over 100 episodes) = 167.770000
Episode 3605: Reward = 120.000000, Mean reward (over 100 episodes) = 166.970000
Episode 3606: Reward = 92.000000, Mean reward (over 100 episodes) = 165.890000
Episode 3607: Reward = 115.000000, Mean reward (over 100 episodes) = 165.740000
Episode 3608: Reward = 200.000000, Mean r

Episode 3700: Reward = 137.000000, Mean reward (over 100 episodes) = 160.910000
Episode 3701: Reward = 154.000000, Mean reward (over 100 episodes) = 160.810000
Episode 3702: Reward = 200.000000, Mean reward (over 100 episodes) = 161.420000
Episode 3703: Reward = 133.000000, Mean reward (over 100 episodes) = 161.480000
Episode 3704: Reward = 134.000000, Mean reward (over 100 episodes) = 161.510000
Episode 3705: Reward = 90.000000, Mean reward (over 100 episodes) = 161.210000
Episode 3706: Reward = 125.000000, Mean reward (over 100 episodes) = 161.540000
Episode 3707: Reward = 200.000000, Mean reward (over 100 episodes) = 162.390000
Episode 3708: Reward = 200.000000, Mean reward (over 100 episodes) = 162.390000
Episode 3709: Reward = 117.000000, Mean reward (over 100 episodes) = 162.230000
Episode 3710: Reward = 140.000000, Mean reward (over 100 episodes) = 162.030000
Episode 3711: Reward = 124.000000, Mean reward (over 100 episodes) = 161.650000
Episode 3712: Reward = 200.000000, Mean r

Episode 3804: Reward = 115.000000, Mean reward (over 100 episodes) = 162.050000
Episode 3805: Reward = 173.000000, Mean reward (over 100 episodes) = 162.880000
Episode 3806: Reward = 200.000000, Mean reward (over 100 episodes) = 163.630000
Episode 3807: Reward = 200.000000, Mean reward (over 100 episodes) = 163.630000
Episode 3808: Reward = 112.000000, Mean reward (over 100 episodes) = 162.750000
Episode 3809: Reward = 176.000000, Mean reward (over 100 episodes) = 163.340000
Episode 3810: Reward = 200.000000, Mean reward (over 100 episodes) = 163.940000
Episode 3811: Reward = 163.000000, Mean reward (over 100 episodes) = 164.330000
Episode 3812: Reward = 123.000000, Mean reward (over 100 episodes) = 163.560000
Episode 3813: Reward = 200.000000, Mean reward (over 100 episodes) = 163.560000
Episode 3814: Reward = 153.000000, Mean reward (over 100 episodes) = 163.090000
Episode 3815: Reward = 174.000000, Mean reward (over 100 episodes) = 163.810000
Episode 3816: Reward = 200.000000, Mean 

Episode 3908: Reward = 200.000000, Mean reward (over 100 episodes) = 169.910000
Episode 3909: Reward = 200.000000, Mean reward (over 100 episodes) = 170.150000
Episode 3910: Reward = 200.000000, Mean reward (over 100 episodes) = 170.150000
Episode 3911: Reward = 200.000000, Mean reward (over 100 episodes) = 170.520000
Episode 3912: Reward = 165.000000, Mean reward (over 100 episodes) = 170.940000
Episode 3913: Reward = 138.000000, Mean reward (over 100 episodes) = 170.320000
Episode 3914: Reward = 200.000000, Mean reward (over 100 episodes) = 170.790000
Episode 3915: Reward = 159.000000, Mean reward (over 100 episodes) = 170.640000
Episode 3916: Reward = 200.000000, Mean reward (over 100 episodes) = 170.640000
Episode 3917: Reward = 117.000000, Mean reward (over 100 episodes) = 169.810000
Episode 3918: Reward = 158.000000, Mean reward (over 100 episodes) = 169.390000
Episode 3919: Reward = 175.000000, Mean reward (over 100 episodes) = 170.050000
Episode 3920: Reward = 200.000000, Mean 

Episode 4011: Reward = 200.000000, Mean reward (over 100 episodes) = 168.100000
Episode 4012: Reward = 112.000000, Mean reward (over 100 episodes) = 167.570000
Episode 4013: Reward = 169.000000, Mean reward (over 100 episodes) = 167.880000
Episode 4014: Reward = 166.000000, Mean reward (over 100 episodes) = 167.540000
Episode 4015: Reward = 200.000000, Mean reward (over 100 episodes) = 167.950000
Episode 4016: Reward = 173.000000, Mean reward (over 100 episodes) = 167.680000
Episode 4017: Reward = 200.000000, Mean reward (over 100 episodes) = 168.510000
Episode 4018: Reward = 143.000000, Mean reward (over 100 episodes) = 168.360000
Episode 4019: Reward = 200.000000, Mean reward (over 100 episodes) = 168.610000
Episode 4020: Reward = 200.000000, Mean reward (over 100 episodes) = 168.610000
Episode 4021: Reward = 142.000000, Mean reward (over 100 episodes) = 168.030000
Episode 4022: Reward = 200.000000, Mean reward (over 100 episodes) = 168.630000
Episode 4023: Reward = 155.000000, Mean 

Episode 4115: Reward = 200.000000, Mean reward (over 100 episodes) = 169.290000
Episode 4116: Reward = 200.000000, Mean reward (over 100 episodes) = 169.560000
Episode 4117: Reward = 174.000000, Mean reward (over 100 episodes) = 169.300000
Episode 4118: Reward = 200.000000, Mean reward (over 100 episodes) = 169.870000
Episode 4119: Reward = 200.000000, Mean reward (over 100 episodes) = 169.870000
Episode 4120: Reward = 149.000000, Mean reward (over 100 episodes) = 169.360000
Episode 4121: Reward = 200.000000, Mean reward (over 100 episodes) = 169.940000
Episode 4122: Reward = 200.000000, Mean reward (over 100 episodes) = 169.940000
Episode 4123: Reward = 200.000000, Mean reward (over 100 episodes) = 170.390000
Episode 4124: Reward = 200.000000, Mean reward (over 100 episodes) = 171.250000
Episode 4125: Reward = 200.000000, Mean reward (over 100 episodes) = 171.250000
Episode 4126: Reward = 200.000000, Mean reward (over 100 episodes) = 171.250000
Episode 4127: Reward = 200.000000, Mean 

Episode 4218: Reward = 163.000000, Mean reward (over 100 episodes) = 179.660000
Episode 4219: Reward = 200.000000, Mean reward (over 100 episodes) = 179.660000
Episode 4220: Reward = 160.000000, Mean reward (over 100 episodes) = 179.770000
Episode 4221: Reward = 137.000000, Mean reward (over 100 episodes) = 179.140000
Episode 4222: Reward = 200.000000, Mean reward (over 100 episodes) = 179.140000
Episode 4223: Reward = 200.000000, Mean reward (over 100 episodes) = 179.140000
Episode 4224: Reward = 163.000000, Mean reward (over 100 episodes) = 178.770000
Episode 4225: Reward = 153.000000, Mean reward (over 100 episodes) = 178.300000
Episode 4226: Reward = 200.000000, Mean reward (over 100 episodes) = 178.300000
Episode 4227: Reward = 200.000000, Mean reward (over 100 episodes) = 178.300000
Episode 4228: Reward = 200.000000, Mean reward (over 100 episodes) = 178.580000
Episode 4229: Reward = 166.000000, Mean reward (over 100 episodes) = 178.490000
Episode 4230: Reward = 200.000000, Mean 

Episode 4321: Reward = 200.000000, Mean reward (over 100 episodes) = 182.430000
Episode 4322: Reward = 175.000000, Mean reward (over 100 episodes) = 182.180000
Episode 4323: Reward = 200.000000, Mean reward (over 100 episodes) = 182.180000
Episode 4324: Reward = 168.000000, Mean reward (over 100 episodes) = 182.230000
Episode 4325: Reward = 200.000000, Mean reward (over 100 episodes) = 182.700000
Episode 4326: Reward = 176.000000, Mean reward (over 100 episodes) = 182.460000
Episode 4327: Reward = 134.000000, Mean reward (over 100 episodes) = 181.800000
Episode 4328: Reward = 200.000000, Mean reward (over 100 episodes) = 181.800000
Episode 4329: Reward = 200.000000, Mean reward (over 100 episodes) = 182.140000
Episode 4330: Reward = 130.000000, Mean reward (over 100 episodes) = 181.440000
Episode 4331: Reward = 165.000000, Mean reward (over 100 episodes) = 181.750000
Episode 4332: Reward = 200.000000, Mean reward (over 100 episodes) = 182.070000
Episode 4333: Reward = 200.000000, Mean 

Episode 4425: Reward = 200.000000, Mean reward (over 100 episodes) = 179.650000
Episode 4426: Reward = 157.000000, Mean reward (over 100 episodes) = 179.460000
Episode 4427: Reward = 155.000000, Mean reward (over 100 episodes) = 179.670000
Episode 4428: Reward = 200.000000, Mean reward (over 100 episodes) = 179.670000
Episode 4429: Reward = 200.000000, Mean reward (over 100 episodes) = 179.670000
Episode 4430: Reward = 151.000000, Mean reward (over 100 episodes) = 179.880000
Episode 4431: Reward = 133.000000, Mean reward (over 100 episodes) = 179.560000
Episode 4432: Reward = 200.000000, Mean reward (over 100 episodes) = 179.560000
Episode 4433: Reward = 178.000000, Mean reward (over 100 episodes) = 179.340000
Episode 4434: Reward = 200.000000, Mean reward (over 100 episodes) = 179.340000
Episode 4435: Reward = 200.000000, Mean reward (over 100 episodes) = 179.340000
Episode 4436: Reward = 159.000000, Mean reward (over 100 episodes) = 179.530000
Episode 4437: Reward = 200.000000, Mean 

Episode 4528: Reward = 117.000000, Mean reward (over 100 episodes) = 177.810000
Episode 4529: Reward = 200.000000, Mean reward (over 100 episodes) = 177.810000
Episode 4530: Reward = 200.000000, Mean reward (over 100 episodes) = 178.300000
Episode 4531: Reward = 151.000000, Mean reward (over 100 episodes) = 178.480000
Episode 4532: Reward = 200.000000, Mean reward (over 100 episodes) = 178.480000
Episode 4533: Reward = 135.000000, Mean reward (over 100 episodes) = 178.050000
Episode 4534: Reward = 200.000000, Mean reward (over 100 episodes) = 178.050000
Episode 4535: Reward = 138.000000, Mean reward (over 100 episodes) = 177.430000
Episode 4536: Reward = 151.000000, Mean reward (over 100 episodes) = 177.350000
Episode 4537: Reward = 200.000000, Mean reward (over 100 episodes) = 177.350000
Episode 4538: Reward = 162.000000, Mean reward (over 100 episodes) = 176.970000
Episode 4539: Reward = 200.000000, Mean reward (over 100 episodes) = 177.370000
Episode 4540: Reward = 133.000000, Mean 

Episode 4632: Reward = 200.000000, Mean reward (over 100 episodes) = 170.140000
Episode 4633: Reward = 139.000000, Mean reward (over 100 episodes) = 170.180000
Episode 4634: Reward = 96.000000, Mean reward (over 100 episodes) = 169.140000
Episode 4635: Reward = 134.000000, Mean reward (over 100 episodes) = 169.100000
Episode 4636: Reward = 200.000000, Mean reward (over 100 episodes) = 169.590000
Episode 4637: Reward = 128.000000, Mean reward (over 100 episodes) = 168.870000
Episode 4638: Reward = 156.000000, Mean reward (over 100 episodes) = 168.810000
Episode 4639: Reward = 158.000000, Mean reward (over 100 episodes) = 168.390000
Episode 4640: Reward = 165.000000, Mean reward (over 100 episodes) = 168.710000
Episode 4641: Reward = 106.000000, Mean reward (over 100 episodes) = 167.770000
Episode 4642: Reward = 169.000000, Mean reward (over 100 episodes) = 167.780000
Episode 4643: Reward = 200.000000, Mean reward (over 100 episodes) = 168.080000
Episode 4644: Reward = 161.000000, Mean r

Episode 4736: Reward = 178.000000, Mean reward (over 100 episodes) = 169.070000
Episode 4737: Reward = 151.000000, Mean reward (over 100 episodes) = 169.300000
Episode 4738: Reward = 200.000000, Mean reward (over 100 episodes) = 169.740000
Episode 4739: Reward = 200.000000, Mean reward (over 100 episodes) = 170.160000
Episode 4740: Reward = 200.000000, Mean reward (over 100 episodes) = 170.510000
Episode 4741: Reward = 144.000000, Mean reward (over 100 episodes) = 170.890000
Episode 4742: Reward = 178.000000, Mean reward (over 100 episodes) = 170.980000
Episode 4743: Reward = 200.000000, Mean reward (over 100 episodes) = 170.980000
Episode 4744: Reward = 200.000000, Mean reward (over 100 episodes) = 171.370000
Episode 4745: Reward = 131.000000, Mean reward (over 100 episodes) = 171.230000
Episode 4746: Reward = 200.000000, Mean reward (over 100 episodes) = 171.730000
Episode 4747: Reward = 200.000000, Mean reward (over 100 episodes) = 172.320000
Episode 4748: Reward = 200.000000, Mean 

Episode 4839: Reward = 130.000000, Mean reward (over 100 episodes) = 176.870000
Episode 4840: Reward = 200.000000, Mean reward (over 100 episodes) = 176.870000
Episode 4841: Reward = 200.000000, Mean reward (over 100 episodes) = 177.430000
Episode 4842: Reward = 171.000000, Mean reward (over 100 episodes) = 177.360000
Episode 4843: Reward = 200.000000, Mean reward (over 100 episodes) = 177.360000
Episode 4844: Reward = 200.000000, Mean reward (over 100 episodes) = 177.360000
Episode 4845: Reward = 200.000000, Mean reward (over 100 episodes) = 178.050000
Episode 4846: Reward = 124.000000, Mean reward (over 100 episodes) = 177.290000
Episode 4847: Reward = 200.000000, Mean reward (over 100 episodes) = 177.290000
Episode 4848: Reward = 173.000000, Mean reward (over 100 episodes) = 177.020000
Episode 4849: Reward = 143.000000, Mean reward (over 100 episodes) = 176.450000
Episode 4850: Reward = 200.000000, Mean reward (over 100 episodes) = 176.450000
Episode 4851: Reward = 113.000000, Mean 

Episode 4943: Reward = 200.000000, Mean reward (over 100 episodes) = 177.680000
Episode 4944: Reward = 119.000000, Mean reward (over 100 episodes) = 176.870000
Episode 4945: Reward = 200.000000, Mean reward (over 100 episodes) = 176.870000
Episode 4946: Reward = 133.000000, Mean reward (over 100 episodes) = 176.960000
Episode 4947: Reward = 158.000000, Mean reward (over 100 episodes) = 176.540000
Episode 4948: Reward = 200.000000, Mean reward (over 100 episodes) = 176.810000
Episode 4949: Reward = 168.000000, Mean reward (over 100 episodes) = 177.060000
Episode 4950: Reward = 174.000000, Mean reward (over 100 episodes) = 176.800000
Episode 4951: Reward = 200.000000, Mean reward (over 100 episodes) = 177.670000
Episode 4952: Reward = 124.000000, Mean reward (over 100 episodes) = 176.910000
Episode 4953: Reward = 134.000000, Mean reward (over 100 episodes) = 176.650000
Episode 4954: Reward = 123.000000, Mean reward (over 100 episodes) = 175.880000
Episode 4955: Reward = 148.000000, Mean 

Episode 5047: Reward = 176.000000, Mean reward (over 100 episodes) = 168.930000
Episode 5048: Reward = 123.000000, Mean reward (over 100 episodes) = 168.160000
Episode 5049: Reward = 200.000000, Mean reward (over 100 episodes) = 168.480000
Episode 5050: Reward = 200.000000, Mean reward (over 100 episodes) = 168.740000
Episode 5051: Reward = 110.000000, Mean reward (over 100 episodes) = 167.840000
Episode 5052: Reward = 119.000000, Mean reward (over 100 episodes) = 167.790000
Episode 5053: Reward = 140.000000, Mean reward (over 100 episodes) = 167.850000
Episode 5054: Reward = 135.000000, Mean reward (over 100 episodes) = 167.970000
Episode 5055: Reward = 200.000000, Mean reward (over 100 episodes) = 168.490000
Episode 5056: Reward = 165.000000, Mean reward (over 100 episodes) = 168.140000
Episode 5057: Reward = 144.000000, Mean reward (over 100 episodes) = 167.580000
Episode 5058: Reward = 153.000000, Mean reward (over 100 episodes) = 167.830000
Episode 5059: Reward = 153.000000, Mean 

Episode 5150: Reward = 200.000000, Mean reward (over 100 episodes) = 164.000000
Episode 5151: Reward = 107.000000, Mean reward (over 100 episodes) = 163.970000
Episode 5152: Reward = 127.000000, Mean reward (over 100 episodes) = 164.050000
Episode 5153: Reward = 125.000000, Mean reward (over 100 episodes) = 163.900000
Episode 5154: Reward = 200.000000, Mean reward (over 100 episodes) = 164.550000
Episode 5155: Reward = 130.000000, Mean reward (over 100 episodes) = 163.850000
Episode 5156: Reward = 200.000000, Mean reward (over 100 episodes) = 164.200000
Episode 5157: Reward = 200.000000, Mean reward (over 100 episodes) = 164.760000
Episode 5158: Reward = 200.000000, Mean reward (over 100 episodes) = 165.230000
Episode 5159: Reward = 140.000000, Mean reward (over 100 episodes) = 165.100000
Episode 5160: Reward = 169.000000, Mean reward (over 100 episodes) = 164.790000
Episode 5161: Reward = 154.000000, Mean reward (over 100 episodes) = 165.020000
Episode 5162: Reward = 200.000000, Mean 

Episode 5253: Reward = 176.000000, Mean reward (over 100 episodes) = 161.850000
Episode 5254: Reward = 150.000000, Mean reward (over 100 episodes) = 161.350000
Episode 5255: Reward = 120.000000, Mean reward (over 100 episodes) = 161.250000
Episode 5256: Reward = 200.000000, Mean reward (over 100 episodes) = 161.250000
Episode 5257: Reward = 175.000000, Mean reward (over 100 episodes) = 161.000000
Episode 5258: Reward = 200.000000, Mean reward (over 100 episodes) = 161.000000
Episode 5259: Reward = 200.000000, Mean reward (over 100 episodes) = 161.600000
Episode 5260: Reward = 121.000000, Mean reward (over 100 episodes) = 161.120000
Episode 5261: Reward = 172.000000, Mean reward (over 100 episodes) = 161.300000
Episode 5262: Reward = 121.000000, Mean reward (over 100 episodes) = 160.510000
Episode 5263: Reward = 124.000000, Mean reward (over 100 episodes) = 159.750000
Episode 5264: Reward = 132.000000, Mean reward (over 100 episodes) = 159.490000
Episode 5265: Reward = 200.000000, Mean 

Episode 5357: Reward = 134.000000, Mean reward (over 100 episodes) = 162.790000
Episode 5358: Reward = 162.000000, Mean reward (over 100 episodes) = 162.410000
Episode 5359: Reward = 125.000000, Mean reward (over 100 episodes) = 161.660000
Episode 5360: Reward = 146.000000, Mean reward (over 100 episodes) = 161.910000
Episode 5361: Reward = 146.000000, Mean reward (over 100 episodes) = 161.650000
Episode 5362: Reward = 168.000000, Mean reward (over 100 episodes) = 162.120000
Episode 5363: Reward = 154.000000, Mean reward (over 100 episodes) = 162.420000
Episode 5364: Reward = 162.000000, Mean reward (over 100 episodes) = 162.720000
Episode 5365: Reward = 161.000000, Mean reward (over 100 episodes) = 162.330000
Episode 5366: Reward = 200.000000, Mean reward (over 100 episodes) = 162.730000


In [None]:
import matplotlib.pyplot as plt

def plot(x, y, name):
    fig, ax = plt.subplots()
    ax.plot(x, y)
    
    ax.set(xlabel='Episode', ylabel='Reward', title=name)
    ax.grid()

    fig.savefig("%s.png" % name)
    plt.show()
    
plot(range(episode), plot_history_episode_rewards, 'REINFORCE')

In [None]:
while True:
    do_rollout(env=env, policy=policy, render=True)