In [1]:
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime
import matplotlib.pyplot as plt
import math

In [2]:
model_name = "Bootstrap"

In [3]:
gym.envs.register(
    id='CartPole-v3',
    entry_point='gym.envs.classic_control:CartPoleEnv',
    max_episode_steps=2000,
    reward_threshold=-110.0,
)

In [4]:
env = gym.make('CartPole-v3')
n_outputs = env.action_space.n

[2019-01-01 15:22:57,197] Making new env: CartPole-v3


In [5]:
tf.reset_default_graph()
def q_network(X, name_scope, model_name):
    # Initialize layers
    if model_name == "Bootstrap":
        output = []
        
        initializer = tf.contrib.layers.variance_scaling_initializer()
        
        with tf.variable_scope(name_scope) as scope:
            fc1 = fully_connected(X, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)
        
            for i in range(10):
                with tf.variable_scope("heads_" + str(i)):

                    fc2 = fully_connected(fc1, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)

                    temp_output = fully_connected(fc2, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)

                    output.append(temp_output)

            # Vars will store the parameters of the network such as weights
        varibles = {v.name[len(scope.name):]: v for v in tf.get_collection(key = tf.GraphKeys.TRAINABLE_VARIABLES, scope = scope.name)}
        return varibles, output
        
    if model_name == "DQN":
            
        # Initialize layers
        initializer = tf.contrib.layers.variance_scaling_initializer()

        with tf.variable_scope(name_scope) as scope: 

            fc1 = fully_connected(X, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)
            tf.summary.histogram('fc1',fc1)
            
            fc2 = fully_connected(fc1, num_outputs = 24, activation_fn=tf.nn.relu, weights_initializer=initializer)
            tf.summary.histogram('fc2',fc2)

            output = fully_connected(fc2, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
            tf.summary.histogram('output',output)


            # Vars will store the parameters of the network such as weights
            varibles = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
            return varibles, output

In [6]:
eps_min = 0.05
eps_max = 1.0
eps_decay = 200
steps_done = 0

def epsilon_greedy(action):
    global steps_done
    eps_threshold = eps_min+(eps_max-eps_min)*math.exp(-1.*steps_done/eps_decay)
    steps_done += 1
    if np.random.rand() < eps_threshold:
        return np.random.randint(n_outputs)
    else:
        if type(action) == np.ndarray and len(action) == 1:
            return action[0]
        return action

In [7]:
buffer_len = 10000
exp_buffer = deque(maxlen=buffer_len)

In [8]:
def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

In [9]:
num_episodes = 20
batch_size = 128
input_shape = (None, 4,)
learning_rate = 0.001
X_shape = (None, 4,)
discount_factor = 0.999

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000

In [10]:
logdir = 'logs'
tf.reset_default_graph()

# Now we define the placeholder for our input i.e game state
X = tf.placeholder(tf.float32, shape=X_shape)

# we define a boolean called in_training_model to toggle the training
in_training_mode = tf.placeholder(tf.bool)

In [11]:
# we build our Q network, which takes the input X and generates Q values for all the actions in the state
mainQ, mainQ_outputs = q_network(X, 'mainQ',model_name)

# similarly we build our target Q network
targetQ, targetQ_outputs = q_network(X, 'targetQ',model_name)

In [12]:
# define the placeholder for our action values
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

[2019-01-01 15:22:58,794] From <ipython-input-12-2e17235474d5>:3: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [13]:
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

In [14]:
# define a placeholder for our output i.e action
y = tf.placeholder(tf.float32, shape=(None,1))

# now we calculate the loss which is the difference between actual value and predicted value
loss = tf.reduce_mean(tf.square(y - Q_action))

# we use adam optimizer for minimizing the loss
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

In [15]:
import pandas as pd
number_of_epochs = []
rewards_of_episode = []
with tf.Session() as sess:
    init.run()
    # for each episode
    for i in range(50000):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []

        # while the state is not the terminal state
        while not done:

            #env.render()
            if model_name == "Bootstrap":
                head = int(np.random.uniform() * 10)

            # feed the game screen and get the Q values for each action
            if model_name == "Bootstrap":
                actions = mainQ_outputs[head].eval(feed_dict={X:[obs], in_training_mode:False})
            elif model_name == "DQN":
                actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
            
            # get the action
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 
            
            # select the action using epsilon greedy policy
            action = epsilon_greedy(action)
            
            # now perform the action and move to the next state, next_obs, receive reward
            next_obs, reward, done, _ = env.step(action)

            # Store this transistion as an experience in the replay buffer
            exp_buffer.append([obs, action, next_obs, reward, done])
            
            # After certain steps, we train our Q network with samples from the experience replay buffer
            if global_step % steps_train == 0 and global_step > batch_size:
                
                # sample experience
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)

                # states
                o_obs = [x for x in o_obs]

                # next states
                o_next_obs = [x for x in o_next_obs]

                # next actions
                if model_name == "Bootstrap":
                    next_act = mainQ_outputs[head].eval(feed_dict={X:o_next_obs, in_training_mode:False})
                elif model_name == "DQN":
                    next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})

                # reward
                y_batch = o_rew + discount_factor * np.max(next_act, axis = -1) * (1 - o_done) 

                # merge all summaries and write to the file
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                # now we train the network and calculate loss
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # after some interval we copy our main Q network weights to target Q network
            if (global_step + 1) % copy_steps == 0 and global_step > batch_size:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
            if i%100==0:
                result = pd.DataFrame(data = rewards_of_episode, columns = [['Rewards']])
                writer = pd.ExcelWriter('boot_cartpole_10heads_3.xlsx')
                result.to_excel(writer,'Sheet1')
                writer.save()
        rewards_of_episode.append(episodic_reward)
        number_of_epochs.append(epoch)
        print('Epoch', epoch, 'Reward', episodic_reward,'Steps',steps_done)

Epoch 22 Reward 22.0 Steps 22
Epoch 30 Reward 30.0 Steps 52
Epoch 36 Reward 36.0 Steps 88
Epoch 19 Reward 19.0 Steps 107
Epoch 14 Reward 14.0 Steps 121
Epoch 27 Reward 27.0 Steps 148
Epoch 26 Reward 26.0 Steps 174
Epoch 40 Reward 40.0 Steps 214
Epoch 14 Reward 14.0 Steps 228
Epoch 11 Reward 11.0 Steps 239
Epoch 21 Reward 21.0 Steps 260
Epoch 15 Reward 15.0 Steps 275
Epoch 10 Reward 10.0 Steps 285
Epoch 17 Reward 17.0 Steps 302
Epoch 15 Reward 15.0 Steps 317
Epoch 8 Reward 8.0 Steps 325
Epoch 13 Reward 13.0 Steps 338
Epoch 11 Reward 11.0 Steps 349
Epoch 13 Reward 13.0 Steps 362
Epoch 21 Reward 21.0 Steps 383
Epoch 11 Reward 11.0 Steps 394
Epoch 16 Reward 16.0 Steps 410
Epoch 10 Reward 10.0 Steps 420
Epoch 10 Reward 10.0 Steps 430
Epoch 10 Reward 10.0 Steps 440
Epoch 16 Reward 16.0 Steps 456
Epoch 9 Reward 9.0 Steps 465
Epoch 10 Reward 10.0 Steps 475
Epoch 10 Reward 10.0 Steps 485
Epoch 9 Reward 9.0 Steps 494
Epoch 8 Reward 8.0 Steps 502
Epoch 11 Reward 11.0 Steps 513
Epoch 10 Reward 10.

Epoch 170 Reward 170.0 Steps 7943
Epoch 201 Reward 201.0 Steps 8144
Epoch 165 Reward 165.0 Steps 8309
Epoch 165 Reward 165.0 Steps 8474
Epoch 208 Reward 208.0 Steps 8682
Epoch 145 Reward 145.0 Steps 8827
Epoch 157 Reward 157.0 Steps 8984
Epoch 158 Reward 158.0 Steps 9142
Epoch 198 Reward 198.0 Steps 9340
Epoch 193 Reward 193.0 Steps 9533
Epoch 178 Reward 178.0 Steps 9711
Epoch 195 Reward 195.0 Steps 9906
Epoch 173 Reward 173.0 Steps 10079
Epoch 204 Reward 204.0 Steps 10283
Epoch 254 Reward 254.0 Steps 10537
Epoch 171 Reward 171.0 Steps 10708
Epoch 260 Reward 260.0 Steps 10968
Epoch 158 Reward 158.0 Steps 11126
Epoch 173 Reward 173.0 Steps 11299
Epoch 181 Reward 181.0 Steps 11480
Epoch 233 Reward 233.0 Steps 11713
Epoch 185 Reward 185.0 Steps 11898
Epoch 254 Reward 254.0 Steps 12152
Epoch 178 Reward 178.0 Steps 12330
Epoch 309 Reward 309.0 Steps 12639
Epoch 188 Reward 188.0 Steps 12827
Epoch 343 Reward 343.0 Steps 13170
Epoch 247 Reward 247.0 Steps 13417
Epoch 201 Reward 201.0 Steps 136

Epoch 394 Reward 394.0 Steps 141502
Epoch 311 Reward 311.0 Steps 141813
Epoch 339 Reward 339.0 Steps 142152
Epoch 281 Reward 281.0 Steps 142433
Epoch 335 Reward 335.0 Steps 142768
Epoch 313 Reward 313.0 Steps 143081
Epoch 307 Reward 307.0 Steps 143388
Epoch 255 Reward 255.0 Steps 143643
Epoch 242 Reward 242.0 Steps 143885
Epoch 262 Reward 262.0 Steps 144147
Epoch 272 Reward 272.0 Steps 144419
Epoch 258 Reward 258.0 Steps 144677
Epoch 282 Reward 282.0 Steps 144959
Epoch 246 Reward 246.0 Steps 145205
Epoch 260 Reward 260.0 Steps 145465
Epoch 254 Reward 254.0 Steps 145719
Epoch 274 Reward 274.0 Steps 145993
Epoch 260 Reward 260.0 Steps 146253
Epoch 243 Reward 243.0 Steps 146496
Epoch 245 Reward 245.0 Steps 146741
Epoch 240 Reward 240.0 Steps 146981
Epoch 243 Reward 243.0 Steps 147224
Epoch 229 Reward 229.0 Steps 147453
Epoch 262 Reward 262.0 Steps 147715
Epoch 244 Reward 244.0 Steps 147959
Epoch 268 Reward 268.0 Steps 148227
Epoch 266 Reward 266.0 Steps 148493
Epoch 255 Reward 255.0 Steps

Epoch 82 Reward 82.0 Steps 251115
Epoch 10 Reward 10.0 Steps 251125
Epoch 11 Reward 11.0 Steps 251136
Epoch 11 Reward 11.0 Steps 251147
Epoch 9 Reward 9.0 Steps 251156
Epoch 9 Reward 9.0 Steps 251165
Epoch 9 Reward 9.0 Steps 251174
Epoch 11 Reward 11.0 Steps 251185
Epoch 11 Reward 11.0 Steps 251196
Epoch 9 Reward 9.0 Steps 251205
Epoch 9 Reward 9.0 Steps 251214
Epoch 9 Reward 9.0 Steps 251223
Epoch 10 Reward 10.0 Steps 251233
Epoch 10 Reward 10.0 Steps 251243
Epoch 11 Reward 11.0 Steps 251254
Epoch 11 Reward 11.0 Steps 251265
Epoch 9 Reward 9.0 Steps 251274
Epoch 10 Reward 10.0 Steps 251284
Epoch 8 Reward 8.0 Steps 251292
Epoch 12 Reward 12.0 Steps 251304
Epoch 10 Reward 10.0 Steps 251314
Epoch 10 Reward 10.0 Steps 251324
Epoch 10 Reward 10.0 Steps 251334
Epoch 10 Reward 10.0 Steps 251344
Epoch 9 Reward 9.0 Steps 251353
Epoch 11 Reward 11.0 Steps 251364
Epoch 8 Reward 8.0 Steps 251372
Epoch 12 Reward 12.0 Steps 251384
Epoch 10 Reward 10.0 Steps 251394
Epoch 10 Reward 10.0 Steps 251404


Epoch 103 Reward 103.0 Steps 282821
Epoch 165 Reward 165.0 Steps 282986
Epoch 127 Reward 127.0 Steps 283113
Epoch 113 Reward 113.0 Steps 283226
Epoch 125 Reward 125.0 Steps 283351
Epoch 119 Reward 119.0 Steps 283470
Epoch 123 Reward 123.0 Steps 283593
Epoch 171 Reward 171.0 Steps 283764
Epoch 114 Reward 114.0 Steps 283878
Epoch 144 Reward 144.0 Steps 284022
Epoch 121 Reward 121.0 Steps 284143
Epoch 146 Reward 146.0 Steps 284289
Epoch 110 Reward 110.0 Steps 284399
Epoch 189 Reward 189.0 Steps 284588
Epoch 118 Reward 118.0 Steps 284706
Epoch 117 Reward 117.0 Steps 284823
Epoch 115 Reward 115.0 Steps 284938
Epoch 105 Reward 105.0 Steps 285043
Epoch 122 Reward 122.0 Steps 285165
Epoch 158 Reward 158.0 Steps 285323
Epoch 109 Reward 109.0 Steps 285432
Epoch 132 Reward 132.0 Steps 285564
Epoch 130 Reward 130.0 Steps 285694
Epoch 108 Reward 108.0 Steps 285802
Epoch 121 Reward 121.0 Steps 285923
Epoch 191 Reward 191.0 Steps 286114
Epoch 108 Reward 108.0 Steps 286222
Epoch 136 Reward 136.0 Steps

Epoch 2000 Reward 2000.0 Steps 479049
Epoch 2000 Reward 2000.0 Steps 481049
Epoch 2000 Reward 2000.0 Steps 483049
Epoch 2000 Reward 2000.0 Steps 485049
Epoch 2000 Reward 2000.0 Steps 487049
Epoch 2000 Reward 2000.0 Steps 489049
Epoch 2000 Reward 2000.0 Steps 491049
Epoch 2000 Reward 2000.0 Steps 493049
Epoch 2000 Reward 2000.0 Steps 495049
Epoch 2000 Reward 2000.0 Steps 497049
Epoch 2000 Reward 2000.0 Steps 499049
Epoch 2000 Reward 2000.0 Steps 501049
Epoch 2000 Reward 2000.0 Steps 503049
Epoch 2000 Reward 2000.0 Steps 505049
Epoch 2000 Reward 2000.0 Steps 507049
Epoch 2000 Reward 2000.0 Steps 509049
Epoch 2000 Reward 2000.0 Steps 511049
Epoch 1416 Reward 1416.0 Steps 512465
Epoch 151 Reward 151.0 Steps 512616
Epoch 12 Reward 12.0 Steps 512628
Epoch 11 Reward 11.0 Steps 512639
Epoch 11 Reward 11.0 Steps 512650
Epoch 13 Reward 13.0 Steps 512663
Epoch 13 Reward 13.0 Steps 512676
Epoch 10 Reward 10.0 Steps 512686
Epoch 11 Reward 11.0 Steps 512697
Epoch 114 Reward 114.0 Steps 512811
Epoch 

Epoch 2000 Reward 2000.0 Steps 681720
Epoch 2000 Reward 2000.0 Steps 683720
Epoch 2000 Reward 2000.0 Steps 685720
Epoch 2000 Reward 2000.0 Steps 687720
Epoch 2000 Reward 2000.0 Steps 689720
Epoch 2000 Reward 2000.0 Steps 691720
Epoch 2000 Reward 2000.0 Steps 693720
Epoch 2000 Reward 2000.0 Steps 695720
Epoch 2000 Reward 2000.0 Steps 697720
Epoch 2000 Reward 2000.0 Steps 699720
Epoch 2000 Reward 2000.0 Steps 701720
Epoch 2000 Reward 2000.0 Steps 703720
Epoch 2000 Reward 2000.0 Steps 705720
Epoch 2000 Reward 2000.0 Steps 707720
Epoch 2000 Reward 2000.0 Steps 709720
Epoch 2000 Reward 2000.0 Steps 711720
Epoch 2000 Reward 2000.0 Steps 713720
Epoch 2000 Reward 2000.0 Steps 715720
Epoch 2000 Reward 2000.0 Steps 717720
Epoch 2000 Reward 2000.0 Steps 719720
Epoch 2000 Reward 2000.0 Steps 721720
Epoch 2000 Reward 2000.0 Steps 723720
Epoch 517 Reward 517.0 Steps 724237
Epoch 2000 Reward 2000.0 Steps 726237
Epoch 2000 Reward 2000.0 Steps 728237
Epoch 2000 Reward 2000.0 Steps 730237
Epoch 2000 Rew

Epoch 2000 Reward 2000.0 Steps 843132
Epoch 2000 Reward 2000.0 Steps 845132
Epoch 2000 Reward 2000.0 Steps 847132
Epoch 2000 Reward 2000.0 Steps 849132
Epoch 2000 Reward 2000.0 Steps 851132
Epoch 2000 Reward 2000.0 Steps 853132
Epoch 2000 Reward 2000.0 Steps 855132
Epoch 2000 Reward 2000.0 Steps 857132
Epoch 2000 Reward 2000.0 Steps 859132
Epoch 2000 Reward 2000.0 Steps 861132
Epoch 2000 Reward 2000.0 Steps 863132
Epoch 2000 Reward 2000.0 Steps 865132
Epoch 2000 Reward 2000.0 Steps 867132
Epoch 2000 Reward 2000.0 Steps 869132
Epoch 2000 Reward 2000.0 Steps 871132
Epoch 2000 Reward 2000.0 Steps 873132
Epoch 2000 Reward 2000.0 Steps 875132
Epoch 2000 Reward 2000.0 Steps 877132
Epoch 2000 Reward 2000.0 Steps 879132
Epoch 2000 Reward 2000.0 Steps 881132
Epoch 2000 Reward 2000.0 Steps 883132
Epoch 2000 Reward 2000.0 Steps 885132
Epoch 2000 Reward 2000.0 Steps 887132
Epoch 2000 Reward 2000.0 Steps 889132
Epoch 2000 Reward 2000.0 Steps 891132
Epoch 2000 Reward 2000.0 Steps 893132
Epoch 2000 R

Epoch 2000 Reward 2000.0 Steps 999571
Epoch 2000 Reward 2000.0 Steps 1001571
Epoch 2000 Reward 2000.0 Steps 1003571
Epoch 2000 Reward 2000.0 Steps 1005571
Epoch 2000 Reward 2000.0 Steps 1007571
Epoch 2000 Reward 2000.0 Steps 1009571
Epoch 2000 Reward 2000.0 Steps 1011571
Epoch 2000 Reward 2000.0 Steps 1013571
Epoch 2000 Reward 2000.0 Steps 1015571
Epoch 2000 Reward 2000.0 Steps 1017571
Epoch 2000 Reward 2000.0 Steps 1019571
Epoch 2000 Reward 2000.0 Steps 1021571
Epoch 2000 Reward 2000.0 Steps 1023571
Epoch 2000 Reward 2000.0 Steps 1025571
Epoch 2000 Reward 2000.0 Steps 1027571
Epoch 2000 Reward 2000.0 Steps 1029571
Epoch 2000 Reward 2000.0 Steps 1031571
Epoch 2000 Reward 2000.0 Steps 1033571
Epoch 2000 Reward 2000.0 Steps 1035571
Epoch 2000 Reward 2000.0 Steps 1037571
Epoch 2000 Reward 2000.0 Steps 1039571
Epoch 2000 Reward 2000.0 Steps 1041571
Epoch 2000 Reward 2000.0 Steps 1043571
Epoch 2000 Reward 2000.0 Steps 1045571
Epoch 2000 Reward 2000.0 Steps 1047571
Epoch 2000 Reward 2000.0 S

Epoch 200 Reward 200.0 Steps 1330857
Epoch 194 Reward 194.0 Steps 1331051
Epoch 203 Reward 203.0 Steps 1331254
Epoch 189 Reward 189.0 Steps 1331443
Epoch 206 Reward 206.0 Steps 1331649
Epoch 183 Reward 183.0 Steps 1331832
Epoch 185 Reward 185.0 Steps 1332017
Epoch 214 Reward 214.0 Steps 1332231
Epoch 214 Reward 214.0 Steps 1332445
Epoch 215 Reward 215.0 Steps 1332660
Epoch 183 Reward 183.0 Steps 1332843
Epoch 183 Reward 183.0 Steps 1333026
Epoch 199 Reward 199.0 Steps 1333225
Epoch 195 Reward 195.0 Steps 1333420
Epoch 182 Reward 182.0 Steps 1333602
Epoch 191 Reward 191.0 Steps 1333793
Epoch 203 Reward 203.0 Steps 1333996
Epoch 208 Reward 208.0 Steps 1334204
Epoch 190 Reward 190.0 Steps 1334394
Epoch 213 Reward 213.0 Steps 1334607
Epoch 204 Reward 204.0 Steps 1334811
Epoch 206 Reward 206.0 Steps 1335017
Epoch 212 Reward 212.0 Steps 1335229
Epoch 247 Reward 247.0 Steps 1335476
Epoch 200 Reward 200.0 Steps 1335676
Epoch 214 Reward 214.0 Steps 1335890
Epoch 232 Reward 232.0 Steps 1336122
E

Epoch 122 Reward 122.0 Steps 1574020
Epoch 113 Reward 113.0 Steps 1574133
Epoch 156 Reward 156.0 Steps 1574289
Epoch 1177 Reward 1177.0 Steps 1575466
Epoch 254 Reward 254.0 Steps 1575720
Epoch 231 Reward 231.0 Steps 1575951
Epoch 2000 Reward 2000.0 Steps 1577951
Epoch 1913 Reward 1913.0 Steps 1579864
Epoch 866 Reward 866.0 Steps 1580730
Epoch 519 Reward 519.0 Steps 1581249
Epoch 2000 Reward 2000.0 Steps 1583249
Epoch 1455 Reward 1455.0 Steps 1584704
Epoch 372 Reward 372.0 Steps 1585076
Epoch 315 Reward 315.0 Steps 1585391
Epoch 271 Reward 271.0 Steps 1585662
Epoch 278 Reward 278.0 Steps 1585940
Epoch 259 Reward 259.0 Steps 1586199
Epoch 289 Reward 289.0 Steps 1586488
Epoch 287 Reward 287.0 Steps 1586775
Epoch 276 Reward 276.0 Steps 1587051
Epoch 259 Reward 259.0 Steps 1587310
Epoch 222 Reward 222.0 Steps 1587532
Epoch 244 Reward 244.0 Steps 1587776
Epoch 190 Reward 190.0 Steps 1587966
Epoch 186 Reward 186.0 Steps 1588152
Epoch 208 Reward 208.0 Steps 1588360
Epoch 224 Reward 224.0 Steps

Epoch 2000 Reward 2000.0 Steps 1913288
Epoch 2000 Reward 2000.0 Steps 1915288
Epoch 2000 Reward 2000.0 Steps 1917288
Epoch 2000 Reward 2000.0 Steps 1919288
Epoch 2000 Reward 2000.0 Steps 1921288
Epoch 1349 Reward 1349.0 Steps 1922637
Epoch 2000 Reward 2000.0 Steps 1924637
Epoch 1575 Reward 1575.0 Steps 1926212
Epoch 2000 Reward 2000.0 Steps 1928212
Epoch 2000 Reward 2000.0 Steps 1930212
Epoch 2000 Reward 2000.0 Steps 1932212
Epoch 1820 Reward 1820.0 Steps 1934032
Epoch 728 Reward 728.0 Steps 1934760
Epoch 256 Reward 256.0 Steps 1935016
Epoch 11 Reward 11.0 Steps 1935027
Epoch 212 Reward 212.0 Steps 1935239
Epoch 221 Reward 221.0 Steps 1935460
Epoch 278 Reward 278.0 Steps 1935738
Epoch 19 Reward 19.0 Steps 1935757
Epoch 169 Reward 169.0 Steps 1935926
Epoch 11 Reward 11.0 Steps 1935937
Epoch 181 Reward 181.0 Steps 1936118
Epoch 166 Reward 166.0 Steps 1936284
Epoch 138 Reward 138.0 Steps 1936422
Epoch 141 Reward 141.0 Steps 1936563
Epoch 196 Reward 196.0 Steps 1936759
Epoch 212 Reward 212

Epoch 1254 Reward 1254.0 Steps 2250332
Epoch 437 Reward 437.0 Steps 2250769
Epoch 419 Reward 419.0 Steps 2251188
Epoch 423 Reward 423.0 Steps 2251611
Epoch 636 Reward 636.0 Steps 2252247
Epoch 2000 Reward 2000.0 Steps 2254247
Epoch 2000 Reward 2000.0 Steps 2256247
Epoch 2000 Reward 2000.0 Steps 2258247
Epoch 2000 Reward 2000.0 Steps 2260247
Epoch 2000 Reward 2000.0 Steps 2262247
Epoch 2000 Reward 2000.0 Steps 2264247
Epoch 2000 Reward 2000.0 Steps 2266247
Epoch 2000 Reward 2000.0 Steps 2268247
Epoch 2000 Reward 2000.0 Steps 2270247
Epoch 2000 Reward 2000.0 Steps 2272247
Epoch 2000 Reward 2000.0 Steps 2274247
Epoch 2000 Reward 2000.0 Steps 2276247
Epoch 2000 Reward 2000.0 Steps 2278247
Epoch 2000 Reward 2000.0 Steps 2280247
Epoch 2000 Reward 2000.0 Steps 2282247
Epoch 2000 Reward 2000.0 Steps 2284247
Epoch 2000 Reward 2000.0 Steps 2286247
Epoch 1651 Reward 1651.0 Steps 2287898
Epoch 2000 Reward 2000.0 Steps 2289898
Epoch 2000 Reward 2000.0 Steps 2291898
Epoch 2000 Reward 2000.0 Steps 22

Epoch 2000 Reward 2000.0 Steps 2549212
Epoch 2000 Reward 2000.0 Steps 2551212
Epoch 2000 Reward 2000.0 Steps 2553212
Epoch 2000 Reward 2000.0 Steps 2555212
Epoch 498 Reward 498.0 Steps 2555710
Epoch 116 Reward 116.0 Steps 2555826
Epoch 220 Reward 220.0 Steps 2556046
Epoch 2000 Reward 2000.0 Steps 2558046
Epoch 2000 Reward 2000.0 Steps 2560046
Epoch 2000 Reward 2000.0 Steps 2562046
Epoch 2000 Reward 2000.0 Steps 2564046
Epoch 2000 Reward 2000.0 Steps 2566046
Epoch 2000 Reward 2000.0 Steps 2568046
Epoch 2000 Reward 2000.0 Steps 2570046
Epoch 2000 Reward 2000.0 Steps 2572046
Epoch 2000 Reward 2000.0 Steps 2574046
Epoch 2000 Reward 2000.0 Steps 2576046
Epoch 2000 Reward 2000.0 Steps 2578046
Epoch 2000 Reward 2000.0 Steps 2580046
Epoch 2000 Reward 2000.0 Steps 2582046
Epoch 2000 Reward 2000.0 Steps 2584046
Epoch 2000 Reward 2000.0 Steps 2586046
Epoch 2000 Reward 2000.0 Steps 2588046
Epoch 2000 Reward 2000.0 Steps 2590046
Epoch 2000 Reward 2000.0 Steps 2592046
Epoch 2000 Reward 2000.0 Steps 

Epoch 2000 Reward 2000.0 Steps 2869477
Epoch 2000 Reward 2000.0 Steps 2871477
Epoch 2000 Reward 2000.0 Steps 2873477
Epoch 2000 Reward 2000.0 Steps 2875477
Epoch 2000 Reward 2000.0 Steps 2877477
Epoch 2000 Reward 2000.0 Steps 2879477
Epoch 2000 Reward 2000.0 Steps 2881477
Epoch 2000 Reward 2000.0 Steps 2883477
Epoch 2000 Reward 2000.0 Steps 2885477
Epoch 2000 Reward 2000.0 Steps 2887477
Epoch 2000 Reward 2000.0 Steps 2889477
Epoch 2000 Reward 2000.0 Steps 2891477
Epoch 2000 Reward 2000.0 Steps 2893477
Epoch 1109 Reward 1109.0 Steps 2894586
Epoch 1025 Reward 1025.0 Steps 2895611
Epoch 2000 Reward 2000.0 Steps 2897611
Epoch 2000 Reward 2000.0 Steps 2899611
Epoch 2000 Reward 2000.0 Steps 2901611
Epoch 2000 Reward 2000.0 Steps 2903611
Epoch 2000 Reward 2000.0 Steps 2905611
Epoch 2000 Reward 2000.0 Steps 2907611
Epoch 798 Reward 798.0 Steps 2908409
Epoch 210 Reward 210.0 Steps 2908619
Epoch 543 Reward 543.0 Steps 2909162
Epoch 267 Reward 267.0 Steps 2909429
Epoch 194 Reward 194.0 Steps 2909

Epoch 258 Reward 258.0 Steps 3154851
Epoch 373 Reward 373.0 Steps 3155224
Epoch 340 Reward 340.0 Steps 3155564
Epoch 281 Reward 281.0 Steps 3155845
Epoch 398 Reward 398.0 Steps 3156243
Epoch 389 Reward 389.0 Steps 3156632
Epoch 311 Reward 311.0 Steps 3156943
Epoch 309 Reward 309.0 Steps 3157252
Epoch 293 Reward 293.0 Steps 3157545
Epoch 310 Reward 310.0 Steps 3157855
Epoch 369 Reward 369.0 Steps 3158224
Epoch 508 Reward 508.0 Steps 3158732
Epoch 328 Reward 328.0 Steps 3159060
Epoch 266 Reward 266.0 Steps 3159326
Epoch 253 Reward 253.0 Steps 3159579
Epoch 336 Reward 336.0 Steps 3159915
Epoch 369 Reward 369.0 Steps 3160284
Epoch 330 Reward 330.0 Steps 3160614
Epoch 440 Reward 440.0 Steps 3161054
Epoch 567 Reward 567.0 Steps 3161621
Epoch 657 Reward 657.0 Steps 3162278
Epoch 596 Reward 596.0 Steps 3162874
Epoch 830 Reward 830.0 Steps 3163704
Epoch 1911 Reward 1911.0 Steps 3165615
Epoch 698 Reward 698.0 Steps 3166313
Epoch 901 Reward 901.0 Steps 3167214
Epoch 923 Reward 923.0 Steps 3168137

Epoch 2000 Reward 2000.0 Steps 3444662
Epoch 2000 Reward 2000.0 Steps 3446662
Epoch 2000 Reward 2000.0 Steps 3448662
Epoch 2000 Reward 2000.0 Steps 3450662
Epoch 2000 Reward 2000.0 Steps 3452662
Epoch 2000 Reward 2000.0 Steps 3454662
Epoch 2000 Reward 2000.0 Steps 3456662
Epoch 2000 Reward 2000.0 Steps 3458662
Epoch 2000 Reward 2000.0 Steps 3460662
Epoch 2000 Reward 2000.0 Steps 3462662
Epoch 2000 Reward 2000.0 Steps 3464662
Epoch 2000 Reward 2000.0 Steps 3466662
Epoch 2000 Reward 2000.0 Steps 3468662
Epoch 2000 Reward 2000.0 Steps 3470662
Epoch 2000 Reward 2000.0 Steps 3472662
Epoch 2000 Reward 2000.0 Steps 3474662
Epoch 2000 Reward 2000.0 Steps 3476662
Epoch 2000 Reward 2000.0 Steps 3478662
Epoch 1622 Reward 1622.0 Steps 3480284
Epoch 785 Reward 785.0 Steps 3481069
Epoch 218 Reward 218.0 Steps 3481287
Epoch 394 Reward 394.0 Steps 3481681
Epoch 362 Reward 362.0 Steps 3482043
Epoch 174 Reward 174.0 Steps 3482217
Epoch 175 Reward 175.0 Steps 3482392
Epoch 118 Reward 118.0 Steps 3482510


KeyboardInterrupt: 

In [None]:
N = 10
cumsum, moving_aves = [0], []

for i, x in enumerate(rewards_of_episode, 1):
    cumsum.append(cumsum[i-1] + x)
    if i>=N:
        moving_ave = (cumsum[i] - cumsum[i-N])/N
        #can do stuff with moving_ave here
        moving_aves.append(moving_ave)



In [None]:
plt.plot(list(range(len(moving_aves))), moving_aves)

In [None]:
epsilon

In [None]:
plt.plot(list(range(len(number_of_epochs))), rewards_of_episode)