In [1]:
import tensorflow as tf
tf.enable_eager_execution()

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import gym

from collections import deque

In [2]:
ENV_NAME = 'CartPole-v0'

env = gym.make(ENV_NAME)

In [3]:
n_obs_params = env.observation_space.shape[0]
n_acts = env.action_space.n

n_obs_params, n_acts

(4, 2)

In [4]:
net = tf.keras.Sequential([
    tf.keras.layers.Dense(16, activation=tf.nn.relu, input_shape=(n_obs_params, )),
    tf.keras.layers.Dense(n_acts, activation=tf.nn.softmax)
])

net.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                80        
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 34        
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________


Let's see the output of our network given an observation from environment

In [5]:
sample_obs = env.reset()
sample_obs = np.expand_dims(sample_obs, axis=0)

prob_logits = net(sample_obs)[0]
action_chosen = tf.argmax(prob_logits)

print(prob_logits)
print(action_chosen)

tf.Tensor([0.49435604 0.50564396], shape=(2,), dtype=float32)
tf.Tensor(1, shape=(), dtype=int64)


In [10]:
def play_episode(net, render=False):
    
    observations = []
    actions = []
    rewards = []
    
    obs = env.reset()
    done = False
    
    while not done:
        if render: env.render()
        
        observations.append(obs)
        obs = np.expand_dims(obs, axis=0)
        act = np.argmax(net(obs)[0].numpy())
        actions.append(act)
        next_obs, reward, done, info = env.step(act)
        rewards.append(reward)
        obs = next_obs
        
    return observations, actions, rewards

In [11]:
play_episode(net, render=True)

([array([ 0.01329557,  0.04096941, -0.01342608,  0.03574862]),
  array([ 0.01411495,  0.23628129, -0.0127111 , -0.26113995]),
  array([ 0.01884058,  0.43158236, -0.0179339 , -0.55780485]),
  array([ 0.02747223,  0.62695142, -0.02909   , -0.85608359]),
  array([ 0.04001126,  0.82245743, -0.04621167, -1.15776979]),
  array([ 0.0564604 ,  1.01815021, -0.06936707, -1.46457698]),
  array([ 0.07682341,  1.21404989, -0.09865861, -1.77809738]),
  array([ 0.10110441,  1.41013457, -0.13422055, -2.09975271]),
  array([ 0.1293071 ,  1.60632573, -0.17621561, -2.43073508])],
 [1, 1, 1, 1, 1, 1, 1, 1, 1],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])

In [12]:
def _convert_rewards_to_advantages(rewards):
    for i in range(-2, -len(rewards)-1, -1):
        rewards[i] += rewards[i+1]

In [13]:
test_list = [1, 2, 3, 4]
_convert_rewards_to_advantages(test_list)

assert test_list == [10, 9, 7, 4]

In [55]:
def train(net, optimizer, observations, actions, advantages, learning_rate=1e-3):
    
    observations = np.array(observations, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    advantages = np.array(advantages, dtype=np.float32)
    
    assert len(observations) == len(actions) == len(advantages)
    
    with tf.GradientTape() as t:
        
        probs_actions = net(observations)
        action_masks = tf.one_hot(actions, n_acts)
        
        log_pi = tf.log(tf.reduce_sum(action_masks * probs_actions, axis=1))
        loss = -tf.reduce_mean( log_pi * advantages )
    
    d_loss_d_w = t.gradient(loss, net.trainable_weights)
    optimizer.apply_gradients(zip(d_loss_d_w, net.trainable_weights))

In [58]:
n_episodes = 200
n_episodes_before_train = 5

observations = []
actions = []
rewards = []

optimizer = tf.train.AdamOptimizer()

for episode in range(n_episodes):
    ep_observations, ep_actions, ep_rewards = play_episode(net)
    observations += ep_observations
    actions += ep_actions
    rewards += ep_rewards
    
    print('Episode {}: Reward = {}'.format(episode, sum(ep_rewards)))
    
    if (episode + 1) % n_episodes_before_train == 0:
        train(net, optimizer, observations, actions, rewards)
        observations = []
        actions = []
        rewards = []

Episode 0: Reward = 9.0
Episode 1: Reward = 10.0
Episode 2: Reward = 9.0
Episode 3: Reward = 9.0
Episode 4: Reward = 8.0
[[-4.24525179e-02 -9.01468471e-03 -1.01846252e-02  2.99019758e-02]
 [-4.26328108e-02  1.86251834e-01 -9.58658569e-03 -2.65976816e-01]
 [-3.89077738e-02  3.81509274e-01 -1.49061223e-02 -5.61667979e-01]
 [-3.12775895e-02  5.76837242e-01 -2.61394810e-02 -8.59009564e-01]
 [-1.97408441e-02  7.72305310e-01 -4.33196723e-02 -1.15979564e+00]
 [-4.29473771e-03  9.67964053e-01 -6.65155873e-02 -1.46574044e+00]
 [ 1.50645431e-02  1.16383445e+00 -9.58303958e-02 -1.77843761e+00]
 [ 3.83412316e-02  1.35989583e+00 -1.31399140e-01 -2.09931111e+00]
 [ 6.55391514e-02  1.55607080e+00 -1.73385367e-01 -2.42955732e+00]
 [ 3.87207828e-02 -3.78581583e-02  1.14900535e-02 -3.40096317e-02]
 [ 3.79636176e-02  1.57097161e-01  1.08098602e-02 -3.23045284e-01]
 [ 4.11055610e-02  3.52063537e-01  4.34895465e-03 -6.12299740e-01]
 [ 4.81468327e-02  5.47124445e-01 -7.89703988e-03 -9.03609693e-01]
 [ 5.908

[[ 4.07275148e-02 -7.03627290e-03  4.35823426e-02  2.87965126e-03]
 [ 4.05867882e-02  1.87434420e-01  4.36399356e-02 -2.75740236e-01]
 [ 4.43354771e-02  3.81907463e-01  3.81251313e-02 -5.54346025e-01]
 [ 5.19736260e-02  5.76473892e-01  2.70382110e-02 -8.34777534e-01]
 [ 6.35031015e-02  7.71216273e-01  1.03426604e-02 -1.11883605e+00]
 [ 7.89274275e-02  9.66201007e-01 -1.20340604e-02 -1.40825677e+00]
 [ 9.82514471e-02  1.16147017e+00 -4.01991941e-02 -1.70467722e+00]
 [ 1.21480852e-01  1.35703099e+00 -7.42927417e-02 -2.00959682e+00]
 [ 1.48621470e-01  1.55284309e+00 -1.14484675e-01 -2.32432675e+00]
 [ 1.79678336e-01  1.74880338e+00 -1.60971209e-01 -2.64992809e+00]
 [ 1.82930194e-02 -2.99836583e-02  2.61992030e-02  7.91760441e-03]
 [ 1.76933445e-02  1.64752960e-01  2.63575558e-02 -2.76385397e-01]
 [ 2.09884048e-02  3.59489143e-01  2.08298471e-02 -5.60639977e-01]
 [ 2.81781871e-02  5.54312646e-01  9.61704832e-03 -8.46688330e-01]
 [ 3.92644405e-02  7.49302089e-01 -7.31671834e-03 -1.13633156e

[[-2.19881115e-03  3.09557449e-02  4.97127846e-02 -4.41963896e-02]
 [-1.57969631e-03  2.25330889e-01  4.88288552e-02 -3.20789367e-01]
 [ 2.92692147e-03  4.19724703e-01  4.24130708e-02 -5.97682536e-01]
 [ 1.13214152e-02  6.14228308e-01  3.04594189e-02 -8.76709878e-01]
 [ 2.36059818e-02  8.08923304e-01  1.29252216e-02 -1.15966320e+00]
 [ 3.97844464e-02  1.00387454e+00 -1.02680437e-02 -1.44826567e+00]
 [ 5.98619357e-02  1.19912112e+00 -3.92333567e-02 -1.74413896e+00]
 [ 8.38443637e-02  1.39466679e+00 -7.41161406e-02 -2.04876328e+00]
 [ 1.11737698e-01  1.59046602e+00 -1.15091406e-01 -2.36342597e+00]
 [ 1.43547013e-01  1.78640914e+00 -1.62359923e-01 -2.68915844e+00]
 [-3.65880392e-02  3.73180546e-02 -1.59093905e-02 -2.54954509e-02]
 [-3.58416773e-02  2.32664496e-01 -1.64193008e-02 -3.23155195e-01]
 [-3.11883856e-02  4.28016365e-01 -2.28824038e-02 -6.20970547e-01]
 [-2.26280596e-02  6.23450279e-01 -3.53018157e-02 -9.20771539e-01]
 [-1.01590538e-02  8.19031060e-01 -5.37172444e-02 -1.22433650e

[<tf.Tensor: id=12288, shape=(4, 16), dtype=float32, numpy=
array([[ 4.01949774e-06, -7.84844719e-03,  3.20916224e-05,
        -1.28671527e-02, -9.63218603e-03, -9.45875887e-03,
        -1.99029280e-04, -2.89555173e-05, -1.31379522e-03,
         1.13732077e-03, -6.19050697e-04,  5.01038230e-05,
        -9.82965250e-03,  4.37911553e-03,  1.34698948e-05,
        -3.72492141e-05],
       [ 5.92521337e-06, -1.48620531e-01,  7.73160082e-06,
        -2.33117148e-01, -1.82776064e-01, -1.77270174e-01,
         8.85382178e-05, -1.30883913e-04, -2.38917377e-02,
         2.13703811e-02, -1.91140361e-02, -1.05821549e-04,
        -1.83844537e-01,  5.06113805e-02,  3.21759326e-05,
         9.70137989e-07],
       [-2.23089523e-06,  9.32180975e-03,  1.34568145e-05,
         1.46985594e-02,  1.15492614e-02,  1.16993934e-02,
        -1.89315557e-04,  2.17583773e-04,  1.52923947e-03,
        -1.38226885e-03,  1.80546695e-03,  1.90086139e-04,
         1.20481001e-02, -4.86405101e-03, -3.58428442e-05,
   

Episode 50: Reward = 11.0
Episode 51: Reward = 10.0
Episode 52: Reward = 11.0
Episode 53: Reward = 9.0
Episode 54: Reward = 10.0
[[ 1.95802487e-02 -1.32624060e-02  4.81808968e-02  2.04859693e-02]
 [ 1.93150006e-02  1.81136653e-01  4.85906154e-02 -2.56614536e-01]
 [ 2.29377337e-02  3.75532418e-01  4.34583239e-02 -5.33583939e-01]
 [ 3.04483827e-02  5.70017099e-01  3.27866450e-02 -8.12262833e-01]
 [ 4.18487228e-02  7.64674962e-01  1.65413897e-02 -1.09445512e+00]
 [ 5.71422242e-02  9.59575176e-01 -5.34771383e-03 -1.38190258e+00]
 [ 7.63337240e-02  1.15476346e+00 -3.29857655e-02 -1.67625296e+00]
 [ 9.94289964e-02  1.35025215e+00 -6.65108263e-02 -1.97902262e+00]
 [ 1.26434043e-01  1.54600787e+00 -1.06091276e-01 -2.29154682e+00]
 [ 1.57354191e-01  1.74193668e+00 -1.51922211e-01 -2.61491990e+00]
 [ 1.92192927e-01  1.93786621e+00 -2.04220608e-01 -2.94992256e+00]
 [-4.51699421e-02 -2.82560140e-02  5.87151293e-03  3.46735641e-02]
 [-4.57350649e-02  1.66781247e-01  6.56498410e-03 -2.56151080e-01]


[<tf.Tensor: id=15656, shape=(4, 16), dtype=float32, numpy=
array([[-1.77367783e-05, -1.13651957e-02, -8.76720369e-05,
        -1.83841754e-02, -1.42789800e-02, -1.40063278e-02,
        -1.12867972e-03, -6.56873104e-04, -2.02692277e-03,
         1.40934542e-03, -1.41032308e-03,  1.76898669e-04,
        -1.39162438e-02,  5.15671493e-03, -1.01656718e-04,
        -2.70957738e-04],
       [ 4.33268724e-06, -1.45871848e-01,  2.14162537e-05,
        -2.27044046e-01, -1.78863123e-01, -1.72977805e-01,
         1.78598872e-04,  1.03941624e-04, -2.52939910e-02,
         1.83786992e-02, -2.31214799e-02, -4.40386393e-05,
        -1.79358482e-01,  5.34007177e-02,  1.60858544e-05,
        -1.62005541e-03],
       [ 8.61527496e-07,  8.22067540e-03,  4.25849066e-06,
         1.28376698e-02,  9.97439213e-03,  9.78061929e-03,
        -3.63738567e-04, -2.11689912e-04,  1.44744583e-03,
        -1.06356153e-03,  2.21245922e-03,  1.94344335e-04,
         1.06498431e-02, -2.85822386e-03, -3.27607922e-05,
   

tf.Tensor(0.47172785, shape=(), dtype=float32)
[<tf.Tensor: id=17364, shape=(4, 16), dtype=float32, numpy=
array([[-4.1431472e-06, -4.7511766e-03,  3.2544129e-05, -8.0566779e-03,
        -6.0488204e-03, -5.9374999e-03, -3.5926481e-05, -9.2356553e-05,
        -9.4466354e-04,  6.0801703e-04, -1.5793043e-03, -1.8393953e-04,
        -6.0747163e-03,  1.9422526e-04,  3.9301049e-05, -9.1421780e-06],
       [ 1.9325485e-06, -1.4201854e-01,  8.1278631e-07, -2.2072381e-01,
        -1.7397906e-01, -1.6857700e-01,  2.1162766e-04,  4.3079104e-05,
        -2.5563136e-02,  1.6709780e-02, -5.6047782e-02, -2.3045943e-05,
        -1.7472391e-01,  1.1348339e-04, -2.2491804e-08, -1.8473636e-04],
       [ 5.4840157e-06,  7.4912277e-03, -5.8555997e-06,  1.1944248e-02,
         9.2366561e-03,  9.1494992e-03, -1.0454138e-04,  1.2224630e-04,
         1.4135907e-03, -9.2627265e-04,  3.1583696e-03,  1.8610255e-04,
         9.2762019e-03, -5.5648346e-05, -5.6846984e-05, -5.1887939e-05],
       [-5.6391254e-06,  2

Episode 84: Reward = 9.0
[[ 4.38655447e-03 -4.95971479e-02  2.27806102e-02  2.34669410e-02]
 [ 3.39461141e-03  1.45190820e-01  2.32499503e-02 -2.61942297e-01]
 [ 6.29842794e-03  3.39973301e-01  1.80111043e-02 -5.47202170e-01]
 [ 1.30978944e-02  5.34837663e-01  7.06706010e-03 -8.34156275e-01]
 [ 2.37946473e-02  7.29862332e-01 -9.61606577e-03 -1.12460828e+00]
 [ 3.83918956e-02  9.25109029e-01 -3.21082324e-02 -1.42029190e+00]
 [ 5.68940751e-02  1.12061322e+00 -6.05140701e-02 -1.72283518e+00]
 [ 7.93063343e-02  1.31637335e+00 -9.49707776e-02 -2.03371835e+00]
 [ 1.05633803e-01  1.51233757e+00 -1.35645136e-01 -2.35421944e+00]
 [ 1.35880560e-01  1.70838761e+00 -1.82729527e-01 -2.68535066e+00]
 [-2.15843040e-02  4.89241183e-02  3.91098261e-02 -1.61906555e-02]
 [-2.06058212e-02  2.43463993e-01  3.87860164e-02 -2.96281964e-01]
 [-1.57365408e-02  4.38012183e-01  3.28603759e-02 -5.76484561e-01]
 [-6.97629806e-03  6.32658482e-01  2.13306844e-02 -8.58637154e-01]
 [ 5.67687163e-03  8.27483475e-01  4.

Episode 97: Reward = 10.0
Episode 98: Reward = 10.0
Episode 99: Reward = 10.0
[[ 2.96634752e-02 -2.60711554e-02  4.25288528e-02 -7.54027255e-03]
 [ 2.91420519e-02  1.68415889e-01  4.23780493e-02 -2.86507338e-01]
 [ 3.25103700e-02  3.62908661e-01  3.66479009e-02 -5.65529108e-01]
 [ 3.97685431e-02  5.57497799e-01  2.53373180e-02 -8.46444786e-01]
 [ 5.09185009e-02  7.52265096e-01  8.40842258e-03 -1.13105357e+00]
 [ 6.59638047e-02  9.47275937e-01 -1.42126493e-02 -1.42108750e+00]
 [ 8.49093199e-02  1.14257085e+00 -4.26343977e-02 -1.71817863e+00]
 [ 1.07760735e-01  1.33815467e+00 -7.69979730e-02 -2.02381849e+00]
 [ 1.34523824e-01  1.53398442e+00 -1.17474340e-01 -2.33930779e+00]
 [ 1.65203527e-01  1.72995293e+00 -1.64260492e-01 -2.66569281e+00]
 [-4.72286195e-02 -1.82234086e-02  4.47124988e-02 -3.06329615e-02]
 [-4.75930870e-02  1.76229775e-01  4.40998413e-02 -3.08880121e-01]
 [-4.40684892e-02  3.70696545e-01  3.79222371e-02 -5.87335527e-01]
 [-3.66545618e-02  5.65267444e-01  2.61755269e-02 -

tf.Tensor(
[-0.6253499  -0.5753206  -0.5257553  -0.47851312 -0.4335394  -0.39081365
 -0.35034806 -0.31218308 -0.27638093 -0.24301676 -0.6227563  -0.5875008
 -0.53745914 -0.48935032 -0.44326332 -0.39948967 -0.35804272 -0.3189637
 -0.28231594 -0.24817571 -0.63538295 -0.59369683 -0.5434177  -0.49543238
 -0.4496814  -0.40614104 -0.36482129 -0.3257621  -0.28902775 -0.25469843
 -0.60952413 -0.5850625  -0.5336345  -0.4845209  -0.43788224 -0.39369914
 -0.3519836  -0.31277406 -0.2761279  -0.6079574  -0.5602969  -0.5092914
 -0.46089303 -0.41506052 -0.3717837  -0.3310801  -0.2929899 ], shape=(47,), dtype=float32)
tf.Tensor(0.43385452, shape=(), dtype=float32)
[<tf.Tensor: id=23246, shape=(4, 16), dtype=float32, numpy=
array([[-2.22525159e-05, -5.56708686e-03,  6.56131087e-05,
        -8.56805407e-03, -6.79168245e-03, -6.55835913e-03,
        -5.57351857e-04, -1.24357102e-04, -1.11986801e-03,
         5.09195612e-04, -2.65337038e-03,  8.93869583e-06,
        -6.81939162e-03,  0.00000000e+00, -1.64

Episode 112: Reward = 10.0
Episode 113: Reward = 9.0
Episode 114: Reward = 10.0
[[ 2.41837967e-02 -3.31624597e-03 -4.11933148e-03 -4.06376123e-02]
 [ 2.41174717e-02  1.91864535e-01 -4.93208366e-03 -3.34617376e-01]
 [ 2.79547609e-02  3.87056321e-01 -1.16244312e-02 -6.28851533e-01]
 [ 3.56958881e-02  5.82338572e-01 -2.42014620e-02 -9.25172627e-01]
 [ 4.73426580e-02  7.77778864e-01 -4.27049138e-02 -1.22536182e+00]
 [ 6.28982335e-02  9.73423898e-01 -6.72121495e-02 -1.53111315e+00]
 [ 8.23667124e-02  1.16928864e+00 -9.78344157e-02 -1.84399235e+00]
 [ 1.05752490e-01  1.36534381e+00 -1.34714261e-01 -2.16538644e+00]
 [ 1.33059368e-01  1.56150115e+00 -1.78021982e-01 -2.49644279e+00]
 [ 3.18743289e-02  3.66925150e-02  1.28477067e-02  6.26118062e-03]
 [ 3.26081775e-02  2.31627882e-01  1.29729304e-02 -2.82340616e-01]
 [ 3.72407362e-02  4.26562399e-01  7.32611818e-03 -5.70903838e-01]
 [ 4.57719825e-02  6.21580839e-01 -4.09195898e-03 -8.61269832e-01]
 [ 5.82036003e-02  8.16758275e-01 -2.13173553e-02

Episode 120: Reward = 10.0
Episode 121: Reward = 10.0
Episode 122: Reward = 9.0
Episode 123: Reward = 9.0
Episode 124: Reward = 8.0
[[-4.41688150e-02  1.39986770e-02  3.75604001e-03  2.99069397e-02]
 [-4.38888445e-02  2.09066570e-01  4.35417891e-03 -2.61588544e-01]
 [-3.97075117e-02  4.04126078e-01 -8.77592247e-04 -5.52894950e-01]
 [-3.16249914e-02  5.99260330e-01 -1.19354911e-02 -8.45854223e-01]
 [-1.96397826e-02  7.94543087e-01 -2.88525764e-02 -1.14226639e+00]
 [-3.74892051e-03  9.90029991e-01 -5.16979061e-02 -1.44385624e+00]
 [ 1.60516798e-02  1.18574870e+00 -8.05750266e-02 -1.75223446e+00]
 [ 3.97666544e-02  1.38168728e+00 -1.15619719e-01 -2.06885219e+00]
 [ 6.74003959e-02  1.57777989e+00 -1.56996757e-01 -2.39494395e+00]
 [ 9.89559963e-02  1.77389085e+00 -2.04895645e-01 -2.73146009e+00]
 [ 1.69512574e-02 -1.48360301e-02  2.09846366e-02  2.85591800e-02]
 [ 1.66545361e-02  1.79978803e-01  2.15558186e-02 -2.57429689e-01]
 [ 2.02541128e-02  3.74786466e-01  1.64072253e-02 -5.43236434e-0

Episode 133: Reward = 10.0
Episode 134: Reward = 10.0
[[-7.11889938e-03 -1.31330984e-02 -4.26218435e-02  4.74677198e-02]
 [-7.38156121e-03  1.82573274e-01 -4.16724868e-02 -2.58352280e-01]
 [-3.73009569e-03  3.78264606e-01 -4.68395352e-02 -5.63882589e-01]
 [ 3.83519661e-03  5.74011385e-01 -5.81171848e-02 -8.70946527e-01]
 [ 1.53154247e-02  7.69873619e-01 -7.55361170e-02 -1.18132067e+00]
 [ 3.07128970e-02  9.65890348e-01 -9.91625339e-02 -1.49669385e+00]
 [ 5.00307046e-02  1.16206813e+00 -1.29096404e-01 -1.81862092e+00]
 [ 7.32720643e-02  1.35836756e+00 -1.65468827e-01 -2.14846873e+00]
 [ 1.00439414e-01  1.55468798e+00 -2.08438203e-01 -2.48735332e+00]
 [-4.80561139e-04  1.49382511e-02 -2.93492153e-02  2.48346273e-02]
 [-1.81796102e-04  2.10468531e-01 -2.88525242e-02 -2.76961923e-01]
 [ 4.02757479e-03  4.05989975e-01 -3.43917608e-02 -5.78603506e-01]
 [ 1.21473745e-02  6.01576626e-01 -4.59638312e-02 -8.81919146e-01]
 [ 2.41789073e-02  7.97291756e-01 -6.36022165e-02 -1.18869030e+00]
 [ 4.012

Episode 144: Reward = 10.0
[[ 5.0057848e-03  1.8337592e-03 -1.6698238e-02  2.9981708e-02]
 [ 5.0424603e-03  1.9719115e-01 -1.6098602e-02 -2.6792258e-01]
 [ 8.9862831e-03  3.9253911e-01 -2.1457054e-02 -5.6563932e-01]
 [ 1.6837064e-02  5.8795542e-01 -3.2769840e-02 -8.6500418e-01]
 [ 2.8596174e-02  7.8350770e-01 -5.0069924e-02 -1.1678078e+00]
 [ 4.4266328e-02  9.7924405e-01 -7.3426083e-02 -1.4757588e+00]
 [ 6.3851207e-02  1.1751822e+00 -1.0294125e-01 -1.7904420e+00]
 [ 8.7354854e-02  1.3712972e+00 -1.3875009e-01 -2.1132684e+00]
 [ 1.1478080e-01  1.5675064e+00 -1.8101546e-01 -2.4454155e+00]
 [-4.6332795e-02 -3.1554755e-02  3.8480062e-02 -1.8350979e-02]
 [-4.6963889e-02  1.6299483e-01  3.8113043e-02 -2.9864898e-01]
 [-4.3703996e-02  3.5755336e-01  3.2140061e-02 -5.7907230e-01]
 [-3.6552928e-02  5.5221051e-01  2.0558616e-02 -8.6145967e-01]
 [-2.5508717e-02  7.4704653e-01  3.3294221e-03 -1.1476082e+00]
 [-1.0567786e-02  9.4212484e-01 -1.9622741e-02 -1.4392452e+00]
 [ 8.2747117e-03  1.1374830e

Episode 157: Reward = 9.0
Episode 158: Reward = 9.0
Episode 159: Reward = 9.0
[[-2.25468562e-03 -4.01610993e-02  1.78097673e-02 -1.05451979e-02]
 [-3.05790757e-03  1.54700965e-01  1.75988637e-02 -2.97556162e-01]
 [ 3.61118655e-05  3.49567682e-01  1.16477404e-02 -5.84637165e-01]
 [ 7.02746538e-03  5.44524550e-01 -4.50024891e-05 -8.73628259e-01]
 [ 1.79179572e-02  7.39647090e-01 -1.75175685e-02 -1.16632533e+00]
 [ 3.27108987e-02  9.34992611e-01 -4.08440754e-02 -1.46444857e+00]
 [ 5.14107496e-02  1.13059032e+00 -7.01330453e-02 -1.76960528e+00]
 [ 7.40225613e-02  1.32643044e+00 -1.05525151e-01 -2.08324528e+00]
 [ 1.00551166e-01  1.52244937e+00 -1.47190064e-01 -2.40660524e+00]
 [ 1.31000161e-01  1.71851468e+00 -1.95322156e-01 -2.74064088e+00]
 [ 2.90601961e-02  1.15289716e-02 -3.97772864e-02 -4.50806990e-02]
 [ 2.92907748e-02  2.07198068e-01 -4.06788997e-02 -3.50043625e-01]
 [ 3.34347375e-02  4.02874231e-01 -4.76797745e-02 -6.55271351e-01]
 [ 4.14922200e-02  5.98626435e-01 -6.07852004e-02 -

Episode 165: Reward = 8.0
Episode 166: Reward = 10.0
Episode 167: Reward = 10.0
Episode 168: Reward = 8.0
Episode 169: Reward = 8.0
[[-3.29983910e-03 -1.53905358e-02 -4.79914434e-02  4.14060131e-02]
 [-3.60764959e-03  1.80385575e-01 -4.71633226e-02 -2.66024113e-01]
 [ 6.18680431e-08  3.76147836e-01 -5.24838045e-02 -5.73201835e-01]
 [ 7.52301840e-03  5.71964860e-01 -6.39478415e-02 -8.81946087e-01]
 [ 1.89623144e-02  7.67894387e-01 -8.15867633e-02 -1.19402754e+00]
 [ 3.43202017e-02  9.63972569e-01 -1.05467312e-01 -1.51112640e+00]
 [ 5.35996519e-02  1.16020226e+00 -1.35689840e-01 -1.83478522e+00]
 [ 7.68036991e-02  1.35653925e+00 -1.72385544e-01 -2.16635418e+00]
 [ 1.78218391e-02  3.95647362e-02  2.44119279e-02 -4.72750850e-02]
 [ 1.86131336e-02  2.34328285e-01  2.34664269e-02 -3.32156956e-01]
 [ 2.32996996e-02  4.29108500e-01  1.68232862e-02 -6.17348373e-01]
 [ 3.18818688e-02  6.23991430e-01  4.47632000e-03 -9.04685616e-01]
 [ 4.43616994e-02  8.19052458e-01 -1.36173917e-02 -1.19595814e+0

Episode 178: Reward = 9.0
Episode 179: Reward = 9.0
[[ 2.34529376e-02  1.90261956e-02 -4.15150560e-02  5.72989509e-03]
 [ 2.38334611e-02  2.14718178e-01 -4.14004587e-02 -2.99757093e-01]
 [ 2.81278249e-02  4.10405040e-01 -4.73956019e-02 -6.05204105e-01]
 [ 3.63359265e-02  6.06156647e-01 -5.94996810e-02 -9.12430823e-01]
 [ 4.84590605e-02  8.02030921e-01 -7.77482986e-02 -1.22320485e+00]
 [ 6.44996762e-02  9.98063445e-01 -1.02212399e-01 -1.53920043e+00]
 [ 8.44609439e-02  1.19425583e+00 -1.32996410e-01 -1.86195123e+00]
 [ 1.08346060e-01  1.39056158e+00 -1.70235425e-01 -2.19279528e+00]
 [ 1.35060987e-02 -4.03570496e-02  4.09620963e-02  4.34010774e-02]
 [ 1.26989577e-02  1.54154316e-01  4.18301187e-02 -2.36081734e-01]
 [ 1.57820452e-02  3.48654449e-01  3.71084847e-02 -5.15282273e-01]
 [ 2.27551330e-02  5.43234706e-01  2.68028397e-02 -7.96044290e-01]
 [ 3.36198285e-02  7.37978756e-01  1.08819539e-02 -1.08017647e+00]
 [ 4.83794026e-02  9.32955384e-01 -1.07215745e-02 -1.36942482e+00]
 [ 6.70385

Episode 188: Reward = 10.0
Episode 189: Reward = 10.0
[[ 3.33747305e-02 -2.33968574e-04 -1.18407765e-02 -6.78956509e-03]
 [ 3.33700515e-02  1.95055783e-01 -1.19765680e-02 -3.03184748e-01]
 [ 3.72711681e-02  3.90346348e-01 -1.80402640e-02 -5.99620640e-01]
 [ 4.50780950e-02  5.85716009e-01 -3.00326757e-02 -8.97930980e-01]
 [ 5.67924120e-02  7.81231880e-01 -4.79912944e-02 -1.19990087e+00]
 [ 7.24170506e-02  9.76940691e-01 -7.19893128e-02 -1.50723004e+00]
 [ 9.19558629e-02  1.17285788e+00 -1.02133915e-01 -1.82149136e+00]
 [ 1.15413025e-01  1.36895525e+00 -1.38563737e-01 -2.14407969e+00]
 [ 1.42792135e-01  1.56514573e+00 -1.81445330e-01 -2.47615242e+00]
 [-3.29085514e-02  8.12077243e-03  2.82765348e-02 -3.70812453e-02]
 [-3.27461362e-02  2.02826068e-01  2.75349114e-02 -3.20710242e-01]
 [-2.86896154e-02  3.97545278e-01  2.11207066e-02 -6.04584098e-01]
 [-2.07387097e-02  5.92365623e-01  9.02902335e-03 -8.90540361e-01]
 [-8.89139809e-03  7.87363887e-01 -8.78178421e-03 -1.18037140e+00]
 [ 6.855

[<tf.Tensor: id=39136, shape=(4, 16), dtype=float32, numpy=
array([[ 3.54586009e-05, -3.13247321e-03,  7.00092933e-05,
        -4.71402239e-03, -3.81041318e-03, -3.67950345e-03,
         4.43464902e-04,  2.68883479e-04, -7.87737779e-04,
         1.07961023e-04, -1.65778154e-03,  0.00000000e+00,
        -3.82206799e-03,  0.00000000e+00,  6.42915547e-05,
        -2.23204552e-04],
       [-3.46514535e-05, -1.06953315e-01, -6.84156548e-05,
        -1.60952806e-01, -1.30100474e-01, -1.25630796e-01,
        -4.33370296e-04, -2.62762886e-04, -2.68960502e-02,
         3.40384920e-03, -5.66023178e-02,  0.00000000e+00,
        -1.30498439e-01,  0.00000000e+00, -6.28280613e-05,
        -1.08129196e-02],
       [-3.35790464e-05,  5.19681489e-03, -6.62983002e-05,
         7.82062579e-03,  6.32152427e-03,  6.10434404e-03,
        -4.19958320e-04, -2.54630868e-04,  1.30686746e-03,
        -1.73385386e-04,  2.75028124e-03,  0.00000000e+00,
         6.34086225e-03,  0.00000000e+00, -6.08836235e-05,
   