# Policy Gradient

In [1]:
import sys

import numpy as np
import tensorflow as tf

In [2]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

In [3]:
def pull_bandit(bandit):
    result = np.random.randn(1)
    return 1 if result > bandit else -1

In [4]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, axis=0)

In [5]:
reward_placeholder = tf.placeholder(tf.float32, shape=[1])
action_placeholder = tf.placeholder(tf.int32, shape=[1])
responsible_weight = tf.slice(weights, action_placeholder, size=[1])

In [6]:
# loss = log(policy) * advantage    
# policy is the learned weights while advantage is the reward-baseline
loss = -(tf.log(responsible_weight) * reward_placeholder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

### Training the network

In [7]:
total_episodes = 1000
total_reward = np.zeros(num_bandits)
e = 0.1  # epsilon - chance of trying different action (E-Greedy Exploration)

In [8]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        # Epsilon greedy exploration 
        # choose random action - exploration or action from network - exploitation
        if np.random.randn(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        # get reward from picking one of the bandits
        reward = pull_bandit(bandits[action])
        # Update the network
        _, _resp_weight, _weights = sess.run([update, responsible_weight, weights],
                                             feed_dict={reward_placeholder: [reward],
                                                        action_placeholder: [action]})
        total_reward[action] += reward
        # Logging
        if i % 50 == 0:
            print(f'Running reward for the {num_bandits} bandits: {total_reward}')
        # increment i
        i += 1
print(f'The agent thinks bandit {np.argmax(_weights) + 1} is the most promising...')
if np.argmax(_weights) == np.argmax(-np.array(bandits)):
    print('...and it was right!')
else:
    print('...and it was wrong!')
        

Running reward for the 4 bandits: [ 1.  0.  0.  0.]
Running reward for the 4 bandits: [-2.  2.  5.  8.]
Running reward for the 4 bandits: [ -4.   1.   9.  33.]
Running reward for the 4 bandits: [ -7.  -3.   7.  66.]
Running reward for the 4 bandits: [-10.   2.   9.  96.]
Running reward for the 4 bandits: [ -11.    2.   11.  123.]
Running reward for the 4 bandits: [ -12.    4.    9.  146.]
Running reward for the 4 bandits: [ -14.    4.    5.  176.]
Running reward for the 4 bandits: [ -12.    8.    7.  206.]
Running reward for the 4 bandits: [ -16.   11.   10.  232.]
Running reward for the 4 bandits: [ -19.   12.   12.  264.]
Running reward for the 4 bandits: [ -23.   12.   12.  294.]
Running reward for the 4 bandits: [ -23.   13.    9.  326.]
Running reward for the 4 bandits: [ -20.   14.   11.  358.]
Running reward for the 4 bandits: [ -21.   10.   14.  386.]
Running reward for the 4 bandits: [ -21.   12.   17.  413.]
Running reward for the 4 bandits: [ -25.    7.   19.  444.]
Running 