# Policy Gradient

### Dependencies

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

In [3]:
def pull_bandit(bandit):
    result = np.random.randn(1)
    return 1 if result > bandit else -1

### The Model

In [4]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, axis=0)

In [5]:
reward_placeholder = tf.placeholder(tf.float32, shape=[1])
action_placeholder = tf.placeholder(tf.int32, shape=[1])
responsible_weight = tf.slice(weights, action_placeholder, size=[1])

### Loss function and optimizer

In [6]:
# loss = log(policy) * advantage    
# policy is the learned weights while advantage is the reward-baseline
loss = -(tf.log(responsible_weight) * reward_placeholder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

## Training the network

### Hyperparameters

In [7]:
total_episodes = 1000
total_reward = np.zeros(num_bandits)
e = 0.1  # epsilon - chance of trying different action (E-Greedy Exploration)

### Training episodes

In [8]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        # Epsilon greedy exploration 
        # choose random action - exploration or action from network - exploitation
        if np.random.randn(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        # get reward from picking one of the bandits
        reward = pull_bandit(bandits[action])
        # Update the network
        _, _resp_weight, _weights = sess.run([update, responsible_weight, weights],
                                             feed_dict={reward_placeholder: [reward],
                                                        action_placeholder: [action]})
        total_reward[action] += reward
        # Logging
        if i % 50 == 0:
            print(f'Bandit {np.argmax(_weights)+1} looks good: {total_reward}')
        # increment i
        i += 1

print(f'\n\nThe agent thinks bandit {np.argmax(_weights) + 1} is the most promising...')
if np.argmax(_weights) == np.argmax(-np.array(bandits)):
    print('...and it was right!')
else:
    print('...and it was wrong!')

Bandit 1 looks good: [ 0.  0. -1.  0.]
Bandit 4 looks good: [ -7.  -1.   2.  29.]
Bandit 4 looks good: [ -7.   0.   2.  66.]
Bandit 4 looks good: [ -4.  -1.  -1.  97.]
Bandit 4 looks good: [  -5.    0.   -1.  127.]
Bandit 4 looks good: [  -6.    0.    5.  162.]
Bandit 4 looks good: [  -1.    2.    6.  196.]
Bandit 4 looks good: [  -4.    2.    5.  228.]
Bandit 4 looks good: [  -4.    0.    7.  258.]
Bandit 4 looks good: [  -9.    3.   12.  283.]
Bandit 4 looks good: [  -9.    1.   15.  316.]
Bandit 4 looks good: [ -10.    2.   18.  347.]
Bandit 4 looks good: [ -11.    0.   21.  377.]
Bandit 4 looks good: [ -14.   -4.   25.  402.]
Bandit 4 looks good: [ -16.   -6.   29.  430.]
Bandit 4 looks good: [ -18.   -6.   30.  463.]
Bandit 4 looks good: [ -19.  -12.   34.  494.]
Bandit 4 looks good: [ -19.  -14.   32.  522.]
Bandit 4 looks good: [ -19.  -15.   30.  555.]
Bandit 4 looks good: [ -22.  -16.   31.  588.]


The agent thinks bandit 4 is the most promising...
...and it was right!
