# Policy Gradient Algorithm

### Dependencies

In [1]:
import numpy as np
import tensorflow as tf

In [2]:
bandits = [0.2, 0, -0.2, -5]
num_bandits = len(bandits)

In [3]:
def pull_bandit(bandit):
    result = np.random.randn(1)
    return 1 if result > bandit else -1

### The Model

In [4]:
tf.reset_default_graph()

weights = tf.Variable(tf.ones([num_bandits]))
chosen_action = tf.argmax(weights, axis=0)

In [5]:
reward_placeholder = tf.placeholder(tf.float32, shape=[1])
action_placeholder = tf.placeholder(tf.int32, shape=[1])
responsible_weight = tf.slice(weights, action_placeholder, size=[1])

### Loss function and optimizer

In [6]:
# loss = log(policy) * advantage    
# policy is the learned weights while advantage is the reward-baseline
loss = -(tf.log(responsible_weight) * reward_placeholder)
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

## Training the network

### Hyperparameters

In [7]:
total_episodes = 1000
total_reward = np.zeros(num_bandits)
e = 0.9  # epsilon - chance of trying different action (E-Greedy Exploration)

### Training episodes

In [8]:
init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        # Epsilon greedy exploration 
        # choose random action - exploration or action from network - exploitation
        if np.random.randn(1) < e:
            action = np.random.randint(num_bandits)
        else:
            action = sess.run(chosen_action)
        # get reward from picking one of the bandits
        reward = pull_bandit(bandits[action])
        # Update the network
        _, _resp_weight, _weights = sess.run([update, responsible_weight, weights],
                                             feed_dict={reward_placeholder: [reward],
                                                        action_placeholder: [action]})
        total_reward[action] += reward
        # Logging
        if i % 50 == 0:
            print(f'Bandit {np.argmax(_weights)+1} looks good: {total_reward}')
        # increment i
        i += 1

print(f'\n\nThe agent thinks bandit {np.argmax(_weights) + 1} is the most promising...')
if np.argmax(_weights) == np.argmax(-np.array(bandits)):
    print('...and it was right!')
else:
    print('...and it was wrong!')

Bandit 2 looks good: [-1.  0.  0.  0.]
Bandit 4 looks good: [ -2.  -5.  -1.  11.]
Bandit 4 looks good: [-10.  -7.  -1.  31.]
Bandit 4 looks good: [-11. -10.   1.  51.]
Bandit 4 looks good: [-13.  -8.  -1.  75.]
Bandit 4 looks good: [-15.  -8.  -3.  95.]
Bandit 4 looks good: [ -14.   -9.   -1.  115.]
Bandit 4 looks good: [ -14.   -8.    1.  136.]
Bandit 4 looks good: [ -11.   -8.    4.  156.]
Bandit 4 looks good: [ -18.   -8.    4.  173.]
Bandit 4 looks good: [ -17.   -4.    5.  193.]
Bandit 4 looks good: [ -18.   -6.    5.  214.]
Bandit 4 looks good: [ -22.   -5.   10.  228.]
Bandit 4 looks good: [ -23.   -4.    9.  251.]
Bandit 4 looks good: [ -25.   -6.   10.  272.]
Bandit 4 looks good: [ -33.    3.   12.  287.]
Bandit 4 looks good: [ -29.    1.   14.  301.]
Bandit 4 looks good: [ -32.    3.   21.  321.]
Bandit 4 looks good: [ -36.    5.   25.  331.]
Bandit 4 looks good: [ -32.    7.   27.  351.]


The agent thinks bandit 4 is the most promising...
...and it was right!
