Taken from: https://math.stackexchange.com/questions/2013050/log-of-softmax-function-derivative

The derivation of the softmax score function (aka eligibility vector) is as follows:  
  
First, note that:  
#### $$\pi_\theta(s,a) = softmax =\frac{e^{\phi(s,a)^\intercal\theta}}{\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta}}$$  
The important bit here is that the slide only identifies the proportionality, not the full softmax function which requires the normalization factor.  
  
Continuing the derivation:  
  
Using the log identity $\log(x/y) = \log(x) - \log(y)$ we can write  
#### $$\log(\pi_\theta(s,a)) = \log(e^{\phi(s,a)^\intercal\theta}) - \log(\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta})$$  
  
Now take the gradient:  
#### $$\nabla_\theta\log(\pi_\theta(s,a)) = \nabla_\theta\log(e^{\phi(s,a)^\intercal\theta}) - \nabla_\theta\log(\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta})$$  
  
The left term simplifies as follows:  
  
#### $$left= \nabla_\theta\log(e^{\phi(s,a)^\intercal\theta}) = \nabla_\theta\phi(s,a)^\intercal\theta = \phi(s,a)$$
  
The right term simplifies as follows:  
  
Using the chain rule:  
#### $$\nabla_x\log(f(x)) = \frac{\nabla_xf(x)}{f(x)}$$
  
We can write:  
  
#### $$right = \nabla_\theta\log(\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta}) = \frac{\nabla_\theta\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta}}{\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta}}$$  

Taking the gradient of the numerator we get:  
  
#### $$right = \frac{\sum_{k=1}^N{\phi(s,a_k)}e^{\phi(s,a_k)^\intercal\theta}}{\sum_{k=1}^Ne^{\phi(s,a_k)^\intercal\theta}}$$
Substituting the definition of $\pi_\theta(s,a)$ we can simplify to:  
  
#### $$right = \sum_{k=1}^N{\phi(s,a_k)}\pi_\theta(s,a_k)$$  
Given the definition of Expected Value:  
  
#### $$\mathrm{E}[X] = X \cdot P = x_1p_1+x_2p_2+ ... +x_np_n$$
Which in English is just the sum of each feature times its probability.  
  
####$$X = features = {\phi(s,a)}$$  
####$$P = probabilities =\pi_\theta(s,a)$$  
So now we can write the expected value of the features:  
  
#### $$right = \mathrm{E}_{\pi_\theta}[\phi(s,\cdot)]$$  
Putting it all together:  
#### $$\nabla_\theta\log(\pi_\theta(s,a)) = left - right = \phi(s,a) - \mathrm{E}_{\pi_\theta}[\phi(s,\cdot)]$$

In [1]:
import numpy as np
import gym as gym
from collections import namedtuple, deque
import matplotlib
import matplotlib.pyplot as plt
import time
from IPython import display
import cv2 as cv2
import random
import tensorflow.contrib.eager as tfe
import tensorflow as tf
tf.enable_eager_execution()

from utils.epsilon import Epsilon


  from ._conv import register_converters as _register_converters


Instructions for updating:
Use the retry module or similar alternatives.


# An agent that samples randomly generated weights until it can do 200 episodes

In [None]:
class RandomAgent():
    def __init__(self):
        self._weights = np.random.uniform(-1, 1, 4)
        self.env = gym.make('CartPole-v0')
        
    def get_action(self, s):
        result = np.matmul(self._weights, s)
        if result > 0:
            return 1
        else:
            return 0
    def randomize(self):
        self._weights = np.random.uniform(-1, 1, 4)
    
    def train(self, episodes=1):
        for episode in range(episodes):
            s = self.env.reset()
            steps = 0
            solved = False
            self.randomize() # randomize weights for each episode
            while True:
                action = self.get_action(s)
                s, reward, done, info = self.env.step(action)
                steps += 1
                if steps == 200:
                    solved = True
                if done:
                    print("Episode finished after {} timesteps".format(steps))
                    break
            if solved == True:
                print("Solved")
                break;
    
    def run(self):
        self.env = gym.make('CartPole-v0')
        s = self.env.reset()
        steps = 0
        while True:
            self.env.render()
            action = self.get_action(s)
            s, reward, done, info = self.env.step(action)
            steps += 1
            if done:
                print("Episode finished after {} timesteps".format(steps))
                break
        self.env.close()

agent = RandomAgent()
agent.train(2)

In [None]:
agent.run()

# Policy Gradient

In [159]:
# import tensorflow.contrib.eager as tfe

class LinearModel(tf.keras.Model):
    def __init__(self):
        super(LinearModel, self).__init__()
        self.dense1 = tf.keras.layers.Dense(units=2)
        self.dense2 = tf.keras.layers.Dense(units=4)
        self.dense3 = tf.keras.layers.Dense(units=2, activation=tf.nn.softmax)

    def call(self, input):
        """Run the model."""
        result = self.dense1(input)
        result = self.dense2(result)
        result = self.dense3(result)  # reuse variables from dense2 layer
        return result

model = LinearModel()
# model.summary()

batch = tf.random_uniform([2, 4])
print(batch)

result = model(batch)
print(result)

def loss(model, inputs, targets):
    y = model(inputs)
#     print(tf.squeeze(y))
#     print(tf.squeeze(targets))
    return tf.losses.softmax_cross_entropy(onehot_labels=targets, logits=y)


def grad(model, inputs, targets):
    with tfe.GradientTape() as tape:
        loss_value = loss(model, inputs, targets)
    return tape.gradient(loss_value, model.variables)

optimizer = tf.train.AdamOptimizer(learning_rate=0.01)


# keep results for plotting
train_loss_results = []
train_accuracy_results = []
epoch_loss_avg = tfe.metrics.Mean()
epoch_accuracy = tfe.metrics.Accuracy()

num_epochs = 50

for epoch in range(num_epochs):

    y = tf.constant([[1, 0], [0, 1]], dtype="int32")

    # Optimize the model
    grads = grad(model, batch, y)
    optimizer.apply_gradients(zip(grads, model.variables),
                              global_step=tf.train.get_or_create_global_step())

    # Track progress
    epoch_loss_avg(loss(model, batch, y))  # add current batch loss
    # compare predicted label to actual label
    epoch_accuracy(tf.argmax(model(batch), axis=1, output_type=tf.int32), tf.argmax(y, output_type=tf.int32))

    # end epoch
    train_loss_results.append(epoch_loss_avg.result())
    train_accuracy_results.append(epoch_accuracy.result())

    if epoch % 5 == 0:
        print("Epoch {:03d}: Loss: {:.3f}, Accuracy: {:.3%}".format(epoch,
                                                                epoch_loss_avg.result(),
                                                                epoch_accuracy.result()))
        
model(batch)

tf.Tensor(
[[0.45204008 0.69368565 0.99150026 0.4770708 ]
 [0.17957842 0.9985689  0.41209364 0.45931566]], shape=(2, 4), dtype=float32)
tf.Tensor(
[[0.9263994  0.07360058]
 [0.8764412  0.12355883]], shape=(2, 2), dtype=float32)
Epoch 000: Loss: 0.732, Accuracy: 50.000%
Epoch 005: Loss: 0.696, Accuracy: 50.000%
Epoch 010: Loss: 0.671, Accuracy: 63.636%
Epoch 015: Loss: 0.655, Accuracy: 75.000%
Epoch 020: Loss: 0.640, Accuracy: 80.952%
Epoch 025: Loss: 0.623, Accuracy: 84.615%
Epoch 030: Loss: 0.606, Accuracy: 87.097%
Epoch 035: Loss: 0.588, Accuracy: 88.889%
Epoch 040: Loss: 0.569, Accuracy: 90.244%
Epoch 045: Loss: 0.550, Accuracy: 91.304%


<tf.Tensor: id=525632, shape=(2, 2), dtype=float32, numpy=
array([[0.92259383, 0.07740616],
       [0.106193  , 0.89380693]], dtype=float32)>

# Ok, now we do PG with the continuous states received rather than pixels

In [None]:
class Agent():
    def __init__(self):
        self.env = gym.make('CartPole-v0')
        self.epsilon = Epsilon(start=1.0, end=0.05, update_increment=0.002)
        
        self.episode_durations = []
        
    def getAction(self, s):
        action = self.env.action_space.sample()
        return action
    
    def train(self, episodes=100, chart_function=None):
        self.epsilon.isTraining = True
        # run for 100 episodes:"
        for i in range(episodes):
            s = self.env.reset()
            steps = 0
            while True:
                action = self.getAction(s)
                
                s_1, reward, done, info = self.env.step(action)
                
                if done and steps == 199:
                    reward = 1
                else: 
                    reward = 0
                
                s = s_1
                
                steps += 1
                if done:
                    print("Training episode finished after {} timesteps".format(steps))
                    break
            self.episode_durations.append(steps)
                
    
    def run(self):
        self.env = gym.make('CartPole-v0')
        self.epsilon.isTraining = False
        s = self.env.reset()
        steps = 0
        while True:
            self.env.render()
            action = self.getAction(s)
            s_1, reward, done, info = self.env.step(action)
            steps += 1
            if done:
                print("Episode finished successfully after {} timesteps".format(steps))
                break
        self.env.close()

agent = Agent()

def show_chart(agent):
    plt.figure(figsize=(15,10))
    plt.subplot(2, 2, 1)
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(learner.episode_rewards)
    plt.subplot(2, 2, 2)
    plt.xlabel('Last x Training Cycles')
    plt.ylabel('Loss')
    plt.plot(list(learner.l_tq_squared_error))
    plt.subplot(2, 2, 3)
    plt.xlabel('Episode')
    plt.ylabel('Epsilon')
    plt.plot(list(learner.epsilon_log))
    display.clear_output(wait=True)
    display.display(plt.gcf())

%time agent.train(episodes=1, chart_function=show_chart)
# learner.train(1)


In [None]:
###### learner.train(100, show_chart)
for i in range(3):
    learner.run()
# display.clear_output(wait=True)

# Lessons Learnt
