In [None]:
# policy gradient 

In [2]:
import gym
import numpy as np
import matplotlib
import copy 

#Hyperparameters
NUM_EPISODES = 10000
LEARNING_RATE = 0.000025
GAMMA = 0.99

# Create gym
env = gym.make('Acrobot-v1')
nA = env.action_space.n
np.random.seed(0)

# Init weight
w = np.random.rand(6, 3)

# rewards
episode_rewards = []

# Our policy
def policy(state,w):
    z = state.dot(w)
    exp = np.exp(z)
    return exp/np.sum(exp)

# Vectorized softmax Jacobian
def softmax_grad(softmax):
    s = softmax.reshape(-1,1)
    return np.diagflat(s) - np.dot(s, s.T)

# Main loop 
for e in range(NUM_EPISODES):

    state = env.reset()[None,:]

    grads = []
    rewards = []

    # single episode score
    score = 0

    while True:

        # Uncomment to visualise
        #env.render()

        # Sample from policy and take action in environment
        probs = policy(state,w)
        action = np.random.choice(nA,p=probs[0])
        next_state,reward,done,_ = env.step(action)
        next_state = next_state[None,:]

        # Compute gradient and save with reward in memory for our weight updates
        dsoftmax = softmax_grad(probs)[action,:]
        dlog = dsoftmax / probs[0,action]
        grad = state.T.dot(dlog[None,:])

        grads.append(grad)
        rewards.append(reward)

        score+=reward

        # update
        state = next_state

        if done:
            break

    # Weight update
    for i in range(len(grads)):

        # Loop through everything that happend in the episode and update towards the log policy gradient times **FUTURE** reward
        w += LEARNING_RATE * grads[i] * sum([ r * (GAMMA ** r) for t,r in enumerate(rewards[i:])])
    
    # Append for logging and print
    episode_rewards.append(score) 
    print("EP: " + str(e) + " Score: " + str(score) + "         ",end="\r", flush=False) 
matplotlib.rc('figure', figsize=(10, 5))
plt.plot(np.arange(NUM_EPISODES),episode_rewards)
plt.show()
env.close()

EP: 9999 Score: -77.0          

NameError: name 'plt' is not defined