In [42]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
from time import sleep
from IPython import display
from gym import wrappers
%matplotlib inline

In [43]:
# Constants
GAMMA = 0.9

class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(PolicyNetwork, self).__init__()

        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
    def get_action(self, state):
        if state.__class__ == ().__class__:
            state = torch.from_numpy(state[0]).float().unsqueeze(0)
        else:
            state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(Variable(state))
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob


def update_policy(policy_network, rewards, log_probs):
    discounted_rewards = []

    for t in range(len(rewards)):
        Gt = 0 
        pw = 0
        for r in rewards[t:]:
            Gt = Gt + GAMMA**pw * r
            pw = pw + 1
        discounted_rewards.append(Gt)
        
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

    policy_gradient = []
    for log_prob, Gt in zip(log_probs, discounted_rewards):
        policy_gradient.append(-log_prob * Gt)
    
    policy_network.optimizer.zero_grad()
    policy_gradient = torch.stack(policy_gradient).sum()
    policy_gradient.backward()
    policy_network.optimizer.step()

In [44]:
def render_gym(x):
    plt.imshow(x)
    display.display(plt.gcf())
    display.clear_output(wait=True)
    sleep(1)

In [49]:
def main():
    env = gym.make('CartPole-v0', render_mode="rgb_array")
    policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)
    
    max_episode_num = 5000
    max_steps = 1000
    numsteps = []
    avg_numsteps = []
    all_rewards = []

    for episode in range(max_episode_num):
        state = env.reset()
        #img = plt.imshow(env.render())
        log_probs = []
        rewards = []

        for steps in range(max_steps):
            #img.set_data(env.render())
            #display.display(plt.gcf())
            #display.clear_output(wait=True)
            
            action, log_prob = policy_net.get_action(state)
            new_state, reward, done, _, _= env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)

            if done:
                update_policy(policy_net, rewards, log_probs)
                numsteps.append(steps)
                avg_numsteps.append(np.mean(numsteps[-10:]))
                all_rewards.append(np.sum(rewards))
                if episode % 1 == 0:
                    sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))
                break
            
            state = new_state
    
    plt.plot(numsteps)
    plt.plot(avg_numsteps)
    plt.xlabel('Episode')
    plt.show()

In [50]:
main()

  logger.warn(


tensor([ 1.0513,  1.0221,  0.9898,  0.9538,  0.9139,  0.8695,  0.8202,  0.7654,
         0.7045,  0.6369,  0.5617,  0.4782,  0.3854,  0.2823,  0.1678,  0.0405,
        -0.1010, -0.2581, -0.4327, -0.6267, -0.8423, -1.0818, -1.3479, -1.6436,
        -1.9721, -2.3372])
tensor(-7.5652e-08)
episode: 0, total reward: 26.0, average_reward: 26.0, length: 25
tensor([ 1.2154,  1.1237,  1.0218,  0.9087,  0.7829,  0.6432,  0.4879,  0.3154,
         0.1238, -0.0892, -0.3258, -0.5887, -0.8809, -1.2054, -1.5661, -1.9668])
tensor(1.1176e-07)
episode: 1, total reward: 16.0, average_reward: 21.0, length: 15
tensor([ 1.1998,  1.1188,  1.0287,  0.9287,  0.8175,  0.6940,  0.5568,  0.4043,
         0.2348,  0.0466, -0.1626, -0.3950, -0.6532, -0.9402, -1.2590, -1.6132,
        -2.0068])
tensor(4.9086e-08)
episode: 2, total reward: 17.0, average_reward: 19.667, length: 16
tensor([ 1.0039,  0.9827,  0.9593,  0.9332,  0.9042,  0.8720,  0.8362,  0.7964,
         0.7522,  0.7032,  0.6486,  0.5880,  0.5207,  0.445

KeyboardInterrupt: 

In [40]:
class model2(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, learning_rate=3e-4):
        super(PolicyNetwork, self).__init__()

        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 

In [7]:
for i in model.named_parameters(self):
    print(i)

NameError: name 'self' is not defined

In [20]:
m = torch.distributions.categorical.Categorical(torch.tensor([0.3, 0.7]))

In [21]:
m.sample()

tensor(1)

In [25]:
m.log_prob(m.sample())

tensor(-1.2040)

In [29]:
m = torch.randn(2, 3, requires_grad=True)

In [33]:
m.sum().backward()

In [36]:
m.grad

tensor([[1., 1., 1.],
        [1., 1., 1.]])

In [37]:
m

tensor([[ 0.8112, -0.0583, -0.1627],
        [-0.0245,  1.2620,  0.9236]], requires_grad=True)