In [36]:
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

import random

class args:
    pass

args = args()
args.gamma = 0.995
args.seed = 0
args.render = False
args.log_interval = 1

env = gym.make('CartPole-v0')
env.seed(args.seed)
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 16)
        self.action_head = nn.Linear(16, 2)
        self.value_head = nn.Linear(16, 1)
        
        # init weights
        gain = nn.init.calculate_gain('tanh')
        nn.init.orthogonal_(self.affine1.weight, gain=gain)
        nn.init.constant_(self.affine1.bias, 0.0)
        nn.init.orthogonal_(self.action_head.weight, gain=0.01)
        nn.init.constant_(self.action_head.bias, 0.0)
        nn.init.orthogonal_(self.value_head.weight, gain=1.0)
        nn.init.constant_(self.value_head.bias, 0.0)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.tanh(self.affine1(x))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values


model = Policy()

print(model)
print(model.action_head.weight)

optimizer = optim.RMSprop(model.parameters(), lr=1e-2, alpha=0.99, eps=1e-5)
#optimizer = optim.Adam(model.parameters(), lr=1e-2)


def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action


def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    rewards = []
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + float(np.finfo(np.float32).eps))
    for (log_prob, value), r in zip(saved_actions, rewards):
        reward = r - value.item()
        policy_losses.append(-log_prob * reward)
        value_losses.append(F.mse_loss(value, Variable(torch.Tensor([r]))))
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + 0.5*torch.stack(value_losses).sum()
    
    print(f'Loss {loss.item()}')
    
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]


def main():
    for i_episode in range(2):
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            action = select_action(state)
            
            print(action.item())
            
            state, reward, done, _ = env.step(action.item())
            if args.render:
                env.render()
            model.rewards.append(reward)
            if done:
                break

        rewards = np.sum(model.rewards)
        
        finish_episode()
        if i_episode % args.log_interval == 0:
            print(f'Episode {i_episode}\t Reward : {rewards}')
            
if __name__ == '__main__':
    main()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Policy(
  (affine1): Linear(in_features=4, out_features=16, bias=True)
  (action_head): Linear(in_features=16, out_features=2, bias=True)
  (value_head): Linear(in_features=16, out_features=1, bias=True)
)


Columns 0 to 9 
1.00000e-03 *
  1.2366  0.5375  2.7454 -0.8997  3.2017 -1.6016 -5.3705 -1.5985  1.8631  2.0755
  2.5014 -3.5695  3.0256 -1.4842 -3.4725  1.1526 -0.3967  3.7408  1.7525  0.9690

Columns 10 to 15 
1.00000e-03 *
 -3.7804  1.9036 -1.7147 -3.4487 -2.2568  0.0374
 -0.3536 -2.9248 -0.4460  0.3271 -2.2740  4.8928
[torch.FloatTensor of size (2,16)]

0
0
1
0
1
1
1
1
0
0
0
1
0
0
0
1
0
0
0
0
0
Loss 10.141735076904297
Episode 0	 Reward : 21.0
0
0
1
1
1
1
0
1
1
1
0
1
0
0
0
1
0
0
0
0
1
1
0
1
0
1
1
Loss 16.725116729736328
Episode 1	 Reward : 27.0
