In [None]:
from copy import deepcopy

import numpy as np

import torch
import torch.optim as optim

from lagom import BaseAlgorithm
from lagom.agents import A2CAgent
from lagom.core.policies import CategoricalMLPPolicy
from lagom.core.utils import Logger
from lagom.runner import Runner
from lagom.envs import EnvSpec

from lagom.utils import set_global_seeds

from engine import GoalEngine

from goal_sampler import LinearGoalSampler
from goal_sampler import SWUCBgGoalSampler


In [5]:
import argparse
import gym
import numpy as np
from itertools import count
from collections import namedtuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical

class args:
    pass

args = args()
args.gamma = 0.99
args.seed = 1
args.render = False
args.log_interval = 10



env = gym.make('CartPole-v0')
env.seed(args.seed)
torch.manual_seed(args.seed)


SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])


class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(4, 128)
        self.action_head = nn.Linear(128, 2)
        self.value_head = nn.Linear(128, 1)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.affine1(x))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values


model = Policy()
optimizer = optim.RMSprop(model.parameters(), lr=1e-2, alpha=0.99, eps=1e-5)
#optimizer = optim.Adam(model.parameters(), lr=1e-2)


def select_action(state):
    state = torch.from_numpy(state).float()
    probs, state_value = model(Variable(state))
    m = Categorical(probs)
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.data[0]


def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    rewards = []
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + float(np.finfo(np.float32).eps))
    for (log_prob, value), r in zip(saved_actions, rewards):
        reward = r - value.data[0]
        policy_losses.append(-log_prob * reward)
        value_losses.append(F.smooth_l1_loss(value, Variable(torch.Tensor([r]))))
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]


def main():
    running_reward = 10
    for i_episode in count(1):
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            action = select_action(state)
            
            state, reward, done, _ = env.step(action.item())
            if args.render:
                env.render()
            model.rewards.append(reward)
            if done:
                break

        rewards = np.sum(model.rewards)
        
        finish_episode()
        if i_episode % args.log_interval == 0:
            print(f'Episode {i_episode}\t Reward : {rewards}')
            
        if rewards == 200:
            print('Solved!')
            break

if __name__ == '__main__':
    main()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode 10	 Reward : 10.0




Episode 20	 Reward : 9.0
Episode 30	 Reward : 12.0
Episode 40	 Reward : 8.0
Episode 50	 Reward : 10.0
Episode 60	 Reward : 8.0
Episode 70	 Reward : 8.0
Episode 80	 Reward : 9.0
Episode 90	 Reward : 12.0
Episode 100	 Reward : 10.0
Episode 110	 Reward : 10.0
Episode 120	 Reward : 10.0
Episode 130	 Reward : 10.0
Episode 140	 Reward : 10.0
Episode 150	 Reward : 8.0
Episode 160	 Reward : 10.0
Episode 170	 Reward : 10.0
Episode 180	 Reward : 9.0
Episode 190	 Reward : 8.0
Episode 200	 Reward : 10.0
Episode 210	 Reward : 10.0
Episode 220	 Reward : 9.0
Episode 230	 Reward : 10.0
Episode 240	 Reward : 11.0
Episode 250	 Reward : 19.0
Episode 260	 Reward : 10.0
Episode 270	 Reward : 118.0
Episode 280	 Reward : 129.0
Solved!
