In [26]:
import torch
from torch import nn
from torch import optim

import gym
import numpy as np

from octopus.policy.evaluate import evaluate_agent
from octopus.policy.loss import reinforce_loss

In [27]:
class Policy(nn.Module):
    def __init__(self, n_states, n_hidden, n_actions):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_states, n_hidden),
            nn.ReLU(),
            nn.Linear(n_hidden, n_actions),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.layers(x)

In [28]:
env = gym.make("CartPole-v1")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
n_hidden = 64

In [29]:
def learner(policy, optimizier, n_training_episodes, discount_factor=0.99):
    for i_episode in range(n_training_episodes):
        rewards = []
        selected_actions = []
        log_prob_selected_actions = []
        selected_probs = []
        
        state, _ = env.reset()
        state = torch.from_numpy(state)
        in_progress = True
        
        while in_progress:
            predicted_action = policy(state)
            action = torch.multinomial(predicted_action, num_samples=1).item()
            
            next_state, reward, done, truncated, info = env.step(action)
            
            selected_actions.append(action)
            log_prob_selected_actions.append(predicted_action[action].log())
            selected_probs.append(predicted_action[action])
            rewards.append(torch.tensor(reward))
            
            if done: break
            
            state = torch.from_numpy(next_state)

        loss = reinforce_loss(
            log_probs=log_prob_selected_actions,
            rewards=rewards,
            discount_factor=discount_factor
        )
        
        optimizier.zero_grad()
        loss.backward()
        optimizier.step()
        
        if i_episode % 100 == 0:
            print('Episode {}\tLoss: {:.2f}'.format(i_episode, loss.detach().numpy()))

In [32]:
model = Policy(n_states, n_hidden, n_actions)
optimizier = optim.Adam(model.parameters(), lr=1e-2)

learner(model, optimizier, n_training_episodes=100)

Episode 0	Loss: 6.14


In [33]:
evaluate_agent(model, env, max_steps=1000, n_eval_episodes=10)

(212.4, 92.88724347293336)