In [1]:
import torch
from torch import nn
from torch import optim

import gym
import numpy as np

from octopus.policy.evaluate import evaluate_agent
from octopus.policy.loss import reinforce_loss

In [2]:
class Policy(nn.Module):
    def __init__(self, n_states, n_hidden, n_actions):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(n_states, n_hidden),
            nn.ReLU(),
            # nn.Linear(n_hidden, n_hidden),
            # nn.ReLU(),
            nn.Linear(n_hidden, n_actions),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.layers(x)

In [3]:
env = gym.make("CartPole-v1")
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
n_hidden = 64

In [34]:
# def reinforce(policy, optimizier, n_training_episodes, gamma=0.99):
#     for i_episode in range(n_training_episodes):
#         rewards = []
#         selected_actions = []
#         log_prob_selected_actions = []
        
#         state, _ = env.reset()
#         state = torch.from_numpy(state)
#         in_progress = True
        
#         while in_progress:
#             predicted_action = policy(state)
#             action = torch.multinomial(predicted_action, num_samples=1).item()
            
#             next_state, reward, done, truncated, info = env.step(action)
            
#             selected_actions.append(action)
#             log_prob_selected_actions.append(predicted_action[action].log())
#             rewards.append(reward)
            
#             if done: break
            
#             state = torch.from_numpy(next_state)

#         # [1.0, 0.99, 0.9801, 0.970299, 0.96059601,...]
#         discounts = [gamma**k for k in range(len(rewards)+1)]
        
#         # [1.0, 0.99, 0.9801, 0.970299, 0.96059601,..] for each reward equals to 1
#         discounted_rewards = [discount*reward for discount, reward in zip(discounts, rewards)]

#         # a scalar
#         discounted_return = sum(discounted_rewards)
        
#         # calculate discounted return at each time step
#         discounted_return_each_step = []
#         g_value = 0
#         for discounted_reward_each_step in reversed(discounted_rewards):
#             g_value += discounted_reward_each_step
#             discounted_return_each_step.append(g_value)
        
#         discounted_return_each_step = discounted_return_each_step[::-1]
        
#         losses = []
#         for log_prob, g_step in zip(log_prob_selected_actions, discounted_return_each_step):
#             # why not discounted_return at each timestep corresponds to log_pro
#             losses.append(-log_prob * g_step)
        
#         # losses = []
#         # for log_prob in log_prob_selected_actions:
#         #     # why not discounted_return at each timestep corresponds to log_pro
#         #     losses.append(-log_prob * discounted_return)
        
#         loss = sum(losses)
#         optimizier.zero_grad()
#         loss.backward()
#         optimizier.step()
                
#         print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, loss.detach().numpy()))

In [35]:
# class Learner:
#     def __init__(self, model):
#         self.model = model
#         self.optimizier = optim.Adam(model.parameters(), 1e-3)

In [4]:
def learner(policy, optimizier, n_training_episodes, discount_factor=0.99):
    for i_episode in range(n_training_episodes):
        rewards = []
        selected_actions = []
        log_prob_selected_actions = []
        selected_probs = []
        
        state, _ = env.reset()
        state = torch.from_numpy(state)
        in_progress = True
        
        while in_progress:
            predicted_action = policy(state)
            action = torch.multinomial(predicted_action, num_samples=1).item()
            
            next_state, reward, done, truncated, info = env.step(action)
            
            selected_actions.append(action)
            log_prob_selected_actions.append(predicted_action[action].log())
            selected_probs.append(predicted_action[action])
            rewards.append(torch.tensor(reward))
            
            if done: break
            
            state = torch.from_numpy(next_state)

        loss = reinforce_loss(
            log_probs=log_prob_selected_actions,
            rewards=rewards,
            discount_factor=discount_factor
        )
        
        optimizier.zero_grad()
        loss.backward()
        optimizier.step()
                
        print('Episode {}\tLoss: {:.2f}'.format(i_episode, loss.detach().numpy()))

In [5]:
model = Policy(n_states, n_hidden, n_actions)
optimizier = optim.Adam(model.parameters(), lr=1e-2)

learner(model, optimizier, n_training_episodes=500)

Episode 0	Average Score: 5.82
Episode 1	Average Score: 10.81
Episode 2	Average Score: 14.04
Episode 3	Average Score: 12.20
Episode 4	Average Score: 5.73
Episode 5	Average Score: 7.16
Episode 6	Average Score: 9.45
Episode 7	Average Score: 14.82
Episode 8	Average Score: 6.54
Episode 9	Average Score: 20.86
Episode 10	Average Score: 8.44
Episode 11	Average Score: 6.08
Episode 12	Average Score: 21.25
Episode 13	Average Score: 9.49
Episode 14	Average Score: 7.99
Episode 15	Average Score: 9.28
Episode 16	Average Score: 6.89
Episode 17	Average Score: 14.57
Episode 18	Average Score: 9.51
Episode 19	Average Score: 6.68
Episode 20	Average Score: 17.42
Episode 21	Average Score: 25.53
Episode 22	Average Score: 16.06
Episode 23	Average Score: 11.77
Episode 24	Average Score: 5.59
Episode 25	Average Score: 10.56
Episode 26	Average Score: 9.92
Episode 27	Average Score: 18.46
Episode 28	Average Score: 10.76
Episode 29	Average Score: 13.68
Episode 30	Average Score: 11.45
Episode 31	Average Score: 16.70
E

In [4]:
# def reinforce(policy, optimizier, n_training_episodes, gamma=0.99):
#     for i_episode in range(n_training_episodes):
#         rewards = []
#         selected_actions = []
#         log_prob_selected_actions = []
        
#         state, _ = env.reset()
#         state = torch.from_numpy(state)
#         in_progress = True
        
#         while in_progress:
#             predicted_action = policy(state)
#             action = torch.multinomial(predicted_action, num_samples=1).item()
            
#             next_state, reward, done, truncated, info = env.step(action)
            
#             selected_actions.append(action)
#             log_prob_selected_actions.append(predicted_action[action].log())
#             rewards.append(reward)
            
#             if done: break
            
#             state = torch.from_numpy(next_state)

#         # [1.0, 0.99, 0.9801, 0.970299, 0.96059601,...]
#         discounts = [gamma**k for k in range(len(rewards)+1)]
        
#         # [1.0, 0.99, 0.9801, 0.970299, 0.96059601,..] for each reward equals to 1
#         discounted_rewards = [discount*reward for discount, reward in zip(discounts, rewards)]

#         # a scalar
#         discounted_return = sum(discounted_rewards)
        
#         # calculate discounted return at each time step
#         discounted_return_each_step = []
#         g_value = 0
#         for discounted_reward_each_step in reversed(discounted_rewards):
#             g_value += discounted_reward_each_step
#             discounted_return_each_step.append(g_value)
        
#         discounted_return_each_step = discounted_return_each_step[::-1]
        
#         losses = []
#         for log_prob, g_step in zip(log_prob_selected_actions, discounted_return_each_step):
#             # why not discounted_return at each timestep corresponds to log_pro
#             losses.append(-log_prob * g_step)
        
#         # losses = []
#         # for log_prob in log_prob_selected_actions:
#         #     # why not discounted_return at each timestep corresponds to log_pro
#         #     losses.append(-log_prob * discounted_return)
        
#         loss = sum(losses)
#         optimizier.zero_grad()
#         loss.backward()
#         optimizier.step()
                
#         print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, loss.detach().numpy()))

In [5]:
model = Policy(n_states, n_hidden, n_actions)
optimizier = optim.Adam(model.parameters(), lr=1e-2)

In [6]:
reinforce(model, optimizier, n_training_episodes=100)

Episode 0	Average Score: 228.36
Episode 1	Average Score: 39.30
Episode 2	Average Score: 58.70
Episode 3	Average Score: 28.25
Episode 4	Average Score: 46.56
Episode 5	Average Score: 24.80
Episode 6	Average Score: 22.60
Episode 7	Average Score: 7.56
Episode 8	Average Score: 6.41
Episode 9	Average Score: 5.03
Episode 10	Average Score: 3.84
Episode 11	Average Score: 3.28
Episode 12	Average Score: 2.12
Episode 13	Average Score: 2.15
Episode 14	Average Score: 1.80
Episode 15	Average Score: 1.64
Episode 16	Average Score: 1.12
Episode 17	Average Score: 1.07
Episode 18	Average Score: 0.75
Episode 19	Average Score: 0.60
Episode 20	Average Score: 0.76
Episode 21	Average Score: 0.60
Episode 22	Average Score: 0.44
Episode 23	Average Score: 0.34
Episode 24	Average Score: 0.37
Episode 25	Average Score: 0.31
Episode 26	Average Score: 0.26
Episode 27	Average Score: 0.25
Episode 28	Average Score: 0.19
Episode 29	Average Score: 0.14
Episode 30	Average Score: 0.18
Episode 31	Average Score: 0.13
Episode 32

In [9]:
evaluate_agent(model, env, max_steps=1000, n_eval_episodes=10)

(18.3, 5.119570294468081)

In [14]:
evaluate_agent(model, env, max_steps=1000, n_eval_episodes=10)

(9.7, 0.45825756949558405)

In [2]:
rewards = torch.tensor([5, 2, 3, 4])
actions = torch.tensor([0, 1, 2, 3])

In [3]:
log_probs = torch.tensor([-1, -2, -3, -4])