In [24]:
import torch
import gym
from torch import nn
import numpy as np

##### Example 1

In [5]:
rewards = torch.tensor([1, 2, 3, 4])

In [47]:
import torch

Given `rewards` is the list of all predicted rewards from the current time step `0` to the next time step `3`

In [48]:
rewards

tensor([1, 2, 3, 4])

Write a function to calcualte the discounted reward at each time step

In [49]:
discount_factor = 0.99

In [50]:
def discount_reward(rewards, discount_factor):
    n_rewards = len(rewards)
    timesteps = torch.arange(0, n_rewards)
    
    # calculate the discount for each time step
    discount = torch.pow(discount_factor, timesteps)
    discounted_rewards = discount * rewards

    return discounted_rewards

In [51]:
discount_reward(rewards, discount_factor)

tensor([1.0000, 1.9800, 2.9403, 3.8812])

##### Example 2

In [46]:
preds = torch.tensor([[0.1, 0.3, 0.6]])

In [56]:
import torch

`preds` is the probability distribution over the possible actions at the current time step

In [57]:
preds

tensor([[0.1000, 0.3000, 0.6000]])

Sample an action from `preds` using PyTorch's built-in function

In [58]:
action = torch.multinomial(preds, num_samples=1)

In [59]:
action

tensor([[1]])

##### Example 3

In [6]:
preds = torch.tensor([[0.1, 0.3, 0.6], [0.7, 0.2, 0.1]])

In [8]:
discounted_reward = torch.tensor([1.0, 0.5])

- `preds` is the probability distribution over the possible actions at each time step
- `discounted_reward` is the distribution of discounted rewards at each time step

In [9]:
preds

tensor([[0.1000, 0.3000, 0.6000],
        [0.7000, 0.2000, 0.1000]])

In [10]:
discounted_reward

tensor([1.0000, 0.5000])

In [4]:
def loss_func(preds, discounted_rewards):
    return -1 * torch.sum(discounted_rewards * torch.log(preds))

In [11]:
loss_func(preds, discounted_reward)

RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1

### Transitions 

##### Example 1

In [2]:
env = gym.make('CartPole-v1')
n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]
hidden_size = 150

In [4]:
model = nn.Sequential(
    nn.Linear(n_observations, hidden_size),
    nn.LeakyReLU(),
    nn.Linear(hidden_size, n_actions),
    nn.Softmax(dim=-1)
)

In [58]:
import torch

Write a function generate transitions in one episode

In [59]:
def generate_transitions(model, env):
    transitions = []
    state, _ = env.reset()
    
    while True:
        predicted_action = model(torch.from_numpy(state))
        action = torch.argmax(predicted_action, dim=-1)
        next_state, reward, done, truncated, info = env.step(action.item())
        
        transitions.append((
            state, action, reward, next_state
        ))
        
        if done: break
        
        state = next_state
    
    return transitions

In [60]:
type(model), type(env)

(torch.nn.modules.container.Sequential, gym.wrappers.time_limit.TimeLimit)

In [61]:
transitions = generate_transitions(model, env)

In [62]:
transitions

[(array([-0.04570273, -0.00080109,  0.03115952,  0.04477873], dtype=float32),
  tensor(0),
  1.0,
  array([-0.04571875, -0.19635567,  0.0320551 ,  0.34712765], dtype=float32)),
 (array([-0.04571875, -0.19635567,  0.0320551 ,  0.34712765], dtype=float32),
  tensor(0),
  1.0,
  array([-0.04964587, -0.39191857,  0.03899765,  0.64974385], dtype=float32)),
 (array([-0.04964587, -0.39191857,  0.03899765,  0.64974385], dtype=float32),
  tensor(0),
  1.0,
  array([-0.05748424, -0.5875614 ,  0.05199253,  0.9544474 ], dtype=float32)),
 (array([-0.05748424, -0.5875614 ,  0.05199253,  0.9544474 ], dtype=float32),
  tensor(0),
  1.0,
  array([-0.06923547, -0.7833428 ,  0.07108147,  1.2630016 ], dtype=float32)),
 (array([-0.06923547, -0.7833428 ,  0.07108147,  1.2630016 ], dtype=float32),
  tensor(0),
  1.0,
  array([-0.08490232, -0.9792979 ,  0.09634151,  1.5770723 ], dtype=float32)),
 (array([-0.08490232, -0.9792979 ,  0.09634151,  1.5770723 ], dtype=float32),
  tensor(0),
  1.0,
  array([-0.10448