In [2]:
import torch
import gym
from torch import nn
import numpy as np

### Discounted Rewards

##### Example 1

In [5]:
rewards = torch.tensor([1, 2, 3, 4])

In [47]:
import torch

Given `rewards` is the list of all predicted rewards from the current time step `0` to the next time step `3`

In [48]:
rewards

tensor([1, 2, 3, 4])

Write a function to calcualte the discounted reward at each time step

In [49]:
discount_factor = 0.99

In [50]:
def discount_reward(rewards, discount_factor):
    n_rewards = len(rewards)
    timesteps = torch.arange(0, n_rewards)
    
    # calculate the discount for each time step
    discount = torch.pow(discount_factor, timesteps)
    discounted_rewards = discount * rewards

    return discounted_rewards

In [51]:
discount_reward(rewards, discount_factor)

tensor([1.0000, 1.9800, 2.9403, 3.8812])

##### Example 2

In [46]:
preds = torch.tensor([[0.1, 0.3, 0.6]])

In [56]:
import torch

`preds` is the probability distribution over the possible actions at the current time step

In [57]:
preds

tensor([[0.1000, 0.3000, 0.6000]])

Sample an action from `preds` using PyTorch's built-in function

In [58]:
action = torch.multinomial(preds, num_samples=1)

In [59]:
action

tensor([[1]])

##### Example 3

In [6]:
preds = torch.tensor([[0.1, 0.3, 0.6], [0.7, 0.2, 0.1]])

In [8]:
discounted_reward = torch.tensor([1.0, 0.5])

- `preds` is the probability distribution over the possible actions at each time step
- `discounted_reward` is the distribution of discounted rewards at each time step

In [9]:
preds

tensor([[0.1000, 0.3000, 0.6000],
        [0.7000, 0.2000, 0.1000]])

In [10]:
discounted_reward

tensor([1.0000, 0.5000])

In [4]:
def loss_func(preds, discounted_rewards):
    return -1 * torch.sum(discounted_rewards * torch.log(preds))

In [11]:
loss_func(preds, discounted_reward)

RuntimeError: The size of tensor a (2) must match the size of tensor b (3) at non-singleton dimension 1

### Transitions 

##### Example 1

In [2]:
env = gym.make('CartPole-v1')
n_actions = env.action_space.n
n_observations = env.observation_space.shape[0]
hidden_size = 150

In [3]:
model = nn.Sequential(
    nn.Linear(n_observations, hidden_size),
    nn.LeakyReLU(),
    nn.Linear(hidden_size, n_actions),
    nn.Softmax(dim=-1)
)

In [6]:
import torch

Write a function generate transitions in one episode

**Hint**: `env.step()` returns `next_state`, `reward`, `done`, `truncated` and `info`

In [7]:
def generate_transitions(model, env):
    transitions = []
    state, _ = env.reset()
    
    while True:
        predicted_action = model(torch.from_numpy(state))
        action = torch.argmax(predicted_action, dim=-1)
        next_state, reward, done, truncated, info = env.step(action.item())
        
        transitions.append((
            state, action, reward, next_state
        ))
        
        if done: break
        
        state = next_state
    
    return transitions

In [8]:
type(model), type(env)

(torch.nn.modules.container.Sequential, gym.wrappers.time_limit.TimeLimit)

In [9]:
transitions = generate_transitions(model, env)

In [10]:
transitions

[(array([ 0.00905523,  0.02905998,  0.04682675, -0.00809694], dtype=float32),
  tensor(1),
  1.0,
  array([ 0.00963643,  0.22348018,  0.04666481, -0.28564554], dtype=float32)),
 (array([ 0.00963643,  0.22348018,  0.04666481, -0.28564554], dtype=float32),
  tensor(1),
  1.0,
  array([ 0.01410604,  0.4179066 ,  0.0409519 , -0.5632532 ], dtype=float32)),
 (array([ 0.01410604,  0.4179066 ,  0.0409519 , -0.5632532 ], dtype=float32),
  tensor(1),
  1.0,
  array([ 0.02246417,  0.61243075,  0.02968683, -0.8427583 ], dtype=float32)),
 (array([ 0.02246417,  0.61243075,  0.02968683, -0.8427583 ], dtype=float32),
  tensor(1),
  1.0,
  array([ 0.03471278,  0.80713516,  0.01283167, -1.1259596 ], dtype=float32)),
 (array([ 0.03471278,  0.80713516,  0.01283167, -1.1259596 ], dtype=float32),
  tensor(1),
  1.0,
  array([ 0.05085549,  1.0020866 , -0.00968752, -1.4145904 ], dtype=float32)),
 (array([ 0.05085549,  1.0020866 , -0.00968752, -1.4145904 ], dtype=float32),
  tensor(1),
  1.0,
  array([ 0.07089

### Discounted Return

##### Example 1

In [3]:
rewards = torch.tensor([1, 2, 3, 4, 5])

`rewards` is a list of rewards at each time step to an end of an episode

In [6]:
rewards

tensor([1, 2, 3, 4, 5])

Write a function calculate **the discounted return of an episode**

In [11]:
def calculate_discounted_return(rewards, gamma=0.99):
    total_return = torch.zeros(1)
    discounted_returns = []
    reversed_rewards = reversed(rewards)
    
    for reward in reversed_rewards:
        pass

In [12]:
calculate_discounted_return(rewards)

tensor([5, 4, 3, 2, 1])