In [1]:
from collections import namedtuple, deque
from copy import deepcopy
from itertools import count
import math
import random

from torch.utils.tensorboard import SummaryWriter
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.categorical import Categorical

from env import Env

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

writer = SummaryWriter("runs/deep-sarsa")
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

ENV = Env(4.5)

cpu


In [2]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4
NUM_ACTIONS = ENV.n_actions
NUM_STEPS = int(2e4)

In [3]:
Transition = namedtuple('Transition', ('state', 'action', 'reward', 'next_state','next_action'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
    
    def push(self, *args):
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [4]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

In [5]:
class Agent(nn.Module):
    def __init__(self):
        super().__init__()
        self.critic = nn.Sequential(
            #layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            layer_init(nn.Linear(1, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            #layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            layer_init(nn.Linear(1, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, NUM_ACTIONS), std=0.01),
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

In [6]:
agent = Agent().to(device)
optimizer = optim.Adam(agent.parameters(), lr=LR, eps=1e-5)

In [7]:
obs = torch.zeros((NUM_STEPS, 1) + (1,)).to(device)
actions = torch.zeros((NUM_STEPS, 1) + (NUM_ACTIONS,)).to(device)
logprobs = torch.zeros((NUM_STEPS, 1)).to(device)
rewards = torch.zeros((NUM_STEPS, 1)).to(device)
dones = torch.zeros((NUM_STEPS, 1)).to(device)
values = torch.zeros((NUM_STEPS, 1)).to(device)

In [8]:
obs.shape

torch.Size([20000, 1, 1])

In [23]:
next_obs

tensor([-6.9004e-17])

In [14]:
import wandb

wandb.init(
    project="rl ppo",
    sync_tensorboard=True,
    monitor_gym=True,
    save_code=True,
)
writer = SummaryWriter(f"runs")

In [18]:
from icecream import ic

In [26]:
global_step = 0
total_timesteps = 10000
update_epochs = 4
num_minibatch = 4
minibatch_size = int(BATCH_SIZE // num_minibatch)
next_obs = torch.Tensor(ENV.reset()).to(device)
next_done = torch.zeros(1).to(device)
num_updates = total_timesteps // BATCH_SIZE

In [27]:
for update in range(1, num_updates + 1):

    for step in range(0, NUM_STEPS):
        global_step += 1
        ic(next_obs)
        ic(next_obs.shape)
        obs[step] = next_obs
        dones[step] = next_done

        # ALGO LOGIC: action logic
        with torch.no_grad():
            action, logprob, _, value = agent.get_action_and_value(next_obs)
            values[step] = value.flatten()
        actions[step] = action
        logprobs[step] = logprob

        # TRY NOT TO MODIFY: execute the game and log data.
        # next_obs, reward, done, info = ENV.step(action.item())
        next_obs, reward = ENV.step(action.item())
        ic(next_obs)
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        # next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
        next_obs = torch.Tensor(next_obs).to(device)
        ic(next_obs)

    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        returns = torch.zeros_like(rewards).to(device)
        for t in reversed(range(NUM_STEPS)):
            if t == NUM_STEPS - 1:
                nextnonterminal = 1.0 - next_done
                next_return = next_value
            else:
                nextnonterminal = 1.0 - dones[t + 1]
                next_return = returns[t + 1]
            returns[t] = rewards[t] + GAMMA * nextnonterminal * next_return
        advantages = returns - values

    # flatten the batch
    b_obs = obs.reshape((-1,) + (1, ))
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + (NUM_ACTIONS, ))
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)

    b_inds = np.arange(BATCH_SIZE)
    clipfracs = []
    for epoch in range(update_epochs):
        np.random.shuffle(b_inds)
        for start in range(0, BATCH_SIZE, minibatch_size):
            end = start + minibatch_size
            mb_inds = b_inds[start:end]

            _, newlogprob, entropy, newvalue = agent.get_action_and_value(b_obs[mb_inds], b_actions.long()[mb_inds])
            logratio = newlogprob - b_logprobs[mb_inds]
            ratio = logratio.exp()

            with torch.no_grad():
                # calculate approx_kl http://joschu.net/blog/kl-approx.html
                old_approx_kl = (-logratio).mean()
                approx_kl = ((ratio - 1) - logratio).mean()
                clipfracs += [((ratio - 1.0).abs() > 0.2).float().mean().item()]

            mb_advantages = b_advantages[mb_inds]
        
            # Policy loss
            pg_loss1 = -mb_advantages * ratio
            pg_loss2 = -mb_advantages * torch.clamp(ratio, 1 - 0.2, 1 + 0.2)
            pg_loss = torch.max(pg_loss1, pg_loss2).mean()

            # Value loss
            newvalue = newvalue.view(-1)
            v_loss_unclipped = (newvalue - b_returns[mb_inds]) ** 2
            v_clipped = b_values[mb_inds] + torch.clamp(
                newvalue - b_values[mb_inds],
                -0.2,
                0.2,
            )
            v_loss_clipped = (v_clipped - b_returns[mb_inds]) ** 2
            v_loss_max = torch.max(v_loss_unclipped, v_loss_clipped)
            v_loss = 0.5 * v_loss_max.mean()

            entropy_loss = entropy.mean()
            loss = pg_loss - 0.01 * entropy_loss + v_loss * 0.5

            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(agent.parameters(), 0.5)
            optimizer.step()

    y_pred, y_true = b_values.cpu().numpy(), b_returns.cpu().numpy()
    var_y = np.var(y_true)
    explained_var = np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y


ic| next_obs: tensor([-6.9004e-17])
ic| next_obs.shape: torch.Size([1])
ic| next_obs: 1
ic| next_obs: tensor([-6.9004e-17])
ic| next_obs: tensor([-6.9004e-17])
ic| next_obs.shape: torch.Size([1])
ic| next_obs: 1
ic| next_obs: tensor([-6.9004e-17])
ic| next_obs: tensor([-6.9004e-17])
ic| next_obs.shape: torch.Size([1])
ic| next_obs: 1
ic| next_obs: tensor([-3.5128e+22])
ic| next_obs: tensor([-3.5128e+22])
ic| next_obs.shape: torch.Size([1])
ic| next_obs: 2
ic| next_obs: tensor([-6.9004e-17,  4.5597e-41])
ic| next_obs: tensor([-6.9004e-17,  4.5597e-41])
ic| next_obs.shape: torch.Size([2])


RuntimeError: The expanded size of the tensor (1) must match the existing size (2) at non-singleton dimension 1.  Target sizes: [1, 1].  Tensor sizes: [2]