In [1]:
import argparse
import os
import random
import time
from distutils.util import strtobool

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

In [2]:
# gym_id = "CartPole-v1"
# LEARNING_RATE = 1e-3
# N_ENVS = 4
# N_STEPS = 1000

# # N_UPDATES = 10
# BATCH_SIZE = N_ENVS * N_STEPS

# EXP_NAME = "TESTING"

PARAMS = {
    "GYM_ID": "CartPole-v1",
    "LEARNING_RATE": 1e-3,
    "N_ENVS": 4,
    "N_STEPS": 1000,
    "EXP_NAME": "TESTING",
    "GAE": True,
    "GAMMA": 0.99,
    "GAE_LAMBDA": 0.95
}

PARAMS["BATCH_SIZE"] = PARAMS["N_ENVS"] * PARAMS["N_STEPS"]


In [3]:
F"{PARAMS['GYM_ID']}"

'CartPole-v1'

In [4]:
run_name = f"{PARAMS['GYM_ID']}_{PARAMS['EXP_NAME']}"

In [5]:
writer = SummaryWriter(f"runs/{PARAMS['EXP_NAME']}")

In [6]:
def make_env(gym_id):
    def thunk():
        env = gym.make(gym_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        # env = gym.wrappers.RecordVideo(env, "videos", episode_trigger=)
        return env

    return thunk

In [7]:
# envs = gym.vector.SyncVectorEnv([make_env(gym_id)])

envs = gym.vector.SyncVectorEnv(
        [make_env(PARAMS['GYM_ID']) for i in range(PARAMS['N_ENVS'])]
)

In [8]:
observation, _ = envs.reset()
# episodic_return = 0
for _ in range(200):
    action = envs.action_space.sample()
    observation, reward, done, truncated, info = envs .step(action)
    # episodic_return += reward
     
    if done:
        observation, _ = envs.reset()
        episodic_return = 0
        
        print("episodic_return = ", info['episode']['r'])
envs.close()

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [9]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    
    return layer

In [10]:
class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        prod_observations = np.array(envs.single_observation_space.shape).prod()
        n_actions = envs.single_action_space.n
        
        self.critic = nn.Sequential(
            layer_init(nn.Linear(prod_observations, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.),
        )
        
        self.actor = nn.Sequential(
            layer_init(nn.Linear(prod_observations, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, n_actions), std=0.01),
        )
    
    def get_value(self, state):
        return self.critic(state)
    
    def get_action_and_value(self, state, action=None):
        logits = self.actor(state)
        
        probs = Categorical(logits=logits)
        
        if action is None:
            action = probs.sample()
        
        return action, probs.log_prob(action), probs.entropy(), self.critic(state)

In [11]:
agent = Agent(envs)

In [12]:
agent

Agent(
  (critic): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
  (actor): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, out_features=64, bias=True)
    (3): Tanh()
    (4): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [13]:
optimizer = optim.Adam(agent.parameters(), lr=PARAMS['LEARNING_RATE'], eps=1e-5)

In [14]:
state, _ = envs.reset()
state = torch.tensor(state)

In [15]:
agent.get_value(state)

tensor([[-0.0019],
        [-0.0194],
        [ 0.0121],
        [-0.0157]], grad_fn=<AddmmBackward0>)

In [16]:
(PARAMS['N_STEPS'], PARAMS['N_ENVS']) + envs.single_observation_space.shape

(1000, 4, 4)

In [17]:
envs.single_observation_space.shape

(4,)

In [18]:
# shape(obs) = [N_STEPS, N_ENV, OBS]
obs = torch.zeros((PARAMS['N_STEPS'], PARAMS['N_ENVS']) + envs.single_observation_space.shape)
actions = torch.zeros((PARAMS['N_STEPS'], PARAMS['N_ENVS']) + envs.single_action_space.shape)
log_probs = torch.zeros((PARAMS['N_STEPS'], PARAMS['N_ENVS']))
rewards = torch.zeros((PARAMS['N_STEPS'], PARAMS['N_ENVS']))
dones = torch.zeros((PARAMS['N_STEPS'], PARAMS['N_ENVS']))
values = torch.zeros((PARAMS['N_ENVS'], PARAMS['N_ENVS']))

In [19]:
obs.shape

torch.Size([1000, 4, 4])

In [20]:
global_step = 0
start_time = time.time()
next_obs, _ = envs.reset()
next_obs = torch.tensor(next_obs)
next_done = torch.tensor(PARAMS['N_ENVS'])

N_UPDATES = 10

In [21]:
for update in range(1, N_UPDATES + 1):
    for step in range(0, PARAMS['N_ENVS']):
        global_step += 1 * PARAMS['N_ENVS']
        obs[step] = next_obs
        dones[step] = next_done
        
        with torch.no_grad():
            action, log_prob, _, value = agent.get_action_and_value(next_obs) 
            values[step] = value.flatten()
        
        actions[step] = action
        log_probs[step] = log_prob
        
        next_obs, reward, done, truncated, info = envs.step(action.cpu().numpy())
        rewards[step] = torch.tensor(reward).view(-1)
        next_obs, next_done = torch.tensor(next_obs), torch.tensor(done)
                
        for item in info:
            if "episode" in item.keys():
                print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                writer.add_scalar("charts/episodic_return", item["episode"]["r"], global_step)
                writer.add_scalar("charts/episodic_length", item["episode"]["l"], global_step)
                break
        
        with torch.no_grad():
            next_value = agent.get_value(next_obs).reshape(-1, 1)

AttributeError: 'str' object has no attribute 'keys'