In [1]:
# docs and experiment results can be found at https://docs.cleanrl.dev/rl-algorithms/ppo/#ppopy

import argparse
import os
import random
import time
from distutils.util import strtobool

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical
from torch.utils.tensorboard import SummaryWriter

In [58]:
params = {
    # "exp_name": os.path.basename(__file__).rstrip(".py"),
    "exp_name": "clean",
    "seed": 1,
    "torch_deterministic": True,
    "cuda": True,
    "track": False,
    "wandb_project_name": "cleanRL",
    "wandb_entity": None,
    "capture_video": False,
    "env_id": "CartPole-v1",
    "total_timesteps": 500000,
    "learning_rate": 2.5e-4,
    "num_envs": 4,
    "num_steps": 128,
    "anneal_lr": True,
    "gamma": 0.99,
    "gae_lambda": 0.95,
    "num_minibatches": 4,
    "update_epochs": 4,
    "norm_adv": True,
    "clip_coef": 0.2,
    "clip_vloss": True,
    "ent_coef": 0.01,
    "vf_coef": 0.5,
    "max_grad_norm": 0.5,
    "use_tfboard": True,
    "lr_schedule": "linear",
    "debug": False,
    "lr_warmup": 0,
    "use_sde": False,
    "sde_sample_freq": 4,
    "sde_support_size": 100,
    "sde_num_atoms": 50,
    "sde_scaling": 0.5,
    "use_rnd": False,
    "rnd_normalize_reward": False,
    "rnd_normalize_observation": False,
    "rnd_ignore_done": False,
    "no_tensorboard": False,
    "no_wandb": False,
}

params["batch_size"] = int(params["num_envs"] * params["num_steps"])
params["minibatch_size"] = int(params["batch_size"] // params["num_minibatches"])

In [59]:
args = argparse.Namespace(**params)

In [60]:
def make_env(env_id, seed, idx, capture_video, run_name):
    def thunk():
        env = gym.make(env_id)
        env = gym.wrappers.RecordEpisodeStatistics(env)
        if capture_video:
            if idx == 0:
                env = gym.wrappers.RecordVideo(env, f"videos/{run_name}")

        # if not seed:
        #     seed = 200

        # env.seed(seed)
        env.action_space.seed(seed)
        env.observation_space.seed(seed)
        return env

    return thunk

In [61]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

In [62]:
# class Agent(nn.Module):
#     def __init__(self, n_observations, n_actions):
#         super().__init__()
#         self.critic = nn.Sequential(
#             layer_init(nn.Linear(n_observations, 64)),
#             nn.Tanh(),
#             layer_init(nn.Linear(64, 64)),
#             nn.Tanh(),
#             layer_init(nn.Linear(64, 1), std=1.0),
#         )
#         self.actor = nn.Sequential(
#             layer_init(nn.Linear(n_observations, 64)),
#             nn.Tanh(),
#             layer_init(nn.Linear(64, 64)),
#             nn.Tanh(),
#             layer_init(nn.Linear(64, n_actions), std=0.01),
#         )

#     def get_value(self, x):
#         return self.critic(x)

#     def get_action_and_value(self, x, action=None):
#         logits = self.actor(x)
#         probs = Categorical(logits=logits)
#         if action is None:
#             action = probs.sample()
#         return action, probs.log_prob(action), probs.entropy(), self.critic(x)

In [63]:
class Agent(nn.Module):
    def __init__(self, envs):
        super().__init__()
        self.critic = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 1), std=1.0),
        )
        self.actor = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, envs.single_action_space.n), std=0.01),
        )

    def get_value(self, x):
        return self.critic(x)

    def get_action_and_value(self, x, action=None):
        logits = self.actor(x)
        probs = Categorical(logits=logits)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action), probs.entropy(), self.critic(x)

In [64]:
# env_id = "CartPole-v1"

In [65]:
# env = gym.make(env_id)
# n_observations = env.observation_space.shape[0]
# n_actions = env.action_space.n

In [66]:
args.env_id

'CartPole-v1'

In [67]:
run_name = f"{args.env_id}__{args.exp_name}__{args.seed}__{int(time.time())}"
if args.track:
    import wandb

    wandb.init(
        project=args.wandb_project_name,
        entity=args.wandb_entity,
        sync_tensorboard=True,
        config=vars(args),
        name=run_name,
        monitor_gym=True,
        save_code=True,
    )
writer = SummaryWriter(f"runs/{run_name}")
writer.add_text(
    "hyperparameters",
    "|param|value|\n|-|-|\n%s" % ("\n".join([f"|{key}|{value}|" for key, value in vars(args).items()])),
)

# TRY NOT TO MODIFY: seeding
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.backends.cudnn.deterministic = args.torch_deterministic

device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu")

# env setup
envs = gym.vector.SyncVectorEnv(
    # [make_env(args.env_id, args.seed + i, i, args.capture_video, run_name) for i in range(args.num_envs)]
    [make_env(args.env_id, random.randint(3, 1000), i, args.capture_video, run_name) for i in range(args.num_envs)]
)
assert isinstance(envs.single_action_space, gym.spaces.Discrete), "only discrete action space is supported"

In [68]:
# agent = Agent(n_observations, n_actions)
#optimizer = optim.Adam(agent.parameters(), lr=1e-3, eps=1e-5)

In [69]:
agent = Agent(envs).to(device)
optimizer = optim.Adam(agent.parameters(), lr=args.learning_rate, eps=1e-5)

In [108]:
# ALGO Logic: Storage setup
obs = torch.zeros((args.num_steps, args.num_envs) + envs.single_observation_space.shape).to(device)
actions = torch.zeros((args.num_steps, args.num_envs) + envs.single_action_space.shape).to(device)
logprobs = torch.zeros((args.num_steps, args.num_envs)).to(device)
rewards = torch.zeros((args.num_steps, args.num_envs)).to(device)
dones = torch.zeros((args.num_steps, args.num_envs)).to(device)
values = torch.zeros((args.num_steps, args.num_envs)).to(device)

In [109]:
# TRY NOT TO MODIFY: start the game
global_step = 0
start_time = time.time()

_state, _ = envs.reset()
next_obs = torch.Tensor(_state).to(device)
next_done = torch.zeros(args.num_envs).to(device)
num_updates = args.total_timesteps // args.batch_size

In [110]:
num_updates

976

In [111]:
for update in range(1, num_updates +1):
    
    # anneal learning rate schedule
    
    for step in range(0, args.num_steps):
        global_step += 1 * args.num_envs
        
        obs[step] = next_obs
        dones[step] = next_done
        
        with torch.no_grad():
            action, log_prob, entropy, critic_value = agent.get_action_and_value(next_obs)
            values[step] = critic_value.flatten()
        
        actions[step] = action
        logprobs[step] = log_prob
        
        # next_obs, reward, done, truncated, info = envs.step(action.cpu().numpy())
        # rewards[step] = torch.tensor(reward).to(device).view(-1)
        # next_obs, next_done = torch.tensor(next_obs).to(device), torch.tensor(done).to(device)
        
        next_obs, reward, done, _, info = envs.step(action.cpu().numpy())
        rewards[step] = torch.tensor(reward).to(device).view(-1)
        next_obs, next_done = torch.Tensor(next_obs).to(device), torch.Tensor(done).to(device)
    
    # TODO: understand how to calcualte estimate advantage
    with torch.no_grad():
        next_value = agent.get_value(next_obs).reshape(1, -1)
        advantages = torch.zeros_like(rewards).to(device)
        lastgaelam = 0
        for t in reversed(range(args.num_steps)):
            if t == args.num_steps - 1:
                nextnonterminal = 1.0 - next_done
                nextvalues = next_value
            else:
                nextnonterminal = 1.0 - dones[t + 1]
                nextvalues = values[t + 1]
            delta = rewards[t] + args.gamma * nextvalues * nextnonterminal - values[t]
            advantages[t] = lastgaelam = delta + args.gamma * args.gae_lambda * nextnonterminal * lastgaelam
        returns = advantages + values
        
    # flatten the batch
    b_obs = obs.reshape((-1,) + envs.single_observation_space.shape)
    b_logprobs = logprobs.reshape(-1)
    b_actions = actions.reshape((-1,) + envs.single_action_space.shape)
    b_advantages = advantages.reshape(-1)
    b_returns = returns.reshape(-1)
    b_values = values.reshape(-1)
    
    # optimize the policy and value network
    

KeyboardInterrupt: 