In [1]:
import gymnasium as gym
import torch
import sys

In [2]:
env = gym.make_vec('LunarLander-v3', num_envs = 4)

Checking out environments

In [3]:
env.observation_space

Box([[ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]
 [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]
 [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]
 [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]], [[ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]
 [ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]
 [ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]
 [ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]], (4, 8), float32)

In [4]:
obs_sample = env.observation_space.sample()
obs_sample

array([[-0.03253634, -0.88300264, -7.865607  , -5.336766  ,  0.434043  ,
        -7.9357414 ,  0.9586344 ,  0.5768451 ],
       [ 1.093525  , -1.5775923 , -7.3541307 , -9.811705  ,  3.937337  ,
        -3.7853758 ,  0.27572083,  0.804381  ],
       [-0.7951227 ,  1.5547985 ,  2.4960365 ,  3.3656154 , -5.766545  ,
        -0.26347592,  0.30940792,  0.22402176],
       [ 0.05055139, -0.45805088, -3.811204  ,  8.166333  , -3.594789  ,
        -4.828926  ,  0.23119257,  0.9492218 ]], dtype=float32)

In [5]:
obs_sample.shape

(4, 8)

In [6]:
env.action_space

MultiDiscrete([4 4 4 4])

In [7]:
actions_sample = env.action_space.sample()
actions_sample

array([0, 2, 1, 1])

In [8]:
states, info = env.reset()
states

array([[ 0.00305958,  1.4203336 ,  0.30988377,  0.41837493, -0.00353846,
        -0.07019331,  0.        ,  0.        ],
       [-0.00247841,  1.3984988 , -0.2510584 , -0.5520632 ,  0.00287873,
         0.05686847,  0.        ,  0.        ],
       [ 0.00156975,  1.4137702 ,  0.15898359,  0.12667738, -0.00181216,
        -0.03601218,  0.        ,  0.        ],
       [-0.00418701,  1.4193463 , -0.42412192,  0.37449604,  0.00485856,
         0.09607007,  0.        ,  0.        ]], dtype=float32)

In [9]:
next_states, rewards, dones, terminated, _ = env.step(actions_sample)

In [10]:
next_states

array([[ 6.1192513e-03,  1.4291688e+00,  3.0946472e-01,  3.9266151e-01,
        -7.0069069e-03, -6.9374286e-02,  0.0000000e+00,  0.0000000e+00],
       [-5.0230981e-03,  1.3866826e+00, -2.5699800e-01, -5.2516639e-01,
         5.3823702e-03,  5.0078470e-02,  0.0000000e+00,  0.0000000e+00],
       [ 3.0511855e-03,  1.4160452e+00,  1.4769444e-01,  1.0111109e-01,
        -1.3705736e-03,  8.8333674e-03,  0.0000000e+00,  0.0000000e+00],
       [-8.4383013e-03,  1.4272060e+00, -4.3155876e-01,  3.4928420e-01,
         1.1211531e-02,  1.2707140e-01,  0.0000000e+00,  0.0000000e+00]],
      dtype=float32)

In [11]:
rewards

array([ 0.83746879,  2.80967754,  2.21555632, -0.39298463])

In [12]:
dones # done is when state successfully finished or passed

array([False, False, False, False])

In [13]:
terminated # terminated is when agent failed 

array([False, False, False, False])

Setting up networks

In [14]:
import torch.nn as nn 
import torch.optim as opt
import random
from torch.distributions.categorical import Categorical
import numpy as np

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
device

device(type='cuda')

In [None]:
class Actor(nn.Module):
    def __init__(self, in_features: int, out_features: int, hidden_size: int):
        super(Actor, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = hidden_size
        self.layers = nn.Sequential(
            nn.Linear(in_features, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_features),
            nn.Softmax(-1)
        )

    def forward(self, states: torch.tensor) -> torch.tensor:
        dist = self.layers(states)
        return Categorical(dist)


class Critic(nn.Module):
    def __init__(self,  in_features: int, out_features: int, hidden_size: int):
        super(Critic, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = hidden_size
        self.layers = nn.Sequential(
            nn.Linear(in_features, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, out_features)
        )
    
    def forward(self, states: torch.tensor) -> torch.tensor:
        return self.layers(states)

In [None]:
class Agent():
    def __init__(self, env, actor: Actor, critic: Critic, epsilon: float, gamma: float, lam: float, actor_lr: float, critic_lr: float):
        self.env = env
        self.actor = actor
        self.critic = critic
        self.epsilon = epsilon
        self.gamma = gamma
        self.lam = lam
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.actor_opt = opt.Adam(self.actor.parameters(), actor_lr)
        self.critic_opt = opt.Adam(self.critic.parameters(), critic_lr)

    def save_agent(self, path: str = 'cartpole_agent.pt') -> None:
        torch.save({
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'hyperparameters': {
                'actor_lr': self.actor_lr,
                'critic_lr': self.critic_lr,
                'epsilon': self.epsilon,
                'gamma': self.gamma,
                'lam': self.lam,
                'actor_in_feats': self.actor.in_features,
                'actor_out_feats': self.actor.out_features,
                'critic_in_feats': self.critic.in_features,
                'critic_out_feats': self.critic.out_features,
                'actor_hidden_size': self.actor.hidden_size,
                'critic_hidden_size': self.critic.hidden_size,
            }
        }, path)

    @staticmethod
    def load_agent(env, path: str, device: str) -> 'Agent':
        chckpt = torch.load(path)

        actor = Actor(chckpt['hyperparameters']['actor_in_feats'], chckpt['hyperparameters']['actor_out_feats'], chckpt['hyperparameters']['actor_hidden_size']).to(device)
        critic = Critic(chckpt['hyperparameters']['critic_in_feats'], chckpt['hyperparameters']['critic_out_feats'], chckpt['hyperparameters']['critic_hidden_size']).to(device)
        actor.load_state_dict(chckpt['actor'])
        critic.load_state_dict(chckpt['critic'])

        return Agent(
            env, 
            actor, 
            critic, 
            chckpt['hyperparameters']['epsilon'],
            chckpt['hyperparameters']['gamma'],
            chckpt['hyperparameters']['lam'],
            chckpt['hyperparameters']['actor_lr'],
            chckpt['hyperparameters']['critic_lr']
        )


    def select_action(self, states: torch.tensor) -> tuple:
        dist = self.actor.forward(states)
        actions = dist.sample()

        return actions, dist.log_prob(actions)  
    
    def get_state_values(self, states: torch.tensor) -> torch.tensor:
        return self.critic.forward(states)

    def update_nets(self, actor_loss: torch.tensor, critic_loss: torch.tensor) -> None:
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        self.critic_opt.zero_grad()
        critic_loss.backward()
        self.critic_opt.step()

    def fit(self, train_iters: int, timesteps: int, K: int, bs: int) -> None:
        if bs > self.env.num_envs * timesteps:
            raise ValueError('batch size cannot be greater than number of environments * timesteps')
        
        self.all_rewards = []
        self.all_steps = []

        for train_iter in range(train_iters):
            rollout = []
            ep_reward = 0
            ep_steps = 0

            states, _ = self.env.reset()
            states = torch.from_numpy(states).to(device)

            for _ in range(timesteps):
                actions, probs = self.select_action(states)
                state_vals = self.get_state_values(states)

                next_states, rewards, dones, terminated, _ = self.env.step(actions.detach().cpu().numpy())
                next_states = torch.from_numpy(next_states).to(device)
                rewards = torch.from_numpy(rewards)
                dones = torch.from_numpy(dones)

                ep_reward += sum(rewards).item()

                next_state_vals = self.get_state_values(next_states)
                for s, sv, ns, nsv, a, p, r, d in zip(states, state_vals.detach(), next_states, next_state_vals, actions, probs.detach(), rewards, dones):
                    rollout.append([s, sv, ns, nsv, a, p, r, d])

                states = next_states
                ep_steps += 1
            
            print('finished episode:', train_iter)
            print('total reward:', ep_reward)
            print('number of steps:', ep_steps)
            print('-' * 15)

            self.all_rewards.append(ep_reward)
            self.all_steps.append(ep_steps)

            # next_advantage = 0
            # for t in reversed(range(len(rollout))):
            #     delta = rollout[t][6] + self.gamma * (rollout[t][3] if t + 1 < len(rollout) else 0) - rollout[t][1]
            #     rollout[t].append((delta + self.gamma * self.lam * next_advantage).detach())
            #     next_advantage = rollout[t][8]

            for i in range(len(advantages) - 1): # dont go out of bounds
                discount = 1
                advantage_t = 0
                for j in range(i, len(rollout) - 1):
                    delta = (rollout[j][6] + self.gamma * rollout[j][3] - rollout[j][1])
                    advantage_t += discount * delta
                rollout[i].append(advantage_t)
                discount *= self.gamma * self.lam

            for _ in range(K):
                samples = random.sample(rollout, bs)
                states = torch.stack([s[0] for s in samples])
                old_probs = torch.stack([s[5] for s in samples]) # get action probabilites from samples
                actions = torch.stack([s[4] for s in samples]) # get selected actions from samples

                advantages = torch.tensor([s[8] for s in samples]).to(device)
                advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

                new_probs = torch.stack([self.actor.forward(state).log_prob(action) for state, action in zip(states, actions)]) # get new action probs from sample states
                ratio = (new_probs - old_probs).exp()

                returns = advantages + torch.stack([s[1] for s in samples])

                critic_loss = ((returns - self.critic.forward(states)) ** 2).mean() # loss for critic network
                actor_loss = -torch.min(ratio * advantages, torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages).mean()
                self.update_nets(actor_loss, critic_loss)


In [17]:
actor = Actor(8, 4, 64).to(device)
critic = Critic(8, 1, 64).to(device)

In [18]:
obs_sample = torch.from_numpy(obs_sample).to(device)
dist = actor.forward(obs_sample)
dist

Categorical(logits: torch.Size([4, 4]))

In [19]:
test_sample = dist.sample()

In [20]:
dist.log_prob(test_sample)

tensor([-1.4147, -1.5126, -1.3745, -1.6495], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

In [21]:
dist.log_prob(test_sample).exp()

tensor([0.2430, 0.2203, 0.2530, 0.1921], device='cuda:0',
       grad_fn=<ExpBackward0>)

In [22]:
critic.forward(obs_sample)

tensor([[-0.6404],
        [-0.5591],
        [-0.4051],
        [-0.8154]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [23]:
agent = Agent(env, actor, critic, 0.2, 0.999, 0.98, 0.001, 0.001)

In [24]:
agent.fit(800, 128, 4, 256)

finished episode: 0
total reward: -1026.7723753378766
number of steps: 128
---------------
finished episode: 1
total reward: -562.0362175447955
number of steps: 128
---------------
finished episode: 2
total reward: -352.25816712982686
number of steps: 128
---------------
finished episode: 3
total reward: -707.0751132690879
number of steps: 128
---------------
finished episode: 4
total reward: -589.6928397841383
number of steps: 128
---------------
finished episode: 5
total reward: -628.1823560398917
number of steps: 128
---------------
finished episode: 6
total reward: -1020.8369415126723
number of steps: 128
---------------
finished episode: 7
total reward: -1011.7752862939551
number of steps: 128
---------------
finished episode: 8
total reward: -1044.5596212039877
number of steps: 128
---------------
finished episode: 9
total reward: -1126.720282172506
number of steps: 128
---------------
finished episode: 10
total reward: -253.9549851197329
number of steps: 128
---------------
fini

In [1]:
import matplotlib.pyplot as plt
plt.plot(agent.all_rewards)
plt.xlabel('episode')
plt.ylabel('reward')

NameError: name 'agent' is not defined

In [26]:
agent.env.close()

In [27]:
agent.save_agent('lunarlander_agent.pt')

In [8]:
import torch as T 
start = T.arange(0, 200, 32)
start

tensor([  0,  32,  64,  96, 128, 160, 192])

In [9]:
indices = T.randperm(200)
indices

tensor([ 14, 164,  68,  64, 148, 194, 132,  93,  65,  16, 139,  46, 127,  33,
         60, 147,  84,  30,  39, 190, 126,  99,   9,  54,  12, 171, 178,  55,
         81, 125, 195, 168,  49, 150,  36, 112, 143, 116,  24,  77,  41,  11,
        142, 106, 180, 124, 121,  83,  88, 109,  52,  47,  13,  80, 128, 151,
         66, 113, 182, 166,  32,  22, 181, 174, 119, 107, 122,  94, 146,   3,
         27, 193, 196, 189, 101,  69,  10, 154, 131,  58,  34, 117, 158,   1,
         44, 111,   7,  98, 161,  31,  95, 159,   4,  97,   6, 179,  25, 173,
         57, 176, 165, 110,   0, 155,  71,  48,  62,  42, 149,  18,  59,  53,
         19, 197,  75, 185,  70, 140, 183, 120, 198, 138, 141,  17, 103, 162,
        135, 100,  56,  85, 156,  76, 170,  26, 115,  45,  23,  91, 108,  86,
         37,  61, 188, 114, 175,  43,  29, 157,  38, 129, 163, 199,   2,  15,
        137, 136, 192,  35,  90, 145,  21, 105, 104,  20,   8, 123,  82, 160,
         96,  28,  74, 186, 130,  72,   5, 144, 102, 187, 167,  

In [None]:
batches = [indices[i:i+32] for i in start]
batches

In [7]:
for batch in batches:
    print(batch)

tensor([ 91,  18,  74,  50, 100, 118, 178, 122,  93,  61,  55,  13, 128, 198,
         16, 154,  54,  65, 184,  12, 171,  53,  23,  64, 132, 175, 101, 176,
         20, 170, 144, 181])
tensor([ 15, 195,   6, 165, 127,   2,   4,  42,  37, 114,  66, 147, 191, 143,
         89, 194,  83, 193, 157, 167, 106,  35, 174, 152, 185,  36, 109,  21,
         68,  26,  75, 110])
tensor([ 71, 146,   5,  57,  62, 159, 145, 116, 151, 112,  11, 180,  41,  70,
          0,  52,  90,  27, 108,  80,  82, 150,  73, 136, 173, 119,  63, 115,
        111, 129,  10,  72])
tensor([190,  95, 168,  88, 197,  97, 158, 130,   1,  25, 131,  59,  78,  87,
        113,  77, 188,  44,  92,  43,  56,  46,  28,  33, 124,  67,  86,  40,
        186,  14,  69, 156])
tensor([ 39, 133,  81,  49, 135, 117,   3, 126,  24,  34,  94,  76,  47, 155,
        172, 164,  51,  96, 120, 182,  48, 105, 103, 125,  22, 148, 161, 189,
        149, 107, 160, 169])
tensor([137, 141,  19, 153, 139, 196, 140, 121,  45, 177,  79, 138,  29,  3

In [19]:
indices.shape.numel()

200

In [21]:
indices[1].item()

164

In [22]:
len(indices)

200