In [1]:
import gymnasium as gym
import torch
import sys

In [2]:
env = gym.make_vec('LunarLander-v3', num_envs = 4)

In [3]:
env.observation_space

Box([[ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]
 [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]
 [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]
 [ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
   -0.         -0.       ]], [[ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]
 [ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]
 [ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]
 [ 2.5        2.5       10.        10.         6.2831855 10.
   1.         1.       ]], (4, 8), float32)

In [4]:
obs_sample = env.observation_space.sample()
obs_sample

array([[ 0.44748777, -2.312572  ,  1.2642175 ,  9.7698    , -1.2552831 ,
         8.596922  ,  0.21509914,  0.25256023],
       [-0.945145  , -0.9545386 , -2.4618263 , -3.597326  ,  0.59580344,
         3.2782707 ,  0.7986262 ,  0.1646779 ],
       [-0.765997  ,  1.5410225 , -1.4091359 , -0.358053  ,  1.5071638 ,
        -3.1281674 ,  0.03703095,  0.6593081 ],
       [-0.6173986 , -1.6826386 ,  1.0004673 ,  3.242899  , -5.3144646 ,
         4.4133782 ,  0.22535437,  0.23393273]], dtype=float32)

In [5]:
obs_sample.shape

(4, 8)

In [6]:
env.action_space

MultiDiscrete([4 4 4 4])

In [7]:
actions_sample = env.action_space.sample()
actions_sample

array([3, 1, 3, 1])

In [8]:
states, info = env.reset()
states

array([[-1.1797905e-03,  1.4014894e+00, -1.1951701e-01, -4.1913673e-01,
         1.3738930e-03,  2.7072394e-02,  0.0000000e+00,  0.0000000e+00],
       [ 6.7546843e-03,  1.4133019e+00,  6.8415338e-01,  1.0584520e-01,
        -7.8201331e-03, -1.5497103e-01,  0.0000000e+00,  0.0000000e+00],
       [ 4.6447753e-03,  1.4069952e+00,  4.7044951e-01, -1.7444031e-01,
        -5.3753429e-03, -1.0656377e-01,  0.0000000e+00,  0.0000000e+00],
       [-3.4833909e-03,  1.4201725e+00, -3.5283408e-01,  4.1121399e-01,
         4.0430767e-03,  7.9922289e-02,  0.0000000e+00,  0.0000000e+00]],
      dtype=float32)

In [9]:
next_states, rewards, dones, terminated, _ = env.step(actions_sample)

In [10]:
next_states

array([[-2.2675514e-03,  1.3914794e+00, -1.0777588e-01, -4.4488612e-01,
         3.8926295e-04, -1.9693913e-02,  0.0000000e+00,  0.0000000e+00],
       [ 1.3423729e-02,  1.4151100e+00,  6.7246151e-01,  8.0320738e-02,
        -1.3318105e-02, -1.0996882e-01,  0.0000000e+00,  0.0000000e+00],
       [ 9.3564987e-03,  1.4025036e+00,  4.7819933e-01, -1.9966845e-01,
        -1.2321948e-02, -1.3894440e-01,  0.0000000e+00,  0.0000000e+00],
       [-7.0319176e-03,  1.4288518e+00, -3.6050764e-01,  3.8572267e-01,
         9.6268412e-03,  1.1168589e-01,  0.0000000e+00,  0.0000000e+00]],
      dtype=float32)

In [11]:
rewards

array([-1.12174097,  0.73976798, -1.92400182, -0.07033965])

In [12]:
dones # done is when state successfully finished or passed

array([False, False, False, False])

In [13]:
terminated # terminated is when agent failed 

array([False, False, False, False])

In [14]:
from src.ppo import *
from src.tune_hyperparams import return_obj
import optuna
import random
import numpy as np

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [15]:
actor = Actor(8, 4, 256).to(device)
critic = Critic(8, 1, 256).to(device)

In [16]:
obs_sample = torch.from_numpy(obs_sample).to(device)
dist = actor.forward(obs_sample)
dist

Categorical(logits: torch.Size([4, 4]))

In [17]:
test_sample = dist.sample()

In [18]:
dist.log_prob(test_sample)

tensor([-1.4937, -1.5034, -1.2137, -1.6737], device='cuda:0',
       grad_fn=<SqueezeBackward1>)

In [19]:
dist.log_prob(test_sample).exp()

tensor([0.2245, 0.2224, 0.2971, 0.1876], device='cuda:0',
       grad_fn=<ExpBackward0>)

In [20]:
critic.forward(obs_sample)

tensor([[-0.3053],
        [-0.0186],
        [-0.2330],
        [ 0.0091]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [None]:
params = {
    "actor_in_feats": (8, 8, "int"),
    "actor_out_feats": (4, 4, "int"),
    "actor_hs": (64, 256, "int"),
    "critic_in_feats": (8, 8, "int"),
    "critic_out_feats": (1, 1, "int"),
    "critic_hs": (64, 256, "int"),
    "epsilon": (0.1, 0.3, "float"),
    "gamma": (0.9, 0.999, "float"),
    "lambda": (0.9, 1.0, "float"),
    "actor_lr": (1e-5, 1e-3, "float"),
    "critic_lr": (1e-5, 1e-3, "float"),
    "batch_size": (4, 256, "int"),
    "c1": (0.01, 0.2, "int"),
    "c2": (0.01, 0.2, "int")
}

objective = return_obj(env, device, 500, 128, 4, train, params)
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 50)

[I 2025-10-09 22:56:53,234] A new study created in memory with name: no-name-336e2537-7f8a-400a-9e0f-c51ae9886f76
[I 2025-10-09 22:58:43,654] Trial 0 finished with value: -101676.68285359218 and parameters: {'actor_in_feats': 8, 'actor_out_feats': 4, 'actor_hs': 200, 'critic_in_feats': 8, 'critic_out_feats': 1, 'critic_hs': 132, 'epsilon': 0.22043369308674093, 'gamma': 0.9578118925872531, 'lambda': 0.9206397160348497, 'actor_lr': 0.00025512010401112205, 'critic_lr': 0.0009802859534456775, 'batch_size': 241, 'c1': 0, 'c2': 0}. Best is trial 0 with value: -101676.68285359218.
[I 2025-10-09 23:00:42,441] Trial 1 finished with value: -89748.64705340193 and parameters: {'actor_in_feats': 8, 'actor_out_feats': 4, 'actor_hs': 67, 'critic_in_feats': 8, 'critic_out_feats': 1, 'critic_hs': 68, 'epsilon': 0.10679724150952885, 'gamma': 0.9624087880857666, 'lambda': 0.978809555396285, 'actor_lr': 0.00011149556645605471, 'critic_lr': 0.00037268261485302786, 'batch_size': 129, 'c1': 0, 'c2': 0}. Best

In [None]:
study.best_params

{'actor_in_feats': 8,
 'actor_out_feats': 4,
 'actor_hs': 186,
 'critic_in_feats': 8,
 'critic_out_feats': 1,
 'critic_hs': 232,
 'epsilon': 0.2975842809617252,
 'gamma': 0.9985670435741899,
 'lambda': 0.9650146270424367,
 'actor_lr': 0.0002229043296480284,
 'critic_lr': 0.0003966104592736233,
 'batch_size': 137}

In [None]:
actor = Actor(8, 4, study.best_params['actor_hs'])
critic = Critic(8, 1, study.best_params['critic_hs'])

agent = Agent(
    actor, 
    critic, 
    study.best_params['epsilon'], 
    study.best_params['gamma'], 
    study.best_params['lamda'], 
    study.best_params['c1'], 
    study.best_params['c2'],
    study.best_params['actor_lr'],
    study.best_params['critic_lr'], 
    device, 
    study.best_params['batch_size']
)

In [None]:
train(env, agent, 3000, 128, 4)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (15, 5))
plt.plot(agent.all_rewards)
plt.xlabel('episode')
plt.ylabel('reward')

In [None]:
env.close()
eval_env = gym.make('LunarLander-v3')

In [None]:
evaluate(eval_env, agent)