In [1]:
import gymnasium as gym
from gymnasium.wrappers.vector import NumpyToTorch
import torch
import matplotlib.pyplot as plt
from src.ppo import *

In [2]:
num_envs = 16
env = gym.make_vec('HalfCheetah-v5', num_envs)
env = NumpyToTorch(env)

In [3]:
env.observation_space

Box(-inf, inf, (16, 17), float64)

In [4]:
env.observation_space.sample()

array([[-8.53934069e-01, -4.15187798e-01,  2.15971153e+00,
         3.18977664e-01,  8.90886732e-01, -2.87816633e-01,
        -8.90096057e-01,  3.57086495e-02, -4.00794141e-02,
         3.73795318e-02, -5.98736088e-03, -3.38256399e-01,
         8.46463941e-02, -2.71525219e-01, -2.32293166e-01,
        -6.94653659e-01,  5.82397475e-03],
       [-1.90681779e-01, -1.51327776e+00,  2.33368142e+00,
         4.87526861e-01, -2.89424781e-03,  1.20935121e+00,
         1.50221643e+00,  4.59030589e-01, -8.89572036e-01,
        -6.99245821e-01, -3.49327805e+00, -1.75747424e+00,
         9.34470834e-02,  1.01660605e-01, -1.31946997e+00,
        -5.37342227e-01,  1.22197016e+00],
       [ 3.20562220e+00,  1.32939551e+00, -3.35227089e-01,
         5.28397099e-01,  1.31331035e+00,  5.41652722e-01,
         3.35480773e-02,  2.06843330e-01, -2.45977546e-01,
        -9.35136386e-01, -7.50002349e-01, -1.60611459e+00,
         3.27039850e-01, -5.06690855e-01, -1.15816756e-02,
         9.99201001e-01, -1.6

In [5]:
env.observation_space.sample().shape

(16, 17)

In [6]:
env.action_space

Box(-1.0, 1.0, (16, 6), float32)

In [7]:
env.action_space.sample()

array([[ 0.2640485 , -0.21387762,  0.31502616, -0.6683145 , -0.67426306,
        -0.4037574 ],
       [-0.85429204, -0.45009747, -0.24800266, -0.52005523, -0.0906904 ,
         0.0472281 ],
       [-0.7820101 , -0.7557616 , -0.3871553 , -0.7653157 , -0.2248515 ,
         0.7055948 ],
       [ 0.47772607,  0.22539389,  0.5294949 ,  0.4373729 ,  0.13195285,
         0.69474113],
       [ 0.6164335 , -0.17133895,  0.3093939 , -0.90942526, -0.4869076 ,
         0.6116383 ],
       [-0.16764264, -0.07907969, -0.2399339 ,  0.78842896,  0.00367948,
        -0.8703105 ],
       [ 0.9854848 , -0.29130217,  0.6343386 ,  0.7737198 , -0.13292608,
        -0.29946578],
       [ 0.81677693,  0.01175282,  0.47978106,  0.4260301 ,  0.21810643,
        -0.4623843 ],
       [-0.06986938,  0.64355445, -0.81893826, -0.16764517,  0.7568629 ,
        -0.89384264],
       [-0.654082  ,  0.05780401,  0.7828481 ,  0.40304184, -0.4847424 ,
        -0.5013304 ],
       [ 0.9014891 ,  0.69990426,  0.20517367,  0.

In [8]:
env.action_space.sample().shape

(16, 6)

In [9]:
sample_state, _ = env.reset()
sample_action = env.action_space.sample()

In [10]:
sample_state = sample_state.to(dtype = torch.float32)

In [11]:
next_state, reward, done, terminated, _ = env.step(sample_action)

In [12]:
reward

tensor([ 0.3689, -0.2075, -0.3452, -0.2118,  0.5567,  0.1170,  0.2776, -0.0058,
        -0.5314, -0.3450, -0.9260, -1.0288, -0.1353, -0.0359, -0.6564,  0.4207],
       dtype=torch.float64)

In [13]:
done

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False])

In [14]:
terminated

tensor([False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False])

In [15]:
in_features = 17
out_features = env.action_space.shape[1]
hidden_size = 256
device = 'cuda' if torch.cuda.is_available() else 'cpu'

actor = Actor(in_features, out_features, hidden_size, True).to(device)
critic = Critic(in_features, 1, hidden_size).to(device)

  return torch._C._cuda_getDeviceCount() > 0


In [16]:
dist = actor.forward(sample_state.to(device))
dist

Normal(loc: torch.Size([16, 6]), scale: torch.Size([16, 6]))

In [17]:
sample = dist.sample()
sample

tensor([[ 0.2908, -0.0482, -0.6239,  1.5232, -1.0395,  0.8230],
        [ 1.5601, -0.9407,  1.1642, -0.9061,  1.5443,  2.1815],
        [ 0.7084, -0.4770, -0.5627,  0.1744,  2.4641,  1.5010],
        [ 0.5201,  1.2760, -1.6320,  0.7377, -1.4821, -0.7474],
        [-1.1096, -0.3956,  0.9914, -0.1303,  0.2891,  0.5059],
        [-0.3033, -0.1859,  1.2410,  1.7590,  1.3449,  0.7655],
        [ 0.5111,  0.4649,  0.8023,  0.5425,  1.4005,  2.2546],
        [ 1.0867, -0.3638,  0.5517, -2.6344, -2.3201, -1.6158],
        [ 0.0598, -1.3480, -0.2545, -0.3957,  0.2394,  3.3644],
        [ 0.6493,  0.8463, -0.1774,  0.3587, -0.9915,  0.9768],
        [-0.6906, -0.2653, -0.5988, -0.7193,  0.9595,  1.2417],
        [ 0.6123, -1.2001,  1.3169, -0.9664,  0.3326, -1.1760],
        [-0.6160, -0.2433, -1.2729, -0.2269, -0.3267, -0.0141],
        [ 0.0667, -0.9374,  0.3097,  1.8253, -0.0203,  0.4847],
        [-0.7800, -1.3426, -0.9352, -1.7733, -0.2147, -0.2393],
        [-0.1322, -1.0930,  0.3751, -1.6

In [18]:
dist.log_prob(sample)

tensor([[-0.9459, -0.9191, -1.0798, -2.0587, -1.4179, -1.2809],
        [-2.0428, -1.3020, -1.6739, -1.3357, -2.1714, -3.3712],
        [-1.1280, -1.0017, -1.0412, -0.9319, -4.0684, -2.1004],
        [-1.0234, -1.8241, -2.1461, -1.1839, -1.9583, -1.1730],
        [-1.6085, -0.9743, -1.4791, -0.9304, -0.9749, -1.0637],
        [-0.9853, -0.9257, -1.7694, -2.4472, -1.8862, -1.2383],
        [-1.0193, -1.0632, -1.2969, -1.0594, -1.9565, -3.5329],
        [-1.4434, -0.9621, -1.1060, -4.4161, -3.5057, -2.1683],
        [-0.9189, -1.7306, -0.9375, -1.0029, -0.9598, -6.6987],
        [-1.0949, -1.3353, -0.9255, -0.9798, -1.3742, -1.4285],
        [-1.1995, -0.9387, -1.0615, -1.1854, -1.4226, -1.7308],
        [-1.0707, -1.5540, -1.8744, -1.3961, -0.9880, -1.5806],
        [-1.1472, -0.9340, -1.6409, -0.9475, -0.9586, -0.9191],
        [-0.9190, -1.3007, -0.9883, -2.5671, -0.9191, -1.0529],
        [-1.2708, -1.7252, -1.2958, -2.4963, -0.9344, -0.9401],
        [-0.9383, -1.4464, -1.0181, -2.2

In [19]:
dist.log_prob(sample).exp()

tensor([[0.3883, 0.3989, 0.3397, 0.1276, 0.2422, 0.2778],
        [0.1297, 0.2720, 0.1875, 0.2630, 0.1140, 0.0343],
        [0.3237, 0.3673, 0.3530, 0.3938, 0.0171, 0.1224],
        [0.3594, 0.1614, 0.1169, 0.3061, 0.1411, 0.3094],
        [0.2002, 0.3774, 0.2278, 0.3944, 0.3772, 0.3452],
        [0.3733, 0.3962, 0.1704, 0.0865, 0.1516, 0.2899],
        [0.3608, 0.3454, 0.2734, 0.3467, 0.1414, 0.0292],
        [0.2361, 0.3821, 0.3309, 0.0121, 0.0300, 0.1144],
        [0.3989, 0.1772, 0.3916, 0.3668, 0.3830, 0.0012],
        [0.3346, 0.2631, 0.3963, 0.3754, 0.2530, 0.2397],
        [0.3013, 0.3911, 0.3459, 0.3056, 0.2411, 0.1771],
        [0.3428, 0.2114, 0.1534, 0.2476, 0.3723, 0.2058],
        [0.3175, 0.3930, 0.1938, 0.3877, 0.3834, 0.3989],
        [0.3989, 0.2723, 0.3722, 0.0768, 0.3989, 0.3489],
        [0.2806, 0.1781, 0.2737, 0.0824, 0.3928, 0.3906],
        [0.3913, 0.2354, 0.3613, 0.1017, 0.3989, 0.3874]],
       grad_fn=<ExpBackward0>)

In [None]:
critic.forward(sample_state.to(device)) 

tensor([[-0.0248],
        [-0.0377],
        [-0.0276],
        [-0.0370],
        [-0.0354],
        [-0.0278],
        [-0.0337],
        [-0.0258],
        [-0.0375],
        [-0.0426],
        [-0.0321],
        [-0.0416],
        [-0.0296],
        [-0.0289],
        [-0.0301],
        [-0.0455]], grad_fn=<AddmmBackward0>)

In [21]:
epsilon = 0.2
gamma = 0.99
lam = 0.95
c1 = 0.5
c2 = 0.01
actor_lr = 0.0003
critic_lr = 0.0003
batch_size = 64

agent = Agent(actor, critic, epsilon, gamma, lam, c1, c2, actor_lr, critic_lr, device, batch_size)

In [None]:
train(env, agent, num_envs, 1000, 512, 4, 'models/halfcheetah.pt', True)

finished episode: 0
total reward: -4922.101401027502
number of steps: 512
---------------
new best model... saving...
finished episode: 1
total reward: -5027.751447337744
number of steps: 512
---------------
finished episode: 2
total reward: -4962.422722528857
number of steps: 512
---------------
finished episode: 3
total reward: -4533.981784855765
number of steps: 512
---------------
new best model... saving...
finished episode: 4
total reward: -4442.382312998708
number of steps: 512
---------------
new best model... saving...
finished episode: 5
total reward: -4533.637395424885
number of steps: 512
---------------
finished episode: 6
total reward: -4172.444842753126
number of steps: 512
---------------
new best model... saving...
finished episode: 7
total reward: -4424.0309468712485
number of steps: 512
---------------
finished episode: 8
total reward: -3724.611578120898
number of steps: 512
---------------
new best model... saving...
finished episode: 9
total reward: -3907.266617092