In [3]:
!git clone git@github.com:facebookresearch/vggt.git

Cloning into 'vggt'...
remote: Enumerating objects: 1265, done.[K
remote: Total 1265 (delta 0), reused 0 (delta 0), pack-reused 1265 (from 1)[K
Receiving objects: 100% (1265/1265), 64.94 MiB | 9.43 MiB/s, done.
Resolving deltas: 100% (579/579), done.


In [4]:
!mv vggt/vggt/* vggt/

In [1]:
# Example: loop over random ProcTHOR scenes
import prior
dataset = prior.load_dataset("procthor-10k")
train_scenes = dataset["train"]

NUM_UPDATES = 1000


Fetching reference HEAD


    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 17799.17it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 19604.77it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 20369.20it/s]


In [None]:
from ai2thor.controller import Controller
from cons import NUM_ACTIONS, EPISODE_STEPS, DEVICE, FEAT_DIM, TRAIN_EPOCHS
from models import ActorCritic, SlidingWindowTransformerActorCritic
from rl import PPOTrainer, CLIPCuriosity, ThorNavEnv, VGGTCuriosity, ExtrinsicReward
import torch
import os


# Create CLIP curiosity module
clip_curiosity = CLIPCuriosity(
    device=DEVICE,
    buffer_size=EPISODE_STEPS,
    topk=5,
    ema_beta=0.99,
    reward_scale=1.0,
    every_n_steps=1,  # or 2/4 to save CLIP compute
)

# Build actor-critic and PPO trainer
ac = SlidingWindowTransformerActorCritic(feat_dim=FEAT_DIM, num_actions=NUM_ACTIONS, device=DEVICE)
trainer = PPOTrainer(ac)

for upd in range(1, NUM_UPDATES + 1):
    # Sample random scene
    idx = torch.randint(0, len(train_scenes), (1,)).item()
    house = train_scenes[idx]

    controller = Controller(
        scene=house,
        snapToGrid=False,
        rotateStepDegrees=30,
        renderDepthImage=True,
    )

    try:
        env = ThorNavEnv(controller, clip_curiosity, extrinsic_reward=ExtrinsicReward())

        buf, ep_reward = trainer.collect_rollout(env, horizon=EPISODE_STEPS)
        trainer.ppo_update(buf, epochs=TRAIN_EPOCHS, is_pretrain=upd < 0.05 * NUM_UPDATES)

        print(f"[Update {upd}/{NUM_UPDATES}] Episode reward: {ep_reward:.3f}, steps: {len(buf)}")

        # Optionally save model
        if upd % 10 == 0:
            os.makedirs("checkpoints", exist_ok=True)
            torch.save(ac.state_dict(), f"checkpoints/ac_update_{upd}.pt")

    finally:
        controller.stop()

  from .autonotebook import tqdm as notebook_tqdm


[PPO] Epoch 20/20 Loss=0.3010 Value=0.3010 Corr=0.0757
[Update 1/1000] Episode reward: -26.457, steps: 512




[PPO] Epoch 20/20 Loss=0.5683 Value=0.5683 Corr=-0.0128
[Update 2/1000] Episode reward: -5.154, steps: 512




[PPO] Epoch 20/20 Loss=0.2045 Value=0.2045 Corr=0.0482
[Update 3/1000] Episode reward: -19.142, steps: 512




[PPO] Epoch 20/20 Loss=0.2922 Value=0.2922 Corr=0.5673
[Update 4/1000] Episode reward: -1.599, steps: 512




[PPO] Epoch 20/20 Loss=0.3243 Value=0.3243 Corr=-0.0544
[Update 5/1000] Episode reward: 8.272, steps: 512




[PPO] Epoch 20/20 Loss=0.3039 Value=0.3039 Corr=0.0020
[Update 6/1000] Episode reward: -13.255, steps: 512




[PPO] Epoch 20/20 Loss=0.3359 Value=0.3359 Corr=-0.0403
[Update 7/1000] Episode reward: -12.684, steps: 512




[PPO] Epoch 20/20 Loss=0.2886 Value=0.2886 Corr=0.2477
[Update 8/1000] Episode reward: -3.111, steps: 512




[PPO] Epoch 20/20 Loss=0.1600 Value=0.1600 Corr=0.4637
[Update 9/1000] Episode reward: -0.950, steps: 512




[PPO] Epoch 20/20 Loss=0.2339 Value=0.2339 Corr=0.8578
[Update 10/1000] Episode reward: -6.569, steps: 512




[PPO] Epoch 20/20 Loss=0.0886 Value=0.0886 Corr=-0.0568
[Update 11/1000] Episode reward: -20.861, steps: 512




[PPO] Epoch 20/20 Loss=0.3161 Value=0.3161 Corr=0.2011
[Update 12/1000] Episode reward: -9.729, steps: 512




[PPO] Epoch 20/20 Loss=0.1285 Value=0.1285 Corr=0.4901
[Update 13/1000] Episode reward: -4.852, steps: 512




[PPO] Epoch 20/20 Loss=0.1172 Value=0.1172 Corr=0.7835
[Update 14/1000] Episode reward: -14.034, steps: 512




[PPO] Epoch 20/20 Loss=0.1433 Value=0.1433 Corr=0.6603
[Update 15/1000] Episode reward: -21.258, steps: 512




[PPO] Epoch 20/20 Loss=0.1504 Value=0.1504 Corr=0.8184
[Update 16/1000] Episode reward: -2.244, steps: 512




[PPO] Epoch 20/20 Loss=0.2602 Value=0.2602 Corr=0.6202
[Update 17/1000] Episode reward: -12.832, steps: 512




[PPO] Epoch 20/20 Loss=0.1507 Value=0.1507 Corr=0.1566
[Update 18/1000] Episode reward: 2.624, steps: 512




[PPO] Epoch 20/20 Loss=0.4365 Value=0.4365 Corr=0.3383
[Update 19/1000] Episode reward: 1.142, steps: 512




[PPO] Epoch 20/20 Loss=0.1321 Value=0.1321 Corr=0.7462
[Update 20/1000] Episode reward: -9.213, steps: 512




[PPO] Epoch 20/20 Loss=0.1663 Value=0.1663 Corr=0.7050
[Update 21/1000] Episode reward: 3.610, steps: 512




[PPO] Epoch 20/20 Loss=0.3155 Value=0.3155 Corr=0.0583
[Update 22/1000] Episode reward: 0.242, steps: 512




[PPO] Epoch 20/20 Loss=0.3539 Value=0.3539 Corr=0.3693
[Update 23/1000] Episode reward: -24.771, steps: 512




[PPO] Epoch 20/20 Loss=0.2726 Value=0.2726 Corr=0.3481
[Update 24/1000] Episode reward: -3.322, steps: 512




[PPO] Epoch 20/20 Loss=0.0892 Value=0.0892 Corr=0.4737
[Update 25/1000] Episode reward: -11.080, steps: 512




[PPO] Epoch 20/20 Loss=0.2417 Value=0.2417 Corr=0.0257
[Update 26/1000] Episode reward: -8.914, steps: 512




[PPO] Epoch 20/20 Loss=0.2056 Value=0.2056 Corr=-0.0355
[Update 27/1000] Episode reward: -16.282, steps: 512




[PPO] Epoch 20/20 Loss=0.2330 Value=0.2330 Corr=0.0066
[Update 28/1000] Episode reward: -12.700, steps: 512




[PPO] Epoch 20/20 Loss=0.2667 Value=0.2667 Corr=0.3437
[Update 29/1000] Episode reward: -11.841, steps: 512




[PPO] Epoch 20/20 Loss=0.4044 Value=0.4044 Corr=0.4166
[Update 30/1000] Episode reward: -17.812, steps: 512




[PPO] Epoch 20/20 Loss=0.2279 Value=0.2279 Corr=0.0994
[Update 31/1000] Episode reward: 3.520, steps: 512




[PPO] Epoch 20/20 Loss=0.1960 Value=0.1960 Corr=0.0595
[Update 32/1000] Episode reward: -16.225, steps: 512




[PPO] Epoch 20/20 Loss=0.2919 Value=0.2919 Corr=0.0112
[Update 33/1000] Episode reward: -2.389, steps: 512




[PPO] Epoch 20/20 Loss=0.2985 Value=0.2985 Corr=-0.0600
[Update 34/1000] Episode reward: -2.337, steps: 512




[PPO] Epoch 20/20 Loss=0.2297 Value=0.2297 Corr=-0.0399
[Update 35/1000] Episode reward: -10.293, steps: 512




[PPO] Epoch 20/20 Loss=0.2508 Value=0.2508 Corr=0.0799
[Update 36/1000] Episode reward: -11.315, steps: 512




[PPO] Epoch 20/20 Loss=0.1544 Value=0.1544 Corr=-0.0208
[Update 37/1000] Episode reward: -8.673, steps: 512




[PPO] Epoch 20/20 Loss=0.2097 Value=0.2097 Corr=-0.0060
[Update 38/1000] Episode reward: 3.611, steps: 512




[PPO] Epoch 20/20 Loss=0.1725 Value=0.1725 Corr=0.0603
[Update 39/1000] Episode reward: -12.206, steps: 512




[PPO] Epoch 20/20 Loss=0.1192 Value=0.1192 Corr=-0.0057
[Update 40/1000] Episode reward: -17.037, steps: 512




[PPO] Epoch 20/20 Loss=0.2173 Value=0.2173 Corr=-0.0950
[Update 41/1000] Episode reward: -2.881, steps: 512




[PPO] Epoch 20/20 Loss=0.1292 Value=0.1292 Corr=-0.0889
[Update 42/1000] Episode reward: -6.211, steps: 512




[PPO] Epoch 20/20 Loss=0.2393 Value=0.2393 Corr=0.0315
[Update 43/1000] Episode reward: -14.418, steps: 512




[PPO] Epoch 20/20 Loss=0.3775 Value=0.3775 Corr=0.0238
[Update 44/1000] Episode reward: -15.397, steps: 512




[PPO] Epoch 20/20 Loss=0.1864 Value=0.1864 Corr=0.0358
[Update 45/1000] Episode reward: -11.171, steps: 512




[PPO] Epoch 20/20 Loss=0.2064 Value=0.2064 Corr=0.1907
[Update 46/1000] Episode reward: -7.171, steps: 512




[PPO] Epoch 20/20 Loss=0.1717 Value=0.1717 Corr=0.5850
[Update 47/1000] Episode reward: -10.920, steps: 512




[PPO] Epoch 20/20 Loss=0.1694 Value=0.1694 Corr=-0.0200
[Update 48/1000] Episode reward: -2.889, steps: 512




[PPO] Epoch 20/20 Loss=0.3866 Value=0.3866 Corr=0.0592
[Update 49/1000] Episode reward: -3.602, steps: 512




[PPO] Epoch 4/4 Loss=0.0796 Policy=-0.0065 Value=0.2763 Entropy=1.0428 KL=0.0027 Corr=0.0457
[Update 50/1000] Episode reward: 5.724, steps: 512




[PPO] Epoch 4/4 Loss=0.0999 Policy=-0.0008 Value=0.3052 Entropy=1.0374 KL=0.0110 Corr=-0.0356
[Update 51/1000] Episode reward: -10.037, steps: 512




[PPO] Epoch 4/4 Loss=0.2185 Policy=0.0104 Value=0.5178 Entropy=1.0152 KL=0.0190 Corr=0.0329
[Update 52/1000] Episode reward: -22.370, steps: 512




[PPO] Epoch 4/4 Loss=0.1963 Policy=0.0025 Value=0.4886 Entropy=1.0089 KL=0.0074 Corr=-0.0007
[Update 53/1000] Episode reward: 0.143, steps: 512




[PPO] Epoch 4/4 Loss=0.3313 Policy=0.0011 Value=0.7593 Entropy=0.9879 KL=0.0046 Corr=0.0570
[Update 54/1000] Episode reward: -16.881, steps: 512




[PPO] Epoch 4/4 Loss=0.1247 Policy=0.0001 Value=0.3472 Entropy=0.9792 KL=0.0113 Corr=-0.0327
[Update 55/1000] Episode reward: -12.200, steps: 512




[PPO] Epoch 4/4 Loss=0.2700 Policy=0.0009 Value=0.6329 Entropy=0.9468 KL=0.0045 Corr=-0.0113
[Update 56/1000] Episode reward: 0.716, steps: 512




[PPO] Epoch 4/4 Loss=0.2120 Policy=0.0071 Value=0.5012 Entropy=0.9157 KL=0.0056 Corr=-0.0184
[Update 57/1000] Episode reward: -3.004, steps: 512




[PPO] Epoch 4/4 Loss=0.0575 Policy=-0.0032 Value=0.2170 Entropy=0.9579 KL=0.0127 Corr=0.0349
[Update 58/1000] Episode reward: -10.427, steps: 512




[PPO] Epoch 4/4 Loss=0.1803 Policy=-0.0031 Value=0.4657 Entropy=0.9903 KL=0.0114 Corr=-0.0311
[Update 59/1000] Episode reward: 8.482, steps: 512




[PPO] Epoch 4/4 Loss=0.1818 Policy=0.0033 Value=0.4575 Entropy=1.0041 KL=0.0054 Corr=0.0378
[Update 60/1000] Episode reward: -0.043, steps: 512




[PPO] Epoch 4/4 Loss=0.2086 Policy=0.0075 Value=0.5022 Entropy=1.0007 KL=0.0172 Corr=0.0065
[Update 61/1000] Episode reward: 1.292, steps: 512




[PPO] Epoch 4/4 Loss=0.2541 Policy=0.0121 Value=0.5778 Entropy=0.9385 KL=0.0071 Corr=-0.0051
[Update 62/1000] Episode reward: -7.214, steps: 512




[PPO] Epoch 4/4 Loss=0.3213 Policy=-0.0026 Value=0.7379 Entropy=0.8997 KL=0.0053 Corr=-0.0295
[Update 63/1000] Episode reward: 15.875, steps: 512




[PPO] Epoch 4/4 Loss=0.2942 Policy=-0.0063 Value=0.6857 Entropy=0.8462 KL=0.0108 Corr=-0.0776
[Update 64/1000] Episode reward: 9.644, steps: 512




[PPO] Epoch 4/4 Loss=0.5003 Policy=0.0099 Value=1.0612 Entropy=0.8047 KL=0.0128 Corr=0.0094
[Update 65/1000] Episode reward: -22.087, steps: 512




[PPO] Epoch 4/4 Loss=0.2352 Policy=0.0095 Value=0.5325 Entropy=0.8113 KL=0.0106 Corr=0.0227
[Update 66/1000] Episode reward: -7.131, steps: 512




[PPO] Epoch 4/4 Loss=0.3704 Policy=0.0049 Value=0.8126 Entropy=0.8157 KL=0.0121 Corr=-0.0344
[Update 67/1000] Episode reward: -7.192, steps: 512




[PPO] Epoch 4/4 Loss=0.1127 Policy=-0.0018 Value=0.3059 Entropy=0.7691 KL=0.0117 Corr=-0.0249
[Update 68/1000] Episode reward: -14.797, steps: 512




[PPO] Epoch 4/4 Loss=0.4659 Policy=0.0024 Value=1.0032 Entropy=0.7619 KL=0.0102 Corr=0.0748
[Update 69/1000] Episode reward: 7.260, steps: 512




[PPO] Epoch 4/4 Loss=0.0465 Policy=0.0033 Value=0.1609 Entropy=0.7454 KL=0.0066 Corr=0.0165
[Update 70/1000] Episode reward: -2.950, steps: 512




[PPO] Epoch 4/4 Loss=0.0674 Policy=0.0037 Value=0.2021 Entropy=0.7477 KL=0.0108 Corr=-0.0064
[Update 71/1000] Episode reward: 2.616, steps: 512




[PPO] Epoch 4/4 Loss=0.4584 Policy=0.0042 Value=0.9887 Entropy=0.8036 KL=0.0254 Corr=0.0884
[Update 72/1000] Episode reward: 19.634, steps: 512




[PPO] Epoch 4/4 Loss=0.1093 Policy=0.0041 Value=0.2880 Entropy=0.7744 KL=0.0258 Corr=-0.0121
[Update 73/1000] Episode reward: -0.754, steps: 512




[PPO] Epoch 4/4 Loss=0.3201 Policy=0.0060 Value=0.7007 Entropy=0.7265 KL=0.0105 Corr=-0.0147
[Update 74/1000] Episode reward: 18.025, steps: 512




[PPO] Epoch 4/4 Loss=0.1758 Policy=0.0079 Value=0.4049 Entropy=0.6910 KL=0.0137 Corr=0.0153
[Update 75/1000] Episode reward: 10.465, steps: 512




[PPO] Epoch 4/4 Loss=0.3301 Policy=-0.0037 Value=0.7381 Entropy=0.7039 KL=0.0142 Corr=-0.0343
[Update 76/1000] Episode reward: -15.688, steps: 512




[PPO] Epoch 4/4 Loss=0.6611 Policy=0.0115 Value=1.3707 Entropy=0.7154 KL=0.0149 Corr=-0.0630
[Update 77/1000] Episode reward: 26.395, steps: 512




[PPO] Epoch 4/4 Loss=0.1465 Policy=0.0038 Value=0.3568 Entropy=0.7147 KL=0.0094 Corr=-0.0268
[Update 78/1000] Episode reward: -3.803, steps: 512




[PPO] Epoch 4/4 Loss=0.0377 Policy=0.0017 Value=0.1440 Entropy=0.7205 KL=0.0109 Corr=-0.0353
[Update 79/1000] Episode reward: 6.144, steps: 512




[PPO] Epoch 4/4 Loss=0.1518 Policy=-0.0024 Value=0.3794 Entropy=0.7087 KL=0.0142 Corr=0.0329
[Update 80/1000] Episode reward: 17.249, steps: 512




[PPO] Epoch 4/4 Loss=0.1472 Policy=-0.0064 Value=0.3759 Entropy=0.6868 KL=0.0100 Corr=-0.0782
[Update 81/1000] Episode reward: 3.572, steps: 512




[PPO] Epoch 4/4 Loss=0.2487 Policy=-0.0049 Value=0.5744 Entropy=0.6726 KL=0.0135 Corr=0.0267
[Update 82/1000] Episode reward: 13.013, steps: 512




[PPO] Epoch 4/4 Loss=0.2438 Policy=0.0058 Value=0.5448 Entropy=0.6893 KL=0.0090 Corr=-0.0655
[Update 83/1000] Episode reward: 2.256, steps: 512




[PPO] Epoch 4/4 Loss=0.1607 Policy=0.0084 Value=0.3797 Entropy=0.7516 KL=0.0100 Corr=0.0338
[Update 84/1000] Episode reward: -6.408, steps: 512




[PPO] Epoch 4/4 Loss=0.0876 Policy=0.0072 Value=0.2390 Entropy=0.7815 KL=-0.0000 Corr=-0.0311
[Update 85/1000] Episode reward: -6.449, steps: 512




[PPO] Epoch 4/4 Loss=0.1422 Policy=0.0007 Value=0.3604 Entropy=0.7737 KL=0.0061 Corr=-0.0290
[Update 86/1000] Episode reward: 4.499, steps: 512




[PPO] Epoch 4/4 Loss=0.5093 Policy=0.0034 Value=1.0873 Entropy=0.7555 KL=0.0095 Corr=0.0073
[Update 87/1000] Episode reward: -30.464, steps: 512




[PPO] Epoch 4/4 Loss=0.5060 Policy=0.0021 Value=1.0823 Entropy=0.7437 KL=0.0102 Corr=-0.0195
[Update 88/1000] Episode reward: 17.449, steps: 512




[PPO] Epoch 4/4 Loss=0.1825 Policy=-0.0030 Value=0.4453 Entropy=0.7418 KL=0.0020 Corr=-0.0578
[Update 89/1000] Episode reward: -16.154, steps: 512




[PPO] Epoch 4/4 Loss=0.1929 Policy=0.0057 Value=0.4494 Entropy=0.7486 KL=0.0116 Corr=-0.0987
[Update 90/1000] Episode reward: 9.589, steps: 512




[PPO] Epoch 4/4 Loss=0.2204 Policy=0.0055 Value=0.5030 Entropy=0.7327 KL=-0.0028 Corr=-0.1431
[Update 91/1000] Episode reward: 12.207, steps: 512




[PPO] Epoch 4/4 Loss=0.3209 Policy=0.0009 Value=0.7128 Entropy=0.7266 KL=0.0044 Corr=0.0556
[Update 92/1000] Episode reward: 22.292, steps: 512




[PPO] Epoch 4/4 Loss=0.1530 Policy=-0.0058 Value=0.3878 Entropy=0.7032 KL=0.0035 Corr=-0.0478
[Update 93/1000] Episode reward: 14.017, steps: 512




[PPO] Epoch 4/4 Loss=0.1177 Policy=0.0019 Value=0.3050 Entropy=0.7336 KL=0.0122 Corr=0.0591
[Update 94/1000] Episode reward: 11.695, steps: 512




[PPO] Epoch 4/4 Loss=0.1261 Policy=0.0010 Value=0.3242 Entropy=0.7386 KL=0.0009 Corr=-0.0125
[Update 95/1000] Episode reward: 7.575, steps: 512




[PPO] Epoch 4/4 Loss=0.2919 Policy=-0.0053 Value=0.6640 Entropy=0.6969 KL=0.0195 Corr=-0.0383
[Update 96/1000] Episode reward: 19.641, steps: 512




[PPO] Epoch 4/4 Loss=0.5298 Policy=0.0069 Value=1.1120 Entropy=0.6616 KL=0.0117 Corr=0.0358
[Update 97/1000] Episode reward: -0.835, steps: 512




[PPO] Epoch 4/4 Loss=0.4599 Policy=0.0000 Value=0.9880 Entropy=0.6834 KL=0.0066 Corr=0.0616
[Update 98/1000] Episode reward: -0.202, steps: 512




[PPO] Epoch 4/4 Loss=0.2387 Policy=0.0059 Value=0.5396 Entropy=0.7418 KL=0.0124 Corr=-0.0195
[Update 99/1000] Episode reward: -5.807, steps: 512




[PPO] Epoch 4/4 Loss=0.3141 Policy=0.0061 Value=0.6954 Entropy=0.7936 KL=0.0095 Corr=-0.0536
[Update 100/1000] Episode reward: -1.329, steps: 512




[PPO] Epoch 4/4 Loss=0.3052 Policy=0.0040 Value=0.6819 Entropy=0.7935 KL=0.0158 Corr=-0.0574
[Update 101/1000] Episode reward: -22.179, steps: 512




[PPO] Epoch 4/4 Loss=0.1685 Policy=-0.0033 Value=0.4238 Entropy=0.8016 KL=0.0174 Corr=0.0244
[Update 102/1000] Episode reward: -9.315, steps: 512




[PPO] Epoch 4/4 Loss=0.3036 Policy=0.0072 Value=0.6717 Entropy=0.7896 KL=0.0168 Corr=0.0264
[Update 103/1000] Episode reward: 2.062, steps: 512




[PPO] Epoch 4/4 Loss=0.4346 Policy=0.0047 Value=0.9351 Entropy=0.7523 KL=0.0047 Corr=-0.0838
[Update 104/1000] Episode reward: 17.296, steps: 512




[PPO] Epoch 4/4 Loss=0.5052 Policy=-0.0023 Value=1.0870 Entropy=0.7194 KL=0.0037 Corr=0.0710
[Update 105/1000] Episode reward: 19.378, steps: 512




[PPO] Epoch 4/4 Loss=0.4749 Policy=0.0015 Value=1.0179 Entropy=0.7119 KL=0.0055 Corr=-0.0211
[Update 106/1000] Episode reward: 16.993, steps: 512




[PPO] Epoch 4/4 Loss=0.9168 Policy=-0.0008 Value=1.9058 Entropy=0.7059 KL=0.0107 Corr=-0.0074
[Update 107/1000] Episode reward: -29.345, steps: 512




[PPO] Epoch 4/4 Loss=0.1050 Policy=-0.0043 Value=0.2914 Entropy=0.7275 KL=0.0018 Corr=0.0288
[Update 108/1000] Episode reward: -3.480, steps: 512




[PPO] Epoch 4/4 Loss=0.0509 Policy=-0.0011 Value=0.1828 Entropy=0.7896 KL=0.0034 Corr=0.0145
[Update 109/1000] Episode reward: -11.731, steps: 512




[PPO] Epoch 4/4 Loss=0.1766 Policy=0.0117 Value=0.4119 Entropy=0.8218 KL=0.0079 Corr=0.0406
[Update 110/1000] Episode reward: 10.188, steps: 512




[PPO] Epoch 4/4 Loss=0.3412 Policy=0.0021 Value=0.7610 Entropy=0.8284 KL=0.0049 Corr=-0.0478
[Update 111/1000] Episode reward: 24.174, steps: 512




[PPO] Epoch 4/4 Loss=0.0347 Policy=-0.0024 Value=0.1576 Entropy=0.8339 KL=0.0105 Corr=0.0208
[Update 112/1000] Episode reward: -6.765, steps: 512




[PPO] Epoch 4/4 Loss=0.0392 Policy=0.0095 Value=0.1443 Entropy=0.8483 KL=0.0083 Corr=0.0169
[Update 113/1000] Episode reward: -7.214, steps: 512




[PPO] Epoch 4/4 Loss=0.0195 Policy=0.0009 Value=0.1207 Entropy=0.8362 KL=0.0120 Corr=0.0188
[Update 114/1000] Episode reward: -3.870, steps: 512




[PPO] Epoch 4/4 Loss=0.1383 Policy=0.0021 Value=0.3512 Entropy=0.7871 KL=0.0049 Corr=0.0397
[Update 115/1000] Episode reward: -15.393, steps: 512




[PPO] Epoch 4/4 Loss=0.1427 Policy=-0.0018 Value=0.3653 Entropy=0.7638 KL=0.0069 Corr=0.0341
[Update 116/1000] Episode reward: 2.116, steps: 512




[PPO] Epoch 4/4 Loss=0.3680 Policy=-0.0046 Value=0.8195 Entropy=0.7435 KL=0.0111 Corr=0.0436
[Update 117/1000] Episode reward: 20.645, steps: 512




[PPO] Epoch 4/4 Loss=0.2838 Policy=0.0035 Value=0.6339 Entropy=0.7336 KL=0.0152 Corr=-0.0065
[Update 118/1000] Episode reward: 13.269, steps: 512




[PPO] Epoch 4/4 Loss=0.1343 Policy=0.0014 Value=0.3393 Entropy=0.7336 KL=0.0058 Corr=0.0161
[Update 119/1000] Episode reward: 6.702, steps: 512




[PPO] Epoch 4/4 Loss=0.0675 Policy=-0.0002 Value=0.2072 Entropy=0.7167 KL=0.0087 Corr=0.0439
[Update 120/1000] Episode reward: 1.932, steps: 512




[PPO] Epoch 4/4 Loss=0.0860 Policy=-0.0025 Value=0.2456 Entropy=0.6877 KL=0.0062 Corr=-0.0483
[Update 121/1000] Episode reward: 8.557, steps: 512




[PPO] Epoch 4/4 Loss=0.2163 Policy=0.0055 Value=0.4881 Entropy=0.6650 KL=0.0013 Corr=0.0003
[Update 122/1000] Episode reward: 7.956, steps: 512




[PPO] Epoch 4/4 Loss=0.1729 Policy=0.0034 Value=0.4088 Entropy=0.6974 KL=0.0122 Corr=-0.0108
[Update 123/1000] Episode reward: 0.042, steps: 512




[PPO] Epoch 4/4 Loss=0.1073 Policy=0.0019 Value=0.2854 Entropy=0.7454 KL=0.0147 Corr=0.0152
[Update 124/1000] Episode reward: -5.908, steps: 512




[PPO] Epoch 4/4 Loss=0.0748 Policy=0.0026 Value=0.2202 Entropy=0.7575 KL=0.0015 Corr=-0.0478
[Update 125/1000] Episode reward: 2.234, steps: 512




[PPO] Epoch 4/4 Loss=0.1322 Policy=0.0044 Value=0.3293 Entropy=0.7377 KL=0.0102 Corr=0.0011
[Update 126/1000] Episode reward: -9.595, steps: 512




[PPO] Epoch 4/4 Loss=0.5629 Policy=-0.0102 Value=1.2182 Entropy=0.7212 KL=0.0141 Corr=-0.0206
[Update 127/1000] Episode reward: 20.465, steps: 512




[PPO] Epoch 4/4 Loss=0.8872 Policy=-0.0071 Value=1.8586 Entropy=0.6999 KL=0.0061 Corr=0.0294
[Update 128/1000] Episode reward: 38.104, steps: 512




[PPO] Epoch 4/4 Loss=0.3128 Policy=0.0059 Value=0.6817 Entropy=0.6795 KL=0.0102 Corr=-0.0337
[Update 129/1000] Episode reward: -7.233, steps: 512




[PPO] Epoch 4/4 Loss=0.4660 Policy=0.0062 Value=0.9874 Entropy=0.6782 KL=0.0062 Corr=0.0226
[Update 130/1000] Episode reward: 21.517, steps: 512




[PPO] Epoch 4/4 Loss=0.7491 Policy=-0.0046 Value=1.5748 Entropy=0.6750 KL=0.0011 Corr=-0.0135
[Update 131/1000] Episode reward: 25.294, steps: 512




[PPO] Epoch 4/4 Loss=0.5388 Policy=-0.0032 Value=1.1510 Entropy=0.6690 KL=0.0058 Corr=0.0508
[Update 132/1000] Episode reward: 19.717, steps: 512




[PPO] Epoch 4/4 Loss=0.2256 Policy=0.0014 Value=0.5185 Entropy=0.7019 KL=0.0107 Corr=0.0212
[Update 133/1000] Episode reward: 5.914, steps: 512




[PPO] Epoch 4/4 Loss=0.2922 Policy=0.0049 Value=0.6449 Entropy=0.7033 KL=0.0038 Corr=-0.0053
[Update 134/1000] Episode reward: 18.671, steps: 512




[PPO] Epoch 4/4 Loss=0.3272 Policy=-0.0000 Value=0.7205 Entropy=0.6595 KL=0.0081 Corr=-0.0139
[Update 135/1000] Episode reward: 8.405, steps: 512




[PPO] Epoch 4/4 Loss=0.1809 Policy=0.0044 Value=0.4165 Entropy=0.6364 KL=0.0090 Corr=0.0418
[Update 136/1000] Episode reward: 9.728, steps: 512




[PPO] Epoch 4/4 Loss=0.5671 Policy=0.0002 Value=1.2017 Entropy=0.6793 KL=0.0098 Corr=-0.0170
[Update 137/1000] Episode reward: 8.906, steps: 512




[PPO] Epoch 4/4 Loss=0.2275 Policy=0.0027 Value=0.5215 Entropy=0.7186 KL=0.0142 Corr=-0.0289
[Update 138/1000] Episode reward: 1.855, steps: 512




[PPO] Epoch 4/4 Loss=0.2248 Policy=-0.0040 Value=0.5294 Entropy=0.7178 KL=0.0057 Corr=-0.0044
[Update 139/1000] Episode reward: 12.570, steps: 512




[PPO] Epoch 4/4 Loss=0.1012 Policy=0.0038 Value=0.2670 Entropy=0.7226 KL=-0.0036 Corr=-0.0588
[Update 140/1000] Episode reward: 5.138, steps: 512




[PPO] Epoch 4/4 Loss=0.2678 Policy=0.0073 Value=0.5955 Entropy=0.7436 KL=0.0042 Corr=-0.0627
[Update 141/1000] Episode reward: 12.806, steps: 512




[PPO] Epoch 4/4 Loss=0.1461 Policy=-0.0020 Value=0.3701 Entropy=0.7398 KL=0.0056 Corr=0.0418
[Update 142/1000] Episode reward: -5.127, steps: 512




[PPO] Epoch 4/4 Loss=0.1337 Policy=0.0001 Value=0.3406 Entropy=0.7360 KL=0.0053 Corr=0.0407
[Update 143/1000] Episode reward: -10.033, steps: 512




[PPO] Epoch 4/4 Loss=0.1859 Policy=-0.0060 Value=0.4543 Entropy=0.7053 KL=0.0068 Corr=0.0948
[Update 144/1000] Episode reward: 10.125, steps: 512




[PPO] Epoch 4/4 Loss=0.3950 Policy=-0.0103 Value=0.8754 Entropy=0.6499 KL=0.0036 Corr=0.0132
[Update 145/1000] Episode reward: 24.510, steps: 512




[PPO] Epoch 4/4 Loss=0.1648 Policy=0.0051 Value=0.3852 Entropy=0.6579 KL=0.0032 Corr=0.0612
[Update 146/1000] Episode reward: 6.071, steps: 512




[PPO] Epoch 4/4 Loss=0.3683 Policy=0.0038 Value=0.8024 Entropy=0.7340 KL=0.0111 Corr=0.0380
[Update 147/1000] Episode reward: 7.175, steps: 512




[PPO] Epoch 4/4 Loss=0.1778 Policy=0.0034 Value=0.4226 Entropy=0.7388 KL=0.0096 Corr=0.0593
[Update 148/1000] Episode reward: -8.792, steps: 512




[PPO] Epoch 4/4 Loss=0.1076 Policy=0.0082 Value=0.2732 Entropy=0.7438 KL=0.0024 Corr=0.0154
[Update 149/1000] Episode reward: -3.517, steps: 512




[PPO] Epoch 4/4 Loss=0.2248 Policy=0.0096 Value=0.5049 Entropy=0.7436 KL=0.0086 Corr=0.0544
[Update 150/1000] Episode reward: -13.509, steps: 512




[PPO] Epoch 4/4 Loss=0.0617 Policy=-0.0021 Value=0.2032 Entropy=0.7576 KL=0.0093 Corr=-0.0488
[Update 151/1000] Episode reward: -11.051, steps: 512




[PPO] Epoch 4/4 Loss=0.1264 Policy=0.0030 Value=0.3238 Entropy=0.7705 KL=0.0005 Corr=-0.0765
[Update 152/1000] Episode reward: -8.692, steps: 512




[PPO] Epoch 4/4 Loss=0.2164 Policy=0.0050 Value=0.4978 Entropy=0.7492 KL=0.0132 Corr=-0.0082
[Update 153/1000] Episode reward: 4.505, steps: 512




[PPO] Epoch 4/4 Loss=0.1106 Policy=0.0077 Value=0.2765 Entropy=0.7074 KL=0.0071 Corr=-0.0198
[Update 154/1000] Episode reward: 7.161, steps: 512




[PPO] Epoch 4/4 Loss=0.1851 Policy=-0.0066 Value=0.4515 Entropy=0.6802 KL=0.0107 Corr=0.0110
[Update 155/1000] Episode reward: 10.690, steps: 512




[PPO] Epoch 4/4 Loss=0.5785 Policy=-0.0032 Value=1.2277 Entropy=0.6417 KL=0.0169 Corr=-0.0223
[Update 156/1000] Episode reward: 24.717, steps: 512




[PPO] Epoch 4/4 Loss=0.5546 Policy=0.0017 Value=1.1673 Entropy=0.6160 KL=-0.0006 Corr=0.0000
[Update 157/1000] Episode reward: 17.494, steps: 512




[PPO] Epoch 4/4 Loss=0.2112 Policy=-0.0008 Value=0.4860 Entropy=0.6196 KL=0.0023 Corr=0.0057
[Update 158/1000] Episode reward: 9.980, steps: 512




[PPO] Epoch 4/4 Loss=0.5728 Policy=0.0020 Value=1.2053 Entropy=0.6372 KL=0.0062 Corr=0.0134
[Update 159/1000] Episode reward: 25.806, steps: 512




[PPO] Epoch 4/4 Loss=0.2455 Policy=0.0023 Value=0.5520 Entropy=0.6562 KL=0.0022 Corr=0.0027
[Update 160/1000] Episode reward: 8.138, steps: 512




[PPO] Epoch 4/4 Loss=0.3138 Policy=0.0028 Value=0.6907 Entropy=0.6859 KL=0.0098 Corr=0.0014
[Update 161/1000] Episode reward: -12.639, steps: 512




[PPO] Epoch 4/4 Loss=0.1040 Policy=0.0004 Value=0.2785 Entropy=0.7144 KL=0.0087 Corr=-0.0458
[Update 162/1000] Episode reward: -0.003, steps: 512




[PPO] Epoch 4/4 Loss=0.2041 Policy=0.0019 Value=0.4754 Entropy=0.7115 KL=0.0101 Corr=0.0721
[Update 163/1000] Episode reward: 16.031, steps: 512




[PPO] Epoch 4/4 Loss=0.1858 Policy=-0.0009 Value=0.4418 Entropy=0.6845 KL=0.0005 Corr=0.0067
[Update 164/1000] Episode reward: 11.381, steps: 512




[PPO] Epoch 4/4 Loss=0.4600 Policy=-0.0070 Value=0.9984 Entropy=0.6422 KL=-0.0041 Corr=-0.0227
[Update 165/1000] Episode reward: -0.840, steps: 512




[PPO] Epoch 4/4 Loss=0.4351 Policy=-0.0061 Value=0.9416 Entropy=0.5931 KL=0.0064 Corr=0.1222
[Update 166/1000] Episode reward: 9.782, steps: 512




[PPO] Epoch 4/4 Loss=0.3109 Policy=0.0032 Value=0.6749 Entropy=0.5950 KL=0.0151 Corr=0.0285
[Update 167/1000] Episode reward: 10.286, steps: 512




[PPO] Epoch 4/4 Loss=0.3577 Policy=0.0022 Value=0.7785 Entropy=0.6757 KL=0.0181 Corr=-0.0186
[Update 168/1000] Episode reward: 26.096, steps: 512




[PPO] Epoch 4/4 Loss=0.3686 Policy=0.0054 Value=0.7978 Entropy=0.7139 KL=0.0044 Corr=0.0436
[Update 169/1000] Episode reward: 33.448, steps: 512




[PPO] Epoch 4/4 Loss=0.5456 Policy=-0.0021 Value=1.1680 Entropy=0.7255 KL=0.0021 Corr=0.0022
[Update 170/1000] Episode reward: -3.002, steps: 512




[PPO] Epoch 4/4 Loss=0.1098 Policy=-0.0098 Value=0.3116 Entropy=0.7229 KL=0.0042 Corr=-0.0284
[Update 171/1000] Episode reward: 5.101, steps: 512




[PPO] Epoch 4/4 Loss=0.1371 Policy=0.0003 Value=0.3434 Entropy=0.6984 KL=0.0106 Corr=0.0207
[Update 172/1000] Episode reward: 1.328, steps: 512




[PPO] Epoch 4/4 Loss=0.2057 Policy=-0.0013 Value=0.4797 Entropy=0.6571 KL=0.0048 Corr=0.0222
[Update 173/1000] Episode reward: 6.418, steps: 512




[PPO] Epoch 4/4 Loss=0.3743 Policy=-0.0025 Value=0.8152 Entropy=0.6159 KL=0.0066 Corr=0.0183
[Update 174/1000] Episode reward: 19.842, steps: 512




[PPO] Epoch 4/4 Loss=0.4771 Policy=0.0042 Value=1.0046 Entropy=0.5858 KL=0.0020 Corr=0.0293
[Update 175/1000] Episode reward: 28.734, steps: 512




[PPO] Epoch 4/4 Loss=0.5550 Policy=0.0008 Value=1.1675 Entropy=0.5900 KL=0.0026 Corr=0.0556
[Update 176/1000] Episode reward: -4.192, steps: 512




[PPO] Epoch 4/4 Loss=0.3472 Policy=0.0021 Value=0.7571 Entropy=0.6683 KL=0.0128 Corr=0.0044
[Update 177/1000] Episode reward: 7.394, steps: 512




[PPO] Epoch 4/4 Loss=0.1672 Policy=0.0087 Value=0.3894 Entropy=0.7238 KL=0.0046 Corr=-0.0022
[Update 178/1000] Episode reward: -3.828, steps: 512




[PPO] Epoch 4/4 Loss=0.2025 Policy=-0.0000 Value=0.4829 Entropy=0.7784 KL=0.0119 Corr=0.0177
[Update 179/1000] Episode reward: 6.954, steps: 512




[PPO] Epoch 4/4 Loss=0.0387 Policy=0.0006 Value=0.1571 Entropy=0.8098 KL=0.0111 Corr=0.0132
[Update 180/1000] Episode reward: -1.977, steps: 512




[PPO] Epoch 4/4 Loss=0.0460 Policy=-0.0034 Value=0.1801 Entropy=0.8139 KL=0.0002 Corr=-0.0346
[Update 181/1000] Episode reward: 3.333, steps: 512




[PPO] Epoch 4/4 Loss=0.1359 Policy=-0.0130 Value=0.3751 Entropy=0.7748 KL=0.0118 Corr=-0.0079
[Update 182/1000] Episode reward: 17.657, steps: 512




[PPO] Epoch 4/4 Loss=0.2362 Policy=-0.0039 Value=0.5531 Entropy=0.7289 KL=0.0082 Corr=0.0085
[Update 183/1000] Episode reward: 4.641, steps: 512




[PPO] Epoch 4/4 Loss=0.7022 Policy=0.0063 Value=1.4653 Entropy=0.7350 KL=0.0018 Corr=0.0442
[Update 184/1000] Episode reward: 15.635, steps: 512




[PPO] Epoch 4/4 Loss=0.3782 Policy=0.0044 Value=0.8210 Entropy=0.7353 KL=0.0011 Corr=0.0820
[Update 185/1000] Episode reward: 17.911, steps: 512




[PPO] Epoch 4/4 Loss=0.3878 Policy=-0.0004 Value=0.8483 Entropy=0.7189 KL=0.0027 Corr=-0.0096
[Update 186/1000] Episode reward: 13.414, steps: 512




[PPO] Epoch 4/4 Loss=0.3761 Policy=-0.0046 Value=0.8327 Entropy=0.7133 KL=-0.0003 Corr=0.0294
[Update 187/1000] Episode reward: 30.755, steps: 512




[PPO] Epoch 4/4 Loss=0.2340 Policy=-0.0047 Value=0.5515 Entropy=0.7402 KL=0.0044 Corr=-0.0031
[Update 188/1000] Episode reward: 13.000, steps: 512




[PPO] Epoch 4/4 Loss=0.1782 Policy=0.0012 Value=0.4332 Entropy=0.7927 KL=0.0216 Corr=-0.0311
[Update 189/1000] Episode reward: 0.880, steps: 512




[PPO] Epoch 4/4 Loss=0.2427 Policy=0.0007 Value=0.5664 Entropy=0.8234 KL=0.0032 Corr=0.0765
[Update 190/1000] Episode reward: -4.498, steps: 512




[PPO] Epoch 4/4 Loss=0.1599 Policy=0.0123 Value=0.3787 Entropy=0.8346 KL=0.0030 Corr=0.0076
[Update 191/1000] Episode reward: -3.016, steps: 512




[PPO] Epoch 4/4 Loss=0.2059 Policy=0.0038 Value=0.4858 Entropy=0.8166 KL=0.0031 Corr=-0.0705
[Update 192/1000] Episode reward: 3.225, steps: 512




[PPO] Epoch 4/4 Loss=0.0951 Policy=-0.0018 Value=0.2675 Entropy=0.7354 KL=0.0082 Corr=0.0311
[Update 193/1000] Episode reward: 7.278, steps: 512




[PPO] Epoch 4/4 Loss=0.3785 Policy=0.0090 Value=0.8092 Entropy=0.7019 KL=0.0046 Corr=0.0403
[Update 194/1000] Episode reward: -19.690, steps: 512




[PPO] Epoch 4/4 Loss=0.1950 Policy=-0.0023 Value=0.4626 Entropy=0.6789 KL=0.0084 Corr=0.0224
[Update 195/1000] Episode reward: 4.373, steps: 512




[PPO] Epoch 4/4 Loss=0.2520 Policy=0.0018 Value=0.5677 Entropy=0.6738 KL=0.0125 Corr=-0.0104
[Update 196/1000] Episode reward: 2.527, steps: 512




[PPO] Epoch 4/4 Loss=0.3857 Policy=0.0019 Value=0.8388 Entropy=0.7121 KL=0.0039 Corr=0.0161
[Update 197/1000] Episode reward: 17.776, steps: 512




[PPO] Epoch 4/4 Loss=0.1412 Policy=-0.0125 Value=0.3758 Entropy=0.6846 KL=0.0059 Corr=-0.0004
[Update 198/1000] Episode reward: 5.775, steps: 512




[PPO] Epoch 4/4 Loss=0.2392 Policy=0.0117 Value=0.5146 Entropy=0.5971 KL=0.0053 Corr=0.0475
[Update 199/1000] Episode reward: -7.906, steps: 512




[PPO] Epoch 4/4 Loss=0.2999 Policy=-0.0009 Value=0.6613 Entropy=0.5954 KL=0.0026 Corr=0.0009
[Update 200/1000] Episode reward: 5.578, steps: 512




[PPO] Epoch 4/4 Loss=0.3212 Policy=0.0043 Value=0.6961 Entropy=0.6231 KL=-0.0002 Corr=0.0043
[Update 201/1000] Episode reward: 18.184, steps: 512




[PPO] Epoch 4/4 Loss=0.2363 Policy=0.0001 Value=0.5363 Entropy=0.6379 KL=0.0070 Corr=0.0716
[Update 202/1000] Episode reward: -12.274, steps: 512




[PPO] Epoch 4/4 Loss=0.3715 Policy=-0.0010 Value=0.8097 Entropy=0.6472 KL=0.0032 Corr=0.0759
[Update 203/1000] Episode reward: 22.835, steps: 512




[PPO] Epoch 4/4 Loss=0.2517 Policy=0.0002 Value=0.5663 Entropy=0.6336 KL=0.0090 Corr=0.0126
[Update 204/1000] Episode reward: 17.133, steps: 512




[PPO] Epoch 4/4 Loss=0.2031 Policy=-0.0063 Value=0.4777 Entropy=0.5886 KL=0.0067 Corr=0.0030
[Update 205/1000] Episode reward: 13.512, steps: 512




[PPO] Epoch 4/4 Loss=0.1740 Policy=0.0033 Value=0.3945 Entropy=0.5323 KL=0.0122 Corr=0.0125
[Update 206/1000] Episode reward: -5.408, steps: 512




[PPO] Epoch 4/4 Loss=0.3690 Policy=0.0038 Value=0.7821 Entropy=0.5173 KL=0.0033 Corr=-0.0266
[Update 207/1000] Episode reward: 17.577, steps: 512




[PPO] Epoch 4/4 Loss=0.2587 Policy=0.0024 Value=0.5662 Entropy=0.5359 KL=-0.0010 Corr=-0.0387
[Update 208/1000] Episode reward: 0.636, steps: 512




[PPO] Epoch 4/4 Loss=0.5192 Policy=0.0062 Value=1.0814 Entropy=0.5524 KL=0.0015 Corr=-0.0282
[Update 209/1000] Episode reward: 30.258, steps: 512




[PPO] Epoch 4/4 Loss=0.3096 Policy=0.0015 Value=0.6727 Entropy=0.5639 KL=0.0083 Corr=0.0183
[Update 210/1000] Episode reward: -8.030, steps: 512




[PPO] Epoch 4/4 Loss=0.1609 Policy=0.0023 Value=0.3766 Entropy=0.5945 KL=-0.0033 Corr=-0.0383
[Update 211/1000] Episode reward: -5.388, steps: 512




[PPO] Epoch 4/4 Loss=0.1263 Policy=-0.0015 Value=0.3213 Entropy=0.6572 KL=0.0247 Corr=0.0049
[Update 212/1000] Episode reward: 6.762, steps: 512




[PPO] Epoch 4/4 Loss=0.1801 Policy=0.0059 Value=0.4172 Entropy=0.6885 KL=-0.0002 Corr=-0.0290
[Update 213/1000] Episode reward: 1.772, steps: 512




[PPO] Epoch 4/4 Loss=0.1829 Policy=-0.0090 Value=0.4485 Entropy=0.6484 KL=0.0113 Corr=0.0290
[Update 214/1000] Episode reward: 9.210, steps: 512




[PPO] Epoch 4/4 Loss=0.0657 Policy=0.0023 Value=0.1854 Entropy=0.5861 KL=-0.0018 Corr=0.0122
[Update 215/1000] Episode reward: -1.635, steps: 512




[PPO] Epoch 4/4 Loss=0.4354 Policy=0.0024 Value=0.9220 Entropy=0.5594 KL=0.0061 Corr=0.0379
[Update 216/1000] Episode reward: -25.677, steps: 512




[PPO] Epoch 4/4 Loss=0.1999 Policy=0.0001 Value=0.4565 Entropy=0.5693 KL=0.0046 Corr=0.0350
[Update 217/1000] Episode reward: 3.913, steps: 512




[PPO] Epoch 4/4 Loss=0.4153 Policy=0.0056 Value=0.8777 Entropy=0.5844 KL=0.0053 Corr=-0.0267
[Update 218/1000] Episode reward: 20.771, steps: 512




[PPO] Epoch 4/4 Loss=0.2280 Policy=0.0080 Value=0.4997 Entropy=0.5971 KL=0.0023 Corr=0.0678
[Update 219/1000] Episode reward: -13.229, steps: 512




[PPO] Epoch 4/4 Loss=0.5118 Policy=0.0037 Value=1.0779 Entropy=0.6175 KL=0.0030 Corr=-0.0061
[Update 220/1000] Episode reward: -7.145, steps: 512




[PPO] Epoch 4/4 Loss=0.0809 Policy=0.0050 Value=0.2138 Entropy=0.6188 KL=0.0036 Corr=-0.0289
[Update 221/1000] Episode reward: 8.590, steps: 512




[PPO] Epoch 4/4 Loss=0.0515 Policy=-0.0020 Value=0.1686 Entropy=0.6164 KL=0.0016 Corr=0.0125
[Update 222/1000] Episode reward: -2.073, steps: 512




[PPO] Epoch 4/4 Loss=0.3813 Policy=0.0012 Value=0.8225 Entropy=0.6230 KL=0.0024 Corr=-0.0065
[Update 223/1000] Episode reward: 17.838, steps: 512




[PPO] Epoch 4/4 Loss=0.3046 Policy=-0.0055 Value=0.6813 Entropy=0.6129 KL=0.0059 Corr=0.0169
[Update 224/1000] Episode reward: 21.658, steps: 512




[PPO] Epoch 4/4 Loss=0.4487 Policy=-0.0003 Value=0.9580 Entropy=0.5997 KL=0.0020 Corr=-0.0562
[Update 225/1000] Episode reward: 24.602, steps: 512




[PPO] Epoch 4/4 Loss=0.2254 Policy=0.0024 Value=0.5058 Entropy=0.5974 KL=0.0050 Corr=-0.0050
[Update 226/1000] Episode reward: 0.521, steps: 512




[PPO] Epoch 4/4 Loss=1.1536 Policy=0.0039 Value=2.3600 Entropy=0.6069 KL=0.0017 Corr=-0.0535
[Update 227/1000] Episode reward: 47.890, steps: 512




[PPO] Epoch 4/4 Loss=0.0594 Policy=0.0006 Value=0.1793 Entropy=0.6170 KL=0.0030 Corr=-0.0287
[Update 228/1000] Episode reward: 7.121, steps: 512




[PPO] Epoch 4/4 Loss=0.4084 Policy=0.0057 Value=0.8692 Entropy=0.6370 KL=0.0026 Corr=0.0049
[Update 229/1000] Episode reward: 19.444, steps: 512




[PPO] Epoch 4/4 Loss=0.2863 Policy=0.0013 Value=0.6344 Entropy=0.6432 KL=0.0015 Corr=0.0082
[Update 230/1000] Episode reward: 17.825, steps: 512




[PPO] Epoch 4/4 Loss=0.2281 Policy=0.0038 Value=0.5120 Entropy=0.6350 KL=0.0046 Corr=0.0205
[Update 231/1000] Episode reward: 11.977, steps: 512




[PPO] Epoch 4/4 Loss=0.3583 Policy=-0.0037 Value=0.7871 Entropy=0.6298 KL=0.0057 Corr=-0.0191
[Update 232/1000] Episode reward: 2.752, steps: 512




[PPO] Epoch 4/4 Loss=0.4404 Policy=0.0029 Value=0.9389 Entropy=0.6385 KL=0.0025 Corr=-0.0500
[Update 233/1000] Episode reward: 21.535, steps: 512




[PPO] Epoch 4/4 Loss=0.1637 Policy=0.0035 Value=0.3846 Entropy=0.6427 KL=0.0047 Corr=0.0446
[Update 234/1000] Episode reward: -2.674, steps: 512




[PPO] Epoch 4/4 Loss=0.4902 Policy=-0.0006 Value=1.0466 Entropy=0.6494 KL=0.0045 Corr=-0.0414
[Update 235/1000] Episode reward: 32.314, steps: 512




[PPO] Epoch 4/4 Loss=0.1985 Policy=0.0026 Value=0.4575 Entropy=0.6574 KL=0.0043 Corr=-0.0816
[Update 236/1000] Episode reward: 1.857, steps: 512




[PPO] Epoch 4/4 Loss=0.1525 Policy=-0.0031 Value=0.3782 Entropy=0.6697 KL=0.0066 Corr=-0.0538
[Update 237/1000] Episode reward: 5.166, steps: 512




[PPO] Epoch 4/4 Loss=0.1982 Policy=-0.0079 Value=0.4763 Entropy=0.6418 KL=-0.0005 Corr=0.0337
[Update 238/1000] Episode reward: 17.651, steps: 512




[PPO] Epoch 4/4 Loss=0.2147 Policy=0.0013 Value=0.4871 Entropy=0.6025 KL=-0.0036 Corr=0.0179
[Update 239/1000] Episode reward: -2.365, steps: 512




[PPO] Epoch 4/4 Loss=0.1586 Policy=0.0008 Value=0.3743 Entropy=0.5870 KL=0.0011 Corr=-0.0496
[Update 240/1000] Episode reward: -3.864, steps: 512




[PPO] Epoch 4/4 Loss=0.1263 Policy=0.0029 Value=0.3055 Entropy=0.5866 KL=0.0061 Corr=-0.0215
[Update 241/1000] Episode reward: 2.358, steps: 512




[PPO] Epoch 4/4 Loss=0.2510 Policy=-0.0036 Value=0.5682 Entropy=0.5895 KL=0.0045 Corr=0.0088
[Update 242/1000] Episode reward: -1.948, steps: 512




[PPO] Epoch 4/4 Loss=0.0776 Policy=0.0001 Value=0.2176 Entropy=0.6260 KL=0.0034 Corr=-0.0201
[Update 243/1000] Episode reward: -1.988, steps: 512




[PPO] Epoch 4/4 Loss=0.1424 Policy=0.0022 Value=0.3469 Entropy=0.6649 KL=0.0002 Corr=-0.0921
[Update 244/1000] Episode reward: -5.697, steps: 512




[PPO] Epoch 4/4 Loss=0.2239 Policy=0.0008 Value=0.5158 Entropy=0.6947 KL=0.0022 Corr=0.0776
[Update 245/1000] Episode reward: -9.378, steps: 512




[PPO] Epoch 4/4 Loss=0.6040 Policy=-0.0003 Value=1.2795 Entropy=0.7084 KL=0.0009 Corr=0.0076
[Update 246/1000] Episode reward: 18.826, steps: 512




[PPO] Epoch 4/4 Loss=0.0995 Policy=-0.0003 Value=0.2704 Entropy=0.7094 KL=-0.0001 Corr=-0.0285
[Update 247/1000] Episode reward: -4.098, steps: 512




[PPO] Epoch 4/4 Loss=0.4326 Policy=0.0037 Value=0.9279 Entropy=0.7015 KL=0.0018 Corr=0.0189
[Update 248/1000] Episode reward: 7.546, steps: 512




[PPO] Epoch 4/4 Loss=0.2236 Policy=0.0004 Value=0.5153 Entropy=0.6896 KL=0.0037 Corr=-0.0523
[Update 249/1000] Episode reward: -7.012, steps: 512




[PPO] Epoch 4/4 Loss=0.1078 Policy=0.0025 Value=0.2792 Entropy=0.6871 KL=0.0089 Corr=-0.0438
[Update 250/1000] Episode reward: -17.420, steps: 512




[PPO] Epoch 4/4 Loss=0.0678 Policy=-0.0005 Value=0.2075 Entropy=0.7083 KL=0.0012 Corr=0.0238
[Update 251/1000] Episode reward: -12.893, steps: 512




[PPO] Epoch 4/4 Loss=0.1507 Policy=0.0060 Value=0.3630 Entropy=0.7355 KL=0.0031 Corr=0.0170
[Update 252/1000] Episode reward: -6.929, steps: 512




[PPO] Epoch 4/4 Loss=0.2157 Policy=-0.0101 Value=0.5218 Entropy=0.7041 KL=0.0071 Corr=-0.0429
[Update 253/1000] Episode reward: -1.943, steps: 512




[PPO] Epoch 4/4 Loss=0.1510 Policy=-0.0052 Value=0.3794 Entropy=0.6698 KL=0.0071 Corr=0.0221
[Update 254/1000] Episode reward: 5.238, steps: 512




[PPO] Epoch 4/4 Loss=1.1380 Policy=0.0002 Value=2.3414 Entropy=0.6578 KL=-0.0038 Corr=-0.0240
[Update 255/1000] Episode reward: 37.677, steps: 512




[PPO] Epoch 4/4 Loss=0.0827 Policy=0.0011 Value=0.2309 Entropy=0.6772 KL=0.0042 Corr=-0.0093
[Update 256/1000] Episode reward: -4.327, steps: 512




[PPO] Epoch 4/4 Loss=0.4085 Policy=-0.0004 Value=0.8888 Entropy=0.7103 KL=0.0052 Corr=-0.0168
[Update 257/1000] Episode reward: 14.064, steps: 512




[PPO] Epoch 4/4 Loss=0.1429 Policy=0.0033 Value=0.3504 Entropy=0.7124 KL=0.0004 Corr=-0.0117
[Update 258/1000] Episode reward: 3.113, steps: 512




[PPO] Epoch 4/4 Loss=0.1974 Policy=0.0034 Value=0.4569 Entropy=0.6901 KL=0.0041 Corr=0.0047
[Update 259/1000] Episode reward: 10.354, steps: 512




[PPO] Epoch 4/4 Loss=0.1517 Policy=-0.0029 Value=0.3758 Entropy=0.6670 KL=0.0031 Corr=0.0284
[Update 260/1000] Episode reward: 7.449, steps: 512




[PPO] Epoch 4/4 Loss=0.9767 Policy=-0.0022 Value=2.0234 Entropy=0.6570 KL=0.0026 Corr=-0.0235
[Update 261/1000] Episode reward: 37.614, steps: 512




[PPO] Epoch 4/4 Loss=0.1628 Policy=0.0020 Value=0.3858 Entropy=0.6410 KL=0.0049 Corr=-0.0395
[Update 262/1000] Episode reward: 11.145, steps: 512




[PPO] Epoch 4/4 Loss=0.3308 Policy=-0.0043 Value=0.7324 Entropy=0.6215 KL=0.0005 Corr=0.0354
[Update 263/1000] Episode reward: 17.444, steps: 512




[PPO] Epoch 4/4 Loss=0.2256 Policy=-0.0015 Value=0.5159 Entropy=0.6162 KL=0.0001 Corr=-0.0204
[Update 264/1000] Episode reward: -2.118, steps: 512




[PPO] Epoch 4/4 Loss=0.6642 Policy=0.0053 Value=1.3795 Entropy=0.6169 KL=0.0048 Corr=-0.0522
[Update 265/1000] Episode reward: 28.576, steps: 512




[PPO] Epoch 4/4 Loss=0.3944 Policy=-0.0005 Value=0.8549 Entropy=0.6513 KL=0.0026 Corr=-0.0025
[Update 266/1000] Episode reward: 6.326, steps: 512




[PPO] Epoch 4/4 Loss=0.1882 Policy=0.0027 Value=0.4437 Entropy=0.7262 KL=0.0068 Corr=-0.1020
[Update 267/1000] Episode reward: -0.481, steps: 512




[PPO] Epoch 4/4 Loss=0.2014 Policy=0.0056 Value=0.4681 Entropy=0.7643 KL=0.0089 Corr=0.0414
[Update 268/1000] Episode reward: 11.840, steps: 512




[PPO] Epoch 4/4 Loss=0.1765 Policy=-0.0033 Value=0.4357 Entropy=0.7616 KL=-0.0003 Corr=0.0117
[Update 269/1000] Episode reward: 9.895, steps: 512




[PPO] Epoch 4/4 Loss=0.0808 Policy=-0.0023 Value=0.2390 Entropy=0.7269 KL=0.0022 Corr=-0.0022
[Update 270/1000] Episode reward: -6.960, steps: 512




[PPO] Epoch 4/4 Loss=0.3311 Policy=-0.0058 Value=0.7425 Entropy=0.6862 KL=0.0001 Corr=-0.0061
[Update 271/1000] Episode reward: 13.298, steps: 512




[PPO] Epoch 4/4 Loss=0.5159 Policy=-0.0055 Value=1.1066 Entropy=0.6383 KL=0.0045 Corr=-0.0101
[Update 272/1000] Episode reward: 29.475, steps: 512




[PPO] Epoch 4/4 Loss=0.1119 Policy=0.0013 Value=0.2836 Entropy=0.6239 KL=0.0037 Corr=-0.0511
[Update 273/1000] Episode reward: 7.836, steps: 512




[PPO] Epoch 4/4 Loss=0.3516 Policy=0.0010 Value=0.7661 Entropy=0.6483 KL=0.0034 Corr=-0.0542
[Update 274/1000] Episode reward: 27.692, steps: 512




[PPO] Epoch 4/4 Loss=0.1706 Policy=0.0002 Value=0.4049 Entropy=0.6420 KL=0.0040 Corr=-0.0295
[Update 275/1000] Episode reward: 15.939, steps: 512




[PPO] Epoch 4/4 Loss=0.4396 Policy=-0.0040 Value=0.9480 Entropy=0.6093 KL=0.0034 Corr=-0.0183
[Update 276/1000] Episode reward: 25.689, steps: 512




[PPO] Epoch 4/4 Loss=0.3490 Policy=0.0011 Value=0.7550 Entropy=0.5915 KL=0.0012 Corr=-0.0181
[Update 277/1000] Episode reward: -12.243, steps: 512




[PPO] Epoch 4/4 Loss=0.2983 Policy=-0.0038 Value=0.6625 Entropy=0.5837 KL=-0.0016 Corr=-0.0494
[Update 278/1000] Episode reward: 19.647, steps: 512




[PPO] Epoch 4/4 Loss=0.8902 Policy=0.0023 Value=1.8328 Entropy=0.5696 KL=0.0066 Corr=-0.0477
[Update 279/1000] Episode reward: -27.825, steps: 512




[PPO] Epoch 4/4 Loss=0.5267 Policy=0.0002 Value=1.1093 Entropy=0.5634 KL=0.0022 Corr=0.0697
[Update 280/1000] Episode reward: 34.551, steps: 512




[PPO] Epoch 4/4 Loss=0.1441 Policy=-0.0076 Value=0.3606 Entropy=0.5714 KL=-0.0012 Corr=0.0603
[Update 281/1000] Episode reward: 7.166, steps: 512




[PPO] Epoch 4/4 Loss=0.2232 Policy=-0.0062 Value=0.5189 Entropy=0.6004 KL=0.0034 Corr=-0.0415
[Update 282/1000] Episode reward: -9.976, steps: 512




[PPO] Epoch 4/4 Loss=0.4031 Policy=0.0039 Value=0.8609 Entropy=0.6250 KL=-0.0018 Corr=-0.0079
[Update 283/1000] Episode reward: 24.937, steps: 512




[PPO] Epoch 4/4 Loss=0.0929 Policy=-0.0033 Value=0.2542 Entropy=0.6191 KL=0.0033 Corr=-0.0420
[Update 284/1000] Episode reward: 2.733, steps: 512




[PPO] Epoch 4/4 Loss=0.2748 Policy=0.0037 Value=0.6028 Entropy=0.6056 KL=0.0015 Corr=-0.0932
[Update 285/1000] Episode reward: 17.249, steps: 512




[PPO] Epoch 4/4 Loss=0.2609 Policy=0.0005 Value=0.5844 Entropy=0.6365 KL=0.0056 Corr=-0.0650
[Update 286/1000] Episode reward: 10.628, steps: 512




[PPO] Epoch 4/4 Loss=0.1142 Policy=0.0030 Value=0.2931 Entropy=0.7074 KL=0.0071 Corr=-0.0122
[Update 287/1000] Episode reward: 5.516, steps: 512




[PPO] Epoch 4/4 Loss=0.1352 Policy=-0.0020 Value=0.3453 Entropy=0.7108 KL=0.0015 Corr=0.0033
[Update 288/1000] Episode reward: 12.167, steps: 512




[PPO] Epoch 4/4 Loss=0.2515 Policy=-0.0075 Value=0.5821 Entropy=0.6413 KL=0.0008 Corr=0.0501
[Update 289/1000] Episode reward: 10.423, steps: 512




[PPO] Epoch 4/4 Loss=0.2709 Policy=-0.0022 Value=0.6065 Entropy=0.6032 KL=0.0144 Corr=-0.0475
[Update 290/1000] Episode reward: 9.960, steps: 512




[PPO] Epoch 4/4 Loss=0.3589 Policy=0.0036 Value=0.7762 Entropy=0.6543 KL=0.0089 Corr=-0.0040
[Update 291/1000] Episode reward: 19.365, steps: 512




[PPO] Epoch 4/4 Loss=0.6516 Policy=0.0017 Value=1.3698 Entropy=0.6998 KL=0.0051 Corr=0.1029
[Update 292/1000] Episode reward: -4.386, steps: 512




[PPO] Epoch 4/4 Loss=0.7650 Policy=0.0015 Value=1.6012 Entropy=0.7426 KL=0.0027 Corr=-0.0599
[Update 293/1000] Episode reward: 22.121, steps: 512




[PPO] Epoch 4/4 Loss=0.5740 Policy=-0.0065 Value=1.2379 Entropy=0.7681 KL=0.0002 Corr=0.0561
[Update 294/1000] Episode reward: -12.637, steps: 512




[PPO] Epoch 4/4 Loss=0.4188 Policy=0.0035 Value=0.9105 Entropy=0.8001 KL=0.0002 Corr=0.0120
[Update 295/1000] Episode reward: 17.258, steps: 512




[PPO] Epoch 4/4 Loss=0.4751 Policy=-0.0016 Value=1.0360 Entropy=0.8262 KL=0.0063 Corr=0.0474
[Update 296/1000] Episode reward: 25.166, steps: 512




[PPO] Epoch 4/4 Loss=0.1596 Policy=-0.0023 Value=0.4076 Entropy=0.8392 KL=0.0013 Corr=-0.0157
[Update 297/1000] Episode reward: 4.986, steps: 512




[PPO] Epoch 4/4 Loss=0.1874 Policy=-0.0010 Value=0.4648 Entropy=0.8798 KL=0.0115 Corr=0.0296
[Update 298/1000] Episode reward: 8.901, steps: 512




[PPO] Epoch 4/4 Loss=0.2340 Policy=0.0104 Value=0.5401 Entropy=0.9278 KL=0.0071 Corr=-0.0628
[Update 299/1000] Episode reward: -3.666, steps: 512




[PPO] Epoch 4/4 Loss=0.1750 Policy=-0.0027 Value=0.4500 Entropy=0.9452 KL=0.0153 Corr=-0.0433
[Update 300/1000] Episode reward: -10.038, steps: 512




[PPO] Epoch 4/4 Loss=0.1873 Policy=0.0008 Value=0.4682 Entropy=0.9523 KL=0.0103 Corr=0.0165
[Update 301/1000] Episode reward: -4.610, steps: 512




[PPO] Epoch 4/4 Loss=0.1651 Policy=-0.0030 Value=0.4316 Entropy=0.9546 KL=0.0053 Corr=-0.0187
[Update 302/1000] Episode reward: -13.529, steps: 512




[PPO] Epoch 4/4 Loss=0.0399 Policy=-0.0010 Value=0.1781 Entropy=0.9618 KL=0.0084 Corr=0.0005
[Update 303/1000] Episode reward: 1.398, steps: 512




[PPO] Epoch 4/4 Loss=0.4172 Policy=0.0004 Value=0.9312 Entropy=0.9773 KL=0.0041 Corr=-0.0219
[Update 304/1000] Episode reward: -11.393, steps: 512




[PPO] Epoch 4/4 Loss=0.1553 Policy=-0.0011 Value=0.4109 Entropy=0.9816 KL=0.0026 Corr=-0.0735
[Update 305/1000] Episode reward: -0.726, steps: 512




[PPO] Epoch 4/4 Loss=0.3239 Policy=-0.0007 Value=0.7451 Entropy=0.9595 KL=0.0031 Corr=-0.0032
[Update 306/1000] Episode reward: -15.027, steps: 512




[PPO] Epoch 4/4 Loss=0.3863 Policy=0.0017 Value=0.8636 Entropy=0.9425 KL=0.0075 Corr=-0.0950
[Update 307/1000] Episode reward: 12.278, steps: 512




[PPO] Epoch 4/4 Loss=0.3138 Policy=-0.0083 Value=0.7346 Entropy=0.9042 KL=0.0039 Corr=0.0346
[Update 308/1000] Episode reward: 1.440, steps: 512




[PPO] Epoch 4/4 Loss=0.1205 Policy=-0.0033 Value=0.3386 Entropy=0.9091 KL=0.0128 Corr=0.0118
[Update 309/1000] Episode reward: -7.470, steps: 512




[PPO] Epoch 4/4 Loss=0.1310 Policy=0.0022 Value=0.3484 Entropy=0.9079 KL=0.0000 Corr=0.0441
[Update 310/1000] Episode reward: 7.516, steps: 512




[PPO] Epoch 4/4 Loss=0.1058 Policy=-0.0029 Value=0.3058 Entropy=0.8836 KL=0.0033 Corr=0.0362
[Update 311/1000] Episode reward: 4.224, steps: 512




[PPO] Epoch 4/4 Loss=0.1612 Policy=-0.0024 Value=0.4109 Entropy=0.8367 KL=0.0102 Corr=-0.0169
[Update 312/1000] Episode reward: 10.461, steps: 512




[PPO] Epoch 4/4 Loss=0.2551 Policy=0.0013 Value=0.5875 Entropy=0.7986 KL=-0.0013 Corr=-0.0026
[Update 313/1000] Episode reward: -1.322, steps: 512




[PPO] Epoch 4/4 Loss=0.2231 Policy=0.0025 Value=0.5210 Entropy=0.7968 KL=-0.0033 Corr=0.0680
[Update 314/1000] Episode reward: 2.524, steps: 512




[PPO] Epoch 4/4 Loss=0.3305 Policy=0.0050 Value=0.7320 Entropy=0.8098 KL=0.0011 Corr=-0.0205
[Update 315/1000] Episode reward: -2.359, steps: 512




[PPO] Epoch 4/4 Loss=0.3786 Policy=-0.0012 Value=0.8416 Entropy=0.8202 KL=0.0079 Corr=-0.0046
[Update 316/1000] Episode reward: 20.701, steps: 512




[PPO] Epoch 4/4 Loss=0.2088 Policy=-0.0023 Value=0.5036 Entropy=0.8127 KL=-0.0018 Corr=0.0434
[Update 317/1000] Episode reward: 11.389, steps: 512




[PPO] Epoch 4/4 Loss=0.1750 Policy=-0.0027 Value=0.4358 Entropy=0.8046 KL=0.0041 Corr=0.0097
[Update 318/1000] Episode reward: -10.472, steps: 512




[PPO] Epoch 4/4 Loss=0.1293 Policy=0.0059 Value=0.3277 Entropy=0.8100 KL=0.0040 Corr=0.0272
[Update 319/1000] Episode reward: -8.296, steps: 512




[PPO] Epoch 4/4 Loss=0.4564 Policy=0.0015 Value=0.9916 Entropy=0.8183 KL=0.0097 Corr=0.0081
[Update 320/1000] Episode reward: -21.617, steps: 512




[PPO] Epoch 4/4 Loss=0.5080 Policy=0.0017 Value=1.0952 Entropy=0.8265 KL=0.0061 Corr=-0.0445
[Update 321/1000] Episode reward: 17.204, steps: 512




[PPO] Epoch 4/4 Loss=0.3298 Policy=-0.0049 Value=0.7524 Entropy=0.8307 KL=0.0054 Corr=0.1201
[Update 322/1000] Episode reward: 15.446, steps: 512




[PPO] Epoch 4/4 Loss=0.1160 Policy=0.0045 Value=0.3061 Entropy=0.8302 KL=0.0051 Corr=0.0626
[Update 323/1000] Episode reward: 5.721, steps: 512




[PPO] Epoch 4/4 Loss=0.0849 Policy=0.0028 Value=0.2478 Entropy=0.8352 KL=0.0013 Corr=-0.0830
[Update 324/1000] Episode reward: -0.900, steps: 512




[PPO] Epoch 4/4 Loss=0.2486 Policy=0.0034 Value=0.5730 Entropy=0.8261 KL=0.0030 Corr=0.0769
[Update 325/1000] Episode reward: 7.899, steps: 512




[PPO] Epoch 4/4 Loss=0.0735 Policy=-0.0006 Value=0.2307 Entropy=0.8247 KL=-0.0003 Corr=0.0329
[Update 326/1000] Episode reward: -4.024, steps: 512




[PPO] Epoch 4/4 Loss=0.1194 Policy=-0.0090 Value=0.3345 Entropy=0.7763 KL=0.0112 Corr=-0.0030
[Update 327/1000] Episode reward: 2.263, steps: 512




[PPO] Epoch 4/4 Loss=0.3189 Policy=0.0040 Value=0.6982 Entropy=0.6840 KL=0.0054 Corr=0.0007
[Update 328/1000] Episode reward: 7.450, steps: 512




[PPO] Epoch 4/4 Loss=0.4010 Policy=0.0005 Value=0.8653 Entropy=0.6410 KL=0.0018 Corr=-0.0360
[Update 329/1000] Episode reward: 9.055, steps: 512




[PPO] Epoch 4/4 Loss=0.1300 Policy=-0.0039 Value=0.3342 Entropy=0.6656 KL=0.0013 Corr=-0.0181
[Update 330/1000] Episode reward: 2.568, steps: 512




[PPO] Epoch 4/4 Loss=0.6269 Policy=-0.0064 Value=1.3403 Entropy=0.7374 KL=0.0131 Corr=0.0519
[Update 331/1000] Episode reward: 15.758, steps: 512




[PPO] Epoch 4/4 Loss=0.2397 Policy=0.0040 Value=0.5487 Entropy=0.7737 KL=0.0040 Corr=-0.0038
[Update 332/1000] Episode reward: 7.300, steps: 512




[PPO] Epoch 4/4 Loss=0.3464 Policy=0.0009 Value=0.7681 Entropy=0.7705 KL=0.0008 Corr=0.0778
[Update 333/1000] Episode reward: 18.579, steps: 512




[PPO] Epoch 4/4 Loss=0.1970 Policy=-0.0037 Value=0.4775 Entropy=0.7595 KL=0.0007 Corr=-0.0597
[Update 334/1000] Episode reward: -0.543, steps: 512




[PPO] Epoch 4/4 Loss=0.2225 Policy=0.0014 Value=0.5165 Entropy=0.7409 KL=0.0027 Corr=0.0304
[Update 335/1000] Episode reward: 10.381, steps: 512




[PPO] Epoch 4/4 Loss=0.1198 Policy=-0.0003 Value=0.3105 Entropy=0.7022 KL=0.0036 Corr=-0.0183
[Update 336/1000] Episode reward: 10.251, steps: 512




[PPO] Epoch 4/4 Loss=0.1658 Policy=-0.0107 Value=0.4165 Entropy=0.6347 KL=0.0079 Corr=0.0585
[Update 337/1000] Episode reward: 6.571, steps: 512




[PPO] Epoch 4/4 Loss=0.2703 Policy=0.0020 Value=0.5947 Entropy=0.5791 KL=0.0035 Corr=-0.0415
[Update 338/1000] Episode reward: -6.701, steps: 512




[PPO] Epoch 4/4 Loss=0.4756 Policy=0.0011 Value=1.0059 Entropy=0.5669 KL=-0.0033 Corr=-0.0387
[Update 339/1000] Episode reward: -0.323, steps: 512




[PPO] Epoch 4/4 Loss=0.4503 Policy=-0.0051 Value=0.9735 Entropy=0.6253 KL=0.0129 Corr=0.0233
[Update 340/1000] Episode reward: 5.247, steps: 512




[PPO] Epoch 4/4 Loss=0.3078 Policy=0.0082 Value=0.6707 Entropy=0.7135 KL=0.0005 Corr=0.0264
[Update 341/1000] Episode reward: 3.925, steps: 512




[PPO] Epoch 4/4 Loss=0.8276 Policy=0.0013 Value=1.7270 Entropy=0.7453 KL=0.0033 Corr=0.0797
[Update 342/1000] Episode reward: -19.463, steps: 512




[PPO] Epoch 4/4 Loss=0.1666 Policy=0.0001 Value=0.4081 Entropy=0.7495 KL=-0.0014 Corr=0.0061
[Update 343/1000] Episode reward: 3.377, steps: 512




[PPO] Epoch 4/4 Loss=0.0813 Policy=0.0009 Value=0.2349 Entropy=0.7415 KL=0.0077 Corr=0.0372
[Update 344/1000] Episode reward: -3.396, steps: 512




[PPO] Epoch 4/4 Loss=0.6079 Policy=0.0008 Value=1.2893 Entropy=0.7500 KL=0.0059 Corr=-0.0872
[Update 345/1000] Episode reward: 29.826, steps: 512




[PPO] Epoch 4/4 Loss=0.3147 Policy=0.0055 Value=0.6943 Entropy=0.7604 KL=0.0054 Corr=-0.0006
[Update 346/1000] Episode reward: 0.380, steps: 512




[PPO] Epoch 4/4 Loss=1.0650 Policy=0.0064 Value=2.1949 Entropy=0.7777 KL=0.0051 Corr=0.0241
[Update 347/1000] Episode reward: -44.030, steps: 512




[PPO] Epoch 4/4 Loss=0.5533 Policy=0.0060 Value=1.1735 Entropy=0.7883 KL=0.0049 Corr=0.0422
[Update 348/1000] Episode reward: 21.379, steps: 512




[PPO] Epoch 4/4 Loss=0.1151 Policy=-0.0012 Value=0.3121 Entropy=0.7946 KL=0.0099 Corr=0.0664
[Update 349/1000] Episode reward: -9.656, steps: 512




[PPO] Epoch 4/4 Loss=0.0537 Policy=-0.0044 Value=0.1978 Entropy=0.8152 KL=0.0047 Corr=0.1176
[Update 350/1000] Episode reward: -1.847, steps: 512




[PPO] Epoch 4/4 Loss=0.0787 Policy=0.0003 Value=0.2408 Entropy=0.8409 KL=0.0027 Corr=0.0598
[Update 351/1000] Episode reward: -7.904, steps: 512




[PPO] Epoch 4/4 Loss=0.3275 Policy=0.0040 Value=0.7324 Entropy=0.8540 KL=0.0013 Corr=0.0864
[Update 352/1000] Episode reward: 13.536, steps: 512




[PPO] Epoch 4/4 Loss=0.2119 Policy=-0.0002 Value=0.5089 Entropy=0.8472 KL=0.0010 Corr=0.0611
[Update 353/1000] Episode reward: 9.895, steps: 512




[PPO] Epoch 4/4 Loss=0.2992 Policy=0.0026 Value=0.6769 Entropy=0.8383 KL=-0.0004 Corr=0.0297
[Update 354/1000] Episode reward: -5.480, steps: 512




[PPO] Epoch 4/4 Loss=0.0718 Policy=-0.0076 Value=0.2401 Entropy=0.8126 KL=0.0084 Corr=0.0106
[Update 355/1000] Episode reward: 1.953, steps: 512




[PPO] Epoch 4/4 Loss=0.2387 Policy=0.0027 Value=0.5484 Entropy=0.7631 KL=0.0014 Corr=-0.0443
[Update 356/1000] Episode reward: 6.026, steps: 512




[PPO] Epoch 4/4 Loss=0.1683 Policy=0.0076 Value=0.3974 Entropy=0.7611 KL=0.0069 Corr=0.0368
[Update 357/1000] Episode reward: -7.177, steps: 512




[PPO] Epoch 4/4 Loss=0.2496 Policy=-0.0042 Value=0.5857 Entropy=0.7809 KL=0.0036 Corr=0.0538
[Update 358/1000] Episode reward: -18.199, steps: 512




[PPO] Epoch 4/4 Loss=0.6704 Policy=0.0015 Value=1.4178 Entropy=0.8000 KL=-0.0000 Corr=0.0251
[Update 359/1000] Episode reward: 15.829, steps: 512




[PPO] Epoch 4/4 Loss=0.4414 Policy=0.0034 Value=0.9567 Entropy=0.8063 KL=0.0012 Corr=0.1238
[Update 360/1000] Episode reward: -12.265, steps: 512




[PPO] Epoch 4/4 Loss=0.3867 Policy=0.0005 Value=0.8538 Entropy=0.8124 KL=0.0052 Corr=0.0762
[Update 361/1000] Episode reward: 11.890, steps: 512




[PPO] Epoch 4/4 Loss=0.3542 Policy=-0.0002 Value=0.7906 Entropy=0.8189 KL=0.0040 Corr=0.1541
[Update 362/1000] Episode reward: 16.226, steps: 512




[PPO] Epoch 4/4 Loss=0.1774 Policy=0.0023 Value=0.4335 Entropy=0.8319 KL=0.0033 Corr=-0.0687
[Update 363/1000] Episode reward: -1.084, steps: 512




[PPO] Epoch 4/4 Loss=0.0395 Policy=-0.0048 Value=0.1720 Entropy=0.8341 KL=0.0021 Corr=0.0106
[Update 364/1000] Episode reward: 0.364, steps: 512




[PPO] Epoch 4/4 Loss=0.4409 Policy=-0.0008 Value=0.9650 Entropy=0.8163 KL=0.0084 Corr=0.0662
[Update 365/1000] Episode reward: 15.212, steps: 512




[PPO] Epoch 4/4 Loss=0.1293 Policy=0.0007 Value=0.3392 Entropy=0.8199 KL=0.0042 Corr=0.1657
[Update 366/1000] Episode reward: 2.953, steps: 512




[PPO] Epoch 4/4 Loss=0.2189 Policy=0.0137 Value=0.4949 Entropy=0.8452 KL=-0.0023 Corr=0.0379
[Update 367/1000] Episode reward: 11.202, steps: 512




[PPO] Epoch 4/4 Loss=0.0827 Policy=0.0006 Value=0.2490 Entropy=0.8471 KL=0.0062 Corr=0.1297
[Update 368/1000] Episode reward: -1.680, steps: 512




[PPO] Epoch 4/4 Loss=0.1286 Policy=0.0042 Value=0.3332 Entropy=0.8439 KL=0.0038 Corr=0.1460
[Update 369/1000] Episode reward: -10.568, steps: 512




[PPO] Epoch 4/4 Loss=0.2259 Policy=-0.0042 Value=0.5441 Entropy=0.8394 KL=0.0059 Corr=0.0382
[Update 370/1000] Episode reward: 12.974, steps: 512




[PPO] Epoch 4/4 Loss=0.3258 Policy=-0.0035 Value=0.7406 Entropy=0.8213 KL=0.0046 Corr=0.0810
[Update 371/1000] Episode reward: 11.854, steps: 512




[PPO] Epoch 4/4 Loss=0.2297 Policy=0.0059 Value=0.5280 Entropy=0.8032 KL=0.0065 Corr=0.1267
[Update 372/1000] Episode reward: -9.309, steps: 512




[PPO] Epoch 4/4 Loss=0.4373 Policy=-0.0012 Value=0.9560 Entropy=0.7901 KL=0.0052 Corr=0.0153
[Update 373/1000] Episode reward: 23.353, steps: 512




[PPO] Epoch 4/4 Loss=0.2255 Policy=-0.0056 Value=0.5389 Entropy=0.7683 KL=-0.0015 Corr=0.1419
[Update 374/1000] Episode reward: 5.241, steps: 512




[PPO] Epoch 4/4 Loss=0.4424 Policy=-0.0023 Value=0.9650 Entropy=0.7560 KL=0.0005 Corr=0.0519
[Update 375/1000] Episode reward: 13.590, steps: 512




[PPO] Epoch 4/4 Loss=0.2908 Policy=-0.0086 Value=0.6772 Entropy=0.7843 KL=0.0061 Corr=0.3243
[Update 376/1000] Episode reward: 3.308, steps: 512




[PPO] Epoch 4/4 Loss=0.8510 Policy=0.2404 Value=1.3222 Entropy=1.0107 KL=0.3647 Corr=0.1570
[Update 377/1000] Episode reward: -17.324, steps: 512




[PPO] Epoch 4/4 Loss=0.1222 Policy=0.0070 Value=0.3126 Entropy=0.8229 KL=0.0073 Corr=0.3290
[Update 378/1000] Episode reward: -0.551, steps: 512




[PPO] Epoch 4/4 Loss=0.0295 Policy=0.0038 Value=0.1320 Entropy=0.8068 KL=0.0113 Corr=0.5547
[Update 379/1000] Episode reward: 4.283, steps: 512




[PPO] Epoch 4/4 Loss=0.1357 Policy=0.0003 Value=0.3448 Entropy=0.7409 KL=0.0103 Corr=0.0904
[Update 380/1000] Episode reward: 15.066, steps: 512




[PPO] Epoch 4/4 Loss=0.1437 Policy=-0.0033 Value=0.3742 Entropy=0.8017 KL=0.0098 Corr=0.2219
[Update 381/1000] Episode reward: 3.598, steps: 512




[PPO] Epoch 4/4 Loss=0.2156 Policy=0.0018 Value=0.5008 Entropy=0.7314 KL=0.0393 Corr=0.2326
[Update 382/1000] Episode reward: 16.056, steps: 512




[PPO] Epoch 4/4 Loss=0.1804 Policy=-0.0023 Value=0.4440 Entropy=0.7873 KL=0.0211 Corr=0.3167
[Update 383/1000] Episode reward: -0.458, steps: 512




[PPO] Epoch 4/4 Loss=0.4710 Policy=0.0122 Value=1.0049 Entropy=0.8732 KL=0.0226 Corr=0.1360
[Update 384/1000] Episode reward: -11.951, steps: 512




[PPO] Epoch 4/4 Loss=0.0871 Policy=0.0014 Value=0.2543 Entropy=0.8288 KL=0.0019 Corr=0.0030
[Update 385/1000] Episode reward: -8.142, steps: 512




[PPO] Epoch 4/4 Loss=0.1383 Policy=-0.0010 Value=0.3656 Entropy=0.8715 KL=0.0105 Corr=-0.0957
[Update 386/1000] Episode reward: 1.915, steps: 512




[PPO] Epoch 4/4 Loss=0.1963 Policy=-0.0018 Value=0.4804 Entropy=0.8424 KL=0.0077 Corr=-0.0701
[Update 387/1000] Episode reward: 19.226, steps: 512




[PPO] Epoch 4/4 Loss=0.1974 Policy=-0.0002 Value=0.4799 Entropy=0.8465 KL=0.0048 Corr=0.0915
[Update 388/1000] Episode reward: 5.250, steps: 512




[PPO] Epoch 4/4 Loss=0.4009 Policy=-0.0060 Value=0.9024 Entropy=0.8854 KL=0.0006 Corr=-0.0444
[Update 389/1000] Episode reward: -16.003, steps: 512




[PPO] Epoch 4/4 Loss=0.2811 Policy=-0.0015 Value=0.6550 Entropy=0.8987 KL=-0.0028 Corr=0.0170
[Update 390/1000] Episode reward: 7.480, steps: 512




[PPO] Epoch 4/4 Loss=0.1609 Policy=-0.0037 Value=0.4170 Entropy=0.8797 KL=0.0038 Corr=-0.0569
[Update 391/1000] Episode reward: 17.458, steps: 512




[PPO] Epoch 4/4 Loss=0.4913 Policy=-0.0019 Value=1.0724 Entropy=0.8594 KL=0.0014 Corr=0.0161
[Update 392/1000] Episode reward: -14.954, steps: 512




[PPO] Epoch 4/4 Loss=0.3865 Policy=-0.0010 Value=0.8589 Entropy=0.8380 KL=0.0036 Corr=-0.0125
[Update 393/1000] Episode reward: 27.774, steps: 512




[PPO] Epoch 4/4 Loss=0.4651 Policy=-0.0038 Value=1.0184 Entropy=0.8058 KL=-0.0001 Corr=-0.0204
[Update 394/1000] Episode reward: 23.002, steps: 512




[PPO] Epoch 4/4 Loss=0.9829 Policy=-0.0062 Value=2.0531 Entropy=0.7495 KL=-0.0037 Corr=0.0399
[Update 395/1000] Episode reward: 24.943, steps: 512




[PPO] Epoch 4/4 Loss=0.3442 Policy=-0.0008 Value=0.7589 Entropy=0.6889 KL=0.0050 Corr=-0.0172
[Update 396/1000] Episode reward: 21.078, steps: 512




[PPO] Epoch 4/4 Loss=0.4262 Policy=0.0005 Value=0.9189 Entropy=0.6739 KL=0.0050 Corr=0.0238
[Update 397/1000] Episode reward: -3.895, steps: 512




[PPO] Epoch 4/4 Loss=0.2742 Policy=0.0008 Value=0.6181 Entropy=0.7122 KL=-0.0014 Corr=0.0535
[Update 398/1000] Episode reward: -2.250, steps: 512




[PPO] Epoch 4/4 Loss=0.5503 Policy=0.0044 Value=1.1663 Entropy=0.7439 KL=-0.0007 Corr=0.1414
[Update 399/1000] Episode reward: 27.804, steps: 512




[PPO] Epoch 4/4 Loss=0.4929 Policy=0.0015 Value=1.0582 Entropy=0.7548 KL=0.0020 Corr=-0.0930
[Update 400/1000] Episode reward: -23.822, steps: 512




[PPO] Epoch 4/4 Loss=0.2533 Policy=0.0050 Value=0.5727 Entropy=0.7593 KL=0.0026 Corr=0.0424
[Update 401/1000] Episode reward: 16.437, steps: 512




[PPO] Epoch 4/4 Loss=0.2974 Policy=-0.0013 Value=0.6718 Entropy=0.7421 KL=0.0032 Corr=-0.0366
[Update 402/1000] Episode reward: 10.177, steps: 512




[PPO] Epoch 4/4 Loss=0.1887 Policy=-0.0041 Value=0.4575 Entropy=0.7196 KL=0.0039 Corr=-0.0530
[Update 403/1000] Episode reward: 8.667, steps: 512




[PPO] Epoch 4/4 Loss=0.2804 Policy=-0.0003 Value=0.6333 Entropy=0.7192 KL=0.0074 Corr=-0.0005
[Update 404/1000] Episode reward: 12.060, steps: 512




[PPO] Epoch 4/4 Loss=0.2967 Policy=0.0018 Value=0.6655 Entropy=0.7567 KL=0.0071 Corr=-0.0447
[Update 405/1000] Episode reward: -6.029, steps: 512




[PPO] Epoch 4/4 Loss=0.2210 Policy=-0.0010 Value=0.5237 Entropy=0.7967 KL=0.0041 Corr=0.0469
[Update 406/1000] Episode reward: -7.626, steps: 512




[PPO] Epoch 4/4 Loss=0.1854 Policy=-0.0040 Value=0.4594 Entropy=0.8057 KL=0.0032 Corr=-0.0224
[Update 407/1000] Episode reward: 5.660, steps: 512




[PPO] Epoch 4/4 Loss=0.3025 Policy=0.0052 Value=0.6736 Entropy=0.7897 KL=0.0052 Corr=0.0328
[Update 408/1000] Episode reward: 12.830, steps: 512




[PPO] Epoch 4/4 Loss=0.2430 Policy=0.0008 Value=0.5629 Entropy=0.7860 KL=0.0028 Corr=0.0261
[Update 409/1000] Episode reward: 8.408, steps: 512




[PPO] Epoch 4/4 Loss=0.1840 Policy=-0.0012 Value=0.4488 Entropy=0.7849 KL=0.0001 Corr=0.1242
[Update 410/1000] Episode reward: -4.706, steps: 512




[PPO] Epoch 4/4 Loss=0.2662 Policy=0.0031 Value=0.6055 Entropy=0.7924 KL=0.0033 Corr=-0.0639
[Update 411/1000] Episode reward: -2.817, steps: 512




[PPO] Epoch 4/4 Loss=0.2722 Policy=0.0009 Value=0.6264 Entropy=0.8388 KL=0.0019 Corr=-0.0521
[Update 412/1000] Episode reward: 7.915, steps: 512




[PPO] Epoch 4/4 Loss=0.0843 Policy=-0.0004 Value=0.2590 Entropy=0.8965 KL=0.0046 Corr=-0.0075
[Update 413/1000] Episode reward: 0.989, steps: 512




[PPO] Epoch 4/4 Loss=0.2131 Policy=-0.0050 Value=0.5298 Entropy=0.9361 KL=0.0102 Corr=-0.0163
[Update 414/1000] Episode reward: -11.239, steps: 512




[PPO] Epoch 4/4 Loss=0.1277 Policy=-0.0046 Value=0.3583 Entropy=0.9372 KL=0.0056 Corr=-0.0205
[Update 415/1000] Episode reward: 5.693, steps: 512




[PPO] Epoch 4/4 Loss=0.1583 Policy=0.0028 Value=0.4018 Entropy=0.9064 KL=-0.0003 Corr=-0.0081
[Update 416/1000] Episode reward: -2.403, steps: 512




[PPO] Epoch 4/4 Loss=0.4997 Policy=0.0007 Value=1.0859 Entropy=0.8795 KL=-0.0006 Corr=0.0609
[Update 417/1000] Episode reward: 20.564, steps: 512




[PPO] Epoch 4/4 Loss=0.4861 Policy=-0.0057 Value=1.0687 Entropy=0.8506 KL=-0.0017 Corr=-0.0223
[Update 418/1000] Episode reward: 14.131, steps: 512




[PPO] Epoch 4/4 Loss=0.1435 Policy=0.0042 Value=0.3604 Entropy=0.8175 KL=0.0024 Corr=-0.0265
[Update 419/1000] Episode reward: 8.479, steps: 512




[PPO] Epoch 4/4 Loss=0.8110 Policy=-0.0009 Value=1.7034 Entropy=0.7963 KL=0.0037 Corr=-0.0834
[Update 420/1000] Episode reward: 15.412, steps: 512




[PPO] Epoch 4/4 Loss=0.4435 Policy=-0.0051 Value=0.9756 Entropy=0.7836 KL=-0.0011 Corr=-0.0552
[Update 421/1000] Episode reward: -0.095, steps: 512




[PPO] Epoch 4/4 Loss=0.5145 Policy=-0.0012 Value=1.1095 Entropy=0.7814 KL=0.0023 Corr=-0.0395
[Update 422/1000] Episode reward: 17.302, steps: 512




[PPO] Epoch 4/4 Loss=0.3840 Policy=0.0005 Value=0.8461 Entropy=0.7916 KL=0.0039 Corr=-0.0568
[Update 423/1000] Episode reward: 3.649, steps: 512




[PPO] Epoch 4/4 Loss=0.3140 Policy=0.0037 Value=0.7028 Entropy=0.8212 KL=0.0023 Corr=0.0232
[Update 424/1000] Episode reward: 16.386, steps: 512




[PPO] Epoch 4/4 Loss=0.4091 Policy=0.0019 Value=0.8988 Entropy=0.8442 KL=0.0011 Corr=-0.0475
[Update 425/1000] Episode reward: 1.791, steps: 512




[PPO] Epoch 4/4 Loss=0.3349 Policy=0.0035 Value=0.7484 Entropy=0.8574 KL=0.0044 Corr=-0.0435
[Update 426/1000] Episode reward: 27.563, steps: 512




[PPO] Epoch 4/4 Loss=0.1515 Policy=0.0027 Value=0.3841 Entropy=0.8667 KL=0.0035 Corr=-0.0528
[Update 427/1000] Episode reward: -1.326, steps: 512




[PPO] Epoch 4/4 Loss=0.2316 Policy=-0.0028 Value=0.5550 Entropy=0.8602 KL=0.0075 Corr=-0.0874
[Update 428/1000] Episode reward: 5.887, steps: 512




[PPO] Epoch 4/4 Loss=0.4234 Policy=-0.0047 Value=0.9382 Entropy=0.8198 KL=0.0030 Corr=-0.0492
[Update 429/1000] Episode reward: 23.593, steps: 512




[PPO] Epoch 4/4 Loss=0.2569 Policy=-0.0013 Value=0.5952 Entropy=0.7883 KL=-0.0014 Corr=0.0460
[Update 430/1000] Episode reward: -12.131, steps: 512




[PPO] Epoch 4/4 Loss=0.3297 Policy=-0.0024 Value=0.7408 Entropy=0.7672 KL=0.0022 Corr=0.0689
[Update 431/1000] Episode reward: 14.268, steps: 512




[PPO] Epoch 4/4 Loss=0.4416 Policy=-0.0029 Value=0.9657 Entropy=0.7655 KL=0.0035 Corr=-0.0515
[Update 432/1000] Episode reward: 5.539, steps: 512




[PPO] Epoch 4/4 Loss=0.7451 Policy=0.0047 Value=1.5610 Entropy=0.8008 KL=0.0086 Corr=0.0612
[Update 433/1000] Episode reward: 13.815, steps: 512




[PPO] Epoch 4/4 Loss=0.2627 Policy=0.0039 Value=0.6000 Entropy=0.8226 KL=-0.0010 Corr=0.0200
[Update 434/1000] Episode reward: -6.048, steps: 512




[PPO] Epoch 4/4 Loss=0.2145 Policy=0.0002 Value=0.5142 Entropy=0.8555 KL=0.0039 Corr=0.0196
[Update 435/1000] Episode reward: 5.628, steps: 512




[PPO] Epoch 4/4 Loss=0.0911 Policy=0.0121 Value=0.2461 Entropy=0.8814 KL=-0.0020 Corr=-0.0733
[Update 436/1000] Episode reward: 0.460, steps: 512




[PPO] Epoch 4/4 Loss=0.1193 Policy=-0.0077 Value=0.3425 Entropy=0.8850 KL=-0.0015 Corr=0.0325
[Update 437/1000] Episode reward: 8.361, steps: 512




[PPO] Epoch 4/4 Loss=0.2049 Policy=-0.0021 Value=0.5008 Entropy=0.8673 KL=0.0026 Corr=0.0110
[Update 438/1000] Episode reward: -12.635, steps: 512




[PPO] Epoch 4/4 Loss=0.0930 Policy=-0.0096 Value=0.2896 Entropy=0.8437 KL=-0.0000 Corr=0.0376
[Update 439/1000] Episode reward: 8.859, steps: 512




[PPO] Epoch 4/4 Loss=0.1061 Policy=-0.0047 Value=0.3041 Entropy=0.8238 KL=-0.0002 Corr=-0.0148
[Update 440/1000] Episode reward: -0.265, steps: 512




[PPO] Epoch 4/4 Loss=0.0958 Policy=0.0012 Value=0.2724 Entropy=0.8318 KL=-0.0002 Corr=0.0061
[Update 441/1000] Episode reward: 1.932, steps: 512




[PPO] Epoch 4/4 Loss=0.4357 Policy=-0.0011 Value=0.9583 Entropy=0.8467 KL=0.0048 Corr=0.0172
[Update 442/1000] Episode reward: -26.771, steps: 512




[PPO] Epoch 4/4 Loss=0.0553 Policy=0.0018 Value=0.1925 Entropy=0.8543 KL=-0.0008 Corr=-0.0104
[Update 443/1000] Episode reward: -4.450, steps: 512




[PPO] Epoch 4/4 Loss=0.1355 Policy=0.0011 Value=0.3536 Entropy=0.8478 KL=0.0004 Corr=-0.0461
[Update 444/1000] Episode reward: 7.929, steps: 512




[PPO] Epoch 4/4 Loss=0.4457 Policy=0.0042 Value=0.9663 Entropy=0.8317 KL=-0.0001 Corr=-0.0213
[Update 445/1000] Episode reward: -19.900, steps: 512




[PPO] Epoch 4/4 Loss=0.1274 Policy=0.0022 Value=0.3328 Entropy=0.8250 KL=0.0047 Corr=0.0076
[Update 446/1000] Episode reward: -11.564, steps: 512




[PPO] Epoch 4/4 Loss=0.6173 Policy=0.0017 Value=1.3132 Entropy=0.8199 KL=-0.0008 Corr=0.0072
[Update 447/1000] Episode reward: 26.999, steps: 512




[PPO] Epoch 4/4 Loss=0.1848 Policy=-0.0016 Value=0.4571 Entropy=0.8426 KL=0.0019 Corr=-0.0198
[Update 448/1000] Episode reward: 1.927, steps: 512




[PPO] Epoch 4/4 Loss=0.1426 Policy=0.0026 Value=0.3681 Entropy=0.8828 KL=0.0022 Corr=0.0499
[Update 449/1000] Episode reward: 7.744, steps: 512




[PPO] Epoch 4/4 Loss=0.1279 Policy=-0.0009 Value=0.3478 Entropy=0.9027 KL=0.0045 Corr=0.0176
[Update 450/1000] Episode reward: -0.827, steps: 512




[PPO] Epoch 4/4 Loss=0.0628 Policy=-0.0006 Value=0.2180 Entropy=0.9107 KL=0.0049 Corr=-0.0239
[Update 451/1000] Episode reward: -7.340, steps: 512




[PPO] Epoch 4/4 Loss=0.2311 Policy=-0.0071 Value=0.5673 Entropy=0.9080 KL=0.0073 Corr=0.0051
[Update 452/1000] Episode reward: -0.051, steps: 512




[PPO] Epoch 4/4 Loss=0.2340 Policy=-0.0076 Value=0.5709 Entropy=0.8762 KL=-0.0067 Corr=0.1185
[Update 453/1000] Episode reward: 11.764, steps: 512




[PPO] Epoch 4/4 Loss=0.1901 Policy=0.0018 Value=0.4617 Entropy=0.8494 KL=0.0061 Corr=0.0408
[Update 454/1000] Episode reward: 12.590, steps: 512




[PPO] Epoch 4/4 Loss=0.3331 Policy=-0.0011 Value=0.7527 Entropy=0.8443 KL=-0.0016 Corr=0.0205
[Update 455/1000] Episode reward: 1.991, steps: 512




[PPO] Epoch 4/4 Loss=0.2761 Policy=-0.0003 Value=0.6382 Entropy=0.8545 KL=-0.0005 Corr=0.0016
[Update 456/1000] Episode reward: 17.644, steps: 512




[PPO] Epoch 4/4 Loss=0.1236 Policy=-0.0044 Value=0.3413 Entropy=0.8518 KL=0.0057 Corr=0.0713
[Update 457/1000] Episode reward: 2.390, steps: 512




[PPO] Epoch 4/4 Loss=0.2568 Policy=0.0028 Value=0.5929 Entropy=0.8491 KL=0.0017 Corr=0.0796
[Update 458/1000] Episode reward: -9.663, steps: 512




[PPO] Epoch 4/4 Loss=0.4475 Policy=0.0001 Value=0.9802 Entropy=0.8530 KL=-0.0017 Corr=0.0677
[Update 459/1000] Episode reward: -21.294, steps: 512




[PPO] Epoch 4/4 Loss=0.5116 Policy=0.0021 Value=1.1038 Entropy=0.8493 KL=-0.0005 Corr=-0.0563
[Update 460/1000] Episode reward: -1.575, steps: 512




[PPO] Epoch 4/4 Loss=0.3331 Policy=0.0005 Value=0.7492 Entropy=0.8382 KL=0.0034 Corr=-0.0189
[Update 461/1000] Episode reward: -18.243, steps: 512




[PPO] Epoch 4/4 Loss=0.2901 Policy=-0.0006 Value=0.6637 Entropy=0.8227 KL=0.0018 Corr=-0.0202
[Update 462/1000] Episode reward: -5.573, steps: 512




[PPO] Epoch 4/4 Loss=0.9789 Policy=-0.0015 Value=2.0410 Entropy=0.8026 KL=0.0022 Corr=-0.0211
[Update 463/1000] Episode reward: 25.563, steps: 512




[PPO] Epoch 4/4 Loss=0.5209 Policy=-0.0019 Value=1.1243 Entropy=0.7886 KL=0.0047 Corr=0.0304
[Update 464/1000] Episode reward: -10.917, steps: 512




[PPO] Epoch 4/4 Loss=0.3860 Policy=0.0016 Value=0.8472 Entropy=0.7821 KL=0.0029 Corr=-0.0242
[Update 465/1000] Episode reward: 14.806, steps: 512




[PPO] Epoch 4/4 Loss=0.4498 Policy=0.0003 Value=0.9769 Entropy=0.7795 KL=0.0034 Corr=-0.0714
[Update 466/1000] Episode reward: 12.522, steps: 512




[PPO] Epoch 4/4 Loss=0.1206 Policy=0.0058 Value=0.3080 Entropy=0.7829 KL=-0.0006 Corr=0.0304
[Update 467/1000] Episode reward: -8.941, steps: 512




[PPO] Epoch 4/4 Loss=0.4785 Policy=0.0050 Value=1.0256 Entropy=0.7861 KL=-0.0014 Corr=0.0762
[Update 468/1000] Episode reward: 17.410, steps: 512




[PPO] Epoch 4/4 Loss=0.1138 Policy=-0.0012 Value=0.3093 Entropy=0.7922 KL=-0.0029 Corr=0.0001
[Update 469/1000] Episode reward: -11.185, steps: 512




[PPO] Epoch 4/4 Loss=0.3540 Policy=0.0007 Value=0.7871 Entropy=0.8045 KL=0.0031 Corr=0.0769
[Update 470/1000] Episode reward: 10.905, steps: 512




[PPO] Epoch 4/4 Loss=0.3380 Policy=-0.0023 Value=0.7620 Entropy=0.8133 KL=0.0023 Corr=-0.0397
[Update 471/1000] Episode reward: 1.813, steps: 512




[PPO] Epoch 4/4 Loss=0.2747 Policy=-0.0029 Value=0.6374 Entropy=0.8211 KL=0.0008 Corr=-0.0127
[Update 472/1000] Episode reward: -19.849, steps: 512




[PPO] Epoch 4/4 Loss=0.0864 Policy=0.0025 Value=0.2521 Entropy=0.8411 KL=0.0042 Corr=0.0242
[Update 473/1000] Episode reward: -3.752, steps: 512




[PPO] Epoch 4/4 Loss=0.0642 Policy=0.0005 Value=0.2136 Entropy=0.8622 KL=0.0003 Corr=-0.0296
[Update 474/1000] Episode reward: -11.969, steps: 512




[PPO] Epoch 4/4 Loss=0.2182 Policy=-0.0008 Value=0.5257 Entropy=0.8760 KL=0.0009 Corr=-0.0472
[Update 475/1000] Episode reward: -7.791, steps: 512




[PPO] Epoch 4/4 Loss=0.3059 Policy=0.0019 Value=0.6961 Entropy=0.8820 KL=0.0026 Corr=0.0369
[Update 476/1000] Episode reward: 10.800, steps: 512




[PPO] Epoch 4/4 Loss=0.0962 Policy=0.0018 Value=0.2769 Entropy=0.8807 KL=0.0048 Corr=-0.0018
[Update 477/1000] Episode reward: 3.292, steps: 512




[PPO] Epoch 4/4 Loss=0.2762 Policy=0.0013 Value=0.6380 Entropy=0.8808 KL=0.0012 Corr=0.0527
[Update 478/1000] Episode reward: -7.664, steps: 512




[PPO] Epoch 4/4 Loss=0.0434 Policy=-0.0028 Value=0.1806 Entropy=0.8826 KL=0.0065 Corr=0.0686
[Update 479/1000] Episode reward: -3.824, steps: 512




[PPO] Epoch 4/4 Loss=0.0142 Policy=0.0010 Value=0.1167 Entropy=0.9027 KL=0.0027 Corr=0.0182
[Update 480/1000] Episode reward: -8.841, steps: 512




[PPO] Epoch 4/4 Loss=0.2423 Policy=0.0059 Value=0.5650 Entropy=0.9226 KL=0.0039 Corr=0.0142
[Update 481/1000] Episode reward: 6.429, steps: 512




[PPO] Epoch 4/4 Loss=0.0914 Policy=0.0030 Value=0.2695 Entropy=0.9267 KL=0.0001 Corr=0.0306
[Update 482/1000] Episode reward: -9.597, steps: 512




[PPO] Epoch 4/4 Loss=0.1328 Policy=0.0025 Value=0.3536 Entropy=0.9298 KL=-0.0035 Corr=0.0185
[Update 483/1000] Episode reward: 7.760, steps: 512




[PPO] Epoch 4/4 Loss=0.0818 Policy=-0.0023 Value=0.2607 Entropy=0.9250 KL=0.0027 Corr=-0.0756
[Update 484/1000] Episode reward: -9.111, steps: 512




[PPO] Epoch 4/4 Loss=0.1602 Policy=-0.0012 Value=0.4139 Entropy=0.9116 KL=0.0018 Corr=0.0077
[Update 485/1000] Episode reward: -13.284, steps: 512




[PPO] Epoch 4/4 Loss=0.1947 Policy=0.0009 Value=0.4779 Entropy=0.9025 KL=0.0087 Corr=0.0046
[Update 486/1000] Episode reward: -11.659, steps: 512




[PPO] Epoch 4/4 Loss=0.0744 Policy=-0.0071 Value=0.2527 Entropy=0.8955 KL=0.0093 Corr=-0.0101
[Update 487/1000] Episode reward: -3.093, steps: 512




[PPO] Epoch 4/4 Loss=0.4748 Policy=-0.0076 Value=1.0523 Entropy=0.8750 KL=0.0059 Corr=0.0125
[Update 488/1000] Episode reward: 17.966, steps: 512




[PPO] Epoch 4/4 Loss=0.4802 Policy=0.0002 Value=1.0456 Entropy=0.8565 KL=0.0087 Corr=-0.0135
[Update 489/1000] Episode reward: 10.260, steps: 512




[PPO] Epoch 4/4 Loss=0.4968 Policy=0.0042 Value=1.0704 Entropy=0.8522 KL=0.0025 Corr=-0.0766
[Update 490/1000] Episode reward: -29.781, steps: 512




[PPO] Epoch 4/4 Loss=0.3578 Policy=-0.0066 Value=0.8168 Entropy=0.8793 KL=0.0026 Corr=0.0423
[Update 491/1000] Episode reward: -5.790, steps: 512




[PPO] Epoch 4/4 Loss=0.2923 Policy=0.0050 Value=0.6678 Entropy=0.9328 KL=0.0045 Corr=0.0304
[Update 492/1000] Episode reward: 6.246, steps: 512




[PPO] Epoch 4/4 Loss=0.4662 Policy=-0.0002 Value=1.0287 Entropy=0.9597 KL=0.0043 Corr=0.0928
[Update 493/1000] Episode reward: -27.593, steps: 512




[PPO] Epoch 4/4 Loss=0.4395 Policy=0.0032 Value=0.9703 Entropy=0.9775 KL=0.0074 Corr=-0.0554
[Update 494/1000] Episode reward: -23.792, steps: 512




[PPO] Epoch 4/4 Loss=0.3347 Policy=-0.0010 Value=0.7700 Entropy=0.9860 KL=-0.0013 Corr=0.0019
[Update 495/1000] Episode reward: -10.298, steps: 512




[PPO] Epoch 4/4 Loss=0.1142 Policy=0.0004 Value=0.3258 Entropy=0.9837 KL=0.0053 Corr=0.0414
[Update 496/1000] Episode reward: -17.022, steps: 512




[PPO] Epoch 4/4 Loss=0.1800 Policy=-0.0004 Value=0.4594 Entropy=0.9866 KL=0.0029 Corr=0.0001
[Update 497/1000] Episode reward: -25.916, steps: 512




[PPO] Epoch 4/4 Loss=0.1717 Policy=-0.0008 Value=0.4437 Entropy=0.9867 KL=0.0027 Corr=-0.0086
[Update 498/1000] Episode reward: -18.114, steps: 512




[PPO] Epoch 4/4 Loss=0.1490 Policy=-0.0010 Value=0.3992 Entropy=0.9928 KL=0.0082 Corr=0.0122
[Update 499/1000] Episode reward: -12.090, steps: 512




[PPO] Epoch 4/4 Loss=0.1005 Policy=-0.0009 Value=0.3046 Entropy=1.0194 KL=0.0123 Corr=-0.0452
[Update 500/1000] Episode reward: -4.953, steps: 512




[PPO] Epoch 4/4 Loss=0.1195 Policy=-0.0032 Value=0.3493 Entropy=1.0391 KL=0.0150 Corr=-0.0171
[Update 501/1000] Episode reward: 0.362, steps: 512




[PPO] Epoch 4/4 Loss=0.0729 Policy=0.0022 Value=0.2460 Entropy=1.0450 KL=0.0046 Corr=0.0088
[Update 502/1000] Episode reward: -10.883, steps: 512




[PPO] Epoch 4/4 Loss=0.1390 Policy=0.0000 Value=0.3827 Entropy=1.0476 KL=0.0023 Corr=0.0126
[Update 503/1000] Episode reward: -14.135, steps: 512




[PPO] Epoch 4/4 Loss=0.0825 Policy=0.0019 Value=0.2662 Entropy=1.0504 KL=0.0032 Corr=-0.0032
[Update 504/1000] Episode reward: -17.580, steps: 512




[PPO] Epoch 4/4 Loss=0.1126 Policy=0.0028 Value=0.3239 Entropy=1.0436 KL=0.0043 Corr=-0.0087
[Update 505/1000] Episode reward: -24.642, steps: 512




[PPO] Epoch 4/4 Loss=0.1957 Policy=0.0061 Value=0.4825 Entropy=1.0317 KL=0.0013 Corr=0.0285
[Update 506/1000] Episode reward: -1.409, steps: 512




[PPO] Epoch 4/4 Loss=0.1506 Policy=0.0023 Value=0.3988 Entropy=1.0219 KL=0.0024 Corr=0.0435
[Update 507/1000] Episode reward: -2.817, steps: 512




[PPO] Epoch 4/4 Loss=0.2571 Policy=-0.0006 Value=0.6167 Entropy=1.0127 KL=0.0089 Corr=-0.0038
[Update 508/1000] Episode reward: -20.931, steps: 512




[PPO] Epoch 4/4 Loss=0.1622 Policy=-0.0007 Value=0.4261 Entropy=1.0037 KL=0.0058 Corr=0.0195
[Update 509/1000] Episode reward: -7.067, steps: 512




[PPO] Epoch 4/4 Loss=0.3745 Policy=-0.0066 Value=0.8599 Entropy=0.9779 KL=0.0089 Corr=0.0217
[Update 510/1000] Episode reward: 11.324, steps: 512




[PPO] Epoch 4/4 Loss=0.1898 Policy=-0.0023 Value=0.4792 Entropy=0.9510 KL=0.0010 Corr=-0.0532
[Update 511/1000] Episode reward: 5.696, steps: 512




[PPO] Epoch 4/4 Loss=0.2569 Policy=0.0010 Value=0.6051 Entropy=0.9322 KL=0.0014 Corr=-0.0484
[Update 512/1000] Episode reward: -17.431, steps: 512




[PPO] Epoch 4/4 Loss=0.2982 Policy=-0.0003 Value=0.6888 Entropy=0.9182 KL=0.0061 Corr=0.0160
[Update 513/1000] Episode reward: 12.435, steps: 512




[PPO] Epoch 4/4 Loss=0.2137 Policy=-0.0013 Value=0.5208 Entropy=0.9073 KL=0.0021 Corr=-0.0001
[Update 514/1000] Episode reward: -18.619, steps: 512




[PPO] Epoch 4/4 Loss=0.0850 Policy=-0.0007 Value=0.2602 Entropy=0.8880 KL=-0.0013 Corr=-0.0105
[Update 515/1000] Episode reward: -8.054, steps: 512




[PPO] Epoch 4/4 Loss=0.7057 Policy=-0.0014 Value=1.5007 Entropy=0.8645 KL=0.0030 Corr=-0.0322
[Update 516/1000] Episode reward: 21.627, steps: 512




[PPO] Epoch 4/4 Loss=0.5152 Policy=-0.0015 Value=1.1177 Entropy=0.8431 KL=0.0030 Corr=-0.0175
[Update 517/1000] Episode reward: 13.829, steps: 512




[PPO] Epoch 4/4 Loss=0.2586 Policy=0.0011 Value=0.5972 Entropy=0.8237 KL=0.0032 Corr=-0.0173
[Update 518/1000] Episode reward: 5.208, steps: 512




[PPO] Epoch 4/4 Loss=0.3391 Policy=0.0000 Value=0.7592 Entropy=0.8098 KL=0.0047 Corr=0.0858
[Update 519/1000] Episode reward: 12.202, steps: 512




[PPO] Epoch 4/4 Loss=0.2302 Policy=-0.0017 Value=0.5441 Entropy=0.8039 KL=0.0055 Corr=0.0275
[Update 520/1000] Episode reward: 6.546, steps: 512




[PPO] Epoch 4/4 Loss=0.1508 Policy=-0.0018 Value=0.3850 Entropy=0.7984 KL=-0.0025 Corr=-0.0306
[Update 521/1000] Episode reward: -2.251, steps: 512




[PPO] Epoch 4/4 Loss=0.2200 Policy=-0.0021 Value=0.5230 Entropy=0.7867 KL=-0.0035 Corr=0.0857
[Update 522/1000] Episode reward: 12.067, steps: 512




[PPO] Epoch 4/4 Loss=0.1747 Policy=0.0006 Value=0.4254 Entropy=0.7717 KL=0.0020 Corr=0.0204
[Update 523/1000] Episode reward: 7.624, steps: 512




[PPO] Epoch 4/4 Loss=0.1737 Policy=-0.0014 Value=0.4263 Entropy=0.7609 KL=0.0009 Corr=0.0023
[Update 524/1000] Episode reward: -13.713, steps: 512




[PPO] Epoch 4/4 Loss=0.0787 Policy=0.0015 Value=0.2306 Entropy=0.7621 KL=0.0025 Corr=0.0450
[Update 525/1000] Episode reward: 4.272, steps: 512




[PPO] Epoch 4/4 Loss=0.0903 Policy=0.0015 Value=0.2536 Entropy=0.7597 KL=-0.0032 Corr=0.0172
[Update 526/1000] Episode reward: 4.378, steps: 512




[PPO] Epoch 4/4 Loss=0.5116 Policy=-0.0004 Value=1.0982 Entropy=0.7435 KL=0.0040 Corr=-0.0684
[Update 527/1000] Episode reward: 24.317, steps: 512




[PPO] Epoch 4/4 Loss=0.3725 Policy=0.0049 Value=0.8081 Entropy=0.7290 KL=-0.0035 Corr=-0.0356
[Update 528/1000] Episode reward: 17.448, steps: 512




[PPO] Epoch 4/4 Loss=0.0896 Policy=-0.0002 Value=0.2504 Entropy=0.7088 KL=0.0032 Corr=-0.0485
[Update 529/1000] Episode reward: 6.894, steps: 512




[PPO] Epoch 4/4 Loss=0.2431 Policy=-0.0059 Value=0.5644 Entropy=0.6644 KL=-0.0009 Corr=-0.0254
[Update 530/1000] Episode reward: 8.784, steps: 512




[PPO] Epoch 4/4 Loss=0.2299 Policy=0.0011 Value=0.5175 Entropy=0.6001 KL=0.0084 Corr=0.0100
[Update 531/1000] Episode reward: 10.161, steps: 512




[PPO] Epoch 4/4 Loss=0.4491 Policy=-0.0077 Value=0.9793 Entropy=0.6565 KL=0.0063 Corr=-0.0310
[Update 532/1000] Episode reward: 2.493, steps: 512




[PPO] Epoch 4/4 Loss=0.1915 Policy=-0.0054 Value=0.4664 Entropy=0.7265 KL=0.0128 Corr=0.0199
[Update 533/1000] Episode reward: -9.241, steps: 512




[PPO] Epoch 4/4 Loss=0.0964 Policy=-0.0054 Value=0.2781 Entropy=0.7457 KL=-0.0004 Corr=0.0292
[Update 534/1000] Episode reward: 5.108, steps: 512




[PPO] Epoch 4/4 Loss=0.1271 Policy=0.0010 Value=0.3257 Entropy=0.7339 KL=0.0013 Corr=0.0244
[Update 535/1000] Episode reward: 2.839, steps: 512




[PPO] Epoch 4/4 Loss=0.3795 Policy=0.0035 Value=0.8251 Entropy=0.7307 KL=0.0017 Corr=-0.0671
[Update 536/1000] Episode reward: 18.993, steps: 512




[PPO] Epoch 4/4 Loss=0.2417 Policy=0.0002 Value=0.5550 Entropy=0.7208 KL=-0.0008 Corr=-0.1236
[Update 537/1000] Episode reward: -4.929, steps: 512




[PPO] Epoch 4/4 Loss=0.1559 Policy=0.0023 Value=0.3776 Entropy=0.7048 KL=0.0022 Corr=-0.0255
[Update 538/1000] Episode reward: 10.419, steps: 512




[PPO] Epoch 4/4 Loss=0.3268 Policy=0.0036 Value=0.7164 Entropy=0.7009 KL=0.0039 Corr=-0.0226
[Update 539/1000] Episode reward: -15.351, steps: 512




[PPO] Epoch 4/4 Loss=0.4074 Policy=-0.0026 Value=0.8900 Entropy=0.6999 KL=-0.0015 Corr=-0.0236
[Update 540/1000] Episode reward: 16.279, steps: 512




[PPO] Epoch 4/4 Loss=0.1056 Policy=-0.0016 Value=0.2844 Entropy=0.7012 KL=0.0023 Corr=-0.0034
[Update 541/1000] Episode reward: -0.458, steps: 512




[PPO] Epoch 4/4 Loss=0.3336 Policy=-0.0005 Value=0.7392 Entropy=0.7089 KL=-0.0011 Corr=-0.0313
[Update 542/1000] Episode reward: -8.556, steps: 512




[PPO] Epoch 4/4 Loss=0.2823 Policy=-0.0031 Value=0.6452 Entropy=0.7436 KL=0.0043 Corr=0.0661
[Update 543/1000] Episode reward: -0.269, steps: 512




[PPO] Epoch 4/4 Loss=0.1660 Policy=-0.0062 Value=0.4235 Entropy=0.7922 KL=0.0040 Corr=0.0577
[Update 544/1000] Episode reward: -12.163, steps: 512




[PPO] Epoch 4/4 Loss=0.1889 Policy=0.0031 Value=0.4537 Entropy=0.8206 KL=0.0054 Corr=0.0281
[Update 545/1000] Episode reward: 8.335, steps: 512




[PPO] Epoch 4/4 Loss=0.0367 Policy=-0.0044 Value=0.1641 Entropy=0.8197 KL=0.0027 Corr=0.0064
[Update 546/1000] Episode reward: 1.310, steps: 512




[PPO] Epoch 4/4 Loss=0.1169 Policy=0.0023 Value=0.3075 Entropy=0.7827 KL=0.0088 Corr=0.0057
[Update 547/1000] Episode reward: -13.854, steps: 512




[PPO] Epoch 4/4 Loss=0.2684 Policy=0.0066 Value=0.6003 Entropy=0.7655 KL=0.0059 Corr=-0.0290
[Update 548/1000] Episode reward: -14.813, steps: 512




[PPO] Epoch 4/4 Loss=0.2124 Policy=-0.0009 Value=0.5038 Entropy=0.7713 KL=-0.0017 Corr=-0.0324
[Update 549/1000] Episode reward: 12.800, steps: 512




[PPO] Epoch 4/4 Loss=0.4426 Policy=-0.0032 Value=0.9700 Entropy=0.7826 KL=0.0029 Corr=-0.0942
[Update 550/1000] Episode reward: 15.410, steps: 512




[PPO] Epoch 4/4 Loss=0.2435 Policy=-0.0010 Value=0.5694 Entropy=0.8027 KL=0.0050 Corr=0.0774
[Update 551/1000] Episode reward: -4.557, steps: 512




[PPO] Epoch 4/4 Loss=0.4265 Policy=-0.0019 Value=0.9396 Entropy=0.8268 KL=0.0030 Corr=0.0355
[Update 552/1000] Episode reward: 14.703, steps: 512




[PPO] Epoch 4/4 Loss=0.1709 Policy=-0.0011 Value=0.4281 Entropy=0.8401 KL=-0.0046 Corr=0.0608
[Update 553/1000] Episode reward: -8.110, steps: 512




[PPO] Epoch 4/4 Loss=0.2446 Policy=0.0015 Value=0.5715 Entropy=0.8508 KL=0.0043 Corr=0.0249
[Update 554/1000] Episode reward: 10.386, steps: 512




[PPO] Epoch 4/4 Loss=0.2991 Policy=0.0005 Value=0.6829 Entropy=0.8569 KL=0.0060 Corr=0.0234
[Update 555/1000] Episode reward: 2.725, steps: 512




[PPO] Epoch 4/4 Loss=0.1597 Policy=-0.0052 Value=0.4144 Entropy=0.8463 KL=-0.0013 Corr=0.0653
[Update 556/1000] Episode reward: 13.310, steps: 512




[PPO] Epoch 4/4 Loss=0.3534 Policy=-0.0064 Value=0.8024 Entropy=0.8287 KL=-0.0007 Corr=0.0689
[Update 557/1000] Episode reward: -20.573, steps: 512




[PPO] Epoch 4/4 Loss=0.2647 Policy=-0.0013 Value=0.6147 Entropy=0.8283 KL=-0.0030 Corr=0.0393
[Update 558/1000] Episode reward: 2.445, steps: 512




[PPO] Epoch 4/4 Loss=0.3948 Policy=-0.0006 Value=0.8760 Entropy=0.8523 KL=0.0041 Corr=-0.0912
[Update 559/1000] Episode reward: 20.321, steps: 512




[PPO] Epoch 4/4 Loss=0.2475 Policy=0.0018 Value=0.5771 Entropy=0.8563 KL=0.0011 Corr=0.0543
[Update 560/1000] Episode reward: 5.469, steps: 512




[PPO] Epoch 4/4 Loss=0.1207 Policy=-0.0018 Value=0.3277 Entropy=0.8266 KL=-0.0008 Corr=-0.0423
[Update 561/1000] Episode reward: 11.909, steps: 512




[PPO] Epoch 4/4 Loss=0.2069 Policy=0.0002 Value=0.4936 Entropy=0.8017 KL=0.0040 Corr=0.0418
[Update 562/1000] Episode reward: -9.847, steps: 512




[PPO] Epoch 4/4 Loss=0.0568 Policy=-0.0026 Value=0.1986 Entropy=0.7979 KL=-0.0014 Corr=0.0678
[Update 563/1000] Episode reward: 0.718, steps: 512




[PPO] Epoch 4/4 Loss=0.1469 Policy=-0.0053 Value=0.3850 Entropy=0.8058 KL=0.0014 Corr=0.0154
[Update 564/1000] Episode reward: -1.492, steps: 512




[PPO] Epoch 4/4 Loss=0.1083 Policy=0.0021 Value=0.2939 Entropy=0.8147 KL=0.0002 Corr=0.0489
[Update 565/1000] Episode reward: -3.239, steps: 512




[PPO] Epoch 4/4 Loss=0.3717 Policy=-0.0067 Value=0.8408 Entropy=0.8409 KL=0.0046 Corr=0.0031
[Update 566/1000] Episode reward: 18.174, steps: 512




[PPO] Epoch 4/4 Loss=0.2135 Policy=-0.0015 Value=0.5154 Entropy=0.8536 KL=0.0014 Corr=0.0304
[Update 567/1000] Episode reward: 8.215, steps: 512




[PPO] Epoch 4/4 Loss=0.2637 Policy=0.0021 Value=0.6088 Entropy=0.8547 KL=0.0032 Corr=-0.0664
[Update 568/1000] Episode reward: -0.623, steps: 512




[PPO] Epoch 4/4 Loss=0.1497 Policy=0.0005 Value=0.3829 Entropy=0.8437 KL=0.0052 Corr=0.0304
[Update 569/1000] Episode reward: 14.749, steps: 512




[PPO] Epoch 4/4 Loss=0.2001 Policy=0.0055 Value=0.4726 Entropy=0.8344 KL=0.0020 Corr=0.0685
[Update 570/1000] Episode reward: -7.392, steps: 512




[PPO] Epoch 4/4 Loss=0.2200 Policy=-0.0026 Value=0.5288 Entropy=0.8369 KL=-0.0010 Corr=0.0638
[Update 571/1000] Episode reward: -5.123, steps: 512




[PPO] Epoch 4/4 Loss=0.4304 Policy=-0.0026 Value=0.9487 Entropy=0.8278 KL=0.0035 Corr=-0.0288
[Update 572/1000] Episode reward: 1.955, steps: 512




[PPO] Epoch 4/4 Loss=0.3207 Policy=0.0033 Value=0.7151 Entropy=0.8014 KL=0.0050 Corr=0.0298
[Update 573/1000] Episode reward: 11.227, steps: 512




[PPO] Epoch 4/4 Loss=0.0974 Policy=-0.0039 Value=0.2810 Entropy=0.7856 KL=-0.0020 Corr=-0.0268
[Update 574/1000] Episode reward: -3.366, steps: 512




[PPO] Epoch 4/4 Loss=1.0174 Policy=-0.0001 Value=2.1135 Entropy=0.7850 KL=0.0027 Corr=-0.0588
[Update 575/1000] Episode reward: 42.977, steps: 512




[PPO] Epoch 4/4 Loss=0.2743 Policy=0.0008 Value=0.6253 Entropy=0.7828 KL=0.0034 Corr=0.0111
[Update 576/1000] Episode reward: 13.403, steps: 512




[PPO] Epoch 4/4 Loss=0.1857 Policy=-0.0021 Value=0.4538 Entropy=0.7821 KL=0.0022 Corr=0.0598
[Update 577/1000] Episode reward: 15.661, steps: 512




[PPO] Epoch 4/4 Loss=0.2641 Policy=-0.0047 Value=0.6178 Entropy=0.8018 KL=0.0075 Corr=-0.0191
[Update 578/1000] Episode reward: 5.621, steps: 512




[PPO] Epoch 4/4 Loss=0.6612 Policy=0.0167 Value=1.3728 Entropy=0.8388 KL=0.0100 Corr=-0.0283
[Update 579/1000] Episode reward: 32.557, steps: 512




[PPO] Epoch 4/4 Loss=0.1107 Policy=0.0003 Value=0.3067 Entropy=0.8570 KL=-0.0035 Corr=-0.0242
[Update 580/1000] Episode reward: -8.274, steps: 512




[PPO] Epoch 4/4 Loss=0.1390 Policy=-0.0043 Value=0.3725 Entropy=0.8608 KL=0.0029 Corr=-0.0240
[Update 581/1000] Episode reward: 5.778, steps: 512




[PPO] Epoch 4/4 Loss=0.3047 Policy=-0.0214 Value=0.7355 Entropy=0.8327 KL=0.0053 Corr=0.0016
[Update 582/1000] Episode reward: 17.306, steps: 512




[PPO] Epoch 4/4 Loss=0.2297 Policy=0.0052 Value=0.5289 Entropy=0.7977 KL=0.0037 Corr=0.0793
[Update 583/1000] Episode reward: -15.427, steps: 512




[PPO] Epoch 4/4 Loss=0.1231 Policy=-0.0069 Value=0.3365 Entropy=0.7658 KL=0.0021 Corr=0.0352
[Update 584/1000] Episode reward: 13.341, steps: 512




[PPO] Epoch 4/4 Loss=0.2347 Policy=0.0039 Value=0.5359 Entropy=0.7428 KL=0.0007 Corr=0.0181
[Update 585/1000] Episode reward: -0.061, steps: 512




[PPO] Epoch 4/4 Loss=0.5207 Policy=-0.0030 Value=1.1226 Entropy=0.7532 KL=0.0043 Corr=0.0409
[Update 586/1000] Episode reward: 14.381, steps: 512




[PPO] Epoch 4/4 Loss=0.4693 Policy=0.0002 Value=1.0149 Entropy=0.7663 KL=-0.0009 Corr=0.0270
[Update 587/1000] Episode reward: -21.795, steps: 512




[PPO] Epoch 4/4 Loss=0.1740 Policy=-0.0006 Value=0.4276 Entropy=0.7839 KL=-0.0048 Corr=-0.0010
[Update 588/1000] Episode reward: -9.162, steps: 512




[PPO] Epoch 4/4 Loss=0.1695 Policy=-0.0047 Value=0.4288 Entropy=0.8027 KL=-0.0004 Corr=0.0128
[Update 589/1000] Episode reward: 14.895, steps: 512




[PPO] Epoch 4/4 Loss=0.3739 Policy=0.0018 Value=0.8253 Entropy=0.8112 KL=0.0067 Corr=0.0671
[Update 590/1000] Episode reward: 10.683, steps: 512




[PPO] Epoch 4/4 Loss=0.2400 Policy=-0.0014 Value=0.5645 Entropy=0.8172 KL=0.0046 Corr=0.0486
[Update 591/1000] Episode reward: -0.590, steps: 512




[PPO] Epoch 4/4 Loss=0.5000 Policy=0.0003 Value=1.0813 Entropy=0.8184 KL=0.0021 Corr=0.0677
[Update 592/1000] Episode reward: 11.354, steps: 512




[PPO] Epoch 4/4 Loss=0.0794 Policy=-0.0044 Value=0.2457 Entropy=0.7820 KL=0.0083 Corr=-0.0301
[Update 593/1000] Episode reward: 4.927, steps: 512




[PPO] Epoch 4/4 Loss=0.3357 Policy=0.0019 Value=0.7386 Entropy=0.7107 KL=0.0065 Corr=-0.0102
[Update 594/1000] Episode reward: -3.905, steps: 512




[PPO] Epoch 4/4 Loss=0.2301 Policy=0.0032 Value=0.5225 Entropy=0.6860 KL=0.0057 Corr=0.0686
[Update 595/1000] Episode reward: -10.556, steps: 512




[PPO] Epoch 4/4 Loss=0.2380 Policy=-0.0016 Value=0.5478 Entropy=0.6855 KL=-0.0001 Corr=-0.0479
[Update 596/1000] Episode reward: -1.972, steps: 512




[PPO] Epoch 4/4 Loss=0.6411 Policy=0.0009 Value=1.3495 Entropy=0.6901 KL=-0.0013 Corr=-0.0030
[Update 597/1000] Episode reward: 23.436, steps: 512




[PPO] Epoch 4/4 Loss=0.4949 Policy=-0.0028 Value=1.0649 Entropy=0.6939 KL=0.0031 Corr=-0.1055
[Update 598/1000] Episode reward: -17.839, steps: 512




[PPO] Epoch 4/4 Loss=0.3887 Policy=-0.0014 Value=0.8526 Entropy=0.7240 KL=0.0066 Corr=0.0480
[Update 599/1000] Episode reward: -0.832, steps: 512




[PPO] Epoch 4/4 Loss=0.4596 Policy=0.0052 Value=0.9859 Entropy=0.7718 KL=0.0042 Corr=-0.0332
[Update 600/1000] Episode reward: 18.035, steps: 512




[PPO] Epoch 4/4 Loss=0.3098 Policy=0.0039 Value=0.6917 Entropy=0.7996 KL=0.0007 Corr=0.0285
[Update 601/1000] Episode reward: 15.220, steps: 512




[PPO] Epoch 4/4 Loss=0.3380 Policy=-0.0029 Value=0.7635 Entropy=0.8174 KL=0.0042 Corr=-0.0350
[Update 602/1000] Episode reward: -5.883, steps: 512




[PPO] Epoch 4/4 Loss=0.1234 Policy=-0.0020 Value=0.3333 Entropy=0.8251 KL=0.0010 Corr=0.0372
[Update 603/1000] Episode reward: 6.567, steps: 512




[PPO] Epoch 4/4 Loss=0.3505 Policy=-0.0007 Value=0.7848 Entropy=0.8237 KL=0.0058 Corr=-0.0430
[Update 604/1000] Episode reward: -8.936, steps: 512




[PPO] Epoch 4/4 Loss=0.6190 Policy=0.0039 Value=1.3127 Entropy=0.8233 KL=0.0010 Corr=0.0382
[Update 605/1000] Episode reward: 21.371, steps: 512




[PPO] Epoch 4/4 Loss=0.0715 Policy=-0.0099 Value=0.2416 Entropy=0.7885 KL=0.0023 Corr=0.0194
[Update 606/1000] Episode reward: 3.137, steps: 512




[PPO] Epoch 4/4 Loss=0.2535 Policy=-0.0105 Value=0.5989 Entropy=0.7095 KL=0.0052 Corr=-0.0011
[Update 607/1000] Episode reward: 17.002, steps: 512




[PPO] Epoch 4/4 Loss=0.4592 Policy=-0.0006 Value=0.9837 Entropy=0.6419 KL=0.0061 Corr=0.0168
[Update 608/1000] Episode reward: 28.173, steps: 512




[PPO] Epoch 4/4 Loss=0.3141 Policy=-0.0027 Value=0.6934 Entropy=0.5964 KL=0.0065 Corr=-0.0490
[Update 609/1000] Episode reward: 12.429, steps: 512




[PPO] Epoch 4/4 Loss=0.4165 Policy=-0.0000 Value=0.8907 Entropy=0.5777 KL=0.0034 Corr=-0.0502
[Update 610/1000] Episode reward: 3.597, steps: 512




[PPO] Epoch 4/4 Loss=0.4153 Policy=-0.0014 Value=0.8924 Entropy=0.5899 KL=0.0023 Corr=-0.0558
[Update 611/1000] Episode reward: -5.660, steps: 512




[PPO] Epoch 4/4 Loss=0.8733 Policy=0.0023 Value=1.8029 Entropy=0.6073 KL=-0.0003 Corr=0.0099
[Update 612/1000] Episode reward: 35.021, steps: 512




[PPO] Epoch 4/4 Loss=0.6896 Policy=0.0042 Value=1.4321 Entropy=0.6132 KL=0.0008 Corr=-0.0082
[Update 613/1000] Episode reward: 29.742, steps: 512




[PPO] Epoch 4/4 Loss=0.2447 Policy=0.0018 Value=0.5487 Entropy=0.6301 KL=-0.0042 Corr=-0.0194
[Update 614/1000] Episode reward: 8.867, steps: 512




[PPO] Epoch 4/4 Loss=0.3906 Policy=-0.0030 Value=0.8529 Entropy=0.6573 KL=0.0003 Corr=0.0546
[Update 615/1000] Episode reward: 7.311, steps: 512




[PPO] Epoch 4/4 Loss=0.1723 Policy=-0.0029 Value=0.4165 Entropy=0.6602 KL=0.0031 Corr=0.0680
[Update 616/1000] Episode reward: -11.200, steps: 512




[PPO] Epoch 4/4 Loss=0.4823 Policy=0.0004 Value=1.0310 Entropy=0.6728 KL=0.0046 Corr=-0.0956
[Update 617/1000] Episode reward: -22.857, steps: 512




[PPO] Epoch 4/4 Loss=0.2566 Policy=0.0012 Value=0.5786 Entropy=0.6771 KL=0.0025 Corr=-0.0056
[Update 618/1000] Episode reward: -6.525, steps: 512




[PPO] Epoch 4/4 Loss=0.2052 Policy=-0.0011 Value=0.4798 Entropy=0.6716 KL=0.0021 Corr=-0.0288
[Update 619/1000] Episode reward: 3.684, steps: 512




[PPO] Epoch 4/4 Loss=0.4952 Policy=0.0010 Value=1.0540 Entropy=0.6567 KL=0.0049 Corr=0.0126
[Update 620/1000] Episode reward: 15.158, steps: 512




[PPO] Epoch 4/4 Loss=0.1518 Policy=-0.0006 Value=0.3696 Entropy=0.6460 KL=0.0020 Corr=0.0113
[Update 621/1000] Episode reward: 0.337, steps: 512




[PPO] Epoch 4/4 Loss=0.4167 Policy=-0.0010 Value=0.8990 Entropy=0.6359 KL=-0.0021 Corr=0.0284
[Update 622/1000] Episode reward: 15.172, steps: 512




[PPO] Epoch 4/4 Loss=0.2541 Policy=-0.0020 Value=0.5755 Entropy=0.6326 KL=0.0008 Corr=0.0146
[Update 623/1000] Episode reward: 16.359, steps: 512




[PPO] Epoch 4/4 Loss=0.2256 Policy=-0.0015 Value=0.5185 Entropy=0.6423 KL=0.0014 Corr=-0.0674
[Update 624/1000] Episode reward: -5.377, steps: 512




[PPO] Epoch 4/4 Loss=0.2009 Policy=-0.0011 Value=0.4703 Entropy=0.6626 KL=-0.0005 Corr=-0.0090
[Update 625/1000] Episode reward: -4.340, steps: 512




[PPO] Epoch 4/4 Loss=0.2783 Policy=-0.0005 Value=0.6260 Entropy=0.6846 KL=0.0024 Corr=-0.0004
[Update 626/1000] Episode reward: 10.271, steps: 512




[PPO] Epoch 4/4 Loss=0.4567 Policy=0.0046 Value=0.9740 Entropy=0.6966 KL=0.0006 Corr=-0.0247
[Update 627/1000] Episode reward: 28.411, steps: 512




[PPO] Epoch 4/4 Loss=0.2031 Policy=-0.0006 Value=0.4776 Entropy=0.7011 KL=0.0062 Corr=-0.0382
[Update 628/1000] Episode reward: 12.468, steps: 512




[PPO] Epoch 4/4 Loss=0.1405 Policy=-0.0019 Value=0.3555 Entropy=0.7055 KL=-0.0012 Corr=0.0212
[Update 629/1000] Episode reward: -3.016, steps: 512




[PPO] Epoch 4/4 Loss=0.5394 Policy=-0.0053 Value=1.1595 Entropy=0.7007 KL=0.0006 Corr=-0.0206
[Update 630/1000] Episode reward: 22.084, steps: 512




[PPO] Epoch 4/4 Loss=0.1850 Policy=-0.0026 Value=0.4438 Entropy=0.6862 KL=0.0004 Corr=0.0506
[Update 631/1000] Episode reward: 18.801, steps: 512




[PPO] Epoch 4/4 Loss=0.3272 Policy=-0.0022 Value=0.7262 Entropy=0.6733 KL=-0.0000 Corr=-0.0114
[Update 632/1000] Episode reward: -8.373, steps: 512




[PPO] Epoch 4/4 Loss=0.1248 Policy=0.0006 Value=0.3156 Entropy=0.6727 KL=0.0034 Corr=-0.0221
[Update 633/1000] Episode reward: -4.640, steps: 512




[PPO] Epoch 4/4 Loss=0.1428 Policy=0.0017 Value=0.3501 Entropy=0.6787 KL=0.0014 Corr=0.0283
[Update 634/1000] Episode reward: -2.380, steps: 512




[PPO] Epoch 4/4 Loss=0.5554 Policy=0.0013 Value=1.1761 Entropy=0.6797 KL=0.0007 Corr=-0.0119
[Update 635/1000] Episode reward: 25.912, steps: 512




[PPO] Epoch 4/4 Loss=0.3412 Policy=0.0008 Value=0.7484 Entropy=0.6763 KL=-0.0001 Corr=0.0193
[Update 636/1000] Episode reward: 19.704, steps: 512




[PPO] Epoch 4/4 Loss=0.5363 Policy=-0.0009 Value=1.1420 Entropy=0.6747 KL=0.0005 Corr=-0.0509
[Update 637/1000] Episode reward: 27.319, steps: 512




[PPO] Epoch 4/4 Loss=0.5802 Policy=-0.0034 Value=1.2348 Entropy=0.6750 KL=-0.0001 Corr=-0.0734
[Update 638/1000] Episode reward: -25.720, steps: 512




[PPO] Epoch 4/4 Loss=0.1333 Policy=-0.0019 Value=0.3382 Entropy=0.6787 KL=0.0038 Corr=0.0148
[Update 639/1000] Episode reward: -2.201, steps: 512




[PPO] Epoch 4/4 Loss=0.1462 Policy=0.0020 Value=0.3583 Entropy=0.6979 KL=0.0020 Corr=0.0796
[Update 640/1000] Episode reward: 8.048, steps: 512




[PPO] Epoch 4/4 Loss=0.2060 Policy=-0.0016 Value=0.4887 Entropy=0.7361 KL=-0.0008 Corr=0.0120
[Update 641/1000] Episode reward: -13.562, steps: 512




[PPO] Epoch 4/4 Loss=0.6327 Policy=0.0003 Value=1.3411 Entropy=0.7629 KL=0.0013 Corr=-0.0265
[Update 642/1000] Episode reward: -9.439, steps: 512




[PPO] Epoch 4/4 Loss=0.1156 Policy=0.0032 Value=0.3034 Entropy=0.7867 KL=0.0038 Corr=0.0349
[Update 643/1000] Episode reward: 1.326, steps: 512




[PPO] Epoch 4/4 Loss=0.0500 Policy=0.0043 Value=0.1714 Entropy=0.7993 KL=0.0006 Corr=-0.0134
[Update 644/1000] Episode reward: -1.085, steps: 512




[PPO] Epoch 4/4 Loss=0.1051 Policy=-0.0009 Value=0.2923 Entropy=0.8013 KL=0.0063 Corr=0.0453
[Update 645/1000] Episode reward: -11.414, steps: 512




[PPO] Epoch 4/4 Loss=0.0297 Policy=-0.0015 Value=0.1418 Entropy=0.7940 KL=0.0055 Corr=-0.0134
[Update 646/1000] Episode reward: 0.773, steps: 512




[PPO] Epoch 4/4 Loss=0.0098 Policy=0.0026 Value=0.0917 Entropy=0.7725 KL=0.0007 Corr=0.0076
[Update 647/1000] Episode reward: -1.377, steps: 512




[PPO] Epoch 4/4 Loss=0.2086 Policy=0.0010 Value=0.4927 Entropy=0.7749 KL=0.0001 Corr=0.0289
[Update 648/1000] Episode reward: 10.868, steps: 512




[PPO] Epoch 4/4 Loss=0.1491 Policy=-0.0037 Value=0.3812 Entropy=0.7568 KL=-0.0007 Corr=-0.0037
[Update 649/1000] Episode reward: 6.295, steps: 512




[PPO] Epoch 4/4 Loss=0.2082 Policy=0.0045 Value=0.4803 Entropy=0.7288 KL=0.0011 Corr=-0.0200
[Update 650/1000] Episode reward: 12.351, steps: 512




[PPO] Epoch 4/4 Loss=0.2355 Policy=-0.0033 Value=0.5480 Entropy=0.7051 KL=0.0052 Corr=-0.0514
[Update 651/1000] Episode reward: -2.144, steps: 512




[PPO] Epoch 4/4 Loss=0.2078 Policy=-0.0008 Value=0.4863 Entropy=0.6929 KL=0.0011 Corr=0.0090
[Update 652/1000] Episode reward: 0.476, steps: 512




[PPO] Epoch 4/4 Loss=0.2632 Policy=0.0023 Value=0.5934 Entropy=0.7173 KL=0.0011 Corr=-0.0322
[Update 653/1000] Episode reward: 10.897, steps: 512




[PPO] Epoch 4/4 Loss=0.5848 Policy=-0.0001 Value=1.2427 Entropy=0.7293 KL=0.0004 Corr=-0.0296
[Update 654/1000] Episode reward: 20.545, steps: 512




[PPO] Epoch 4/4 Loss=0.1974 Policy=-0.0014 Value=0.4722 Entropy=0.7464 KL=0.0031 Corr=0.0113
[Update 655/1000] Episode reward: 2.221, steps: 512




[PPO] Epoch 4/4 Loss=0.2211 Policy=0.0043 Value=0.5108 Entropy=0.7739 KL=0.0010 Corr=0.0246
[Update 656/1000] Episode reward: 4.980, steps: 512




[PPO] Epoch 4/4 Loss=0.0959 Policy=0.0055 Value=0.2602 Entropy=0.7921 KL=0.0020 Corr=0.0387
[Update 657/1000] Episode reward: -9.141, steps: 512




[PPO] Epoch 4/4 Loss=0.1992 Policy=0.0037 Value=0.4714 Entropy=0.8024 KL=-0.0001 Corr=0.0208
[Update 658/1000] Episode reward: 15.451, steps: 512




[PPO] Epoch 4/4 Loss=0.2078 Policy=-0.0070 Value=0.5086 Entropy=0.7897 KL=0.0032 Corr=-0.0236
[Update 659/1000] Episode reward: 1.918, steps: 512




[PPO] Epoch 4/4 Loss=0.1783 Policy=-0.0138 Value=0.4593 Entropy=0.7522 KL=0.0090 Corr=0.0197
[Update 660/1000] Episode reward: 12.220, steps: 512




[PPO] Epoch 4/4 Loss=0.3545 Policy=-0.0038 Value=0.7875 Entropy=0.7092 KL=0.0050 Corr=-0.0097
[Update 661/1000] Episode reward: 12.675, steps: 512




[PPO] Epoch 4/4 Loss=0.1536 Policy=0.0024 Value=0.3700 Entropy=0.6768 KL=-0.0005 Corr=-0.0628
[Update 662/1000] Episode reward: -1.966, steps: 512




[PPO] Epoch 4/4 Loss=0.1169 Policy=0.0005 Value=0.2994 Entropy=0.6656 KL=0.0003 Corr=0.0025
[Update 663/1000] Episode reward: -6.778, steps: 512




[PPO] Epoch 4/4 Loss=0.8259 Policy=-0.0002 Value=1.7196 Entropy=0.6740 KL=0.0002 Corr=0.0003
[Update 664/1000] Episode reward: 26.142, steps: 512




[PPO] Epoch 4/4 Loss=0.7401 Policy=-0.0006 Value=1.5492 Entropy=0.6787 KL=0.0053 Corr=0.0644
[Update 665/1000] Episode reward: 16.847, steps: 512




[PPO] Epoch 4/4 Loss=0.2330 Policy=-0.0006 Value=0.5357 Entropy=0.6833 KL=-0.0011 Corr=0.0683
[Update 666/1000] Episode reward: 7.037, steps: 512




[PPO] Epoch 4/4 Loss=0.4105 Policy=-0.0009 Value=0.8901 Entropy=0.6734 KL=0.0032 Corr=0.0579
[Update 667/1000] Episode reward: 20.948, steps: 512




[PPO] Epoch 4/4 Loss=0.2341 Policy=-0.0007 Value=0.5361 Entropy=0.6663 KL=-0.0023 Corr=0.0289
[Update 668/1000] Episode reward: 16.325, steps: 512




[PPO] Epoch 4/4 Loss=0.3770 Policy=0.0011 Value=0.8180 Entropy=0.6618 KL=-0.0005 Corr=0.0337
[Update 669/1000] Episode reward: 8.594, steps: 512




[PPO] Epoch 4/4 Loss=0.4497 Policy=0.0031 Value=0.9596 Entropy=0.6652 KL=0.0053 Corr=0.0909
[Update 670/1000] Episode reward: 11.371, steps: 512




[PPO] Epoch 4/4 Loss=0.3807 Policy=0.0008 Value=0.8276 Entropy=0.6781 KL=0.0017 Corr=0.0143
[Update 671/1000] Episode reward: 16.068, steps: 512




[PPO] Epoch 4/4 Loss=0.1859 Policy=-0.0001 Value=0.4413 Entropy=0.6922 KL=-0.0014 Corr=-0.0159
[Update 672/1000] Episode reward: -6.894, steps: 512




[PPO] Epoch 4/4 Loss=0.2522 Policy=0.0007 Value=0.5727 Entropy=0.6980 KL=-0.0011 Corr=0.0325
[Update 673/1000] Episode reward: 17.934, steps: 512




[PPO] Epoch 4/4 Loss=0.0807 Policy=-0.0061 Value=0.2406 Entropy=0.6685 KL=0.0032 Corr=0.0286
[Update 674/1000] Episode reward: 9.761, steps: 512




[PPO] Epoch 4/4 Loss=0.3028 Policy=0.0048 Value=0.6585 Entropy=0.6265 KL=-0.0003 Corr=-0.0354
[Update 675/1000] Episode reward: 0.134, steps: 512




[PPO] Epoch 4/4 Loss=0.3306 Policy=0.0016 Value=0.7194 Entropy=0.6143 KL=0.0021 Corr=-0.0084
[Update 676/1000] Episode reward: -8.604, steps: 512




[PPO] Epoch 4/4 Loss=0.6971 Policy=-0.0038 Value=1.4631 Entropy=0.6133 KL=-0.0002 Corr=-0.0447
[Update 677/1000] Episode reward: 27.263, steps: 512




[PPO] Epoch 4/4 Loss=0.2477 Policy=0.0001 Value=0.5565 Entropy=0.6132 KL=-0.0001 Corr=-0.0497
[Update 678/1000] Episode reward: 16.230, steps: 512




[PPO] Epoch 4/4 Loss=0.2914 Policy=0.0021 Value=0.6401 Entropy=0.6152 KL=0.0030 Corr=-0.0087
[Update 679/1000] Episode reward: 24.797, steps: 512




[PPO] Epoch 4/4 Loss=0.1734 Policy=0.0014 Value=0.4064 Entropy=0.6258 KL=-0.0011 Corr=-0.0929
[Update 680/1000] Episode reward: 9.923, steps: 512




[PPO] Epoch 4/4 Loss=0.2176 Policy=0.0010 Value=0.4978 Entropy=0.6466 KL=0.0006 Corr=-0.0667
[Update 681/1000] Episode reward: 8.242, steps: 512




[PPO] Epoch 4/4 Loss=0.2289 Policy=-0.0031 Value=0.5312 Entropy=0.6718 KL=0.0037 Corr=-0.0281
[Update 682/1000] Episode reward: -6.516, steps: 512




[PPO] Epoch 4/4 Loss=0.4881 Policy=-0.0019 Value=1.0502 Entropy=0.7029 KL=0.0023 Corr=-0.0099
[Update 683/1000] Episode reward: 22.847, steps: 512




[PPO] Epoch 4/4 Loss=0.2524 Policy=0.0006 Value=0.5769 Entropy=0.7344 KL=-0.0005 Corr=-0.0016
[Update 684/1000] Episode reward: 4.347, steps: 512




[PPO] Epoch 4/4 Loss=0.2864 Policy=0.0006 Value=0.6470 Entropy=0.7552 KL=0.0033 Corr=0.0630
[Update 685/1000] Episode reward: 14.435, steps: 512




[PPO] Epoch 4/4 Loss=0.1340 Policy=-0.0062 Value=0.3555 Entropy=0.7507 KL=-0.0037 Corr=-0.0098
[Update 686/1000] Episode reward: 10.573, steps: 512




[PPO] Epoch 4/4 Loss=0.4646 Policy=0.0000 Value=1.0022 Entropy=0.7310 KL=-0.0009 Corr=-0.0339
[Update 687/1000] Episode reward: 31.359, steps: 512




[PPO] Epoch 4/4 Loss=0.1734 Policy=0.0019 Value=0.4149 Entropy=0.7177 KL=0.0021 Corr=-0.0009
[Update 688/1000] Episode reward: 0.270, steps: 512




[PPO] Epoch 4/4 Loss=0.2711 Policy=0.0001 Value=0.6134 Entropy=0.7126 KL=0.0056 Corr=0.0256
[Update 689/1000] Episode reward: -5.445, steps: 512




[PPO] Epoch 4/4 Loss=0.1692 Policy=-0.0023 Value=0.4145 Entropy=0.7158 KL=0.0001 Corr=0.0026
[Update 690/1000] Episode reward: 2.759, steps: 512




[PPO] Epoch 4/4 Loss=0.4296 Policy=-0.0003 Value=0.9319 Entropy=0.7198 KL=0.0023 Corr=0.0035
[Update 691/1000] Episode reward: 16.329, steps: 512




[PPO] Epoch 4/4 Loss=0.3488 Policy=-0.0018 Value=0.7738 Entropy=0.7273 KL=0.0011 Corr=0.0807
[Update 692/1000] Episode reward: -17.431, steps: 512




[PPO] Epoch 4/4 Loss=0.0940 Policy=0.0005 Value=0.2618 Entropy=0.7477 KL=0.0027 Corr=-0.0872
[Update 693/1000] Episode reward: 6.921, steps: 512




[PPO] Epoch 4/4 Loss=0.2000 Policy=0.0011 Value=0.4753 Entropy=0.7753 KL=0.0021 Corr=-0.0827
[Update 694/1000] Episode reward: -4.672, steps: 512




[PPO] Epoch 4/4 Loss=0.1305 Policy=0.0034 Value=0.3338 Entropy=0.7970 KL=-0.0004 Corr=-0.0539
[Update 695/1000] Episode reward: 12.108, steps: 512




[PPO] Epoch 4/4 Loss=0.1675 Policy=-0.0020 Value=0.4206 Entropy=0.8171 KL=-0.0055 Corr=-0.0477
[Update 696/1000] Episode reward: -2.879, steps: 512




[PPO] Epoch 4/4 Loss=0.1882 Policy=-0.0054 Value=0.4690 Entropy=0.8181 KL=0.0091 Corr=-0.0527
[Update 697/1000] Episode reward: 8.103, steps: 512




[PPO] Epoch 4/4 Loss=0.0339 Policy=-0.0002 Value=0.1470 Entropy=0.7874 KL=0.0060 Corr=-0.0827
[Update 698/1000] Episode reward: 3.206, steps: 512




[PPO] Epoch 4/4 Loss=0.5742 Policy=0.0023 Value=1.2200 Entropy=0.7626 KL=0.0036 Corr=-0.0240
[Update 699/1000] Episode reward: 16.510, steps: 512




[PPO] Epoch 4/4 Loss=0.6984 Policy=0.0027 Value=1.4660 Entropy=0.7450 KL=-0.0007 Corr=-0.0567
[Update 700/1000] Episode reward: 15.399, steps: 512




[PPO] Epoch 4/4 Loss=0.6456 Policy=-0.0008 Value=1.3666 Entropy=0.7364 KL=0.0009 Corr=0.0163
[Update 701/1000] Episode reward: -23.276, steps: 512




[PPO] Epoch 4/4 Loss=0.2872 Policy=-0.0014 Value=0.6516 Entropy=0.7430 KL=-0.0008 Corr=-0.0388
[Update 702/1000] Episode reward: -5.803, steps: 512




[PPO] Epoch 4/4 Loss=0.5655 Policy=-0.0006 Value=1.2086 Entropy=0.7646 KL=0.0016 Corr=0.0532
[Update 703/1000] Episode reward: -18.500, steps: 512




[PPO] Epoch 4/4 Loss=0.3950 Policy=-0.0008 Value=0.8702 Entropy=0.7860 KL=0.0006 Corr=0.0199
[Update 704/1000] Episode reward: -13.961, steps: 512




[PPO] Epoch 4/4 Loss=0.2771 Policy=0.0031 Value=0.6302 Entropy=0.8209 KL=-0.0023 Corr=-0.0474
[Update 705/1000] Episode reward: 4.335, steps: 512




[PPO] Epoch 4/4 Loss=0.4373 Policy=-0.0072 Value=0.9749 Entropy=0.8592 KL=0.0063 Corr=-0.0064
[Update 706/1000] Episode reward: 11.287, steps: 512




[PPO] Epoch 4/4 Loss=0.2390 Policy=0.0005 Value=0.5657 Entropy=0.8869 KL=0.0033 Corr=0.0379
[Update 707/1000] Episode reward: 9.262, steps: 512




[PPO] Epoch 4/4 Loss=0.3979 Policy=0.0024 Value=0.8805 Entropy=0.8963 KL=0.0006 Corr=0.0550
[Update 708/1000] Episode reward: 26.286, steps: 512




[PPO] Epoch 4/4 Loss=0.3054 Policy=-0.0032 Value=0.7067 Entropy=0.8949 KL=-0.0005 Corr=-0.0194
[Update 709/1000] Episode reward: 18.453, steps: 512




[PPO] Epoch 4/4 Loss=0.1825 Policy=-0.0005 Value=0.4555 Entropy=0.8931 KL=0.0029 Corr=0.0670
[Update 710/1000] Episode reward: -10.567, steps: 512




[PPO] Epoch 4/4 Loss=0.3601 Policy=0.0042 Value=0.8014 Entropy=0.8952 KL=0.0050 Corr=0.0150
[Update 711/1000] Episode reward: 0.987, steps: 512




[PPO] Epoch 4/4 Loss=0.2850 Policy=-0.0017 Value=0.6621 Entropy=0.8873 KL=0.0019 Corr=-0.0118
[Update 712/1000] Episode reward: 12.155, steps: 512




[PPO] Epoch 4/4 Loss=0.1267 Policy=-0.0021 Value=0.3455 Entropy=0.8810 KL=0.0060 Corr=-0.0744
[Update 713/1000] Episode reward: 3.061, steps: 512




[PPO] Epoch 4/4 Loss=0.2946 Policy=0.0014 Value=0.6771 Entropy=0.9087 KL=-0.0002 Corr=0.0360
[Update 714/1000] Episode reward: 7.167, steps: 512




[PPO] Epoch 4/4 Loss=0.2092 Policy=0.0007 Value=0.5072 Entropy=0.9017 KL=-0.0033 Corr=-0.0321
[Update 715/1000] Episode reward: 6.977, steps: 512




[PPO] Epoch 4/4 Loss=0.3681 Policy=-0.0015 Value=0.8233 Entropy=0.8425 KL=0.0060 Corr=0.0352
[Update 716/1000] Episode reward: 21.106, steps: 512




[PPO] Epoch 4/4 Loss=0.6304 Policy=-0.0028 Value=1.3468 Entropy=0.8029 KL=0.0061 Corr=-0.0347
[Update 717/1000] Episode reward: -17.637, steps: 512




[PPO] Epoch 4/4 Loss=0.3194 Policy=-0.0009 Value=0.7189 Entropy=0.7816 KL=0.0002 Corr=-0.0256
[Update 718/1000] Episode reward: -4.770, steps: 512




[PPO] Epoch 4/4 Loss=0.1023 Policy=-0.0047 Value=0.2900 Entropy=0.7608 KL=0.0004 Corr=-0.0694
[Update 719/1000] Episode reward: 12.262, steps: 512




[PPO] Epoch 4/4 Loss=0.1044 Policy=0.0020 Value=0.2796 Entropy=0.7493 KL=0.0020 Corr=0.0198
[Update 720/1000] Episode reward: 3.080, steps: 512




[PPO] Epoch 4/4 Loss=0.1184 Policy=0.0004 Value=0.3129 Entropy=0.7699 KL=-0.0010 Corr=-0.0372
[Update 721/1000] Episode reward: 15.172, steps: 512




[PPO] Epoch 4/4 Loss=0.4334 Policy=0.0039 Value=0.9366 Entropy=0.7758 KL=0.0030 Corr=0.0431
[Update 722/1000] Episode reward: 5.543, steps: 512




[PPO] Epoch 4/4 Loss=0.1336 Policy=-0.0051 Value=0.3514 Entropy=0.7388 KL=0.0052 Corr=0.0472
[Update 723/1000] Episode reward: 8.215, steps: 512




[PPO] Epoch 4/4 Loss=0.3794 Policy=0.0041 Value=0.8191 Entropy=0.6834 KL=-0.0006 Corr=-0.0082
[Update 724/1000] Episode reward: -19.453, steps: 512




[PPO] Epoch 4/4 Loss=0.4676 Policy=-0.0009 Value=1.0028 Entropy=0.6576 KL=-0.0005 Corr=0.0680
[Update 725/1000] Episode reward: -12.598, steps: 512




[PPO] Epoch 4/4 Loss=0.7780 Policy=0.0023 Value=1.6159 Entropy=0.6457 KL=0.0017 Corr=-0.0170
[Update 726/1000] Episode reward: -30.558, steps: 512




[PPO] Epoch 4/4 Loss=0.5502 Policy=0.0004 Value=1.1638 Entropy=0.6423 KL=0.0021 Corr=0.0269
[Update 727/1000] Episode reward: 10.124, steps: 512




[PPO] Epoch 4/4 Loss=0.2194 Policy=-0.0016 Value=0.5065 Entropy=0.6463 KL=0.0020 Corr=-0.0772
[Update 728/1000] Episode reward: -7.027, steps: 512




[PPO] Epoch 4/4 Loss=0.5252 Policy=-0.0036 Value=1.1235 Entropy=0.6590 KL=0.0031 Corr=-0.0467
[Update 729/1000] Episode reward: 10.679, steps: 512




[PPO] Epoch 4/4 Loss=0.2691 Policy=0.0038 Value=0.5981 Entropy=0.6742 KL=0.0003 Corr=0.1029
[Update 730/1000] Episode reward: -12.887, steps: 512




[PPO] Epoch 4/4 Loss=0.1232 Policy=0.0013 Value=0.3136 Entropy=0.6982 KL=0.0008 Corr=0.0351
[Update 731/1000] Episode reward: 0.935, steps: 512




[PPO] Epoch 4/4 Loss=0.3206 Policy=0.0029 Value=0.7095 Entropy=0.7389 KL=0.0012 Corr=0.0511
[Update 732/1000] Episode reward: 6.898, steps: 512




[PPO] Epoch 4/4 Loss=0.2517 Policy=0.0055 Value=0.5684 Entropy=0.7588 KL=0.0063 Corr=0.0479
[Update 733/1000] Episode reward: 0.613, steps: 512




[PPO] Epoch 4/4 Loss=0.1923 Policy=0.0013 Value=0.4589 Entropy=0.7691 KL=-0.0026 Corr=0.0349
[Update 734/1000] Episode reward: -7.040, steps: 512




[PPO] Epoch 4/4 Loss=0.2304 Policy=-0.0038 Value=0.5462 Entropy=0.7764 KL=0.0045 Corr=0.0835
[Update 735/1000] Episode reward: -19.741, steps: 512




[PPO] Epoch 4/4 Loss=0.1197 Policy=0.0042 Value=0.3088 Entropy=0.7767 KL=0.0012 Corr=0.0089
[Update 736/1000] Episode reward: 3.766, steps: 512




[PPO] Epoch 4/4 Loss=0.4954 Policy=0.0043 Value=1.0589 Entropy=0.7680 KL=-0.0008 Corr=-0.0510
[Update 737/1000] Episode reward: 8.824, steps: 512




[PPO] Epoch 4/4 Loss=0.1614 Policy=0.0017 Value=0.3963 Entropy=0.7682 KL=-0.0031 Corr=0.0937
[Update 738/1000] Episode reward: -0.023, steps: 512




[PPO] Epoch 4/4 Loss=0.2807 Policy=-0.0002 Value=0.6398 Entropy=0.7791 KL=0.0020 Corr=0.0006
[Update 739/1000] Episode reward: -10.491, steps: 512




[PPO] Epoch 4/4 Loss=0.4347 Policy=0.0028 Value=0.9428 Entropy=0.7902 KL=0.0025 Corr=0.0243
[Update 740/1000] Episode reward: 16.505, steps: 512




[PPO] Epoch 4/4 Loss=0.1124 Policy=0.0008 Value=0.3028 Entropy=0.7955 KL=0.0039 Corr=-0.0564
[Update 741/1000] Episode reward: -11.174, steps: 512




[PPO] Epoch 4/4 Loss=0.3828 Policy=-0.0052 Value=0.8544 Entropy=0.7827 KL=0.0029 Corr=0.0518
[Update 742/1000] Episode reward: 4.330, steps: 512




[PPO] Epoch 4/4 Loss=0.1941 Policy=-0.0012 Value=0.4646 Entropy=0.7388 KL=0.0049 Corr=-0.1056
[Update 743/1000] Episode reward: 1.329, steps: 512




[PPO] Epoch 4/4 Loss=0.2583 Policy=0.0007 Value=0.5856 Entropy=0.7037 KL=0.0058 Corr=0.0649
[Update 744/1000] Episode reward: 4.658, steps: 512




[PPO] Epoch 4/4 Loss=0.5387 Policy=0.0007 Value=1.1469 Entropy=0.7081 KL=-0.0034 Corr=0.0250
[Update 745/1000] Episode reward: 5.616, steps: 512




[PPO] Epoch 4/4 Loss=0.2499 Policy=-0.0039 Value=0.5820 Entropy=0.7440 KL=0.0046 Corr=-0.0060
[Update 746/1000] Episode reward: 4.600, steps: 512




[PPO] Epoch 4/4 Loss=0.5241 Policy=-0.0043 Value=1.1361 Entropy=0.7935 KL=-0.0003 Corr=-0.0192
[Update 747/1000] Episode reward: 10.161, steps: 512




[PPO] Epoch 4/4 Loss=0.2585 Policy=0.0046 Value=0.5901 Entropy=0.8215 KL=0.0052 Corr=-0.0002
[Update 748/1000] Episode reward: 12.224, steps: 512




[PPO] Epoch 4/4 Loss=0.5733 Policy=-0.0009 Value=1.2316 Entropy=0.8308 KL=0.0025 Corr=0.0053
[Update 749/1000] Episode reward: 26.703, steps: 512




[PPO] Epoch 4/4 Loss=0.0504 Policy=0.0015 Value=0.1814 Entropy=0.8343 KL=0.0034 Corr=-0.0470
[Update 750/1000] Episode reward: 2.980, steps: 512




[PPO] Epoch 4/4 Loss=0.4982 Policy=0.0011 Value=1.0783 Entropy=0.8395 KL=0.0016 Corr=-0.0083
[Update 751/1000] Episode reward: 23.122, steps: 512




[PPO] Epoch 4/4 Loss=0.1047 Policy=0.0040 Value=0.2854 Entropy=0.8393 KL=0.0035 Corr=-0.0606
[Update 752/1000] Episode reward: -7.874, steps: 512




[PPO] Epoch 4/4 Loss=0.1889 Policy=-0.0025 Value=0.4662 Entropy=0.8346 KL=0.0041 Corr=-0.0843
[Update 753/1000] Episode reward: -3.236, steps: 512




[PPO] Epoch 4/4 Loss=0.1622 Policy=0.0042 Value=0.3983 Entropy=0.8235 KL=0.0008 Corr=0.0411
[Update 754/1000] Episode reward: -2.997, steps: 512




[PPO] Epoch 4/4 Loss=0.2002 Policy=-0.0064 Value=0.4953 Entropy=0.8213 KL=0.0023 Corr=0.0349
[Update 755/1000] Episode reward: 2.933, steps: 512




[PPO] Epoch 4/4 Loss=0.4301 Policy=0.0043 Value=0.9340 Entropy=0.8246 KL=0.0027 Corr=-0.0161
[Update 756/1000] Episode reward: 6.097, steps: 512




[PPO] Epoch 4/4 Loss=0.2489 Policy=-0.0027 Value=0.5858 Entropy=0.8250 KL=0.0029 Corr=-0.0551
[Update 757/1000] Episode reward: 10.933, steps: 512




[PPO] Epoch 4/4 Loss=0.0992 Policy=-0.0040 Value=0.2887 Entropy=0.8223 KL=0.0016 Corr=0.0311
[Update 758/1000] Episode reward: -9.445, steps: 512




[PPO] Epoch 4/4 Loss=0.3009 Policy=-0.0003 Value=0.6845 Entropy=0.8212 KL=0.0014 Corr=0.0925
[Update 759/1000] Episode reward: 14.677, steps: 512




[PPO] Epoch 4/4 Loss=0.1018 Policy=0.0028 Value=0.2812 Entropy=0.8332 KL=0.0002 Corr=0.0167
[Update 760/1000] Episode reward: 3.809, steps: 512




[PPO] Epoch 4/4 Loss=0.4064 Policy=0.0012 Value=0.8968 Entropy=0.8644 KL=0.0003 Corr=0.0296
[Update 761/1000] Episode reward: -8.089, steps: 512




[PPO] Epoch 4/4 Loss=0.2345 Policy=0.0023 Value=0.5532 Entropy=0.8863 KL=0.0056 Corr=-0.0299
[Update 762/1000] Episode reward: -0.158, steps: 512




[PPO] Epoch 4/4 Loss=0.4914 Policy=-0.0006 Value=1.0736 Entropy=0.8960 KL=0.0061 Corr=0.0034
[Update 763/1000] Episode reward: 21.358, steps: 512




[PPO] Epoch 4/4 Loss=0.2147 Policy=-0.0034 Value=0.5268 Entropy=0.9068 KL=0.0033 Corr=-0.0223
[Update 764/1000] Episode reward: -0.660, steps: 512




[PPO] Epoch 4/4 Loss=0.2846 Policy=-0.0023 Value=0.6661 Entropy=0.9218 KL=0.0045 Corr=0.0074
[Update 765/1000] Episode reward: -15.238, steps: 512




[PPO] Epoch 4/4 Loss=0.4803 Policy=0.0002 Value=1.0532 Entropy=0.9305 KL=0.0033 Corr=0.0463
[Update 766/1000] Episode reward: -16.772, steps: 512




[PPO] Epoch 4/4 Loss=0.3736 Policy=-0.0009 Value=0.8430 Entropy=0.9400 KL=0.0017 Corr=0.0308
[Update 767/1000] Episode reward: -21.792, steps: 512




[PPO] Epoch 4/4 Loss=0.3709 Policy=0.0029 Value=0.8310 Entropy=0.9502 KL=0.0015 Corr=0.0409
[Update 768/1000] Episode reward: -16.277, steps: 512




[PPO] Epoch 4/4 Loss=0.3298 Policy=0.0023 Value=0.7510 Entropy=0.9587 KL=0.0010 Corr=-0.0343
[Update 769/1000] Episode reward: -20.049, steps: 512




[PPO] Epoch 4/4 Loss=0.4752 Policy=-0.0008 Value=1.0478 Entropy=0.9581 KL=-0.0014 Corr=-0.0409
[Update 770/1000] Episode reward: 6.615, steps: 512




[PPO] Epoch 4/4 Loss=0.2003 Policy=0.0020 Value=0.4922 Entropy=0.9565 KL=-0.0025 Corr=-0.0065
[Update 771/1000] Episode reward: -6.319, steps: 512




[PPO] Epoch 4/4 Loss=0.7821 Policy=0.0007 Value=1.6588 Entropy=0.9590 KL=0.0026 Corr=0.0258
[Update 772/1000] Episode reward: 20.664, steps: 512




[PPO] Epoch 4/4 Loss=0.4277 Policy=0.0028 Value=0.9455 Entropy=0.9573 KL=0.0047 Corr=0.0207
[Update 773/1000] Episode reward: -20.551, steps: 512




[PPO] Epoch 4/4 Loss=0.1882 Policy=0.0016 Value=0.4695 Entropy=0.9633 KL=-0.0002 Corr=-0.0948
[Update 774/1000] Episode reward: 3.920, steps: 512




[PPO] Epoch 4/4 Loss=0.2842 Policy=0.0010 Value=0.6639 Entropy=0.9751 KL=0.0028 Corr=-0.0537
[Update 775/1000] Episode reward: 10.991, steps: 512




[PPO] Epoch 4/4 Loss=0.1972 Policy=-0.0033 Value=0.4989 Entropy=0.9787 KL=0.0045 Corr=0.0195
[Update 776/1000] Episode reward: 0.828, steps: 512




[PPO] Epoch 4/4 Loss=0.7101 Policy=0.0009 Value=1.5158 Entropy=0.9747 KL=0.0031 Corr=0.0274
[Update 777/1000] Episode reward: -10.412, steps: 512




[PPO] Epoch 4/4 Loss=0.2903 Policy=-0.0002 Value=0.6781 Entropy=0.9721 KL=0.0015 Corr=-0.0298
[Update 778/1000] Episode reward: 0.596, steps: 512




[PPO] Epoch 4/4 Loss=0.3211 Policy=0.0003 Value=0.7378 Entropy=0.9623 KL=-0.0009 Corr=-0.0863
[Update 779/1000] Episode reward: -0.770, steps: 512




[PPO] Epoch 4/4 Loss=0.3260 Policy=-0.0033 Value=0.7540 Entropy=0.9524 KL=0.0034 Corr=0.0015
[Update 780/1000] Episode reward: -12.753, steps: 512




[PPO] Epoch 4/4 Loss=0.3151 Policy=0.0006 Value=0.7242 Entropy=0.9511 KL=-0.0027 Corr=0.0334
[Update 781/1000] Episode reward: -14.378, steps: 512




[PPO] Epoch 4/4 Loss=0.3082 Policy=0.0033 Value=0.7051 Entropy=0.9537 KL=-0.0004 Corr=0.0214
[Update 782/1000] Episode reward: 7.354, steps: 512




[PPO] Epoch 4/4 Loss=0.4870 Policy=-0.0041 Value=1.0771 Entropy=0.9487 KL=0.0015 Corr=0.0286
[Update 783/1000] Episode reward: -11.454, steps: 512




[PPO] Epoch 4/4 Loss=0.3853 Policy=-0.0006 Value=0.8658 Entropy=0.9404 KL=0.0020 Corr=-0.0473
[Update 784/1000] Episode reward: -22.441, steps: 512




[PPO] Epoch 4/4 Loss=0.3294 Policy=-0.0018 Value=0.7567 Entropy=0.9430 KL=0.0037 Corr=0.0119
[Update 785/1000] Episode reward: -23.805, steps: 512




[PPO] Epoch 4/4 Loss=0.3277 Policy=0.0015 Value=0.7477 Entropy=0.9525 KL=0.0014 Corr=-0.0869
[Update 786/1000] Episode reward: -21.039, steps: 512




[PPO] Epoch 4/4 Loss=0.4634 Policy=-0.0035 Value=1.0305 Entropy=0.9680 KL=0.0034 Corr=-0.0202
[Update 787/1000] Episode reward: -35.027, steps: 512




[PPO] Epoch 4/4 Loss=0.2317 Policy=-0.0054 Value=0.5730 Entropy=0.9879 KL=0.0023 Corr=0.0396
[Update 788/1000] Episode reward: -22.632, steps: 512




[PPO] Epoch 4/4 Loss=0.4081 Policy=0.0043 Value=0.9076 Entropy=1.0004 KL=0.0033 Corr=-0.0645
[Update 789/1000] Episode reward: 3.951, steps: 512




[PPO] Epoch 4/4 Loss=0.2236 Policy=-0.0048 Value=0.5566 Entropy=0.9974 KL=0.0028 Corr=-0.0161
[Update 790/1000] Episode reward: -19.317, steps: 512




[PPO] Epoch 4/4 Loss=0.3133 Policy=0.0005 Value=0.7248 Entropy=0.9919 KL=0.0024 Corr=-0.0446
[Update 791/1000] Episode reward: -6.724, steps: 512




[PPO] Epoch 4/4 Loss=0.6031 Policy=-0.0028 Value=1.3092 Entropy=0.9750 KL=0.0093 Corr=0.1049
[Update 792/1000] Episode reward: 7.088, steps: 512




[PPO] Epoch 4/4 Loss=0.3122 Policy=-0.0093 Value=0.7411 Entropy=0.9815 KL=0.0091 Corr=0.0409
[Update 793/1000] Episode reward: -4.416, steps: 512




[PPO] Epoch 4/4 Loss=0.2369 Policy=0.0057 Value=0.5613 Entropy=0.9893 KL=0.0058 Corr=0.0335
[Update 794/1000] Episode reward: 12.470, steps: 512




[PPO] Epoch 4/4 Loss=0.0926 Policy=0.0059 Value=0.2726 Entropy=0.9915 KL=0.0077 Corr=0.0029
[Update 795/1000] Episode reward: -6.366, steps: 512




[PPO] Epoch 4/4 Loss=0.0184 Policy=-0.0114 Value=0.1590 Entropy=0.9926 KL=0.0075 Corr=0.0535
[Update 796/1000] Episode reward: -5.757, steps: 512




[PPO] Epoch 4/4 Loss=0.5449 Policy=0.0090 Value=1.1677 Entropy=0.9607 KL=0.0104 Corr=-0.0238
[Update 797/1000] Episode reward: -36.972, steps: 512




[PPO] Epoch 4/4 Loss=0.2166 Policy=0.0060 Value=0.5150 Entropy=0.9377 KL=0.0027 Corr=0.0032
[Update 798/1000] Episode reward: -12.187, steps: 512




[PPO] Epoch 4/4 Loss=0.5462 Policy=-0.0041 Value=1.1936 Entropy=0.9299 KL=0.0021 Corr=0.0416
[Update 799/1000] Episode reward: 0.507, steps: 512




[PPO] Epoch 4/4 Loss=0.2101 Policy=-0.0029 Value=0.5180 Entropy=0.9193 KL=0.0030 Corr=0.0392
[Update 800/1000] Episode reward: -9.741, steps: 512




[PPO] Epoch 4/4 Loss=0.3552 Policy=-0.0047 Value=0.8122 Entropy=0.9248 KL=0.0171 Corr=-0.0229
[Update 801/1000] Episode reward: -6.607, steps: 512




[PPO] Epoch 4/4 Loss=0.1110 Policy=0.0031 Value=0.3082 Entropy=0.9235 KL=0.0144 Corr=-0.0817
[Update 802/1000] Episode reward: -14.562, steps: 512




[PPO] Epoch 4/4 Loss=0.0089 Policy=0.0012 Value=0.1059 Entropy=0.9058 KL=0.0125 Corr=0.0455
[Update 803/1000] Episode reward: -12.166, steps: 512




[PPO] Epoch 4/4 Loss=0.4849 Policy=0.0015 Value=1.0559 Entropy=0.8913 KL=0.0082 Corr=-0.0753
[Update 804/1000] Episode reward: 6.705, steps: 512




[PPO] Epoch 4/4 Loss=0.0395 Policy=-0.0001 Value=0.1676 Entropy=0.8862 KL=-0.0001 Corr=0.0926
[Update 805/1000] Episode reward: -16.797, steps: 512




[PPO] Epoch 4/4 Loss=0.0601 Policy=0.0056 Value=0.1986 Entropy=0.8967 KL=0.0008 Corr=-0.0196
[Update 806/1000] Episode reward: -8.936, steps: 512




[PPO] Epoch 4/4 Loss=0.1702 Policy=-0.0056 Value=0.4433 Entropy=0.9178 KL=0.0123 Corr=-0.0365
[Update 807/1000] Episode reward: 5.766, steps: 512




[PPO] Epoch 4/4 Loss=0.1526 Policy=-0.0101 Value=0.4165 Entropy=0.9109 KL=0.0013 Corr=-0.0032
[Update 808/1000] Episode reward: -3.381, steps: 512




[PPO] Epoch 4/4 Loss=0.7227 Policy=-0.0072 Value=1.5471 Entropy=0.8725 KL=-0.0008 Corr=-0.0375
[Update 809/1000] Episode reward: 21.651, steps: 512




[PPO] Epoch 4/4 Loss=0.7462 Policy=0.0027 Value=1.5723 Entropy=0.8539 KL=0.0045 Corr=0.0218
[Update 810/1000] Episode reward: -12.674, steps: 512




[PPO] Epoch 4/4 Loss=0.2945 Policy=0.0019 Value=0.6715 Entropy=0.8636 KL=0.0008 Corr=-0.0100
[Update 811/1000] Episode reward: -4.631, steps: 512




[PPO] Epoch 4/4 Loss=0.3485 Policy=-0.0087 Value=0.8029 Entropy=0.8840 KL=0.0026 Corr=-0.0128
[Update 812/1000] Episode reward: -21.226, steps: 512




[PPO] Epoch 4/4 Loss=0.2547 Policy=-0.0084 Value=0.6177 Entropy=0.9145 KL=0.0121 Corr=0.0353
[Update 813/1000] Episode reward: -7.489, steps: 512




[PPO] Epoch 4/4 Loss=0.1927 Policy=0.0001 Value=0.4783 Entropy=0.9309 KL=0.0054 Corr=-0.0445
[Update 814/1000] Episode reward: 2.633, steps: 512




[PPO] Epoch 4/4 Loss=0.1487 Policy=0.0054 Value=0.3797 Entropy=0.9301 KL=0.0034 Corr=-0.0436
[Update 815/1000] Episode reward: 2.413, steps: 512




[PPO] Epoch 4/4 Loss=0.0872 Policy=0.0030 Value=0.2609 Entropy=0.9262 KL=0.0018 Corr=-0.0247
[Update 816/1000] Episode reward: -7.617, steps: 512




[PPO] Epoch 4/4 Loss=0.1622 Policy=-0.0025 Value=0.4200 Entropy=0.9077 KL=0.0038 Corr=-0.0115
[Update 817/1000] Episode reward: -7.453, steps: 512




[PPO] Epoch 4/4 Loss=0.2215 Policy=-0.0059 Value=0.5413 Entropy=0.8646 KL=-0.0000 Corr=-0.0071
[Update 818/1000] Episode reward: 8.649, steps: 512




[PPO] Epoch 4/4 Loss=0.2381 Policy=-0.0054 Value=0.5698 Entropy=0.8286 KL=0.0051 Corr=-0.0323
[Update 819/1000] Episode reward: 9.424, steps: 512




[PPO] Epoch 4/4 Loss=0.2297 Policy=-0.0035 Value=0.5467 Entropy=0.8026 KL=0.0073 Corr=-0.0398
[Update 820/1000] Episode reward: 8.896, steps: 512




[PPO] Epoch 4/4 Loss=0.2150 Policy=-0.0033 Value=0.5138 Entropy=0.7728 KL=0.0074 Corr=0.0609
[Update 821/1000] Episode reward: -0.986, steps: 512




In [None]:
from ai2thor.controller import Controller
from cons import NUM_ACTIONS, EPISODE_STEPS, DEVICE, FEAT_DIM, TRAIN_EPOCHS
from models import ActorCritic
from rl import PPOTrainer, CLIPCuriosity, ThorNavEnv, RNDCuriosity, ExtrinsicReward
import torch
import os


# Create CLIP curiosity module
clip_curiosity = RNDCuriosity(
    device=DEVICE
)

# Example: simple extrinsic reward (optional)
def extrinsic_reward_fn(event):
    # e.g., punish failed actions slightly
    fail_penalty = 0.0
    if not event.metadata.get("lastActionSuccess", True):
        fail_penalty = -0.1

    move_bonus = 0.0
    if "Move" in event.metadata.get("lastAction", ""):
        move_bonus = 0.05
    
    return move_bonus + fail_penalty

# Build actor-critic and PPO trainer
ac = ActorCritic(feat_dim=FEAT_DIM, hidden_dim=256, num_actions=NUM_ACTIONS, device=DEVICE)
trainer = PPOTrainer(ac)


for upd in range(1, NUM_UPDATES + 1):
    # Sample random scene
    idx = torch.randint(0, len(train_scenes), (1,)).item()
    house = train_scenes[idx]

    controller = Controller(
        scene=house,
        snapToGrid=False,
        rotateStepDegrees=30,
        renderDepthImage=True,
    )

    try:
        env = ThorNavEnv(controller, clip_curiosity, extrinsic_reward=ExtrinsicReward())

        buf, ep_reward = trainer.collect_rollout(env, horizon=EPISODE_STEPS)
        trainer.ppo_update(buf, epochs=TRAIN_EPOCHS)

        print(f"[Update {upd}/{NUM_UPDATES}] Episode reward: {ep_reward:.3f}, steps: {len(buf)}")

        # Optionally save model
        if upd % 10 == 0:
            os.makedirs("checkpoints", exist_ok=True)
            torch.save(ac.state_dict(), f"checkpoints/ac_update_{upd}.pt")

    finally:
        controller.stop()

In [5]:
idx = torch.randint(0, len(train_scenes), (1,)).item()
house = train_scenes[idx]

controller = Controller(
    scene=house,
    snapToGrid=False,
    rotateStepDegrees=30,
    renderDepthImage=True,
)

In [6]:
event = controller.step("MoveAhead")

In [12]:
event.metadata

{'objects': [{'name': 'AlarmClock|surface|4|0',
   'position': {'x': 4.625216484069824,
    'y': 0.5214699506759644,
    'z': 1.6673753261566162},
   'rotation': {'x': 0.10064005851745605,
    'y': 180.00376892089844,
    'z': 359.9861755371094},
   'visible': False,
   'isInteractable': False,
   'receptacle': False,
   'toggleable': False,
   'isToggled': False,
   'breakable': False,
   'isBroken': False,
   'canFillWithLiquid': False,
   'isFilledWithLiquid': False,
   'fillLiquid': None,
   'dirtyable': False,
   'isDirty': False,
   'canBeUsedUp': False,
   'isUsedUp': False,
   'cookable': False,
   'isCooked': False,
   'temperature': 'RoomTemp',
   'isHeatSource': False,
   'isColdSource': False,
   'sliceable': False,
   'isSliced': False,
   'openable': False,
   'isOpen': False,
   'openness': 0.0,
   'pickupable': True,
   'isPickedUp': False,
   'moveable': False,
   'mass': 0.800000011920929,
   'salientMaterials': ['Metal', 'Plastic', 'Glass'],
   'receptacleObjectIds':