In [None]:
!git clone git@github.com:facebookresearch/vggt.git

In [None]:
!mv vggt/vggt/* vggt/

In [2]:
# Example: loop over random ProcTHOR scenes
import prior
dataset = prior.load_dataset("procthor-10k")
train_scenes = dataset["train"]

NUM_UPDATES = 100


    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 11645.93it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 12278.87it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 12267.81it/s]


In [3]:
from ai2thor.controller import Controller
from cons import NUM_ACTIONS, EPISODE_STEPS, DEVICE, FEAT_DIM, TRAIN_EPOCHS
from models import ActorCritic
from rl import PPOTrainer, CLIPCuriosity, ThorNavEnv, VGGTCuriosity
import torch
import os


# Create CLIP curiosity module
clip_curiosity = VGGTCuriosity(
    device=DEVICE,
    buffer_size=EPISODE_STEPS * 20,
    topk=5,
    ema_beta=0.99,
    reward_scale=1.0,
    every_n_steps=1,  # or 2/4 to save CLIP compute
)

# Example: simple extrinsic reward (optional)
def extrinsic_reward_fn(event):
    # e.g., punish failed actions slightly
    fail_penalty = 0.0
    if not event.metadata.get("lastActionSuccess", True):
        fail_penalty = -0.1
    return fail_penalty

# Build actor-critic and PPO trainer
ac = ActorCritic(feat_dim=FEAT_DIM, hidden_dim=256, num_actions=NUM_ACTIONS, device=DEVICE)
trainer = PPOTrainer(ac)


for upd in range(1, NUM_UPDATES + 1):
    # Sample random scene
    idx = torch.randint(0, len(train_scenes), (1,)).item()
    house = train_scenes[idx]

    controller = Controller(
        scene=house,
        snapToGrid=False,
        rotateStepDegrees=30,
        renderDepthImage=True,
    )

    try:
        env = ThorNavEnv(controller, clip_curiosity, extrinsic_reward_fn=extrinsic_reward_fn)

        buf, ep_reward = trainer.collect_rollout(env, horizon=EPISODE_STEPS)
        trainer.ppo_update(buf, epochs=TRAIN_EPOCHS)

        print(f"[Update {upd}/{NUM_UPDATES}] Episode reward: {ep_reward:.3f}, steps: {len(buf)}")

        # Optionally save model
        if upd % 10 == 0:
            os.makedirs("checkpoints", exist_ok=True)
            torch.save(ac.state_dict(), f"checkpoints/ac_update_{upd}.pt")

    finally:
        controller.stop()

  from .autonotebook import tqdm as notebook_tqdm


[PPO] Epoch 4/4 Loss=0.0496 Policy=-0.0018 Value=0.1247 Entropy=1.0965 KL=0.0000
[Update 1/100] Episode reward: -5.831, steps: 256




[PPO] Epoch 4/4 Loss=0.0222 Policy=-0.0010 Value=0.0682 Entropy=1.0958 KL=0.0017
[Update 2/100] Episode reward: -0.753, steps: 256




[PPO] Epoch 4/4 Loss=0.0422 Policy=-0.0067 Value=0.1195 Entropy=1.0943 KL=0.0018
[Update 3/100] Episode reward: -4.915, steps: 256




[PPO] Epoch 4/4 Loss=0.1164 Policy=-0.0020 Value=0.2587 Entropy=1.0889 KL=0.0013
[Update 4/100] Episode reward: -2.511, steps: 256




[PPO] Epoch 4/4 Loss=-0.0058 Policy=-0.0082 Value=0.0262 Entropy=1.0748 KL=0.0074
[Update 5/100] Episode reward: -2.763, steps: 256




[PPO] Epoch 4/4 Loss=0.0193 Policy=-0.0021 Value=0.0637 Entropy=1.0447 KL=0.0012
[Update 6/100] Episode reward: -0.844, steps: 256




[PPO] Epoch 4/4 Loss=0.0380 Policy=0.0002 Value=0.0965 Entropy=1.0394 KL=0.0007
[Update 7/100] Episode reward: 2.954, steps: 256




[PPO] Epoch 4/4 Loss=0.0158 Policy=-0.0008 Value=0.0538 Entropy=1.0290 KL=-0.0000
[Update 8/100] Episode reward: -1.784, steps: 256




[PPO] Epoch 4/4 Loss=-0.0052 Policy=-0.0042 Value=0.0183 Entropy=1.0112 KL=0.0021
[Update 9/100] Episode reward: -0.308, steps: 256




[PPO] Epoch 4/4 Loss=-0.0127 Policy=-0.0117 Value=0.0175 Entropy=0.9819 KL=0.0042
[Update 10/100] Episode reward: -0.698, steps: 256




[PPO] Epoch 4/4 Loss=0.0024 Policy=-0.0045 Value=0.0316 Entropy=0.8875 KL=-0.0057
[Update 11/100] Episode reward: -0.754, steps: 256




[PPO] Epoch 4/4 Loss=0.0085 Policy=0.0002 Value=0.0339 Entropy=0.8689 KL=-0.0048
[Update 12/100] Episode reward: 0.827, steps: 256




[PPO] Epoch 4/4 Loss=-0.0059 Policy=-0.0005 Value=0.0059 Entropy=0.8357 KL=0.0002
[Update 13/100] Episode reward: 0.296, steps: 256




[PPO] Epoch 4/4 Loss=0.0224 Policy=-0.0024 Value=0.0672 Entropy=0.8870 KL=-0.0015
[Update 14/100] Episode reward: 0.379, steps: 256




[PPO] Epoch 4/4 Loss=0.0214 Policy=-0.0003 Value=0.0607 Entropy=0.8675 KL=0.0023
[Update 15/100] Episode reward: -0.579, steps: 256




[PPO] Epoch 4/4 Loss=0.0363 Policy=-0.0019 Value=0.0940 Entropy=0.8800 KL=-0.0001
[Update 16/100] Episode reward: -1.107, steps: 256




[PPO] Epoch 4/4 Loss=0.0416 Policy=-0.0021 Value=0.1056 Entropy=0.9074 KL=-0.0015
[Update 17/100] Episode reward: 0.914, steps: 256




[PPO] Epoch 4/4 Loss=0.0013 Policy=-0.0017 Value=0.0235 Entropy=0.8789 KL=0.0037
[Update 18/100] Episode reward: -0.019, steps: 256




[PPO] Epoch 4/4 Loss=0.1599 Policy=-0.0013 Value=0.3417 Entropy=0.9617 KL=0.0015
[Update 19/100] Episode reward: -2.236, steps: 256




[PPO] Epoch 4/4 Loss=0.0049 Policy=-0.0020 Value=0.0326 Entropy=0.9357 KL=-0.0007
[Update 20/100] Episode reward: 0.275, steps: 256




[PPO] Epoch 4/4 Loss=-0.0008 Policy=-0.0003 Value=0.0170 Entropy=0.9046 KL=-0.0010
[Update 21/100] Episode reward: -0.081, steps: 256




[PPO] Epoch 4/4 Loss=0.0072 Policy=-0.0006 Value=0.0341 Entropy=0.9246 KL=0.0002
[Update 22/100] Episode reward: -0.519, steps: 256




[PPO] Epoch 4/4 Loss=0.0566 Policy=-0.0000 Value=0.1319 Entropy=0.9342 KL=-0.0006
[Update 23/100] Episode reward: 0.579, steps: 256




[PPO] Epoch 4/4 Loss=0.0068 Policy=-0.0039 Value=0.0407 Entropy=0.9695 KL=-0.0009
[Update 24/100] Episode reward: 0.628, steps: 256




[PPO] Epoch 4/4 Loss=0.0058 Policy=-0.0006 Value=0.0321 Entropy=0.9648 KL=0.0052
[Update 25/100] Episode reward: -0.104, steps: 256




[PPO] Epoch 4/4 Loss=0.0594 Policy=-0.0013 Value=0.1411 Entropy=0.9815 KL=0.0002
[Update 26/100] Episode reward: -3.328, steps: 256




[PPO] Epoch 4/4 Loss=-0.0006 Policy=-0.0047 Value=0.0273 Entropy=0.9570 KL=0.0019
[Update 27/100] Episode reward: -0.924, steps: 256




KeyboardInterrupt: 

In [None]:
from ai2thor.controller import Controller
from cons import NUM_ACTIONS, EPISODE_STEPS, DEVICE, FEAT_DIM, TRAIN_EPOCHS
from models import ActorCritic
from rl import PPOTrainer, CLIPCuriosity, ThorNavEnv, RNDCuriosity
import torch
import os


# Create CLIP curiosity module
clip_curiosity = RNDCuriosity(
    device=DEVICE,
    buffer_size=EPISODE_STEPS * 20,
    topk=5,
    ema_beta=0.99,
    reward_scale=1.0,
    every_n_steps=1,  # or 2/4 to save CLIP compute
)

# Example: simple extrinsic reward (optional)
def extrinsic_reward_fn(event):
    # e.g., punish failed actions slightly
    fail_penalty = 0.0
    if not event.metadata.get("lastActionSuccess", True):
        fail_penalty = -0.1
    return fail_penalty

# Build actor-critic and PPO trainer
ac = ActorCritic(feat_dim=FEAT_DIM, hidden_dim=256, num_actions=NUM_ACTIONS, device=DEVICE)
trainer = PPOTrainer(ac)


for upd in range(1, NUM_UPDATES + 1):
    # Sample random scene
    idx = torch.randint(0, len(train_scenes), (1,)).item()
    house = train_scenes[idx]

    controller = Controller(
        scene=house,
        snapToGrid=False,
        rotateStepDegrees=30,
        renderDepthImage=True,
    )

    try:
        env = ThorNavEnv(controller, clip_curiosity, extrinsic_reward_fn=extrinsic_reward_fn)

        buf, ep_reward = trainer.collect_rollout(env, horizon=EPISODE_STEPS)
        trainer.ppo_update(buf, epochs=TRAIN_EPOCHS)

        print(f"[Update {upd}/{NUM_UPDATES}] Episode reward: {ep_reward:.3f}, steps: {len(buf)}")

        # Optionally save model
        if upd % 10 == 0:
            os.makedirs("checkpoints", exist_ok=True)
            torch.save(ac.state_dict(), f"checkpoints/ac_update_{upd}.pt")

    finally:
        controller.stop()