In [None]:
!git clone git@github.com:facebookresearch/vggt.git

In [None]:
!mv vggt/vggt/* vggt/

In [1]:
# Example: loop over random ProcTHOR scenes
import prior
dataset = prior.load_dataset("procthor-10k")
train_scenes = dataset["train"]

NUM_UPDATES = 100


Fetching reference HEAD


    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 23990.36it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 24449.31it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 24614.32it/s]


In [2]:
from ai2thor.controller import Controller
from cons import NUM_ACTIONS, EPISODE_STEPS, DEVICE, FEAT_DIM, TRAIN_EPOCHS
from models import ActorCritic
from rl import PPOTrainer, CLIPCuriosity, ThorNavEnv, VGGTCuriosity
import torch
import os


# Create CLIP curiosity module
clip_curiosity = VGGTCuriosity(
    device=DEVICE,
    buffer_size=EPISODE_STEPS * 20,
    topk=5,
    ema_beta=0.99,
    reward_scale=1.0,
    every_n_steps=1,  # or 2/4 to save CLIP compute
)

# Example: simple extrinsic reward (optional)
def extrinsic_reward_fn(event):
    # e.g., punish failed actions slightly
    fail_penalty = 0.0
    if not event.metadata.get("lastActionSuccess", True):
        fail_penalty = -0.1
    return fail_penalty

# Build actor-critic and PPO trainer
ac = ActorCritic(feat_dim=FEAT_DIM, hidden_dim=256, num_actions=NUM_ACTIONS, device=DEVICE)
trainer = PPOTrainer(ac)


for upd in range(1, NUM_UPDATES + 1):
    # Sample random scene
    idx = torch.randint(0, len(train_scenes), (1,)).item()
    house = train_scenes[idx]

    controller = Controller(
        scene=house,
        snapToGrid=False,
        rotateStepDegrees=30,
        renderDepthImage=True,
    )

    try:
        env = ThorNavEnv(controller, clip_curiosity, extrinsic_reward_fn=extrinsic_reward_fn)

        buf, ep_reward = trainer.collect_rollout(env, horizon=EPISODE_STEPS)
        trainer.ppo_update(buf, epochs=TRAIN_EPOCHS)

        print(f"[Update {upd}/{NUM_UPDATES}] Episode reward: {ep_reward:.3f}, steps: {len(buf)}")

        # Optionally save model
        if upd % 10 == 0:
            os.makedirs("checkpoints", exist_ok=True)
            torch.save(ac.state_dict(), f"checkpoints/ac_update_{upd}.pt")

    finally:
        controller.stop()

[PPO] Epoch 4/4 Loss=0.1369 Policy=-0.0069 Value=0.3095 Entropy=1.0978 KL=-0.0090
[Update 1/100] Episode reward: -0.804, steps: 256
[PPO] Epoch 4/4 Loss=0.0636 Policy=-0.0026 Value=0.1542 Entropy=1.0945 KL=0.0012
[Update 2/100] Episode reward: 2.329, steps: 256
[PPO] Epoch 4/4 Loss=0.1742 Policy=-0.0001 Value=0.3705 Entropy=1.0930 KL=0.0018
[Update 3/100] Episode reward: -1.398, steps: 256
[PPO] Epoch 4/4 Loss=0.1600 Policy=-0.0032 Value=0.3481 Entropy=1.0919 KL=0.0017
[Update 4/100] Episode reward: -0.438, steps: 256
[PPO] Epoch 4/4 Loss=0.0741 Policy=-0.0017 Value=0.1734 Entropy=1.0888 KL=-0.0009
[Update 5/100] Episode reward: 0.831, steps: 256
[PPO] Epoch 4/4 Loss=0.1000 Policy=-0.0082 Value=0.2381 Entropy=1.0794 KL=0.0005
[Update 6/100] Episode reward: -0.511, steps: 256
[PPO] Epoch 4/4 Loss=0.0987 Policy=-0.0003 Value=0.2194 Entropy=1.0644 KL=0.0017
[Update 7/100] Episode reward: 3.427, steps: 256


KeyboardInterrupt: 