In [1]:
import prior

dataset = prior.load_dataset("procthor-10k")
dataset

    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 11395.48it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 11860.88it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 12041.21it/s]


DatasetDict(
    train=Dataset(
    dataset=procthor-dataset,
    size=10000,
    split=train
),
    val=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=val
),
    test=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=test
)
)

In [2]:
from ai2thor.controller import Controller


house = dataset["train"][3]
controller = Controller(scene=house, snapToGrid=False, rotateStepDegrees=30)
event = controller.step("Pass")
spawn = event.metadata["agent"]["position"]



In [3]:
from rl import PPO, CLIPNovelty, ClipEnv, ActorCritic
from models import FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic
from cons import FEAT_DIM, NUM_ACTIONS

ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(project_to_out_dim=False)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS) # Not important
critic = SlidingWindowTransformerCritic(FEAT_DIM) # Not important
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [4]:
from rl import inference, teleport
import torch

def get_distribution(ppo, obs_seq, actions_seq, actor_critic):
    dist = torch.distributions.Categorical(probs=torch.tensor([0.5, 0.25, 0.25]))
    return dist

all_obs = []
for i in range(12):
    event = teleport(controller)
    init_pos = event.metadata["agent"]["position"]
    obs = inference(get_distribution=get_distribution, controller=controller, ppo=ppo, init_position=init_pos, env=clip_env, actor_critic=clip_actor_critic, plot=False, n=32)
    all_obs.append(torch.stack(obs, dim=0))

all_obs_tensor = torch.cat(all_obs, dim=0)

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

In [6]:
embedding = encoder(all_obs_tensor.unsqueeze(0)).squeeze(0)

In [8]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=FEAT_DIM)
pca.fit(embedding.detach().cpu().numpy())
W = pca.components_.astype(np.float32)

In [12]:
import torch
import matplotlib.pyplot as plt
from rl import save_actor_critic, Env, RolloutBuffer, MINIBATCHES, EPISODE_STEPS, DEVICE, teleport
import wandb
import numpy as np


def train(controller, name: str, ppo: PPO, env: Env, actor_critic: ActorCritic, total_updates=10):
    run = wandb.init(
        reinit="finish_previous",
        entity="viriyadhika1",
        project="cv-final-project",
        name=name,
        config={},
    )
    try:
        event = controller.step("Pass")  # prime
        rewards = []
        episode_rewards = []
        for upd in range(total_updates):
            buf = RolloutBuffer()
            for mb in range(MINIBATCHES):
                # collect episodes
                episode_seq = []
                episode_reward = 0
                actions_seq = []
                for t in range(1, EPISODE_STEPS + 1):
                    with torch.no_grad():
                        obs_t = ppo.obs_from_event(event)  # (C,H,W)
                        obs_t_encoded = actor_critic.actor_critic_encoder(obs_t.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
                        obs_seq = torch.stack(episode_seq + [obs_t_encoded], dim=0).unsqueeze(0).to(device=DEVICE)

                    if len(actions_seq) == 0:
                        actions_seq.append(torch.randint(0, NUM_ACTIONS, (1, 1)).item())

                    actions_tensor = torch.tensor(actions_seq, dtype=torch.long, device=DEVICE).unsqueeze(0)
                    logits, value = ppo.act_and_value(obs_seq, actions_tensor, actor_critic)
                    dist = torch.distributions.Categorical(logits=logits)
                    action_idx = dist.sample()
                    logp = dist.log_prob(action_idx)

                    action_idx, logp = action_idx.item(), logp.item()
                    event, reward = env.step_env(controller, action_idx)
                    done = t == EPISODE_STEPS

                    # store one step
                    buf.add(obs_t_encoded, action_idx, logp, reward, value, done)
                    episode_seq.append(obs_t_encoded)
                    actions_seq.append(action_idx)

                    wandb.log({ "reward": reward })

                    episode_reward += reward / EPISODE_STEPS

                    # 50% chance of teleport
                    if done:
                        env.reset()
                        if np.random.rand() > 0.5:
                            event = teleport(controller)

                wandb.log({ "episode_reward": episode_reward })

            ppo.ppo_update(buf, actor_critic)
            if (upd + 1) % 10 == 0:
                save_actor_critic(actor_critic, f"data/{name}_{upd}.pt")
            save_actor_critic(actor_critic, f"data/{name}.pt")

            print(f"Update {upd+1}/{total_updates} — steps: {len(buf)}")
    finally:
        run.finish()

    return buf, rewards, episode_rewards

In [None]:
from rl import PPO, CLIPNovelty, ClipEnv, ActorCritic
from models import FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic
from cons import FEAT_DIM, NUM_ACTIONS
from models import FrozenResNetPCAEncoder
from cons import DEVICE


ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetPCAEncoder(FEAT_DIM, torch.from_numpy(W), device=DEVICE)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [None]:
train(controller, "pca_resnet", ppo, clip_env, clip_actor_critic, 100)

[PPO] Epoch 0: Loss=63.8529, Policy=0.0028, Value=127.8073
[PPO] Epoch 10: Loss=58.0585, Policy=0.0342, Value=116.1520
[PPO] Epoch 20: Loss=44.9031, Policy=0.0139, Value=89.8846
[PPO] Epoch 30: Loss=39.3054, Policy=-0.0001, Value=78.7180
Approx KL Learned: 0.014651771634817123
[✅] Actor-Critic checkpoint saved to data/pca_resnet.pt
Update 1/100 — steps: 4096
[PPO] Epoch 0: Loss=87.4955, Policy=0.0033, Value=175.0931
[PPO] Epoch 10: Loss=63.6732, Policy=0.0078, Value=127.4387
[PPO] Epoch 20: Loss=52.0177, Policy=-0.0016, Value=104.1472
[PPO] Epoch 30: Loss=46.9801, Policy=-0.0001, Value=94.0685
Approx KL Learned: 0.005950332153588533
[✅] Actor-Critic checkpoint saved to data/pca_resnet.pt
Update 2/100 — steps: 4096
[PPO] Epoch 0: Loss=77.9881, Policy=0.0018, Value=156.0811
[PPO] Epoch 10: Loss=65.1128, Policy=0.0001, Value=130.3330
[PPO] Epoch 20: Loss=58.1767, Policy=-0.0024, Value=116.4656
[PPO] Epoch 30: Loss=53.0081, Policy=-0.0052, Value=106.1342
Approx KL Learned: 0.01072727143764

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f4a45c47690>>
Traceback (most recent call last):
  File "/home/juyuanli/miniconda3/envs/nav_assistant/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


BrokenPipeError: [Errno 32] Broken pipe



# Fine Tuning

In [7]:
from rl import load_actor_critic

load_actor_critic(clip_actor_critic, "data/pca_resnet.pt")

[🔁] Actor-Critic checkpoint loaded from data/pca_resnet.pt


In [13]:
for g in clip_actor_critic.optimizer.param_groups:
    g['lr'] = g['lr'] / 5

train(controller, "pca_resnet_fine_tune", ppo, clip_env, clip_actor_critic, 25)

[PPO] Epoch 0: Loss=140.6094, Policy=0.0125, Value=281.2484
[PPO] Epoch 10: Loss=139.1055, Policy=0.0073, Value=278.2513
[PPO] Epoch 20: Loss=137.5756, Policy=0.0172, Value=275.1718
[PPO] Epoch 30: Loss=135.2380, Policy=0.0169, Value=270.4968
Approx KL Learned: 0.019346196204423904
[✅] Actor-Critic checkpoint saved to data/pca_resnet_fine_tune.pt
Update 1/25 — steps: 4096
[PPO] Epoch 0: Loss=81.7055, Policy=0.0111, Value=163.4269
[PPO] Epoch 10: Loss=81.0130, Policy=0.0116, Value=162.0410
[PPO] Epoch 20: Loss=81.0892, Policy=0.0103, Value=162.1962
[PPO] Epoch 30: Loss=80.6467, Policy=0.0118, Value=161.3081
Approx KL Learned: 0.018562152981758118
[✅] Actor-Critic checkpoint saved to data/pca_resnet_fine_tune.pt
Update 2/25 — steps: 4096
[PPO] Epoch 0: Loss=65.4218, Policy=0.0119, Value=130.8434
[PPO] Epoch 10: Loss=64.8348, Policy=0.0058, Value=129.6814
[PPO] Epoch 20: Loss=64.3869, Policy=0.0111, Value=128.7751
[PPO] Epoch 30: Loss=63.7150, Policy=0.0051, Value=127.4432
Approx KL Learn

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/pca_resnet_fine_tune.pt
Update 25/25 — steps: 4096


0,1
episode_reward,▁▁▇██▇▇████████▅█▃███▇███▇▇████▆▇▃▆▂▇███
reward,▃▄▂▁▅▆▄▄▅▇▆▆▇▄▇▅▇▄▂▇▂▅▇▁▄▂▅▇▃▇▇▃▄▅▇▄▃▇▄█

0,1
episode_reward,0.6717
reward,0.80101


(<rl.RolloutBuffer at 0x7f067c45da90>, [], [])

In [14]:
controller.stop()

In [15]:
house = dataset["train"][8084]
controller = Controller(scene=house, snapToGrid=False, rotateStepDegrees=30)
event = controller.step("Pass")
spawn = event.metadata["agent"]["position"]



In [16]:
from rl import PPO, CLIPNovelty, ClipEnv, ActorCritic
from models import FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic
from cons import FEAT_DIM, NUM_ACTIONS

ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(project_to_out_dim=False)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS) # Not important
critic = SlidingWindowTransformerCritic(FEAT_DIM) # Not important
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

from rl import inference, teleport
import torch

def get_distribution(ppo, obs_seq, actions_seq, actor_critic):
    dist = torch.distributions.Categorical(probs=torch.tensor([0.5, 0.25, 0.25]))
    return dist

all_obs = []
for i in range(12):
    event = teleport(controller)
    init_pos = event.metadata["agent"]["position"]
    obs = inference(get_distribution=get_distribution, controller=controller, ppo=ppo, init_position=init_pos, env=clip_env, actor_critic=clip_actor_critic, plot=False, n=32)
    all_obs.append(torch.stack(obs, dim=0))

all_obs_tensor = torch.cat(all_obs, dim=0)

embedding = encoder(all_obs_tensor.unsqueeze(0)).squeeze(0)

from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=FEAT_DIM)
pca.fit(embedding.detach().cpu().numpy())
W = pca.components_.astype(np.float32)

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

In [17]:
from rl import PPO, CLIPNovelty, ClipEnv, ActorCritic
from models import FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic
from cons import FEAT_DIM, NUM_ACTIONS
from models import FrozenResNetPCAEncoder
from cons import DEVICE


ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetPCAEncoder(FEAT_DIM, torch.from_numpy(W), device=DEVICE)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [18]:
train(controller, "pca_resnet_8084", ppo, clip_env, clip_actor_critic, 50)

[PPO] Epoch 0: Loss=60.9980, Policy=0.0048, Value=122.0869
[PPO] Epoch 10: Loss=52.1532, Policy=0.0070, Value=104.3999
[PPO] Epoch 20: Loss=44.2574, Policy=-0.0020, Value=88.6206
[PPO] Epoch 30: Loss=37.2804, Policy=-0.0109, Value=74.6877
Approx KL Learned: 0.022272253409028053
[✅] Actor-Critic checkpoint saved to data/pca_resnet_8084.pt
Update 1/50 — steps: 4096
[PPO] Epoch 0: Loss=67.2917, Policy=0.0030, Value=134.6823
[PPO] Epoch 10: Loss=51.2293, Policy=-0.0003, Value=102.5662
[PPO] Epoch 20: Loss=41.5690, Policy=-0.0042, Value=83.2524
[PPO] Epoch 30: Loss=35.7544, Policy=-0.0063, Value=71.6274
Approx KL Learned: 0.013274455443024635
[✅] Actor-Critic checkpoint saved to data/pca_resnet_8084.pt
Update 2/50 — steps: 4096
[PPO] Epoch 0: Loss=66.3476, Policy=0.0033, Value=132.7949
[PPO] Epoch 10: Loss=49.3431, Policy=-0.0033, Value=98.8001
[PPO] Epoch 20: Loss=40.4885, Policy=-0.0057, Value=81.0957
[PPO] Epoch 30: Loss=34.9161, Policy=-0.0102, Value=69.9599
Approx KL Learned: 0.0151218

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/pca_resnet_8084_49.pt
[✅] Actor-Critic checkpoint saved to data/pca_resnet_8084.pt
Update 50/50 — steps: 4096


0,1
episode_reward,▁▁▁▂▂▂▃▄▄▄▄▅▅▆▅▅▅▅▅▅▅▅▅▅▄▆▄▅▆▆▆▅▆▅▆▆▅▇▄█
reward,▁▃▁▄▂▃▂▄▃▃▅▃▁▃▇▃▅▁▂▇▄▂▃▅▅▇▆▇▆█▂▅▃▃▆▅▇▂▄▅

0,1
episode_reward,0.57813
reward,0.83237


(<rl.RolloutBuffer at 0x7effe9665b10>, [], [])

In [19]:
for g in clip_actor_critic.optimizer.param_groups:
    g['lr'] = g['lr'] / 5

train(controller, "pca_resnet_fine_tune_8084", ppo, clip_env, clip_actor_critic, 25)

[PPO] Epoch 0: Loss=77.0599, Policy=0.0071, Value=154.1622
[PPO] Epoch 10: Loss=55.6439, Policy=0.0103, Value=111.3223
[PPO] Epoch 20: Loss=43.0944, Policy=0.0024, Value=86.2393
[PPO] Epoch 30: Loss=35.5967, Policy=-0.0009, Value=71.2520
Approx KL Learned: 0.022871196269989014
[✅] Actor-Critic checkpoint saved to data/pca_resnet_fine_tune_8084.pt
Update 1/25 — steps: 4096
[PPO] Epoch 0: Loss=171.2326, Policy=0.0075, Value=342.5032
[PPO] Epoch 10: Loss=96.8148, Policy=0.0066, Value=193.6698
[PPO] Epoch 20: Loss=69.7157, Policy=0.0027, Value=139.4819
[PPO] Epoch 30: Loss=52.2602, Policy=0.0035, Value=104.5765
Approx KL Learned: 0.03975043445825577
[✅] Actor-Critic checkpoint saved to data/pca_resnet_fine_tune_8084.pt
Update 2/25 — steps: 4096
[PPO] Epoch 0: Loss=42.9412, Policy=0.0064, Value=85.9063
[PPO] Epoch 10: Loss=28.8299, Policy=0.0032, Value=57.6894
[PPO] Epoch 20: Loss=18.7944, Policy=0.0012, Value=37.6231
[PPO] Epoch 30: Loss=15.4073, Policy=-0.0062, Value=30.8649
Approx KL Lea

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/pca_resnet_fine_tune_8084.pt
Update 25/25 — steps: 4096


0,1
episode_reward,▆▁▇█▇▇▇▄▅▅█████████▄█▇▅███▇██████▇█▅▅▆▆█
reward,▃█▇▅▃▃▆▁▇▅▇▄▅▆▅▆▃▅▃▄▆▇▇▅▄▄▅▄▃▃▄▇▅▃▅▇▂▅▃▇

0,1
episode_reward,0.56152
reward,0.95571


(<rl.RolloutBuffer at 0x7effe9262910>, [], [])