In [1]:
import prior

dataset = prior.load_dataset("procthor-10k")
dataset

    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 11514.66it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 11999.91it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 12129.35it/s]


DatasetDict(
    train=Dataset(
    dataset=procthor-dataset,
    size=10000,
    split=train
),
    val=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=val
),
    test=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=test
)
)

In [2]:
from ai2thor.controller import Controller


house = dataset["train"][3]
controller = Controller(scene=house, snapToGrid=False, rotateStepDegrees=30)
event = controller.step("Pass")
spawn = event.metadata["agent"]["position"]



In [3]:
import numpy as np
def teleport(controller, target=None):
    event = controller.step("GetReachablePositions")
    reachable_positions = event.metadata["actionReturn"]
    # Pick a random target
    if target is None:
        target = np.random.choice(reachable_positions)

    event = controller.step(
        action="TeleportFull",
        x=target["x"],
        y=target["y"],
        z=target["z"],
        rotation={"x": 0, "y": 0, "z": 0},
        horizon=0,
        standing=True
    )

    return event


In [4]:
from rl import PPO, ActorCritic, Env, RolloutBuffer, ClipEnv, CLIPNovelty, ClipEnvNoCuriosity, ClipEnvNoPenalty
from models import LSTMActor, LSTMCritic, FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic, SmallCNNEncoder, CompleteFrozenResNetEncoder
from cons import MINIBATCHES, EPISODE_STEPS, FEAT_DIM, NUM_ACTIONS, DEVICE
import wandb

In [None]:
wandb.login()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/juyuanli/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mviriyadhika-putra[0m ([33mviriyadhika1[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [5]:
import torch
import matplotlib.pyplot as plt
from rl import save_actor_critic


def train(controller, name: str, ppo: PPO, env: Env, actor_critic: ActorCritic, total_updates=10):
    run = wandb.init(
        reinit="finish_previous",
        entity="viriyadhika1",
        project="cv-final-project",
        name=name,
        config={},
    )
    try:
        event = controller.step("Pass")  # prime
        rewards = []
        for upd in range(total_updates):
            buf = RolloutBuffer()
            for mb in range(MINIBATCHES):
                # collect episodes
                episode_seq = []
                episode_reward = 0
                actions_seq = []
                for t in range(1, EPISODE_STEPS + 1):
                    with torch.no_grad():
                        obs_t = ppo.obs_from_event(event)  # (C,H,W)
                        obs_encoded = actor_critic.actor_critic_encoder(obs_t.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0) # (D)
                        obs_seq = torch.stack(episode_seq + [obs_encoded], dim=0).to(device=DEVICE)

                    if len(actions_seq) == 0:
                        actions_seq.append(torch.randint(0, NUM_ACTIONS, (1, 1)).item())
                    
                    actions_tensor = torch.tensor(actions_seq, dtype=torch.long, device=DEVICE)
                    logits, value = ppo.act_and_value(obs_seq, actions_tensor, actor_critic)
                    dist = torch.distributions.Categorical(logits=logits)
                    action_idx = dist.sample()
                    logp = dist.log_prob(action_idx)
                    
                    action_idx, logp = action_idx.item(), logp.item()
                    event, reward = env.step_env(controller, action_idx)
                    done = t == EPISODE_STEPS

                    # store one step
                    buf.add(obs_t, action_idx, logp, reward, value, done)
                    episode_seq.append(obs_encoded)
                    actions_seq.append(action_idx)

                    wandb.log({ "reward": reward })
                    
                    episode_reward += reward / EPISODE_STEPS

                    # 50% chance of teleport
                    if done:
                        env.reset()
                        if np.random.rand() > 0.5:
                            event = teleport(controller)
                wandb.log({ "episode_reward": episode_reward })
                    
            ppo.ppo_update(buf, actor_critic)
            if (upd + 1) % 10 == 0:
                save_actor_critic(actor_critic, f"data/{name}_{upd}.pt")
            save_actor_critic(actor_critic, f"data/{name}.pt")
            
            print(f"Update {upd+1}/{total_updates} — steps: {len(buf)}")
    finally:
        run.finish()
    return buf, rewards

# Base Case

In [10]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [11]:
train(controller, "complete_frozen", ppo, clip_env, clip_actor_critic, 70)

[PPO] Epoch 0: Loss=59.9503, Policy=0.0163, Value=119.9654
[PPO] Epoch 10: Loss=47.9999, Policy=0.0010, Value=96.0957
[PPO] Epoch 20: Loss=38.8011, Policy=-0.0122, Value=77.7274
[PPO] Epoch 30: Loss=30.7381, Policy=-0.0163, Value=61.6093
Approx KL Learned: 0.02442093938589096
[✅] Actor-Critic checkpoint saved to data/complete_frozen.pt
Update 1/70 — steps: 4096
[PPO] Epoch 0: Loss=70.5181, Policy=0.0257, Value=141.0857
[PPO] Epoch 10: Loss=57.9266, Policy=0.0093, Value=115.9329
[PPO] Epoch 20: Loss=49.7256, Policy=0.0011, Value=99.5497
[PPO] Epoch 30: Loss=45.4388, Policy=-0.0059, Value=90.9885
Approx KL Learned: 0.01594380848109722
[✅] Actor-Critic checkpoint saved to data/complete_frozen.pt
Update 2/70 — steps: 4096
[PPO] Epoch 0: Loss=80.0377, Policy=0.0113, Value=160.1536
[PPO] Epoch 10: Loss=64.5547, Policy=-0.0002, Value=129.2093
[PPO] Epoch 20: Loss=59.1423, Policy=-0.0058, Value=118.3970
[PPO] Epoch 30: Loss=52.0829, Policy=-0.0088, Value=104.2845
Approx KL Learned: 0.018151357

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/complete_frozen.pt
Update 70/70 — steps: 4096


0,1
episode_reward,▂▄▄▄▃▅▅▃▇▅▅▆▆▆▅▇▂▁▄▆▅▄▂▆▄▃▇▆▇▇▇▇▇▇█▅▇▇▃▇
reward,▅▅▇▃▅▅▃▅▆▃▇▄▃▅▅▇▃█▃▅▃▅▅█▅▁▇▆▅▆▅▄▅▅▁▅▅▅▅▄

0,1
episode_reward,0.11117
reward,0.14491


(<rl.RolloutBuffer at 0x7f30fac9ec10>, [])

# Small CNN from scratch

In [12]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = SmallCNNEncoder(FEAT_DIM, device=DEVICE)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [13]:
train(controller, "cnn_scratch", ppo, clip_env, clip_actor_critic, 70)

[PPO] Epoch 0: Loss=73.7399, Policy=0.1012, Value=147.3774
[PPO] Epoch 10: Loss=60.1966, Policy=0.0166, Value=120.4663
[PPO] Epoch 20: Loss=49.2619, Policy=0.0184, Value=98.5938
[PPO] Epoch 30: Loss=44.2495, Policy=0.0075, Value=88.5905
Approx KL Learned: 0.016711343079805374
[✅] Actor-Critic checkpoint saved to data/cnn_scratch.pt
Update 1/70 — steps: 4096
[PPO] Epoch 0: Loss=78.9562, Policy=0.0143, Value=157.9920
[PPO] Epoch 10: Loss=75.9041, Policy=0.0097, Value=151.8974
[PPO] Epoch 20: Loss=68.6385, Policy=0.0103, Value=137.3648
[PPO] Epoch 30: Loss=67.8989, Policy=0.0061, Value=135.8940
Approx KL Learned: 0.017251770943403244
[✅] Actor-Critic checkpoint saved to data/cnn_scratch.pt
Update 2/70 — steps: 4096
[PPO] Epoch 0: Loss=93.0688, Policy=0.0099, Value=186.2264
[PPO] Epoch 10: Loss=72.2310, Policy=0.0079, Value=144.5544
[PPO] Epoch 20: Loss=69.4960, Policy=0.0037, Value=139.0918
[PPO] Epoch 30: Loss=60.6914, Policy=0.0015, Value=121.4874
Approx KL Learned: 0.011434593237936497

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/cnn_scratch.pt
Update 70/70 — steps: 4096


0,1
episode_reward,▂▂▂▂▂▂▃▃▂▂▄▃▁▄▂▄▃▄▄▂▄▂▁▇▅▆▆▆▆▅▇▂▂▃▄▇▂██▂
reward,▂▃▄▁▃▃▃▂▄▄▄▄▄█▁▁▄▅▃▄▄▇▇▂▅▇▂▁▄▆▆▄▅▆▇▂▅▄▆▅

0,1
episode_reward,0.09083
reward,-0.15833


(<rl.RolloutBuffer at 0x7f310c385d10>, [])

# Novelty no curiosity

In [14]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnvNoCuriosity(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [15]:
train(controller, "no_curiosity", ppo, clip_env, clip_actor_critic, 70)

[PPO] Epoch 0: Loss=73.4061, Policy=0.0488, Value=146.8185
[PPO] Epoch 10: Loss=62.1843, Policy=0.0139, Value=124.4409
[PPO] Epoch 20: Loss=52.0533, Policy=-0.0003, Value=104.2089
[PPO] Epoch 30: Loss=41.1023, Policy=-0.0120, Value=82.3326
Approx KL Learned: 0.021820876747369766
[✅] Actor-Critic checkpoint saved to data/no_curiosity.pt
Update 1/70 — steps: 4096
[PPO] Epoch 0: Loss=99.5681, Policy=0.0047, Value=199.2318
[PPO] Epoch 10: Loss=73.3735, Policy=0.0056, Value=146.8420
[PPO] Epoch 20: Loss=58.4018, Policy=-0.0008, Value=116.9102
[PPO] Epoch 30: Loss=43.6976, Policy=-0.0021, Value=87.5054
Approx KL Learned: 0.019348368048667908
[✅] Actor-Critic checkpoint saved to data/no_curiosity.pt
Update 2/70 — steps: 4096
[PPO] Epoch 0: Loss=92.3840, Policy=0.0067, Value=184.8607
[PPO] Epoch 10: Loss=69.4481, Policy=-0.0022, Value=139.0043
[PPO] Epoch 20: Loss=59.9321, Policy=-0.0059, Value=119.9814
[PPO] Epoch 30: Loss=53.8703, Policy=-0.0071, Value=107.8589
Approx KL Learned: 0.015879973

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/no_curiosity.pt
Update 70/70 — steps: 4096


0,1
episode_reward,▄▄▄▄▄▄▄▅▅▆▆▆▆▅▆▄▅▇▇▆▇▇▇▇▄▇▇▄█▇▂██▁█▃▆█▄▆
reward,▂▃▃▃▃▄▁▁▄▄▅▄▅▅▁▅▅▄▆▄▇▇▁▅▆▄▆▅▇▇▁▄▅█▃▁█▅▇▅

0,1
episode_reward,0.93719
reward,1.29184


(<rl.RolloutBuffer at 0x7f30fa926890>, [])

# No Penalty

In [16]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnvNoPenalty(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [17]:
train(controller, "no_penalty", ppo, clip_env, clip_actor_critic, 70)

[PPO] Epoch 0: Loss=65.4953, Policy=0.0857, Value=130.9178
[PPO] Epoch 10: Loss=51.6912, Policy=-0.0050, Value=103.4971
[PPO] Epoch 20: Loss=40.1727, Policy=-0.0121, Value=80.4742
[PPO] Epoch 30: Loss=34.3525, Policy=-0.0199, Value=68.8473
Approx KL Learned: 0.03267689049243927
[✅] Actor-Critic checkpoint saved to data/no_penalty.pt
Update 1/70 — steps: 4096
[PPO] Epoch 0: Loss=77.4911, Policy=0.0199, Value=155.0448
[PPO] Epoch 10: Loss=55.4245, Policy=0.0059, Value=110.9388
[PPO] Epoch 20: Loss=48.7491, Policy=-0.0028, Value=97.6073
[PPO] Epoch 30: Loss=44.1474, Policy=-0.0084, Value=88.4146
Approx KL Learned: 0.02021777629852295
[✅] Actor-Critic checkpoint saved to data/no_penalty.pt
Update 2/70 — steps: 4096
[PPO] Epoch 0: Loss=82.1502, Policy=0.0001, Value=164.4027
[PPO] Epoch 10: Loss=65.2993, Policy=-0.0067, Value=130.7158
[PPO] Epoch 20: Loss=56.9372, Policy=-0.0138, Value=114.0059
[PPO] Epoch 30: Loss=52.7530, Policy=-0.0180, Value=105.6455
Approx KL Learned: 0.0197936035692691

[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/no_penalty.pt
Update 70/70 — steps: 4096


0,1
episode_reward,▄▄▃▄▄▅▄▆▇▅▅█▇█▅▅▅▆▆▇▆█▅▁▁▁▂▅▅▆▆▅▅▄▅██▅▅▆
reward,▅▂▁▁▅▆▂▄▃▄▅▃▅▇▃▆▄▄▇▆▃▃▆▆▁▄▆▅▇█▁▃▄▃▇▆▇▃▃▁

0,1
episode_reward,0.29041
reward,0.04774


(<rl.RolloutBuffer at 0x7f310bd83850>, [])

# Low LR

In [8]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [None]:
train(controller, "complete_frozen_low_lr", ppo, clip_env, clip_actor_critic, 50)

[PPO] Epoch 0: Loss=56.8951, Policy=0.0692, Value=113.7568
[PPO] Epoch 10: Loss=53.8160, Policy=0.0124, Value=107.7013
[PPO] Epoch 20: Loss=50.8545, Policy=0.0058, Value=101.7988
[PPO] Epoch 30: Loss=48.0613, Policy=-0.0056, Value=96.2320
Approx KL Learned: 0.023636091500520706
[✅] Actor-Critic checkpoint saved to data/complete_frozen_low_lr.pt
Update 1/50 — steps: 4096
[PPO] Epoch 0: Loss=52.4439, Policy=0.0078, Value=104.9725
[PPO] Epoch 10: Loss=46.3417, Policy=0.0014, Value=92.7801
[PPO] Epoch 20: Loss=44.4827, Policy=-0.0026, Value=89.0699
[PPO] Epoch 30: Loss=42.6265, Policy=-0.0048, Value=85.3612
Approx KL Learned: 0.016063999384641647
[✅] Actor-Critic checkpoint saved to data/complete_frozen_low_lr.pt
Update 2/50 — steps: 4096
[PPO] Epoch 0: Loss=53.8225, Policy=0.0022, Value=107.7399
[PPO] Epoch 10: Loss=52.1844, Policy=-0.0007, Value=104.4716
[PPO] Epoch 20: Loss=50.1011, Policy=-0.0077, Value=100.3186
[PPO] Epoch 30: Loss=47.6393, Policy=-0.0116, Value=95.4034
Approx KL Lear

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f1b295f7650>>
Traceback (most recent call last):
  File "/home/juyuanli/miniconda3/envs/nav_assistant/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 781, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 
