In [1]:
import prior

dataset = prior.load_dataset("procthor-10k")
dataset

Fetching reference HEAD


    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 23257.16it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 23577.02it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 24024.70it/s]


DatasetDict(
    train=Dataset(
    dataset=procthor-dataset,
    size=10000,
    split=train
),
    val=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=val
),
    test=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=test
)
)

In [2]:
from ai2thor.controller import Controller


house = dataset["train"][3]
controller = Controller(scene=house, snapToGrid=False, rotateStepDegrees=30)
event = controller.step("Pass")
spawn = event.metadata["agent"]["position"]

In [3]:
import numpy as np
def teleport(controller, target=None):
    event = controller.step("GetReachablePositions")
    reachable_positions = event.metadata["actionReturn"]
    # Pick a random target
    if target is None:
        target = np.random.choice(reachable_positions)

    event = controller.step(
        action="TeleportFull",
        x=target["x"],
        y=target["y"],
        z=target["z"],
        rotation={"x": 0, "y": 0, "z": 0},
        horizon=0,
        standing=True
    )

    return event


In [4]:
from rl import PPO, ActorCritic, Env, RolloutBuffer, ClipEnv, CLIPNovelty, ClipEnvNoCuriosity, ClipEnvNoPenalty
from models import LSTMActor, LSTMCritic, FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic, SmallCNNEncoder, CompleteFrozenResNetEncoder
from cons import MINIBATCHES, EPISODE_STEPS, FEAT_DIM, NUM_ACTIONS, DEVICE
import wandb

In [None]:
wandb.login()

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/viriyadhika/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mviriyadhika-putra[0m ([33mviriyadhika1[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import torch
import matplotlib.pyplot as plt
from rl import save_actor_critic


def train(controller, name: str, ppo: PPO, env: Env, actor_critic: ActorCritic, total_updates=10):
    run = wandb.init(
        reinit="finish_previous",
        entity="viriyadhika1",
        project="cv-final-project",
        name=name,
        config={},
    )
    try:
        event = controller.step("Pass")  # prime
        rewards = []
        for upd in range(total_updates):
            buf = RolloutBuffer()
            for mb in range(MINIBATCHES):
                # collect episodes
                episode_seq = []
                episode_reward = 0
                actions_seq = []
                for t in range(1, EPISODE_STEPS + 1):
                    with torch.no_grad():
                        obs_t = ppo.obs_from_event(event)  # (C,H,W)
                        obs_encoded = actor_critic.actor_critic_encoder(obs_t.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0) # (D)
                        obs_seq = torch.stack(episode_seq + [obs_encoded], dim=0).to(device=DEVICE)

                    if len(actions_seq) == 0:
                        actions_seq.append(torch.randint(0, NUM_ACTIONS, (1, 1)).item())
                    
                    actions_tensor = torch.tensor(actions_seq, dtype=torch.long, device=DEVICE)
                    logits, value = ppo.act_and_value(obs_seq, actions_tensor, actor_critic)
                    dist = torch.distributions.Categorical(logits=logits)
                    action_idx = dist.sample()
                    logp = dist.log_prob(action_idx)
                    
                    action_idx, logp = action_idx.item(), logp.item()
                    event, reward = env.step_env(controller, action_idx)
                    done = t == EPISODE_STEPS

                    # store one step
                    buf.add(obs_t, action_idx, logp, reward, value, done)
                    episode_seq.append(obs_encoded)
                    actions_seq.append(action_idx)

                    wandb.log({ "reward": reward })
                    
                    episode_reward += reward / EPISODE_STEPS

                    # 50% chance of teleport
                    if done:
                        env.reset()
                        if np.random.rand() > 0.5:
                            event = teleport(controller)
                wandb.log({ "episode_reward": episode_reward })
                    
            ppo.ppo_update(buf, actor_critic)
            if (upd + 1) % 10 == 0:
                save_actor_critic(actor_critic, f"data/{name}_{upd}.pt")
            save_actor_critic(actor_critic, f"data/{name}.pt")
            
            print(f"Update {upd+1}/{total_updates} — steps: {len(buf)}")
    finally:
        run.finish()
    return buf, rewards

# Base Case

In [6]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [7]:
train(controller, "complete_frozen", ppo, clip_env, clip_actor_critic, 1)

[34m[1mwandb[0m: Currently logged in as: [33mviriyadhika-putra[0m ([33mviriyadhika1[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[PPO] Epoch 0: Loss=26.7420, Policy=0.0028, Value=53.5863
Approx KL Learned: 0.3665815591812134


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to complete_frozen.pt
Update 1/1 — steps: 128


0,1
episode_reward,█▁
reward,▄▃▃▄▅███▆▆▃▄▅▄▄▃▄▃▃▃▃▃▄▃▄▃▃▃▃▄▄▅▅▄▆▅▅▁▁▄

0,1
episode_reward,0.2
reward,-0.00396


(<rl.RolloutBuffer at 0x1731351d0>, [])

# Small CNN from scratch

In [9]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = SmallCNNEncoder(FEAT_DIM, device=DEVICE)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [10]:
train(controller, "cnn_scratch", ppo, clip_env, clip_actor_critic, 1)

[PPO] Epoch 0: Loss=14.4576, Policy=0.0038, Value=29.0027
Approx KL Learned: 0.24089230597019196


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to cnn_scratch.pt
Update 1/1 — steps: 128


0,1
episode_reward,█▁
reward,▂▄▃▃▃▃▂▃▃█▃▂▂▃▁▂▂▂▂▂▂▁▃▂▂▂▁▃▂▂▃▃▃▅▃▂▃▃▂▃

0,1
episode_reward,0.06357
reward,0.06107


(<rl.RolloutBuffer at 0x36391c9d0>, [])

# Novelty no curiosity

In [11]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnvNoCuriosity(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [12]:
train(controller, "no_curiosity", ppo, clip_env, clip_actor_critic, 1)

[PPO] Epoch 0: Loss=21.8092, Policy=0.0018, Value=43.7206


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Approx KL Learned: 0.26378151774406433
[✅] Actor-Critic checkpoint saved to no_curiosity.pt
Update 1/1 — steps: 128


0,1
episode_reward,▁█
reward,▄▁▃▃▄▅▅▅▅▅▄▄▄▃▃█▇▆▆▅▇▇▃▃▃▄▄▅▄▄▄▃▄▄▇▆▇▆▆▆

0,1
episode_reward,0.32579
reward,0.34801


(<rl.RolloutBuffer at 0x347f6b850>, [])

# No Penalty

In [14]:
ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(FEAT_DIM)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS)
critic = SlidingWindowTransformerCritic(FEAT_DIM)
clip_novelty = CLIPNovelty()
clip_env = ClipEnvNoPenalty(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [15]:
train(controller, "no_penalty", ppo, clip_env, clip_actor_critic, 1)

[PPO] Epoch 0: Loss=62.6088, Policy=0.0292, Value=125.2623


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Approx KL Learned: 0.06019662320613861
[✅] Actor-Critic checkpoint saved to no_penalty.pt
Update 1/1 — steps: 128


0,1
episode_reward,█▁
reward,▄▄▄▅▄▅▅▆▅▅▅▅▇▅▄▃▃▅▅▅▃▂▂█▃▄▄▄▄▅▄▃▃▂▁▂▁▁▁▂

0,1
episode_reward,0.1351
reward,0.03935


(<rl.RolloutBuffer at 0x363bed990>, [])