In [1]:
import prior

dataset = prior.load_dataset("procthor-10k")
dataset

Fetching reference HEAD


    pip install --upgrade ai2thor
Alternatively, to downgrade to the old version of ProcTHOR-10K, run:
   prior.load_dataset("procthor-10k", revision="ab3cacd0fc17754d4c080a3fd50b18395fae8647")


Loading train: 100%|██████████| 10000/10000 [00:00<00:00, 22668.38it/s]
Loading val: 100%|██████████| 1000/1000 [00:00<00:00, 22769.27it/s]
Loading test: 100%|██████████| 1000/1000 [00:00<00:00, 23143.92it/s]


DatasetDict(
    train=Dataset(
    dataset=procthor-dataset,
    size=10000,
    split=train
),
    val=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=val
),
    test=Dataset(
    dataset=procthor-dataset,
    size=1000,
    split=test
)
)

In [2]:
from ai2thor.controller import Controller


house = dataset["train"][3]
controller = Controller(scene=house, snapToGrid=False, rotateStepDegrees=30)
event = controller.step("Pass")
spawn = event.metadata["agent"]["position"]

In [3]:
from rl import PPO, CLIPNovelty, ClipEnv, ActorCritic
from models import FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic
from cons import FEAT_DIM, NUM_ACTIONS

ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetEncoder(project_to_out_dim=False)
actor = SlidingWindowTransformerActor(FEAT_DIM, NUM_ACTIONS) # Not important
critic = SlidingWindowTransformerCritic(FEAT_DIM) # Not important
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

In [9]:
from rl import inference, teleport
import torch

def get_distribution(ppo, obs_seq, actions_seq, actor_critic):
    dist = torch.distributions.Categorical(probs=torch.tensor([0.5, 0.25, 0.25]))
    return dist

all_obs = []
for i in range(12):
    event = teleport(controller)
    init_pos = event.metadata["agent"]["position"]
    obs = inference(get_distribution=get_distribution, controller=controller, ppo=ppo, init_position=init_pos, env=clip_env, actor_critic=clip_actor_critic, plot=False, n=32)
    all_obs.append(torch.stack(obs, dim=0))

all_obs_tensor = torch.cat(all_obs, dim=0)

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

<Figure size 200x6400 with 0 Axes>

In [10]:
embedding = encoder(all_obs_tensor.unsqueeze(0)).squeeze(0)

In [11]:
from sklearn.decomposition import PCA
import numpy as np

pca = PCA(n_components=FEAT_DIM)
pca.fit(embedding.detach().cpu().numpy())
W = pca.components_.astype(np.float32)

In [12]:
import torch
import matplotlib.pyplot as plt
from rl import save_actor_critic, Env, RolloutBuffer, MINIBATCHES, EPISODE_STEPS, DEVICE, teleport
import wandb
import numpy as np


def train(controller, name: str, ppo: PPO, env: Env, actor_critic: ActorCritic, total_updates=10):
    run = wandb.init(
        reinit="finish_previous",
        entity="viriyadhika1",
        project="cv-final-project",
        name=name,
        config={},
    )
    try:
        event = controller.step("Pass")  # prime
        rewards = []
        episode_rewards = []
        for upd in range(total_updates):
            buf = RolloutBuffer()
            for mb in range(MINIBATCHES):
                # collect episodes
                episode_seq = []
                episode_reward = 0
                actions_seq = []
                for t in range(1, EPISODE_STEPS + 1):
                    with torch.no_grad():
                        obs_t = ppo.obs_from_event(event)  # (C,H,W)
                        obs_t_encoded = actor_critic.actor_critic_encoder(obs_t.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
                        obs_seq = torch.stack(episode_seq + [obs_t_encoded], dim=0).unsqueeze(0).to(device=DEVICE)

                    if len(actions_seq) == 0:
                        actions_seq.append(torch.randint(0, NUM_ACTIONS, (1, 1)).item())

                    actions_tensor = torch.tensor(actions_seq, dtype=torch.long, device=DEVICE).unsqueeze(0)
                    logits, value = ppo.act_and_value(obs_seq, actions_tensor, actor_critic)
                    dist = torch.distributions.Categorical(logits=logits)
                    action_idx = dist.sample()
                    logp = dist.log_prob(action_idx)

                    action_idx, logp = action_idx.item(), logp.item()
                    event, reward = env.step_env(controller, action_idx)
                    done = t == EPISODE_STEPS

                    # store one step
                    buf.add(obs_t_encoded, action_idx, logp, reward, value, done)
                    episode_seq.append(obs_t_encoded)
                    actions_seq.append(action_idx)

                    wandb.log({ "reward": reward })

                    episode_reward += reward / EPISODE_STEPS

                    # 50% chance of teleport
                    if done:
                        env.reset()
                        if np.random.rand() > 0.5:
                            event = teleport(controller)

                wandb.log({ "episode_reward": episode_reward })

            ppo.ppo_update(buf, actor_critic)
            if (upd + 1) % 10 == 0:
                save_actor_critic(actor_critic, f"data/{name}_{upd}.pt")
            save_actor_critic(actor_critic, f"data/{name}.pt")

            print(f"Update {upd+1}/{total_updates} — steps: {len(buf)}")
    finally:
        run.finish()

    return buf, rewards, episode_rewards

In [13]:
from rl import PPO, CLIPNovelty, ClipEnv, ActorCritic
from models import FrozenResNetEncoder, SlidingWindowTransformerActor, SlidingWindowTransformerCritic, SharedSlidingWindowTransformer, SharedSlidingWindowTransformerActor, SharedSlidingWindowTransformerCritic
from cons import FEAT_DIM, NUM_ACTIONS
from models import FrozenResNetPCAEncoder
from cons import DEVICE


ENTROPY_COEF = 0.05

ppo = PPO(ENTROPY_COEF)
encoder = FrozenResNetPCAEncoder(FEAT_DIM, torch.from_numpy(W), device=DEVICE)
shared_transformer = SharedSlidingWindowTransformer(FEAT_DIM)
actor = SharedSlidingWindowTransformerActor(shared_transformer, NUM_ACTIONS)
critic = SharedSlidingWindowTransformerCritic(shared_transformer)
clip_novelty = CLIPNovelty()
clip_env = ClipEnv(clip_novelty)
clip_actor_critic = ActorCritic(encoder, actor, critic)

  return torch._dynamo.disable(fn, recursive)(*args, **kwargs)


In [14]:
for g in clip_actor_critic.optimizer.param_groups:
    g['lr'] = g['lr'] / 5

train(controller, "pca_resnet_sliding", ppo, clip_env, clip_actor_critic, 1)

[34m[1mwandb[0m: Currently logged in as: [33mviriyadhika-putra[0m ([33mviriyadhika1[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[PPO] Epoch 0: Loss=15.3450, Policy=-0.0061, Value=30.8040
[PPO] Epoch 10: Loss=8.9254, Policy=-0.0064, Value=17.9666
[PPO] Epoch 20: Loss=3.8680, Policy=0.0267, Value=7.7843
[PPO] Epoch 30: Loss=1.3783, Policy=0.0278, Value=2.8002
Approx KL Learned: 0.047790590673685074


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


[✅] Actor-Critic checkpoint saved to data/pca_resnet_sliding.pt
Update 1/1 — steps: 64


0,1
episode_reward,▅▄█▁
reward,▃▅▄▂▃▄▆▆▆▁▅▄▄▅▆▄▄▃▃▃▄▃▄▄▄▄▆▅▇█▁▃▄▄▆▃▂▁▁▁

0,1
episode_reward,0.12566
reward,-0.05184


(<rl.RolloutBuffer at 0x3462bfbd0>, [], [])