In [1]:
import json

import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer

from sources.cl_nli.model import SimCSE
from sources.fallback_policy.model import QNetwork
from sources.scienceworld import parse_beliefs

from sources.fallback_policy.replay import ReplayMemory, Transition

# Reading goldpath trajectories

In [2]:
goldpath_file = "/opt/data/scienceworld-goldpaths/goldsequences-0.json"

with open(goldpath_file) as f:
    json_data = json.load(f)

json_data = json_data['0']

json_data.keys()

dict_keys(['taskIdx', 'taskName', 'goldActionSequences'])

In [3]:
ckpt = "/opt/models/simcse_default/version_0/v0-epoch=4-step=18304-val_nli_loss=0.658-train_loss=0.551.ckpt"

model: SimCSE = SimCSE.load_from_checkpoint(ckpt).eval()
hf_model_name = model.hparams['hf_model_name']
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create memory buffer

In [4]:
def encode(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    if include_cls:
        cls_token = tokenizer.cls_token
        texts = [cls_token] + texts
    pad_size = max_size - len(texts)
    padding = [tokenizer.pad_token for _ in range(pad_size)]
    texts = texts + padding
    tokenized_text = tokenizer(texts, padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
    embeddings = model.encode(tokenized_text).detach().unsqueeze(0)  # batch axis
    return embeddings



In [5]:
gold_sequence = json_data['goldActionSequences'][0]['path']

goal = json_data['goldActionSequences'][0]['taskDescription'].split('.')[0]
variation_idx = json_data['goldActionSequences'][0]['variationIdx']
print(f"Goal: {goal} - variation {variation_idx}")

all_lens = []

trajectories_bdi = []

memory_buffer = ReplayMemory(1000)

last_reward = 0
for i, trajectory in enumerate(gold_sequence):
    observation = trajectory['observation']
    look_around = trajectory['freelook']
    inventory = trajectory['inventory']
    belief_base = parse_beliefs(observation=observation, look=look_around, inventory=inventory)

    next_trajectory = gold_sequence[i + 1]
    next_belief_base = parse_beliefs(observation=next_trajectory['observation'],
                                     look=next_trajectory['freelook'],
                                     inventory=next_trajectory['inventory'])

    reward = float(trajectory['score']) - last_reward
    last_reward = float(trajectory['score'])
    is_done = trajectory['isCompleted']
    if is_done == 'true':
        next_state = ""
        print("finish")
        break
        # ou break de repente aqui

    memory_buffer.push(
            Transition(
                    belief_base=encode(belief_base + [goal]),
                    num_beliefs=len(belief_base) + 1,  # including goal
                    action=encode([trajectory['action']], max_size=1, include_cls=False),
                    next_belief_base=encode(next_belief_base + [goal]),
                    num_next_beliefs=len(next_belief_base) + 1,
                    next_action=encode([next_trajectory['action']], max_size=1, include_cls=False),
                    reward=reward,
                    done=True if is_done == 'true' else False
            )
    )

    #print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done}  - action: {trajectory['action']}")

Goal: Your task is to boil water - variation 0
finish


# Train q-network using annotated trajectories

In [6]:
from sources.fallback_policy.model import SimpleQNetwork

#network = QNetwork(768, 768, n_blocks=3)
network = SimpleQNetwork(768, 768, 5)
network = network.to('cuda')
network.train()

num_parameters = sum(p.numel() for p in network.parameters() if p.requires_grad)
print(f"Number of parameters: {num_parameters}")
network

Number of parameters: 4133376


SimpleQNetwork(
  (belief_base_encoder): ModuleList(
    (0-4): 5 x Linear(in_features=768, out_features=768, bias=True)
  )
  (hidden): Linear(in_features=1536, out_features=768, bias=False)
  (q_value_layer): Linear(in_features=768, out_features=1, bias=False)
)

In [7]:
# q-learning 
GAMMA = 0.99
BATCH_SIZE = 32

optimizer = torch.optim.Adam(network.parameters())


def contrastive_loss(belief_base_emb, num_belief_emb, action_emb):
    batch_size, _, _ = belief_base_emb.size()

    belief_q_values = []
    for belief_idx in range(32):
        c_belief_base = belief_base_emb[belief_idx, :, :].unsqueeze(0)
        c_belief_base = c_belief_base.repeat(batch_size, 1, 1)
        q_values = network(belief_base=c_belief_base,
                           belief_base_sizes=[num_belief_emb[belief_idx] for _ in range(batch_size)],
                           action_tensors=action_emb)
        belief_q_values.append(q_values.squeeze(0))
    
    all_q_values = torch.cat(belief_q_values, dim=-1)
    #labels = torch.arange(batch_size).float().to('cuda')
    labels = torch.zeros_like(all_q_values).to('cuda')
    labels = labels.fill_diagonal_(10)
    all_q_values = all_q_values.view(batch_size*batch_size, 1)
    labels = labels.view(batch_size*batch_size, 1)
    #print(all_q_values[:batch_size])
    #print(labels[:batch_size])
    return F.mse_loss(all_q_values, labels)
    


for epoch in range(100):
    batch = memory_buffer.sample(BATCH_SIZE)
    belief_base_emb = torch.cat([b.belief_base for b in batch], dim=0)
    batch_size, _, belief_dim = belief_base_emb.size()
    num_belief_emb = [b.num_beliefs for b in batch]
    actions = torch.cat([b.action for b in batch]).squeeze(1)  # removing mid axis [bs, ?, a_dim] -> [bs, a_dim]

    optimizer.zero_grad()
    loss = contrastive_loss(belief_base_emb, num_belief_emb, actions)
    if epoch % 10 == 0:
        print(f"epoch {epoch} - loss {loss.item(): .4f}")
    loss.backward()
    optimizer.step()
    #break
    #nn.utils.clip_grad_norm_(network.parameters(), 1.)



epoch 0 - loss  3.1131
epoch 10 - loss  3.0232
epoch 20 - loss  3.0004
epoch 30 - loss  2.9661
epoch 40 - loss  2.9800
epoch 50 - loss  2.9573
epoch 60 - loss  3.0023
epoch 70 - loss  2.9714
epoch 80 - loss  2.9581
epoch 90 - loss  2.9841


In [10]:
turn = 1
print(gold_sequence[turn]['action'])
annotated_belief_base = parse_beliefs(observation=gold_sequence[turn - 1]['observation'],
                                      look=gold_sequence[turn - 1]['freelook'],
                                      inventory=gold_sequence[turn - 1]['inventory']) + [goal]
#annotated_belief_base = ['This room is called the kitchen', 'You see a anchor', 'you see a metal pot'] + [goal]
print(len(annotated_belief_base))
annotated_belief_base

open door to kitchen
13


['This room is called the hallway.',
 'You see a picture',
 'You see a substance called air',
 'You see the agent',
 'A door to the green house (that is open)',
 'A door to the living room (that is open)',
 'A door to the art studio (that is open)',
 'A door to the kitchen (that is open)',
 'A door to the bedroom (that is open)',
 'A door to the workshop (that is open)',
 'In your inventory, you see: an orange ',
 '',
 'Your task is to boil water']

In [11]:
encoded_belief_base = encode(annotated_belief_base, max_size=len(annotated_belief_base) + 1)
print(encoded_belief_base.size())
#candidate_actions = info['valid']
candidate_actions = ['focus on bedroom door', 'open door to kitchen', 'go to kitchen', 'pick up thermometer']
encoded_actions = encode(candidate_actions, include_cls=False, max_size=len(candidate_actions))
encoded_actions = encoded_actions.squeeze(0)
print(encoded_actions.size())
num_actions, action_dim = encoded_actions.size()
repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
q_values = network(belief_base=repeat_encoded_belief_base,
                   belief_base_sizes=[len(annotated_belief_base) + 1 for _ in range(num_actions)],
                   action_tensors=encoded_actions)
print(q_values.size())


values, idxs = torch.sort(q_values.squeeze(1), descending=True)
for i, idx in enumerate(idxs[:5]):
    print(f"act {candidate_actions[idx]} - q_value: {values[i]:.3f}")

torch.Size([1, 14, 768])
torch.Size([4, 768])
torch.Size([4, 1])
act go to kitchen - q_value: 0.715
act open door to kitchen - q_value: 0.487
act focus on bedroom door - q_value: 0.097
act pick up thermometer - q_value: -0.125


# Evaluating Q-network in scienceworld environment