In [1]:
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

from sources.cl_nli.model import SimCSE
from sources.fallback_policy.replay import ReplayMemory, Transition
from sources.scienceworld import parse_beliefs

# Reading goldpath trajectories

In [2]:
goldpath_file = "/opt/data/scienceworld-goldpaths/goldsequences-0.json"

with open(goldpath_file) as f:
    json_data = json.load(f)

json_data = json_data['0']

json_data.keys()

dict_keys(['taskIdx', 'taskName', 'goldActionSequences'])

In [3]:
ckpt = "/opt/models/simcse_default/version_0/v0-epoch=4-step=18304-val_nli_loss=0.658-train_loss=0.551.ckpt"

model: SimCSE = SimCSE.load_from_checkpoint(ckpt).eval()
hf_model_name = model.hparams['hf_model_name']
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
simcse_tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-roberta-base")
simcse_model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-roberta-base").eval().to('cuda')

## Create memory buffer

In [5]:
def encode_custom(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    if include_cls:
        cls_token = tokenizer.cls_token
        texts = [cls_token] + texts
    pad_size = max_size - len(texts)
    padding = [tokenizer.pad_token for _ in range(pad_size)]
    texts = texts + padding
    tokenized_text = tokenizer(texts, padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
    embeddings = model.encode(tokenized_text).detach().unsqueeze(0)  # batch axis
    return embeddings


def encode_simcse(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    with torch.no_grad():
        if include_cls:
            cls_token = simcse_tokenizer.cls_token
            texts = [cls_token] + texts
        pad_size = max_size - len(texts)
        padding = [simcse_tokenizer.pad_token for _ in range(pad_size)]
        texts = texts + padding
        tokenized_text = simcse_tokenizer(texts, padding='longest', truncation=True,
                                          return_tensors='pt').to('cuda')
        embeddings = simcse_model(**tokenized_text).pooler_output.unsqueeze(0)
        return embeddings


def encode(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    return encode_custom(texts, max_size, include_cls)

In [34]:
gold_sequence = json_data['goldActionSequences'][0]['path']

goal = json_data['goldActionSequences'][0]['taskDescription'].split('.')[0]
variation_idx = json_data['goldActionSequences'][0]['variationIdx']
print(f"Goal: {goal} - variation {variation_idx}")

all_lens = []

trajectories_bdi = []

memory_buffer = ReplayMemory(1000)

last_reward = 0
observation = ""
for i, trajectory in enumerate(gold_sequence):
    look_around = trajectory['freelook']
    inventory = trajectory['inventory']
    belief_base = parse_beliefs(observation=observation, look=look_around, inventory=inventory)

    next_trajectory = gold_sequence[i + 1]
    next_belief_base = parse_beliefs(observation=next_trajectory['observation'],
                                     look=next_trajectory['freelook'],
                                     inventory=next_trajectory['inventory'])

    reward = float(trajectory['score']) - last_reward
    last_reward = float(trajectory['score'])
    is_done = trajectory['isCompleted']
    if is_done == 'true':
        next_state = ""
        print("finish")
        break
        # ou break de repente aqui
    if trajectory['action'] != 'look around':
        memory_buffer.push(
                Transition(
                        belief_base=encode(belief_base + [goal], include_cls=False),
                        num_beliefs=len(belief_base) + 1,  # including goal
                        action=encode([trajectory['action']], max_size=1, include_cls=False),
                        next_belief_base=encode(next_belief_base + [goal]),
                        num_next_beliefs=len(next_belief_base) + 1,
                        next_action=encode([next_trajectory['action']], max_size=1, include_cls=False),
                        reward=reward,
                        done=True if is_done == 'true' else False
                )
        )
    observation = trajectory['observation']

    #print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done}  - action: {trajectory['action']}")

Goal: Your task is to boil water - variation 0
finish


# Train q-network using annotated trajectories

In [40]:
from sources.fallback_policy.model import SimpleQNetwork, QNetwork

#network = QNetwork(768, 768, n_blocks=3)
network = SimpleQNetwork(768, 768, 1)
network = network.to('cuda')
network.train()

num_parameters = sum(p.numel() for p in network.parameters() if p.requires_grad)
print(f"Number of parameters: {num_parameters}")
network

Number of parameters: 1771008


SimpleQNetwork(
  (belief_base_encoder): ModuleList(
    (0): Linear(in_features=768, out_features=768, bias=True)
  )
  (hidden): Linear(in_features=1536, out_features=768, bias=False)
  (q_value_layer): Linear(in_features=768, out_features=1, bias=False)
)

In [50]:
# q-learning 
GAMMA = 0.99
BATCH_SIZE = 16

optimizer = torch.optim.Adam(network.parameters(), lr=1e-4)


def contrastive_loss(belief_base_emb, num_belief_emb, action_emb):
    # USING CONTRASTIVE LOSS  (cross entropy) does not work well
    # maybe because we did not need to use a softmax function at the top of network to predict q-values
    batch_size, _, _ = belief_base_emb.size()

    belief_q_values = []
    for belief_idx in range(BATCH_SIZE):
        c_belief_base = belief_base_emb[belief_idx, :, :].unsqueeze(0)
        c_belief_base = c_belief_base.repeat(batch_size, 1, 1)
        q_values = network(belief_base=c_belief_base,
                           belief_base_sizes=[num_belief_emb[belief_idx] for _ in range(batch_size)],
                           action_tensors=action_emb)
        belief_q_values.append(q_values.squeeze(0))

    all_q_values = torch.cat(belief_q_values, dim=-1)
    labels = torch.zeros_like(all_q_values).to('cuda')
    labels = labels.fill_diagonal_(1)
    all_q_values = all_q_values.view(batch_size * batch_size, 1)
    labels = labels.view(batch_size * batch_size, 1)
    #return F.mse_loss(all_q_values, labels)
    return F.smooth_l1_loss(all_q_values, labels)


for epoch in range(300):
    batch = memory_buffer.sample(BATCH_SIZE)
    belief_base_emb = torch.cat([b.belief_base for b in batch], dim=0)
    batch_size, _, belief_dim = belief_base_emb.size()
    num_belief_emb = [b.num_beliefs for b in batch]
    actions = torch.cat([b.action for b in batch]).squeeze(1)  # removing mid axis [bs, ?, a_dim] -> [bs, a_dim]

    optimizer.zero_grad()
    loss = contrastive_loss(belief_base_emb, num_belief_emb, actions)
    if epoch % 100 == 0:
        print(f"epoch {epoch} - loss {loss.item(): .4f}")
    loss.backward()
    optimizer.step()
    #break
    #nn.utils.clip_grad_norm_(network.parameters(), 1.)

print(f"epoch {epoch} - loss {loss.item(): .4f}")
# simple network 0.0288

epoch 0 - loss  0.0233
epoch 100 - loss  0.0219
epoch 200 - loss  0.0218
epoch 299 - loss  0.0219


In [61]:
turn = 5
expected_action = gold_sequence[turn]['action']
print(expected_action)
annotated_belief_base = parse_beliefs(observation=gold_sequence[turn - 1]['observation'],
                                      look=gold_sequence[turn]['freelook'],
                                      inventory=gold_sequence[turn]['inventory']) + [goal]
#annotated_belief_base = ['This room is called the kitchen', 'You see a anchor', 'you see a metal pot'] + [goal]
print(len(annotated_belief_base))
annotated_belief_base

open cupboard
23


['This room is called the kitchen.',
 'You see a substance called soap',
 'You see a painting',
 'You see a counter. On the counter is: a bowl (containing a banana, a potato, a red apple, an orange), a drawer.',
 'You see a sink, which is turned off. In the sink is: nothing.',
 'You see a table. On the table is: a glass cup (containing nothing).',
 'You see a chair. On the chair is: nothing.',
 'You see a freezer. The freezer door is closed.',
 'You see a lighter',
 'You see a stopwatch, which is deactivated.',
 'You see a fridge. The fridge door is closed.',
 'You see a substance called air',
 'You see a cupboard. The cupboard door is closed.',
 'You see a oven, which is turned off. The oven door is closed.',
 'You see a glass jar (containing a substance called sodium chloride)',
 'You see the agent',
 'You see a stove, which is turned off. On the stove is: nothing.',
 'A door to the outside (that is open)',
 'A door to the bathroom (that is open)',
 'A door to the hallway (that is op

In [62]:
encoded_belief_base = encode(annotated_belief_base, max_size=len(annotated_belief_base) + 1, include_cls=False)
print(encoded_belief_base.size())
#candidate_actions = info['valid']
candidate_actions = ['focus on bedroom door', 'open door to kitchen', 'go to kitchen', 'deactivate sink'] + [
        expected_action]
encoded_actions = encode(candidate_actions, include_cls=False, max_size=len(candidate_actions))
encoded_actions = encoded_actions.squeeze(0)
print("encoded_actions: ", encoded_actions.size())
num_actions, action_dim = encoded_actions.size()
repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
q_values = network(belief_base=repeat_encoded_belief_base,
                   belief_base_sizes=[len(annotated_belief_base) + 1 for _ in range(num_actions)],
                   action_tensors=encoded_actions)
print(q_values.size())

values, idxs = torch.sort(q_values.squeeze(1), descending=True)
for i, idx in enumerate(idxs[:5]):
    print(f"act {candidate_actions[idx]} - q_value: {values[i]:.3f}")

torch.Size([1, 24, 768])
encoded_actions:  torch.Size([5, 768])
torch.Size([5, 1])
act open cupboard - q_value: 0.405
act deactivate sink - q_value: 0.055
act open door to kitchen - q_value: 0.008
act go to kitchen - q_value: -0.016
act focus on bedroom door - q_value: -0.084


In [57]:
# try to infer context by the similarity between beliefs and the action

In [58]:
action_idx = q_values.squeeze(1).argmax()
action = candidate_actions[action_idx]
action, encoded_actions[action_idx, :].size(), action_idx

('go to kitchen', torch.Size([768]), tensor(2, device='cuda:0'))

In [13]:
bb = encoded_belief_base.squeeze(0)
num_beliefs, belief_dim = bb.size()
emb_action = encoded_actions[action_idx, :].unsqueeze(0).repeat(num_beliefs, 1)
emb_action.size(), bb.size()

(torch.Size([24, 768]), torch.Size([24, 768]))

In [14]:
similarity = nn.CosineSimilarity(dim=-1)(emb_action, bb)
#similarity = similarity[similarity > 0.5]
top_similarity, idx = torch.sort(similarity, descending=True)
top_similarity, idx


(tensor([ 0.7226,  0.4007,  0.3132,  0.2898,  0.2477,  0.2329,  0.1887,  0.1622,
          0.1534,  0.1269,  0.1231,  0.0429,  0.0402,  0.0185,  0.0138,  0.0064,
         -0.0046, -0.0082, -0.0153, -0.0256, -0.0520, -0.1454, -0.1789, -0.2692],
        device='cuda:0'),
 tensor([ 4, 13, 20, 16,  9,  7, 10, 17, 11,  8,  6, 22, 12, 23, 18, 19, 21,  2,
          5, 14, 15,  1,  3,  0], device='cuda:0'))