In [1]:
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer

from sources.cl_nli.model import SimCSE
from sources.fallback_policy.model import QNetwork
from sources.scienceworld import parse_beliefs

from sources.fallback_policy.replay import ReplayMemory, Transition

# Reading goldpath trajectories

In [2]:
goldpath_file = "/opt/data/scienceworld-goldpaths/goldsequences-0.json"

with open(goldpath_file) as f:
    json_data = json.load(f)

json_data = json_data['0']

json_data.keys()

dict_keys(['taskIdx', 'taskName', 'goldActionSequences'])

In [3]:
ckpt = "/opt/models/simcse_default/version_0/v0-epoch=4-step=18304-val_nli_loss=0.658-train_loss=0.551.ckpt"

model: SimCSE = SimCSE.load_from_checkpoint(ckpt).eval()
hf_model_name = model.hparams['hf_model_name']
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Create memory buffer

In [4]:
def encode(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    if include_cls:
        cls_token = tokenizer.cls_token
        texts = [cls_token] + texts
    pad_size = max_size - len(texts)
    padding = [tokenizer.pad_token for _ in range(pad_size)]
    texts = texts + padding
    tokenized_text = tokenizer(texts, padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
    embeddings = model.encode(tokenized_text).detach().unsqueeze(0)  # batch axis
    return embeddings



In [11]:
gold_sequence = json_data['goldActionSequences'][0]['path']

goal = json_data['goldActionSequences'][0]['taskDescription'].split('.')[0]
variation_idx = json_data['goldActionSequences'][0]['variationIdx']
print(f"Goal: {goal} - variation {variation_idx}")

all_lens = []

trajectories_bdi = []

memory_buffer = ReplayMemory(1000)

last_reward = 0
for i, trajectory in enumerate(gold_sequence):
    observation = trajectory['observation']
    look_around = trajectory['freelook']
    inventory = trajectory['inventory']
    belief_base = parse_beliefs(observation=observation, look=look_around, inventory=inventory)

    next_trajectory = gold_sequence[i + 1]
    next_belief_base = parse_beliefs(observation=next_trajectory['observation'],
                                     look=next_trajectory['freelook'],
                                     inventory=next_trajectory['inventory'])

    reward = float(trajectory['score']) - last_reward
    last_reward = float(trajectory['score'])
    is_done = trajectory['isCompleted']
    if is_done == 'true':
        next_state = ""
        print("finish")
        break
        # ou break de repente aqui

    memory_buffer.push(
            Transition(
                    belief_base=encode(belief_base + [goal]),
                    num_beliefs=len(belief_base) + 1,  # including goal
                    action=encode([trajectory['action']], max_size=1, include_cls=False),
                    next_belief_base=encode(next_belief_base + [goal]),
                    num_next_beliefs=len(next_belief_base) + 1,
                    next_action=encode([next_trajectory['action']], max_size=1, include_cls=False),
                    reward=reward,
                    done=True if is_done == 'true' else False
            )
    )

    #print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done}  - action: {trajectory['action']}")

Goal: Your task is to boil water - variation 0
finish


{'action': 'use thermometer in inventory on substance in metal pot',
 'observation': 'the thermometer measures a temperature of 98 degrees celsius',
 'score': '1.0',
 'isCompleted': 'true',
 'freelook': 'This room is called the kitchen. In it, you see: \n\ta substance called soap\n\ta painting\n\ta counter. On the counter is: a bowl (containing a banana, a potato, a red apple, an orange), a drawer.\n\ta sink, which is turned off. In the sink is: nothing.\n\ta table. On the table is: a glass cup (containing nothing).\n\ta chair. On the chair is: nothing.\n\ta freezer. The freezer door is closed. \n\ta lighter\n\ta stopwatch, which is deactivated. \n\ta fridge. The fridge door is closed. \n\ta substance called air\n\ta cupboard. The cupboard door is open. In the cupboard is: a tin cup (containing nothing), a ceramic cup (containing nothing), a drawer.\n\ta oven, which is turned off. The oven door is closed. \n\ta glass jar (containing a substance called sodium chloride)\n\tthe agent\n\ta

In [6]:
import pickle

with open('memory_buffer.pkl', 'wb') as pickle_file:
    pickle.dump(memory_buffer, pickle_file)


In [7]:
with open('memory_buffer.pkl', 'rb') as pickle_file:
    memory_buffer_p = pickle.load(pickle_file)


# Train q-network using annotated trajectories

In [9]:
from sources.fallback_policy.model import SimpleQNetwork

#network = QNetwork(768, 768, 768, n_blocks=5)
network = SimpleQNetwork(768, 768, 5)
network = network.to('cuda')
network.train()

num_parameters = sum(p.numel() for p in network.parameters() if p.requires_grad)
print(f"Number of parameters: {num_parameters}")
network

Number of parameters: 1180416


SimpleQNetwork(
  (hidden): Linear(in_features=1536, out_features=768, bias=False)
  (q_value_layer): Linear(in_features=768, out_features=1, bias=False)
)

In [10]:
# q-learning 
GAMMA = 0.99
BATCH_SIZE = 32

optimizer = torch.optim.AdamW(network.parameters(), lr=1e-4)

for epoch in range(100):
    batch = memory_buffer.sample(BATCH_SIZE)
    belief_base_emb = torch.cat([b.belief_base for b in batch], dim=0)
    num_belief_emb = [b.num_beliefs for b in batch]
    next_belief_base_emb = torch.cat([b.next_belief_base for b in batch], dim=0)
    num_next_belief_emb = [b.num_next_beliefs for b in batch]
    rewards = torch.tensor([(b.reward + 1) for b in batch]).to(model.device)
    actions = torch.cat([b.action for b in batch]).squeeze(1)  # removing mid axis [bs, ?, a_dim] -> [bs, a_dim]
    next_actions = torch.cat([b.next_action for b in batch]).squeeze(
            1)  # removing mid axis [bs, ?, a_dim] -> [bs, a_dim]

    optimizer.zero_grad()
    # Q(s', a')
    next_q_values = network(belief_base=next_belief_base_emb,
                            belief_base_sizes=num_next_belief_emb,
                            action_tensors=next_actions)
    best_next_q_values = next_q_values.squeeze(-1)
    targets = rewards + (GAMMA * best_next_q_values)

    # Q(s, a)
    q_values = network(belief_base=belief_base_emb, belief_base_sizes=num_belief_emb, action_tensors=actions)
    loss = F.smooth_l1_loss(q_values.squeeze(-1), targets.detach())
    if epoch % 10 == 0:
        #print(f"[{epoch}] q-values {q_values.squeeze(-1)}, targets {targets.squeeze(-1)} - loss {loss.item()}")
        print(f"[{epoch}] loss {loss.item()}")

    loss.backward()
    nn.utils.clip_grad_norm_(network.parameters(), 1.)
    optimizer.step()


[0] loss 0.501124918460846
[10] loss 0.5170617699623108
[20] loss 0.6482571363449097
[30] loss 0.4974153935909271
[40] loss 0.5479430556297302
[50] loss 0.6194901466369629
[60] loss 0.6764197945594788
[70] loss 0.6734042167663574
[80] loss 0.9031770825386047
[90] loss 0.7735987901687622


# Evaluating Q-network in scienceworld environment

In [189]:
from scienceworld import ScienceWorldEnv

network = network.eval()

In [190]:
env = ScienceWorldEnv()
variation_idx = json_data['goldActionSequences'][0]['variationIdx']
env.load("boil", variation_idx)

In [191]:
def select_action(obs, look, inv, goal, candidate_actions):
    belief_base = parse_beliefs(observation=obs, look=look, inventory=inv) + [goal]
    encoded_belief_base = encode(belief_base, max_size=len(belief_base) + 1)  # including CLS token
    encoded_actions = encode(candidate_actions, max_size=len(candidate_actions), is_action=True)
    encoded_actions = encoded_actions.squeeze(0)
    num_actions, action_dim = encoded_actions.size()
    repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
    q_values = network(belief_base=repeat_encoded_belief_base,
                       belief_base_sizes=[len(belief_base)],
                       action_tensors=encoded_actions)
    selected_action = q_values.argmax()
    return candidate_actions[selected_action.item()]


In [193]:
with torch.no_grad():
    max_steps = 1
    action = "look around"
    for i in range(max_steps):
        print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done} - action: {action}")
        obs, reward, is_done, info = env.step(action)
        belief_base = parse_beliefs(observation=obs, look=info['look'], inventory=info['inv']) + [goal]
        encoded_belief_base = encode(belief_base, max_size=len(belief_base))
        encoded_actions = encode(info['valid'], max_size=len(info['valid']), include_cls=False)
        encoded_actions = encoded_actions.squeeze(0)
        num_actions, action_dim = encoded_actions.size()
        repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
        q_values = network(belief_base=repeat_encoded_belief_base,
                           belief_base_sizes=[len(belief_base)],
                           action_tensors=encoded_actions)
        selected_action = q_values.argmax()
        action = info['valid'][selected_action.item()]


Step 0 - reward: 0.000 - is_done: False - action: look around


In [194]:
values, idxs = torch.sort(q_values.squeeze(1), descending=True)
for i, idx in enumerate(idxs):
    if info['valid'][idx] == 'open door to kitchen':
        print(f"act {info['valid'][idx]} - q_value: {values[i]:.3f}")

act open door to kitchen - q_value: -7.182


In [195]:
values, idxs = torch.sort(q_values.squeeze(1), descending=True)
for i, idx in enumerate(idxs[:10]):
    print(f"act {info['valid'][idx]} - q_value: {values[i]:.3f}")

act connect agent to air - q_value: 105.128
act open door to kitchen - q_value: -7.182
act open door to workshop - q_value: -7.202
act connect door to kitchen to art studio door - q_value: -7.203
act connect door to living room to door to workshop - q_value: -7.206
act connect air to door to greenhouse - q_value: -7.208
act open art studio door - q_value: -7.209
act connect kitchen to door to greenhouse - q_value: -7.212
act connect art studio to door to kitchen - q_value: -7.213
act look at door to kitchen - q_value: -7.215


In [197]:
turn = 1
print(gold_sequence[turn]['action'])
annotated_belief_base = parse_beliefs(observation=gold_sequence[turn - 1]['observation'],
                                      look=gold_sequence[turn - 1]['freelook'],
                                      inventory=gold_sequence[turn - 1]['inventory']) + [goal]
#annotated_belief_base = ['This room is called the kitchen', 'You see a anchor'] + [goal]
print(len(annotated_belief_base))
annotated_belief_base

open door to kitchen
13


['This room is called the hallway.',
 'You see a picture',
 'You see a substance called air',
 'You see the agent',
 'A door to the green house (that is open)',
 'A door to the living room (that is open)',
 'A door to the art studio (that is open)',
 'A door to the kitchen (that is open)',
 'A door to the bedroom (that is open)',
 'A door to the workshop (that is open)',
 'In your inventory, you see: an orange ',
 '',
 'Your task is to boil water']

In [202]:
encoded_belief_base = encode(annotated_belief_base, max_size=len(annotated_belief_base) + 1)
print(encoded_belief_base.size())
#candidate_actions = info['valid']
candidate_actions = ['focus on bedroom door', 'open door to kitchen']
encoded_actions = encode(candidate_actions, include_cls=False, max_size=len(candidate_actions))
encoded_actions = encoded_actions.squeeze(0)
print(encoded_actions.size())
num_actions, action_dim = encoded_actions.size()
repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
q_values = network(belief_base=repeat_encoded_belief_base,
                   belief_base_sizes=[len(annotated_belief_base) + 1 for _ in range(num_actions)],
                   action_tensors=encoded_actions)
print(q_values.size())

torch.Size([1, 14, 768])
torch.Size([2, 768])
torch.Size([2, 1])


In [203]:
values, idxs = torch.sort(q_values.squeeze(1), descending=True)
for i, idx in enumerate(idxs[:5]):
    print(f"act {candidate_actions[idx]} - q_value: {values[i]:.3f}")

act open door to kitchen - q_value: 105.626
act focus on bedroom door - q_value: 105.429


In [204]:
values, idxs = torch.sort(q_values.squeeze(1), descending=True)
for i, idx in enumerate(idxs[:5]):
    print(f"act {candidate_actions[idx]} - q_value: {values[i]:.3f}")

act open door to kitchen - q_value: 105.626
act focus on bedroom door - q_value: 105.429


In [211]:
embeddings_a = encode(['you see a door, which is open', 'you see a door, which is closed'], max_size=3)
emb_a = embeddings_a[0, 1, :]
emb_b = embeddings_a[0, 2, :]

enc_a = network.belief_base_encoder(embeddings_a, [3])

nn.CosineSimilarity(dim=-1)(emb_a, emb_b)

tensor(-0.1126, device='cuda:0')

In [212]:
belief_test = ['you see a door, which is open', 'you see a door, which is not closed']
print(len(belief_test))
embeddings_b = encode(belief_test, max_size=3)
emb_a = embeddings_b[0, 1, :]
emb_b = embeddings_b[0, 2, :]

enc_b = network.belief_base_encoder(embeddings_b, [3])

nn.CosineSimilarity(dim=-1)(emb_a, emb_b)

2


tensor(0.9416, device='cuda:0')

In [213]:
# HMMM WE HAVE A PROBLEM, POOLED BELIEF REPRESENTATION HAS THE SAME REPRESENTATION FOR EVERY BELIEF COMBINATION, WE NEED TO CHECK THE POOLING METHOD
# THIS IS THE PROBLEM OF TRAINING RL
nn.CosineSimilarity(dim=-1)(enc_a, enc_b), embeddings_a.size(), embeddings_a.size()

(tensor([1.0000], device='cuda:0', grad_fn=<SumBackward1>),
 torch.Size([1, 3, 768]),
 torch.Size([1, 3, 768]))

In [143]:
cls_emb = embeddings_b[:, 0, :]

mask = embeddings_b != cls_emb
mask

mean_embedding_b = ((embeddings_b * mask).sum(dim=1) / mask.sum(dim=1))
mean_embedding_b.size()

torch.Size([1, 768])

In [215]:
embeddings_c = encode(belief_test, max_size=6)
embeddings_c.size(), len(belief_test)

(torch.Size([1, 6, 768]), 2)

In [217]:
mask = torch.zeros_like(embeddings_c)
for size in [len(belief_test)]:
    mask[:, :size] = 1

mask

tensor([[[1., 1., 1.,  ..., 1., 1., 1.],
         [1., 1., 1.,  ..., 1., 1., 1.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')

In [220]:
y_mean = (embeddings_c * mask).sum(dim=1) / mask.sum(dim=1)
y_mean.size()

torch.Size([1, 768])

In [23]:
tokenized_text = tokenizer("take metal pot inside the table", padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
embeddings_a = model.encode(tokenized_text)

tokenized_text = tokenizer("look at metal pot inside the water", padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
embeddings_b = model.encode(tokenized_text)



nn.functional.cosine_similarity(embeddings_a, embeddings_b)

tensor([0.3574], device='cuda:0', grad_fn=<SumBackward1>)