In [1]:
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer

from sources.cl_nli.model import SimCSE
from sources.fallback_policy.model import QNetwork
from sources.scienceworld import parse_beliefs

In [2]:


goldpath_file = "/opt/data/scienceworld-goldpaths/goldsequences-0.json"

with open(goldpath_file) as f:
    json_data = json.load(f)

json_data = json_data['0']

json_data.keys()

dict_keys(['taskIdx', 'taskName', 'goldActionSequences'])

In [3]:
trajectories = json_data['goldActionSequences'][0]['path']
goal = json_data['goldActionSequences'][0]['taskDescription'].split('.')[0]

all_lens = []

trajectories_bdi = []

last_reward = 0
for i, trajectory in enumerate(trajectories):
    observation = trajectory['observation']
    look_around = trajectory['freelook']
    inventory = trajectory['inventory']

    belief_base = parse_beliefs(observation=observation, look=look_around, inventory=inventory)

    next_trajectory = trajectories[i + 1]
    next_belief_base = parse_beliefs(observation=next_trajectory['observation'],
                                     look=next_trajectory['freelook'],
                                     inventory=next_trajectory['inventory'])

    all_lens.append(len(belief_base))
    action = trajectory['action']
    reward = float(trajectory['score']) - last_reward
    last_reward = float(trajectory['score'])
    is_done = trajectory['isCompleted']
    if is_done == 'true':
        next_state = ""
        print("finish")
        break
        # ou break de repente aqui

    trajectories_bdi.append({
            'belief_base': belief_base + [goal],
            'action': action,
            'next_belief_base': next_belief_base + [goal],
            'next_action': next_trajectory['action'],
            'reward': reward
    })

    print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done} - score {float(trajectory['score']):.3f}")

Step 0 - reward: 0.000 - is_done: false - score 0.000
Step 1 - reward: 0.000 - is_done: false - score 0.000
Step 2 - reward: 0.000 - is_done: false - score 0.000
Step 3 - reward: 0.000 - is_done: false - score 0.000
Step 4 - reward: 0.000 - is_done: false - score 0.000
Step 5 - reward: 0.000 - is_done: false - score 0.000
Step 6 - reward: 0.000 - is_done: false - score 0.000
Step 7 - reward: 0.000 - is_done: false - score 0.000
Step 8 - reward: 0.000 - is_done: false - score 0.000
Step 9 - reward: 0.033 - is_done: false - score 0.033
Step 10 - reward: 0.000 - is_done: false - score 0.033
Step 11 - reward: 0.000 - is_done: false - score 0.033
Step 12 - reward: 0.667 - is_done: false - score 0.700
Step 13 - reward: 0.000 - is_done: false - score 0.700
Step 14 - reward: 0.000 - is_done: false - score 0.700
Step 15 - reward: 0.017 - is_done: false - score 0.717
Step 16 - reward: 0.017 - is_done: false - score 0.733
Step 17 - reward: 0.000 - is_done: false - score 0.733
Step 18 - reward: 0.

# Sampling a single trajectory

In [4]:
print(f"{len(trajectories_bdi[12]['belief_base'])} - {len(trajectories_bdi[12]['next_belief_base'])}")
trajectories_bdi[12]

23 - 23


{'belief_base': ['This room is called the kitchen.',
  'You see a substance called soap',
  'You see a painting',
  'You see a counter. On the counter is: a bowl (containing a banana, a potato, a red apple, an orange), a drawer.',
  'You see a sink, which is turned off. In the sink is: nothing.',
  'You see a table. On the table is: a glass cup (containing nothing).',
  'You see a chair. On the chair is: nothing.',
  'You see a freezer. The freezer door is closed.',
  'You see a lighter',
  'You see a stopwatch, which is deactivated.',
  'You see a fridge. The fridge door is closed.',
  'You see a substance called air',
  'You see a cupboard. The cupboard door is open. In the cupboard is: a tin cup (containing nothing), a ceramic cup (containing nothing), a drawer.',
  'You see a oven, which is turned off. The oven door is closed.',
  'You see a glass jar (containing a substance called sodium chloride)',
  'You see the agent',
  'You see a stove, which is turned off. On the stove is: n

In [5]:
ckpt = "/opt/models/simcse_default/version_0/v0-epoch=4-step=18304-val_nli_loss=0.658-train_loss=0.551.ckpt"

model: SimCSE = SimCSE.load_from_checkpoint(ckpt).eval()
hf_model_name = model.hparams['hf_model_name']
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Prepare tensors
- Embeds all belief trajectory and actions as well

In [6]:
example_trajectory = trajectories_bdi[12]

belief_base = example_trajectory['belief_base']
next_belief_base = example_trajectory['next_belief_base']

belief_intersection = [b for b in belief_base if b not in next_belief_base]
belief_intersection

['You focus on the water.']

In [7]:
def encode(texts: list[str]):
    tokenized_text = tokenizer(texts, padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
    return model.encode(tokenized_text).detach()


belief_base_emb = encode(example_trajectory['belief_base']).unsqueeze(0)
next_belief_base_emb = encode(example_trajectory['next_belief_base']).unsqueeze(0)
action_emb = encode(example_trajectory['action'])
next_action_emb = encode(example_trajectory['next_action'])

fake_action_emb = encode(["go to kitchen"])
fake_next_belief_base_emb = torch.randn(next_belief_base_emb.size()).to('cuda')

transition_a = (belief_base_emb, action_emb, next_belief_base_emb, next_action_emb, example_trajectory['reward'])
transition_b = (belief_base_emb, fake_action_emb, fake_next_belief_base_emb, fake_action_emb, 0)

batch_belief_base = torch.cat([transition_a[0], transition_b[0]]).to('cuda')
batch_action = torch.cat([transition_a[1], transition_b[1]]).to('cuda')
batch_next_belief_base = torch.cat([transition_a[2], transition_b[2]]).to('cuda')
batch_next_actions = torch.cat([transition_a[3], transition_b[3]]).to('cuda')
batch_reward = torch.tensor([transition_a[4], transition_b[4]], dtype=torch.float).to('cuda')
batch_reward

tensor([0.6667, 0.0000], device='cuda:0')

In [8]:
belief_base_emb.size()

torch.Size([1, 23, 768])

In [9]:
batch_size, num_beliefs, embedding_dim = belief_base_emb.size()

network = QNetwork(embedding_dim, embedding_dim, embedding_dim, n_blocks=2)
network = network.to('cuda')
network

QNetwork(
  (belief_base_encoder): BeliefBaseEncoder(
    (blocks): ModuleList(
      (0-1): 2 x BeliefTransformerBlock(
        (attention_dropout): Dropout(p=0.0, inplace=False)
        (output_dropout): Dropout(p=0.0, inplace=False)
        (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (qkv_proj_layer): Linear(in_features=768, out_features=2304, bias=False)
        (mlp): PositionWiseFF(
          (c_fc): Linear(in_features=768, out_features=768, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (hidden): Linear(in_features=1536, out_features=768, bias=False)
  (q_value_layer): Linear(in_features=768, out_features=1, bias=False)
)

In [55]:
# q-learning 
GAMMA = 0.99

num_parameters = sum(p.numel() for p in network.parameters() if p.requires_grad)
print(f"Number of parameters: {num_parameters}")

optimizer = torch.optim.Adam(network.parameters(), lr=1e-4)

for epoch in range(100):
    optimizer.zero_grad()
    # Q(s', a')
    next_q_values = network(belief_base=batch_next_belief_base,
                            belief_base_sizes=[num_beliefs],
                            action_tensors=batch_next_actions)
    best_next_q_values = next_q_values.squeeze(-1)
    targets = batch_reward + (GAMMA * best_next_q_values)

    # Q(s, a)
    q_values = network(belief_base=batch_belief_base, belief_base_sizes=[num_beliefs], action_tensors=batch_action)
    if epoch % 10 == 0:
        print(f"[{epoch}] q-values {q_values.squeeze(-1)}, targets {targets.squeeze(-1)}")
    loss = F.smooth_l1_loss(q_values.squeeze(-1), targets.detach())
    loss.backward()
    nn.utils.clip_grad_norm_(network.parameters(), 5.)
    optimizer.step()
    #break

Number of parameters: 7081728
[0] q-values tensor([ 2.8172, -0.9102], device='cuda:0', grad_fn=<SqueezeBackward1>), targets tensor([ 2.8176, -0.9106], device='cuda:0', grad_fn=<SqueezeBackward1>)
[10] q-values tensor([ 2.8851, -0.8148], device='cuda:0', grad_fn=<SqueezeBackward1>), targets tensor([ 2.8848, -0.8501], device='cuda:0', grad_fn=<SqueezeBackward1>)
[20] q-values tensor([ 3.0545, -0.8553], device='cuda:0', grad_fn=<SqueezeBackward1>), targets tensor([ 3.0526, -0.8607], device='cuda:0', grad_fn=<SqueezeBackward1>)
[30] q-values tensor([ 3.1043, -0.8943], device='cuda:0', grad_fn=<SqueezeBackward1>), targets tensor([ 3.0981, -0.8743], device='cuda:0', grad_fn=<SqueezeBackward1>)
[40] q-values tensor([ 3.0852, -0.9227], device='cuda:0', grad_fn=<SqueezeBackward1>), targets tensor([ 3.0814, -0.8810], device='cuda:0', grad_fn=<SqueezeBackward1>)
[50] q-values tensor([ 3.0745, -0.9214], device='cuda:0', grad_fn=<SqueezeBackward1>), targets tensor([ 3.0773, -0.8736], device='cuda:0

In [48]:
print(next_q_values)#.max(dim=-1),
next_q_values.size(), batch_reward.unsqueeze(1).size(), targets.size(), q_values.size(), best_next_q_values.size()

tensor([[1.4348],
        [0.2647]], device='cuda:0', grad_fn=<MmBackward0>)


(torch.Size([2, 1]),
 torch.Size([2, 1]),
 torch.Size([2]),
 torch.Size([2, 1]),
 torch.Size([2]))

In [52]:
next_q_values.squeeze(-1)

tensor([1.4348, 0.2647], device='cuda:0', grad_fn=<SqueezeBackward1>)

In [26]:
torch.nn.CosineSimilarity(dim=-1)(belief_base_emb.squeeze(0), next_belief_base_emb.squeeze(0)).mean()

tensor(0.9630, device='cuda:0')

In [13]:
example_trajectory['next_action'], example_trajectory['action']

('pour metal pot into metal pot', 'focus on substance in metal pot')