In [1]:
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

from sources.cl_nli.model import SimCSE
from sources.fallback_policy.replay import ReplayMemory, Transition
from sources.scienceworld import parse_beliefs

# Initializing q_network

In [2]:
from sources.fallback_policy.model import SimpleQNetwork, QNetwork

use_transformer = True
if use_transformer:
    network = QNetwork(768, 768, n_blocks=4)
    use_cls = True
else:
    network = SimpleQNetwork(768, 768, 1)
    use_cls = False
network = network.to('cuda')
network.train()

num_parameters = sum(p.numel() for p in network.parameters() if p.requires_grad)
print(f"Number of parameters: {num_parameters}")
network

Number of parameters: 12983040


QNetwork(
  (belief_base_encoder): BeliefBaseEncoder(
    (blocks): ModuleList(
      (0-3): 4 x BeliefTransformerBlock(
        (attention_dropout): Dropout(p=0.0, inplace=False)
        (output_dropout): Dropout(p=0.0, inplace=False)
        (layer_norm_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (qkv_proj_layer): Linear(in_features=768, out_features=2304, bias=False)
        (mlp): PositionWiseFF(
          (c_fc): Linear(in_features=768, out_features=768, bias=False)
          (gelu): GELU(approximate='none')
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (layer_norm_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (hidden): Linear(in_features=1536, out_features=768, bias=False)
  (q_value_layer): Linear(in_features=768, out_features=1, bias=False)
)

# Reading goldpath trajectories

In [3]:
goldpath_file = "/opt/data/scienceworld-goldpaths/goldsequences-0.json"

with open(goldpath_file) as f:
    json_data = json.load(f)

json_data = json_data['0']

json_data.keys()

dict_keys(['taskIdx', 'taskName', 'goldActionSequences'])

In [4]:
ckpt = "/opt/models/simcse_default/version_0/v0-epoch=4-step=18304-val_nli_loss=0.658-train_loss=0.551.ckpt"

model: SimCSE = SimCSE.load_from_checkpoint(ckpt).eval()
hf_model_name = model.hparams['hf_model_name']
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
simcse_tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-roberta-base")
simcse_model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-roberta-base").eval().to('cuda')

## Create memory buffer

In [6]:
def encode_custom(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    if include_cls:
        cls_token = tokenizer.cls_token
        texts = [cls_token] + texts
    pad_size = max_size - len(texts)
    padding = [tokenizer.pad_token for _ in range(pad_size)]
    texts = texts + padding
    tokenized_text = tokenizer(texts, padding='longest', truncation=True,
                               return_tensors='pt').to(model.device)
    embeddings = model.encode(tokenized_text).detach().unsqueeze(0)  # batch axis
    return embeddings


def encode_simcse(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    with torch.no_grad():
        if include_cls:
            cls_token = simcse_tokenizer.cls_token
            texts = [cls_token] + texts
        pad_size = max_size - len(texts)
        padding = [simcse_tokenizer.pad_token for _ in range(pad_size)]
        texts = texts + padding
        tokenized_text = simcse_tokenizer(texts, padding='longest', truncation=True,
                                          return_tensors='pt').to('cuda')
        embeddings = simcse_model(**tokenized_text).pooler_output.unsqueeze(0)
        return embeddings


def encode(texts: list[str], max_size: int = 25, include_cls: bool = True) -> torch.Tensor:
    return encode_simcse(texts, max_size, include_cls)
    #return encode_custom(texts, max_size, include_cls)

In [7]:
gold_sequence = json_data['goldActionSequences'][0]['path']

goal = json_data['goldActionSequences'][0]['taskDescription'].split('.')[0]
variation_idx = json_data['goldActionSequences'][0]['variationIdx']
print(f"Goal: {goal} - variation {variation_idx}")

all_lens = []

trajectories_bdi = []

memory_buffer = ReplayMemory(1000)

last_reward = 0
observation = ""
for i, trajectory in enumerate(gold_sequence):
    look_around = trajectory['freelook']
    inventory = trajectory['inventory']
    belief_base = parse_beliefs(observation=observation, look=look_around, inventory=inventory)

    next_trajectory = gold_sequence[i + 1]
    next_belief_base = parse_beliefs(observation=next_trajectory['observation'],
                                     look=next_trajectory['freelook'],
                                     inventory=next_trajectory['inventory'])

    reward = float(trajectory['score']) - last_reward
    last_reward = float(trajectory['score'])
    is_done = trajectory['isCompleted']
    if is_done == 'true':
        next_state = ""
        print("finish")
        break
        # ou break de repente aqui
        
    extra_beliefs = 2 if use_cls else 1
    if trajectory['action'] != 'look around':
        memory_buffer.push(
                Transition(
                        belief_base=encode(belief_base + [goal], include_cls=use_cls),
                        num_beliefs=len(belief_base) + extra_beliefs,  # including goal
                        action=encode([trajectory['action']], max_size=1, include_cls=False),
                        next_belief_base=encode(next_belief_base + [goal]),
                        num_next_beliefs=len(next_belief_base) + extra_beliefs,
                        next_action=encode([next_trajectory['action']], max_size=1, include_cls=False),
                        reward=reward,
                        done=True if is_done == 'true' else False
                )
        )
    observation = trajectory['observation']

    #print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done}  - action: {trajectory['action']}")

Goal: Your task is to boil water - variation 0
finish


# Train q-network using annotated trajectories

### Initializing Network

In [8]:
# q-learning 
GAMMA = 0.99
BATCH_SIZE = 8

optimizer = torch.optim.Adam(network.parameters(), lr=1e-4)


def contrastive_loss(belief_base_emb, num_belief_emb, action_emb):
    # USING CONTRASTIVE LOSS  (cross entropy) does not work well
    # maybe because we did not need to use a softmax function at the top of network to predict q-values
    batch_size, _, _ = belief_base_emb.size()

    belief_q_values = []
    for belief_idx in range(BATCH_SIZE):
        c_belief_base = belief_base_emb[belief_idx, :, :].unsqueeze(0)
        c_belief_base = c_belief_base.repeat(batch_size, 1, 1)
        q_values = network(belief_base=c_belief_base,
                           belief_base_sizes=[num_belief_emb[belief_idx] for _ in range(batch_size)],
                           action_tensors=action_emb)
        belief_q_values.append(q_values.squeeze(0))

    all_q_values = torch.cat(belief_q_values, dim=-1)
    labels = torch.zeros_like(all_q_values).to('cuda')
    labels = labels.fill_diagonal_(1)
    all_q_values = all_q_values.view(batch_size * batch_size, 1)
    labels = labels.view(batch_size * batch_size, 1)
    #return F.mse_loss(all_q_values, labels)
    return F.smooth_l1_loss(all_q_values, labels)


for epoch in range(1000):
    batch = memory_buffer.sample(BATCH_SIZE)
    belief_base_emb = torch.cat([b.belief_base for b in batch], dim=0)
    batch_size, _, belief_dim = belief_base_emb.size()
    num_belief_emb = [b.num_beliefs for b in batch]
    actions = torch.cat([b.action for b in batch]).squeeze(1)  # removing mid axis [bs, ?, a_dim] -> [bs, a_dim]

    optimizer.zero_grad()
    loss = contrastive_loss(belief_base_emb, num_belief_emb, actions)
    if epoch % 100 == 0:
        print(f"epoch {epoch} - loss {loss.item(): .4f}")
    loss.backward()
    optimizer.step()
    #break
    #nn.utils.clip_grad_norm_(network.parameters(), 1.)

print(f"epoch {epoch} - loss {loss.item(): .4f}")
# simple network epoch 299 - loss  0.0219

epoch 0 - loss  0.0738
epoch 100 - loss  0.0544
epoch 200 - loss  0.0529
epoch 300 - loss  0.0488
epoch 400 - loss  0.0421
epoch 500 - loss  0.0497
epoch 600 - loss  0.0444
epoch 700 - loss  0.0420
epoch 800 - loss  0.0430
epoch 900 - loss  0.0360
epoch 999 - loss  0.0265


# Evaluating trajectory samples

In [9]:
network = network.eval()
turn = 2
expected_action = gold_sequence[turn]['action']
#print(expected_action)
annotated_belief_base = parse_beliefs(observation=gold_sequence[turn - 1]['observation'],
                                      look=gold_sequence[turn]['freelook'],
                                      inventory=gold_sequence[turn]['inventory']) + [goal]#['your task is to melt gallium']

annotated_belief_base

['This room is called the hallway.',
 'You see a picture',
 'You see a substance called air',
 'You see the agent',
 'A door to the green house (that is open)',
 'A door to the living room (that is open)',
 'A door to the art studio (that is open)',
 'A door to the kitchen (that is open)',
 'A door to the bedroom (that is open)',
 'A door to the workshop (that is open)',
 'In your inventory, you see: an orange ',
 'The door is already open.',
 'Your task is to boil water']

In [10]:
extra_beliefs = 2 if use_cls else 1
num_beliefs = len(annotated_belief_base) + extra_beliefs

encoded_belief_base = encode(annotated_belief_base, max_size=len(annotated_belief_base), include_cls=use_cls)

#candidate_actions = info['valid']
candidate_actions = ['focus on bedroom door', 'open door to kitchen', 'go to kitchen', 'look at the kitchen', 'go to door to the kitchen'] + [
        expected_action]
encoded_actions = encode(candidate_actions, include_cls=False, max_size=len(candidate_actions))
encoded_actions = encoded_actions.squeeze(0)
num_actions, action_dim = encoded_actions.size()
repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
q_values = network(belief_base=repeat_encoded_belief_base,
                   belief_base_sizes=[len(annotated_belief_base) + 1 for _ in range(num_actions)],
                   action_tensors=encoded_actions)

print(expected_action)
values, idxs = torch.sort(q_values.squeeze(1), descending=True)
print("Action space - Top 5:")
for i, idx in enumerate(idxs[:5]):
    print(f"\tCandidate Action: {candidate_actions[idx]} - q_value: {values[i]:.3f}")
    
selected_idx = idxs[0]
print(f"Predicted action: {candidate_actions[selected_idx]}")

go to kitchen
Action space - Top 5:
	Candidate Action: go to door to the kitchen - q_value: 0.735
	Candidate Action: open door to kitchen - q_value: 0.714
	Candidate Action: go to kitchen - q_value: 0.675
	Candidate Action: go to kitchen - q_value: 0.675
	Candidate Action: look at the kitchen - q_value: 0.639
Predicted action: go to door to the kitchen


In [11]:
from scienceworld import ScienceWorldEnv

network = network.eval()

In [12]:
env = ScienceWorldEnv()
variation_idx = json_data['goldActionSequences'][0]['variationIdx']


In [13]:
env.load("boil", variation_idx, "openDoors")
with torch.no_grad():
    max_steps = 1
    action = "look around"
    
    blacklist_action = []
    for i in range(max_steps):
        obs, reward, is_done, info = env.step(action)
        
        # if obs is non action that matches the input then
        #    remove action from info['valid']
        # else
        #    do nothing
        
        print(f"Step {i} - reward: {reward:.3f} - is_done: {is_done} - action: {action}")
        belief_base = parse_beliefs(observation=obs, look=info['look'], inventory=info['inv']) + [goal]
        belief_base = [b.replace("greenhouse", "green house") for b in belief_base]
        num_beliefs = len(belief_base) + extra_beliefs
        encoded_belief_base = encode(belief_base, max_size=len(belief_base))
        encoded_actions = encode(info['valid'], max_size=len(info['valid']), include_cls=False)
        encoded_actions = encoded_actions.squeeze(0)
        num_actions, action_dim = encoded_actions.size()
        repeat_encoded_belief_base = encoded_belief_base.repeat(num_actions, 1, 1)
        q_values = network(belief_base=repeat_encoded_belief_base,
                           belief_base_sizes=[num_beliefs],
                           action_tensors=encoded_actions)
        selected_action = q_values.argmax()
        action = info['valid'][selected_action.item()]


Step 0 - reward: 0.000 - is_done: False - action: look around


In [14]:
values, idxs = torch.sort(q_values.squeeze(1), descending=True)
top_k = 10
print(f"Action space - Top {top_k}:")
for i, idx in enumerate(idxs[:top_k]):
    print(f"\tCandidate Action: {info['valid'][idx]} - q_value: {values[i]:.3f}")


Action space - Top 10:
	Candidate Action: go to door to kitchen - q_value: 0.718
	Candidate Action: go to kitchen - q_value: 0.674
	Candidate Action: look at door to kitchen - q_value: 0.671
	Candidate Action: connect hallway to kitchen - q_value: 0.642
	Candidate Action: mix door to kitchen - q_value: 0.621
	Candidate Action: connect bedroom door to kitchen - q_value: 0.619
	Candidate Action: focus on door to kitchen - q_value: 0.598
	Candidate Action: connect kitchen to hallway - q_value: 0.598
	Candidate Action: connect hallway to door to kitchen - q_value: 0.587
	Candidate Action: look at kitchen - q_value: 0.584


In [15]:
info['valid'].index("go to kitchen")

358

In [16]:
print(len(belief_base) + extra_beliefs)
belief_base

14


['This room is called the hallway.',
 'You see the agent',
 'You see a substance called air',
 'You see a picture',
 'A door to the art studio (that is open)',
 'A door to the bedroom (that is open)',
 'A door to the green house (that is open)',
 'A door to the kitchen (that is open)',
 'A door to the living room (that is open)',
 'A door to the workshop (that is open)',
 'In your inventory, you see: an orange ',
 'Your task is to boil water']

In [17]:
[b for b in belief_base if b not in annotated_belief_base]

[]

In [18]:
[b for b in annotated_belief_base if b not in belief_base]

['The door is already open.']

# Generating Plans

In [19]:
import spacy

nlp = spacy.load("en_core_web_sm")

def get_nouns(text:str) -> list[str]:
    doc = nlp(text)
    return [token.lemma_ for token in doc if token.pos_ in ("NOUN", "PROPN")]

action_nouns = get_nouns(expected_action)

plan_context = []
for belief in annotated_belief_base:
    nouns = get_nouns(belief)
    if set(action_nouns) <= set(nouns):
        print(f"\nGoal: {goal}\nContext: {belief}. \nAction: {expected_action}")
        plan_context.append(belief)
    
plan = f"""
IF {goal}
CONSIDERING {'AND'.join(plan_context)}
THEN
{expected_action}
"""

print(plan)
    


Goal: Your task is to boil water
Context: A door to the kitchen (that is open). 
Action: go to kitchen

IF Your task is to boil water
CONSIDERING A door to the kitchen (that is open)
THEN
go to kitchen



In [20]:
candidate_context = [
        "you see a cupboard",
        "you don't see a cupboard",
        "you are not in the kitchen",
        "you are in the kitchen"
]

action = "you see a cupboard"

THRESHOLD = 0.7

for i, candidate in enumerate(candidate_context):
    embeddings = encode_simcse([candidate_context[i], action], max_size=2, include_cls=False)
    embeddings_a = embeddings[:, 0, :]
    embeddings_b = embeddings[:, 1, :]
    
    sim = nn.functional.cosine_similarity(embeddings_a, embeddings_b)
    print(f"Candidate context: {candidate} - Similarity: {sim.item():.4f} - [{sim.item() >= THRESHOLD}]")
    

Candidate context: you see a cupboard - Similarity: 1.0000 - [True]
Candidate context: you don't see a cupboard - Similarity: 0.5476 - [False]
Candidate context: you are not in the kitchen - Similarity: 0.3883 - [False]
Candidate context: you are in the kitchen - Similarity: 0.5414 - [False]


In [23]:
candidate_context = [
        "look at the kitchen",
        "open door to the kitchen",
        "go to kitchen",
        "connect hallway to kitchen"
]

action = "go to kitchen"

THRESHOLD = 0.7

for i, candidate in enumerate(candidate_context):
    embeddings = encode_simcse([candidate_context[i], action], max_size=2, include_cls=False)
    embeddings_a = embeddings[:, 0, :]
    embeddings_b = embeddings[:, 1, :]
    
    sim = nn.functional.cosine_similarity(embeddings_a, embeddings_b)
    print(f"Candidate context: {candidate} - Similarity: {sim.item():.4f} - [{sim.item() >= THRESHOLD}]")
    

Candidate context: look at the kitchen - Similarity: 0.7573 - [True]
Candidate context: open door to the kitchen - Similarity: 0.7459 - [True]
Candidate context: go to kitchen - Similarity: 1.0000 - [True]
Candidate context: connect hallway to kitchen - Similarity: 0.7748 - [True]


In [22]:
Candidate context: look at the kitchen - Similarity: 0.9610 - [True]
Candidate context: open door to the kitchen - Similarity: 0.5093 - [False]
Candidate context: go to the kitchen - Similarity: 1.0000 - [True]
Candidate context: connect hallway to kitchen - Similarity: 0.9258 - [True]

SyntaxError: invalid syntax (4119736769.py, line 1)