In [1]:
import json

import lightning as L
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, Features, Sequence, Value
from lightning import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger
from scienceworld import ScienceWorldEnv
from torch.utils.data import DataLoader

from sources.fallback_policy.encoder import HFEncoderModel, EncoderModel
from sources.fallback_policy.model import BeliefBaseEncoder, ContrastiveQNetwork
from sources.scienceworld import parse_beliefs

In [2]:
encoder_model = HFEncoderModel("princeton-nlp/sup-simcse-roberta-base", device='cuda')



In [3]:
goldpath_df = pd.read_csv("/opt/data/scienceworld-goldpaths/trajectories_csv/tabular_task-1-boil.csv")
goldpath_df = goldpath_df[goldpath_df['variation_idx'] == 0]  # TODO: remover filtro
goldpath_df = goldpath_df.sort_values("turn")
goldpath_df

Unnamed: 0,turn,look_around,observation,inventory,action,score,done,goal,task_description,fold,variation_idx,task_name,task_idx
0,1,"This room is called the hallway. In it, you se...",The door is already open.,"In your inventory, you see:\n\tan orange\n",open door to kitchen,0.0,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
1,2,"This room is called the hallway. In it, you se...",You move to the kitchen.,"In your inventory, you see:\n\tan orange\n",go to kitchen,0.0,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
2,3,"This room is called the kitchen. In it, you se...",You move the thermometer to the inventory.,"In your inventory, you see:\n\tan orange\n",pick up thermometer,0.0,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
3,4,"This room is called the kitchen. In it, you se...",The cupboard is now open.,"In your inventory, you see:\n\ta thermometer, ...",open cupboard,0.0,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
4,5,"This room is called the kitchen. In it, you se...",You move the metal pot to the inventory.,"In your inventory, you see:\n\ta thermometer, ...",pick up metal pot,0.0,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
5,6,"This room is called the kitchen. In it, you se...",You move the metal pot to the sink.,"In your inventory, you see:\n\ta thermometer, ...",move metal pot to sink,0.0,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
6,7,"This room is called the kitchen. In it, you se...",The sink is now activated.,"In your inventory, you see:\n\ta thermometer, ...",activate sink,0.033333,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
7,8,"This room is called the kitchen. In it, you se...",The sink is now deactivated.,"In your inventory, you see:\n\ta thermometer, ...",deactivate sink,0.033333,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
8,9,"This room is called the kitchen. In it, you se...",You move the metal pot to the inventory.,"In your inventory, you see:\n\ta thermometer, ...",pick up metal pot,0.033333,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0
9,10,"This room is called the kitchen. In it, you se...",You focus on the water.,"In your inventory, you see:\n\ta thermometer, ...",focus on substance in metal pot,0.7,False,Your task is to boil water,Your task is to boil water. For compounds with...,train,0,task-1-boil,0


# Loading Trajectories

In [4]:
all_trajectories = []

previous_actions = []
observation = ""
for i, row in goldpath_df.iterrows():
    belief_base = parse_beliefs(observation=observation, look=row['look_around'], inventory=row['inventory'])
    belief_base = [b for b in belief_base if len(b) > 0] + [row['goal']]
    for a in previous_actions[-5:]:
        belief_base.append(f"You executed the action {a['action']} at turn {a['turn']}")

    belief_base_sizes = len(belief_base) + 1
    action = row['action']
    all_trajectories.append({
            'belief_base': belief_base,
            'action': action,
            'belief_base_sizes': belief_base_sizes,
    })
    
    previous_actions.append({
                    'turn': row['turn'],
                    'action': action
            })
    
    observation = row['observation']


trajectories_pd = pd.DataFrame(all_trajectories)
dataset = Dataset.from_pandas(trajectories_pd, features=Features({
        "belief_base": Sequence(Value(dtype="string")),
        "action": Value(dtype="string"),
        "belief_base_sizes": Value(dtype="int32")
}))

In [5]:
def collate_fn(data):
    # tem que fazer o encode aqui, para entregar batchs de vetores prontos
    actions = [d['action'] for d in data]
    belief_base_sizes = [d['belief_base_sizes'] for d in data]
    belief_base = [d['belief_base'] for d in data]

    return {'actions': actions,
            'belief_base_sizes': belief_base_sizes,
            'belief_base': belief_base}
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=8, shuffle=True)

In [6]:
EPOCHS = 40
model = ContrastiveQNetwork(768, encoder_model=encoder_model)

base_dir = "cl_step"
tb_logger = TensorBoardLogger(f"logs/{base_dir}")
tb_logger.log_hyperparams(model.hparams)
version = tb_logger.version
filename = base_dir + "/version_" + str(version) + "/" + "v" + str(
        version) + "-{epoch}-{step}-{train_loss_epoch:.3f}"
checkpoint_callback = ModelCheckpoint(dirpath='checkpoints',
                                      monitor='train_loss_epoch',
                                      save_top_k=2,
                                      filename=filename)

trainer = Trainer(max_epochs=EPOCHS,
                  accelerator='gpu',
                  logger=tb_logger,
                  callbacks=[checkpoint_callback]
                  )
trainer.fit(model, dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                | Type              | Params
----------------------------------------------------------
0 | belief_base_encoder | BeliefBaseEncoder | 5.9 M 
1 | similarity_function | CosineSimilarity  | 0     
2 | linear_act          | Linear            | 590 K 
3 | linear_belief       | Linear            | 590 K 
----------------------------------------------------------
7.1 M     Trainable params
0         Non-trainable params


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=40` reached.


In [7]:
model = model.to('cuda')
model = model.eval()

env = ScienceWorldEnv()
goal = row['goal']
variation_idx = row['variation_idx']

env.load("boil", variation_idx, "openDoors")
with torch.no_grad():
    max_steps = 30
    action = "look around"

    plan = []
    previous_action = []
    for step in range(max_steps):
        obs, reward, is_done, info = env.step(action)

        print(f" => Step {step} - reward: {reward:.3f} - is_done: {is_done} - action: {action}")
        belief_base = parse_beliefs(observation=obs, look=info['look'], inventory=info['inv']) + [goal]
        belief_base = [b.replace("greenhouse", "green house") for b in belief_base]

        for a in previous_action[-5:]:
            belief_base.append(f"You executed the action {a['action']} at turn {a['turn']}")

        num_beliefs = len(belief_base) + 1 + 1  # including cls
        # candidate_actions = available_actions
        candidate_actions = info['valid']
        # q_values = model.act(belief_base, candidate_actions=info['valid'])
        q_values = model.act(belief_base, candidate_actions=candidate_actions)
        selected_action = q_values.argmax(dim=-1)[0]
        action = candidate_actions[selected_action]
        # if i == 1:
        #   action = "focus on substance in metal pot"
        # print(f"Belief Base: {belief_base}")
        #print(f"obs: {obs}")
        #print(f"Selected action: {action}")
        values, idxs = torch.sort(q_values.squeeze(0), descending=True)

        top_k = 3
        #print(f"\tAction space - Top {top_k}:")
        #for i, idx in enumerate(idxs[:top_k]):
        #    print(f"\t\tCandidate Action: {candidate_actions[idx]} - q_value: {values[i]:.3f}")

        plan.append(action)

        previous_action.append({
                'turn': step,
                'action': action
        })

    #print("Plan Executed: ")
    #for i, a in enumerate(plan):
    #    print(f"{i} -  {a}")

 => Step 0 - reward: 0.000 - is_done: False - action: look around
 => Step 1 - reward: 0.000 - is_done: False - action: go to kitchen
 => Step 2 - reward: 0.000 - is_done: False - action: pick up thermometer
 => Step 3 - reward: 0.000 - is_done: False - action: open cupboard
 => Step 4 - reward: 0.000 - is_done: False - action: pick up metal pot
 => Step 5 - reward: 0.000 - is_done: False - action: move metal pot to sink
 => Step 6 - reward: 3.000 - is_done: False - action: activate sink
 => Step 7 - reward: 0.000 - is_done: False - action: deactivate sink
 => Step 8 - reward: 0.000 - is_done: False - action: pick up metal pot
 => Step 9 - reward: 67.000 - is_done: False - action: focus on substance in metal pot
 => Step 10 - reward: 0.000 - is_done: False - action: pour metal pot into metal pot
 => Step 11 - reward: 0.000 - is_done: False - action: focus on metal pot
 => Step 12 - reward: 0.000 - is_done: False - action: move metal pot to stove
 => Step 13 - reward: 3.000 - is_done: F