In [23]:
import json
import pandas as pd
import torch
from tqdm import tqdm

from transformers import AutoModel, AutoTokenizer
from transformers import default_data_collator
from torch.utils.data import DataLoader

from sources.scienceworld import parse_beliefs, parse_goal

In [2]:
goldpath_file = "../data/goldsequences-0-1-2-3-4-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29.json"
#goldpath_file = "../data/goldsequences-0.json"

with open(goldpath_file) as file:
    data = json.load(file)

data.keys()

dict_keys(['0', '1', '2', '3', '4', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29'])

In [14]:
sentence_list = []

all_tasks = list(data.keys())
for task_id in tqdm(all_tasks):
    for variation in range(0, len(data[task_id]['goldActionSequences'])):
        trajectory = data[task_id]['goldActionSequences'][variation]
        task = trajectory['taskDescription']

        goal = parse_goal(task)
        if goal not in sentence_list:
            sentence_list.append(goal)
        for step in trajectory['path']:
            inventory = step['inventory']
            observation = step['observation']
            look = step['freelook']
            if step['action'] == 'look around':  # avoid including the freelook
                observation = ""
            beliefs = parse_beliefs(observation=observation, look=look, inventory=inventory)
            for b in beliefs:
                if b not in sentence_list:
                    sentence_list.append(b)

print("ok")
len(sentence_list) # 57179

100%|██████████| 29/29 [04:57<00:00, 10.27s/it]

ok





In [21]:
df = pd.DataFrame(data=sentence_list, columns=['sentence'])
df.to_csv("all_beliefs.csv", index=False)

In [24]:
#df = pd.read_csv("all_beliefs.csv")
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model = model.to('cuda')


In [61]:
tokenized_sentence = tokenizer(sentence_list, padding="max_length", max_length=256, truncation=True, return_tensors="pt")
        #break

In [72]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, sentences):
        self.sentences = sentences

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.sentences.items()}
        return item

    def __len__(self):
        num_sentences, _ = self.sentences['input_ids'].size()
        return num_sentences

train_dataset = CustomDataset(tokenized_sentence)
len(train_dataset)

57179

In [79]:
dataloader = DataLoader(train_dataset, batch_size=256)
all_embeddings = []
model.eval()
with torch.no_grad():
    for batch in tqdm(dataloader):
        x = {k:b.to("cuda") for k,b in batch.items()}
        embeddings = model(**x)
        all_embeddings.append(embeddings.pooler_output)
        #break
sentences_embeddings = torch.concat(all_embeddings)

100%|██████████| 224/224 [08:10<00:00,  2.19s/it]


In [85]:
torch.save(sentences_embeddings, "all_beliefs_embeddings.pt")