In [1]:
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, GPT2LMHeadModel
import torch

In [2]:
#checkpoint = "/home/drdo/Caricatures/models/scan_dummy_tokens_gpt2/checkpoint-40000"
checkpoint = "/home/drdo/Caricatures/models/scan_distilgpt2/checkpoint-40000"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

model = GPT2LMHeadModel.from_pretrained(checkpoint)

model.generation_config.max_length = 256

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
max_len = 9
dummy_token = "<empty>"

# command type maps
actions = {
    "walk": "I_WALK",
    "run": "I_RUN",
    "jump": "I_JUMP",
    "look": "I_LOOK",
    "turn": dummy_token,
    dummy_token: dummy_token,
    }

turns = {
    "around": "yyyy",
    "opposite": "yy",
    dummy_token: dummy_token
}

directions = {
    "right": "I_TURN_RIGHT",
    "left": "I_TURN_LEFT",
    dummy_token: dummy_token
}

nums = {
    "twice": "xx",
    "thrice": "xxx",
    dummy_token: dummy_token
}

conjs = ["and", "after", dummy_token]

# command structure
command_structure = {
    0: actions,
    1: turns,
    2: directions,
    3: nums,
    4: conjs,
    5: actions,
    6: turns,
    7: directions,
    8: nums,
}

In [5]:
dataset = load_dataset("scan", "simple", trust_remote_code=True)
column_names = dataset["train"].column_names
input_column = column_names[0]
output_column = column_names[1]

def add_empty_token(x):
    command_str = x[input_column]
    command = command_str.split()
    padded_command = []
    index = 0
    c = 0
    while index < max_len:
        expected_cs = command_structure[index]
        if c < len(command) and command[c] in expected_cs:
            padded_command.append(command[c])
            c += 1
        else:
            padded_command.append(dummy_token)
        index += 1
    
    x[input_column] = ' '.join(padded_command)
    return x

test_dataset = dataset["test"].map(
    add_empty_token,
    batched=False,
    desc="Running tokenizer on dataset",
)


In [6]:
test_dataset[1]

{'commands': 'run <empty> right twice after walk <empty> right twice',
 'actions': 'I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN'}

In [7]:
# generation_mode = GenerationMode.GREEDY_SEARCH

context = 'run <empty> right twice after walk <empty> right twice'
inputs = tokenizer(context+tokenizer.sep_token, return_tensors="pt")

output = model.generate(**inputs)[0]
output = tokenizer.decode(output, skip_special_tokens=False).replace(context+tokenizer.sep_token, '')
output = output.replace(tokenizer.eos_token, '')
print(output)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN


In [13]:
testset = test_dataset  #.select(range(100))

In [14]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

model.to(device)

count = 0
bar = tqdm(range(len(testset)))
for example in testset:
    command = example['commands']
    label = example['actions']
    inputs = tokenizer(command+tokenizer.sep_token, return_tensors="pt").to(device)
    output = model.generate(**inputs)[0].to("cpu")
    output = tokenizer.decode(output, skip_special_tokens=False).replace(command+tokenizer.sep_token, '')
    output = output.replace(tokenizer.eos_token, '')
    if output == label:
        count += 1
    bar.update(1)

print(count/len(testset))

  0%|          | 0/4182 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

0.993065518890483


In [15]:
print(count/len(testset))

0.993065518890483


In [None]:
#35k -> 0.989
# 40k -> 0.992