In [1]:
from tqdm.auto import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("/home/drdo/Caricatures/models/scan_dummy_tokens_gpt2/checkpoint-40000")

model = GPT2LMHeadModel.from_pretrained("/home/drdo/Caricatures/models/scan_dummy_tokens_gpt2/checkpoint-40000")

model.generation_config.max_length = 256

In [3]:
max_len = 9
dummy_token = "<empty>"

# command type maps
actions = {
    "walk": "I_WALK",
    "run": "I_RUN",
    "jump": "I_JUMP",
    "look": "I_LOOK",
    "turn": dummy_token,
    dummy_token: dummy_token,
    }

turns = {
    "around": "yyyy",
    "opposite": "yy",
    dummy_token: dummy_token
}

directions = {
    "right": "I_TURN_RIGHT",
    "left": "I_TURN_LEFT",
    dummy_token: dummy_token
}

nums = {
    "twice": "xx",
    "thrice": "xxx",
    dummy_token: dummy_token
}

conjs = ["and", "after", dummy_token]

# command structure
command_structure = {
    0: actions,
    1: turns,
    2: directions,
    3: nums,
    4: conjs,
    5: actions,
    6: turns,
    7: directions,
    8: nums,
}

In [4]:
dataset = load_dataset("scan", "simple", trust_remote_code=True)
column_names = dataset["train"].column_names
input_column = column_names[0]
output_column = column_names[1]

def add_empty_token(x):
    command_str = x[input_column]
    command = command_str.split()
    padded_command = []
    index = 0
    c = 0
    while index < max_len:
        expected_cs = command_structure[index]
        if c < len(command) and command[c] in expected_cs:
            padded_command.append(command[c])
            c += 1
        else:
            padded_command.append(dummy_token)
        index += 1
    
    x[input_column] = ' '.join(padded_command)
    return x

test_dataset = dataset["test"].map(
    add_empty_token,
    batched=False,
    desc="Running tokenizer on dataset",
)


In [5]:
test_dataset[1]

{'commands': 'run <empty> right twice after walk <empty> right twice',
 'actions': 'I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN'}

In [6]:
# generation_mode = GenerationMode.GREEDY_SEARCH

context = 'run <empty> right twice after walk <empty> right twice'
inputs = tokenizer(context+tokenizer.sep_token, return_tensors="pt")

output = model.generate(**inputs)[0]
#print(output)
#print(tokenizer.decode(output, skip_special_tokens=False))

#tokenizer.decode(output[0], skip_special_tokens=False).replace(context+tokenizer.sep_token, '')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


model_inputs from input_ids : torch.Size([1, 12])
None
output logits : torch.Size([1, 12, 50260])
torch.Size([1, 50260])
new input_ids : torch.Size([1, 13])

model_inputs from input_ids : torch.Size([1, 1])
past_key_values : 12
output logits : torch.Size([1, 1, 50260])
torch.Size([1, 50260])
new input_ids : torch.Size([1, 14])

model_inputs from input_ids : torch.Size([1, 1])
past_key_values : 12
output logits : torch.Size([1, 1, 50260])
torch.Size([1, 50260])
new input_ids : torch.Size([1, 15])

model_inputs from input_ids : torch.Size([1, 1])
past_key_values : 12
output logits : torch.Size([1, 1, 50260])
torch.Size([1, 50260])
new input_ids : torch.Size([1, 16])

model_inputs from input_ids : torch.Size([1, 1])
past_key_values : 12
output logits : torch.Size([1, 1, 50260])
torch.Size([1, 50260])
new input_ids : torch.Size([1, 17])

model_inputs from input_ids : torch.Size([1, 1])
past_key_values : 12
output logits : torch.Size([1, 1, 50260])
torch.Size([1, 50260])
new input_ids : tor

In [10]:
import torch
a = torch.tensor([ 5143, 220, 50259, 826, 5403, 706, 2513, 220, 50259, 826, 5403])
print(tokenizer.decode(a, skip_special_tokens=False))

run <empty> right twice after walk <empty> right twice


In [7]:
context = 'run <empty> right twice after walk <empty> right twice'
inputs = tokenizer(context, return_tensors="pt")
tokenizer.decode(model.generate(**inputs)[0], skip_special_tokens=False)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'run <empty> right twice after walk <empty> right twice<sep>I_TURN_RIGHT I_WALK I_TURN_RIGHT I_WALK I_TURN_RIGHT I_RUN I_TURN_RIGHT I_RUN<|endoftext|>'

In [18]:
testset = test_dataset.select(range(100))

In [19]:
count = 0
model.to("cuda")
bar = tqdm(range(len(testset)))
for example in testset:
    command = example['commands']
    label = example['actions']
    inputs = tokenizer(command+tokenizer.sep_token, return_tensors="pt").to("cuda")
    output = model.generate(**inputs)[0].to("cpu")
    output = tokenizer.decode(output, skip_special_tokens=False).replace(command+tokenizer.sep_token, '')
    output = output.replace(tokenizer.eos_token, '')
    if output == label:
        count += 1
    bar.update(1)

print(count/len(testset))

  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

1.0


In [None]:
# distilgpt2 : acc = 0.6406025824964132
# distilgpt2 : acc = 0.67 -> beam search (beam size = 3)
# gpt2 : acc = 0.810856049736968, 0.88, 