In [2]:
import sys
import re

import torch
from torch.utils.data import DataLoader

from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from dataloader import read_mathqapython, MathQAPython 

# MathQA dataset class

In [3]:
class MathQAPython(torch.utils.data.Dataset): 
    def __init__(self, instance_list, tokenizer, text_len, code_len): 
        self.data = instance_list 
        self.tokenizer = tokenizer 
        self.text_len = text_len
        self.code_len = code_len
    

    def __getitem__(self, idx): 
        idx = idx + 2
        instance = self.data[idx]
        text = instance['text']
        code = instance['code']
        answer = instance['answer']

        text_encode = self.tokenizer(text, max_length=self.text_len, 
                                     padding='max_length', return_tensors='pt')
        code_encode = self.tokenizer(code, max_length=self.code_len, 
                                     padding='max_length', return_tensors='pt')
        text_ids = text_encode['input_ids'].squeeze()
        code_ids = code_encode['input_ids'].squeeze()

        return text_ids.to(dtype=torch.long), code_ids.to(dtype=torch.long), answer


    def __len__(self): 
        return len(self.data) 

# Few-shot evaluation

In [3]:
# Take a look at some data
data = read_mathqapython('data/mathqapython_dev.json')

In [4]:
data[35]

{'text': '# mr . kramer , the losing candidate in a two - candidate election , received 942,568 votes , which was exactly 25 percent of all votes cast . approximately what percent of the remaining votes would he need to have received in order to have won at least 50 percent of all the votes cast ? n0 = 942568.0 n1 = 25.0 n2 = 50.0',
 'code': 'n0 = 942568.0\nn1 = 25.0\nn2 = 50.0\nt0 = n2 / 100.0\nt1 = n1 / 100.0\nt2 = t0 - t1\nt3 = 1.0 - t1\nt4 = t2 / t3\nanswer = t4 * 100.0',
 'dsl_code': 'divide(n2,const_100)|divide(n1,const_100)|subtract(#0,#1)|subtract(const_1,#1)|divide(#2,#3)|multiply(#4,const_100)|',
 'reasoning': 'lets assume that candidate got 25 % votes and total votes is 100 . candidate won = 25 remaining = 75 to get 50 % , candidate requires 25 votes from 100 which is 25 % and 25 votes from 75 . 25 / 75 = 33.33 % which is approx 33 % . hence the answer is e',
 'answer': 33.33333333333333,
 'task_id': 35}

In [11]:
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
tokenizer.pad_token = tokenizer.eos_token

In [12]:
few_shot_prompt = "\n\n".join([example['text'] + '\n' + example['code'] for example in data[0:12] ]) + '\n\n'
print(len(tokenizer(few_shot_prompt)['input_ids']))
print(few_shot_prompt)

1707
# a multiple choice test consists of 4 questions , and each question has 5 answer choices . in how many r ways can the test be completed if every question is unanswered ? n0 = 4.0 n1 = 5.0
n0 = 4.0
n1 = 5.0

answer = n1**min(n0, 5)

# the hcf and lcm of two numbers m and n are respectively 6 and 210 . if m + n = 72 , then 1 / m + 1 / n is equal to n0 = 6.0 n1 = 210.0 n2 = 72.0 n3 = 1.0 n4 = 1.0
n0 = 6.0
n1 = 210.0
n2 = 72.0
n3 = 1.0
n4 = 1.0
t0 = n0 * n1
answer = n2 / t0

# in a kilometer race , a beats b by 48 meters or 12 seconds . what time does a take to complete the race ? n0 = 48.0 n1 = 12.0
n0 = 48.0
n1 = 12.0
t0 = n0 / n1
t1 = 1.0 * 1000.0
t2 = t1 / t0
answer = t2 - n1

# in a school of 650 boys , 44 % of muslims , 28 % hindus , 10 % sikhs and the remaining of other communities . how many belonged to the other communities ? n0 = 650.0 n1 = 44.0 n2 = 28.0 n3 = 10.0
n0 = 650.0
n1 = 44.0
n2 = 28.0
n3 = 10.0
t0 = n1 + n2
t1 = n3 + t0
t2 = 100.0 - t1
t3 = n0 * t2
answer = t3 / 

In [13]:
test_set = MathQAPython(data, tokenizer, 256, 256)

loader = DataLoader(test_set, batch_size=1, shuffle=True)

In [14]:
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")

In [None]:
for batch in loader: 
    ids, gt, gt_answer = batch
    encoded_few_shot_prompt = tokenizer(few_shot_prompt, return_tensors="pt")['input_ids']
    few_shot_ids = torch.cat([encoded_few_shot_prompt, ids], axis=1)
    generated_ids = model.generate(
        input_ids=few_shot_ids, 
        do_sample=True,
        temperature=0.4, 
        max_length=2048
        )
    print("completion" + "#"*20)
    print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
    print("prompt" + "#"*20)
    print(tokenizer.decode(ids.squeeze(), skip_special_tokens=True))
    print('#'*20 + 'ground truth code')
    print(tokenizer.decode(gt.squeeze(), skip_special_tokens=True))

    break
    

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
start_idx = encoded_few_shot_prompt.size()[1]
completion = tokenizer.decode(generated_ids[0, start_idx:])
program = completion[:re.search('answer.*?\n', completion).span()[1]]
print(program)
loc={}
a = exec(code+last_line, globals(), loc)
print(loc['answer'])
print('ground truth answer: ', gt_answer)

AttributeError: 'NoneType' object has no attribute 'span'

# Training loop

In [12]:
from transformers import TrainingArguments, Trainer

In [13]:
data = read_mathqapython('data/mathqapython_train.json')
val_data = read_mathqapython('data/mathqapython_dev.json')
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
tokenizer.pad_token = tokenizer.eos_token
train_set = MathQAPython(data[0:10], tokenizer, 1024, 1024)
dev_set = MathQAPython(data[0:2], tokenizer, 1024, 1024)
# loader = DataLoader(train_set, batch_size=2, shuffle=True)

loading file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/vocab.json from cache at /home/zhangir/.cache/huggingface/transformers/08c00c4159e921d4c941ac75732643373aba509d9b352a82bbbb043a94058d98.a552555fdda56a1c7c9a285bccfd44ac8e4b9e26c8c9b307831b3ea3ac782b45
loading file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/merges.txt from cache at /home/zhangir/.cache/huggingface/transformers/12305762709d884a770efe7b0c68a7f4bc918da44e956058d43da0d12f7bea20.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/special_tokens_map.json from cache at /home/zhangir/.cache/huggingface/transformers/6c3239a63aaf46ec7625b38abfe41fc2ce0b25f90800aefe6526256340d4ab6d.2b8bf81243d08385c806171bc7ced6d2a0dcc7f896ca637f4e777418f7f0cc3c
loading file https://huggingface.co/EleutherAI/gpt-neo-

In [14]:
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-125M")

loading configuration file https://huggingface.co/EleutherAI/gpt-neo-125M/resolve/main/config.json from cache at /home/zhangir/.cache/huggingface/transformers/29380fef22a43cbfb3d3a6c8e2f4fd951459584d87c34e4621b30580a54aca84.f0f7ebddfc6e15a23ac33e7fa95cd8cca05edf87cc74f9e3be7905f538a59762
Model config GPTNeoConfig {
  "activation_function": "gelu_new",
  "architectures": [
    "GPTNeoForCausalLM"
  ],
  "attention_dropout": 0,
  "attention_layers": [
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local",
    "global",
    "local"
  ],
  "attention_types": [
    [
      [
        "global",
        "local"
      ],
      6
    ]
  ],
  "bos_token_id": 50256,
  "embed_dropout": 0,
  "eos_token_id": 50256,
  "gradient_checkpointing": false,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": null,
  "layer_norm_epsilon": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neo",
 

In [25]:
training_args = TrainingArguments(output_dir='./results', 
                                  num_train_epochs=2, 
                                  logging_steps=2, 
                                  save_steps=2, 
                                  per_device_train_batch_size=2, 
                                  per_device_eval_batch_size=2, 
                                  weight_decay=0.01, 
                                  logging_dir='./logs')

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
def data_collator(data):
    return {'input_ids': torch.stack([f[0] for f in data]),
            'labels': torch.stack([f[1] for f in data]) 
           }

In [27]:
Trainer(model=model, args=training_args, train_dataset=train_set, 
        eval_dataset=dev_set, data_collator=data_collator).train()

***** Running training *****
  Num examples = 8
  Num Epochs = 2
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 8


Step,Training Loss
2,7.968
4,2.1305
6,0.6566
8,0.4391


Saving model checkpoint to ./results/checkpoint-2
Configuration saved in ./results/checkpoint-2/config.json
Model weights saved in ./results/checkpoint-2/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-4
Configuration saved in ./results/checkpoint-4/config.json
Model weights saved in ./results/checkpoint-4/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-6
Configuration saved in ./results/checkpoint-6/config.json
Model weights saved in ./results/checkpoint-6/pytorch_model.bin
Saving model checkpoint to ./results/checkpoint-8
Configuration saved in ./results/checkpoint-8/config.json
Model weights saved in ./results/checkpoint-8/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8, training_loss=2.798557788133621, metrics={'train_runtime': 103.6475, 'train_samples_per_second': 0.154, 'train_steps_per_second': 0.077, 'total_flos': 8358627115008.0, 'train_loss': 2.798557788133621, 'epoch': 2.0})