In [53]:
import sys

import torch
from torch.utils.data import DataLoader

from transformers import RobertaTokenizer, T5ForConditionalGeneration
from dataloader import read_mathqapython, MathQAPython 

In [54]:
# Take a look at some data
data = read_mathqapython('data/mathqapython_dev.json')

In [55]:
data[35]

{'text': '# mr . kramer , the losing candidate in a two - candidate election , received 942,568 votes , which was exactly 25 percent of all votes cast . approximately what percent of the remaining votes would he need to have received in order to have won at least 50 percent of all the votes cast ? n0 = 942568.0 n1 = 25.0 n2 = 50.0',
 'code': 'n0 = 942568.0\nn1 = 25.0\nn2 = 50.0\nt0 = n2 / 100.0\nt1 = n1 / 100.0\nt2 = t0 - t1\nt3 = 1.0 - t1\nt4 = t2 / t3\nanswer = t4 * 100.0',
 'dsl_code': 'divide(n2,const_100)|divide(n1,const_100)|subtract(#0,#1)|subtract(const_1,#1)|divide(#2,#3)|multiply(#4,const_100)|',
 'reasoning': 'lets assume that candidate got 25 % votes and total votes is 100 . candidate won = 25 remaining = 75 to get 50 % , candidate requires 25 votes from 100 which is 25 % and 25 votes from 75 . 25 / 75 = 33.33 % which is approx 33 % . hence the answer is e',
 'answer': 33.33333333333333,
 'task_id': 35}

In [56]:
class MathQAPython(torch.utils.data.Dataset): 
    def __init__(self, instance_list, tokenizer, text_len, code_len): 
        self.data = instance_list 
        self.tokenizer = tokenizer 
        self.text_len = text_len
        self.code_len = code_len
    

    def __getitem__(self, idx): 
        instance = self.data[idx]
        text = instance['text']
        code = instance['code']
        answer = instance['answer']

        text_encode = self.tokenizer(text, max_length = self.text_len, pad_to_max_length=False, return_tensors='pt')
        code_encode = self.tokenizer(code, max_length = self.code_len, pad_to_max_length=False, return_tensors='pt')
        text_ids = text_encode['input_ids'].squeeze()
        text_mask = text_encode['attention_mask'].squeeze()
        code_ids = code_encode['input_ids'].squeeze()
        code_mask = code_encode['attention_mask'].squeeze()

        return {
                'text_ids': text_ids.to(dtype=torch.long), 
                'text_mask': text_mask.to(dtype=torch.long), 
                'code_ids': code_ids.to(dtype=torch.long), 
                'code_mask': code_mask.to(dtype=torch.long), 
                'answer': answer
                }


    def __len__(self): 
        return len(self.data)

In [57]:
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

test_set = MathQAPython(data, tokenizer, 256, 256)

loader = DataLoader(test_set, batch_size=1, shuffle=True)

In [58]:
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

In [68]:
for batch in loader: 
    ids = batch['text_ids']
    mask = batch['text_mask']
    generated_ids = model.generate(
        input_ids=ids, 
        attention_mask=mask, 
        max_length=512
        )
    print(tokenizer.decode(ids.squeeze(), skip_special_toeksn=True))
    print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

    break
    

<s># the income tax in country x is 8 % of any income up to $ 5000. for incomes over $ 5000, an 8 % tax is imposed on the first $ 5000, and a 10 % tax is imposed on the portion of the total income in excess of $ 5000. if perry paid $ 950 in income tax last month, then what was perry's income? n0 = 8.0 n1 = 5000.0 n2 = 5000.0 n3 = 8.0 n4 = 5000.0 n5 = 10.0 n6 = 5000.0 n7 = 950.0</s>
theincome taxthetheincomethethethethethethethethethethethethethethethethethethethethethe
