In [29]:
%load_ext autoreload
%autoreload 2

from project.data.data import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Preprocess raw data:

In [4]:
processor = RaceDataProcessor()
processor.process_data("RACE", "LON")

## Prepare datasets:

##### All of the following steps should be done from command line. They are here to a quick & dirty heuristic to work with jupyter.

In [30]:
from collections import namedtuple

Hparams = namedtuple("Hparams", ["data_path", "batch_size", "num_workers", "special_tokens", "pretrained_model"])
hparams = Hparams(
    data_path = "LON",
    batch_size = 16,
    num_workers = 6,
    special_tokens = ["[CON]", "[QUE]", "[ANS]", "[DIS]"],
    pretrained_model = "bert-base-cased"
)

In [11]:
def customed_collate_fn(batch, tokenizer):
    """"""
    import torch
    articles = []
    questions = []
    answers = []
    distractors = []

    for item in batch:
        articles.append(" ".join(["<answer>", item["answer"], "<context>", item["article"]]))
        questions.append(item["question"])
    articles = tokenizer(articles, padding=True, 
                                       truncation=True, 
                                       return_tensors="pt", 
                                       pad_to_max_length=True, 
                                       max_length=512)
    questions = tokenizer(questions, padding=True, 
                                       truncation=True, 
                                       return_tensors="pt", 
                                       pad_to_max_length=True, 
                                       max_length=512)
    articles['input_ids'] = torch.squeeze(articles['input_ids'])
    articles['attention_mask'] = torch.squeeze(articles['attention_mask'])
    questions['input_ids'] = torch.squeeze(questions['input_ids'])
    questions['attention_mask'] = torch.squeeze(questions['attention_mask'])
    
    return (articles, questions)

In [24]:
# Create and setup 
data_module = RaceDataModule(hparams, customed_collate_fn)
data_module.prepare_data()
data_module.setup()

In [25]:
# Get data loaders:
trainloader = data_module.train_dataloader()
valloader = data_module.val_dataloader()
testloader = data_module.test_dataloader()

In [26]:
cac = next(iter(trainloader))

Con cac dit meCon cac dit meCon cac dit meCon cac dit me
Con cac dit me



Con cac dit me
Con cac dit me
Con cac dit me
Con cac dit me
Con cac dit me
Con cac dit me
Con cac dit me
Con cac dit me


In [27]:
cac

[{'input_ids': tensor([[ 101,  133, 2590,  ...,    0,    0,    0],
          [ 101,  133, 2590,  ...,    0,    0,    0],
          [ 101,  133, 2590,  ...,    0,    0,    0],
          ...,
          [ 101,  133, 2590,  ...,    0,    0,    0],
          [ 101,  133, 2590,  ...,    0,    0,    0],
          [ 101,  133, 2590,  ...,    0,    0,    0]]),
  'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]),
  'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          ...,
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0],
          [1, 1, 1,  ..., 0, 0, 0]])},
 {'input_ids': tensor([[  101,  1327,  1169,  1195,  3858,  1121,  1142,  5885,   136,   102,
               0,     0,     0,     0,     0,    

In [47]:
from transformers import AutoTokenizer, AutoModel

In [101]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
tokenizer.add_special_tokens({'additional_special_tokens': ["cac", "[CAC]"]})
model = AutoModel.from_pretrained("t5-small")
model.resize_token_embeddings(len(tokenizer))

Embedding(32102, 512)

In [109]:
tokenizer

PreTrainedTokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['cac', '[CAC]']})

In [88]:
tokenizer.decode(tokenizer("[Cac1] acw fwef efwefwef [Cac2] sgs ssvsd sfsf ")["input_ids"])

'[Cac1] acw fwef efwefwef[Cac2] sgs ssvsd sfsf</s>'

In [93]:
from functools import partial
def concac(a, b):
    return a + b
cailon = partial(concac, b=2)

In [94]:
cailon(1)

3