In [1]:
!pip install torch~=2.1.0 torch_xla[tpu]~=2.1.0 -f https://storage.googleapis.com/libtpu-releases/index.html

Looking in links: https://storage.googleapis.com/libtpu-releases/index.html
[0m

In [2]:
import torch

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, T5TokenizerFast
# from transformers.modeling_outputs import BaseModelOutput, CausalLMOutputWithPast

import datasets
from datasets import load_dataset
# from transformers import LlamaModel, LlamaConfig

import tqdm as notebook_tqdm
import os
from dotenv import load_dotenv


In [3]:
load_dotenv()
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
print(device)

cuda


In [4]:
tokenizer = T5TokenizerFast.from_pretrained('google/flan-t5-base')

In [10]:
# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    example['input_text'] = 'question: %s  context: %s </s>' % (example['question'], example['context'])
    example['target_text'] = '%s </s>' % example['answers']['text'][0]
    return example

# tokenize the examples
max_source_length = 512
max_target_length = 16
def convert_to_features(example_batch):
    input_encodings = tokenizer(example_batch['input_text'], pad_to_multiple_of=16, max_length=max_source_length, truncation=True)
    target_encodings = tokenizer(example_batch['target_text'], pad_to_multiple_of=16, max_length=max_target_length, truncation=True)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings

In [11]:
# load train and validation split of squad
train_dataset  = load_dataset('squad', split='train')
valid_dataset = load_dataset('squad', split='validation')

# map add_eos_to_examples function to the dataset example wise 
train_dataset = train_dataset.map(add_eos_to_examples)
# map convert_to_features batch wise
train_dataset = train_dataset.map(convert_to_features, batched=True)

valid_dataset = valid_dataset.map(add_eos_to_examples, load_from_cache_file=False)
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)


# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [17]:
# cache the dataset, so we can load it directly for training
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

# print(load_dataset('squad', split='train').features)
# print(len(train_dataset), len(valid_dataset))

{'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}
87599 10570
