# Test Transformer & Trainer

In this notebook, I will test my implementation of Transformer and Trainer

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import torch
import torch.nn as nn
import torch.nn.functional as F

from src.lion.lion import Lion
from src.transformer.transformer import Transformer
from src.training.trainer import Trainer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
import zipfile
import urllib.request
import os

dataset_url = "https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip"
data_path = "data/simplebooks.zip"
extracted_path = "data/simplebooks/"

if not os.path.exists(extracted_path):
    urllib.request.urlretrieve(dataset_url, data_path)

    with zipfile.ZipFile(data_path, 'r') as zip_ref:
        zip_ref.extractall("data/")
    
    os.remove(data_path)
else:
    print("Dataset already downloaded and extracted.")

Dataset already downloaded and extracted.


In [3]:
from datasets import load_dataset

train_path = os.path.join(extracted_path, 'simplebooks-2/train.txt')
valid_path = os.path.join(extracted_path, 'simplebooks-2/valid.txt')
test_path = os.path.join(extracted_path, 'simplebooks-2/test.txt')

datasets = load_dataset('text', data_files={
    'train': train_path,
    'validation': valid_path,
    'test': test_path
})

datasets = datasets.filter(lambda example: example["text"])

Filter:   0%|          | 0/114695 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13383 [00:00<?, ? examples/s]

Filter:   0%|          | 0/14829 [00:00<?, ? examples/s]

In [4]:
print(datasets['train'], '\n', datasets['train']['text'][14])

Dataset({
    features: ['text'],
    num_rows: 55366
}) 
 " Find the man who stole the pearls , " said the king . Away went the guards looking high and low for the thief .


In [12]:
from transformers import T5Tokenizer
from torch.utils.data import DataLoader, Dataset

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )

tokenized_datasets = datasets.map(preprocess_function, batched=True)

test_dataloader = DataLoader(
    tokenized_datasets["test"].with_format("torch"),
    batch_size=8,
)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/55366 [00:00<?, ? examples/s]

Map:   0%|          | 0/6548 [00:00<?, ? examples/s]

Map:   0%|          | 0/7251 [00:00<?, ? examples/s]

In [7]:
len(tokenized_datasets["test"].with_format("torch")[13]['text'])

69

In [None]:
class TextDataset(Dataset):
    def __init__(self):
        super().__init__()

    def __getitem__(self, index):
        return super().__getitem__(index)

    def __len__(self):
        pass


In [8]:
for batch in test_dataloader:
    print(batch)
    break

{'text': ["Bunny Rabbit 's <unk>", 'By', 'Mary <unk> <unk>', "Bunny Rabbit 's <unk>", 'Mr. and Mrs. Rabbit and the three little rabbits lived in the woods .', 'Each little rabbit had a name .', 'There was Bunny Rabbit , <unk> Rabbit , and Billy Rabbit .', 'Bunny was full of fun , and liked to play tricks on his brothers .'], 'input_ids': tensor([[ 6100,    29,    63,  ...,     0,     0,     0],
        [  938,     1,     0,  ...,     0,     0,     0],
        [ 3790,     2,     2,  ...,     0,     0,     0],
        ...,
        [ 1698,   385, 18383,  ...,     0,     0,     0],
        [  290,    47,  6100,  ...,     0,     0,     0],
        [ 6100,    29,    63,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [9]:
tokenizer.vocab_size

32000

In [10]:
from src.training.metrics import compute_metrics

tiny_model = Transformer(
    num_layers=8,
    d_model=128,
    num_heads=8,
    d_ff=128,
    input_dim=512,
    output_dim=tokenizer.vocab_size,
    max_len=512
)

trainer = Trainer(
    model=tiny_model,
    num_epochs=1000,
    batch_size=8,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    loss_fn=F.cross_entropy,
    metrics_fn=compute_metrics,
    optimizer=Lion,
    optimizer_kwargs=dict(lr=1e-4),
    checkpoint_path='./'
)

In [11]:
trainer.train()

Epoch 1/1000:   0%|          | 0/6921 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'to'