In [None]:
import os
import jsonlines

filename = "wikianswers_3length-100000samples.jsonl"
filepath = "../../data/" + filename

# load the jsonline as "pairs":
with jsonlines.open(filepath) as reader:
    # read the pairs as tuples of sentences 
    pairs = [tuple(p["set"]) for p in reader]
pairs[:5]

[('What word describes feeling sorrow regret or remorse for something?',
  'The word meaning sorrow remorse or regret?'),
 ('The word meaning sorrow remorse or regret?', 'What means full of feeling?'),
 ('What word describes feeling sorrow regret or remorse for something?',
  'What means full of feeling?'),
 ('What kinds of animals does live in namib desert inafrica?',
  'What kindes of animals live in the desert?'),
 ('What kindes of animals live in the desert?',
  'What kind animals live deserts?')]

In [16]:
len(pairs)

296941

In [None]:
# just select a subset of 1000 pairs to begin with

In [17]:
# inspect stats of the data, such as max sentence length:
max_sentence_length = max([len(s.split()) for p in pairs for s in p])
max_sentence_length
# remove all pairs with any sentence longer than 30 words
pairs = [p for p in pairs if max([len(s.split()) for s in p]) <= 30]

In [18]:
# now inspect max sequence length (i.e. characters)
max_sequence_length = max([len(s) for p in pairs for s in p])
max_sequence_length

# discard all examples with sentences longer than 200 characters
pairs = [p for p in pairs if max([len(s) for s in p]) <= 200]
len(pairs)

293524

In [19]:
# Required Libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')


For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [20]:
class CompressionDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.sentence_pairs = sentence_pairs

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        pair = self.sentence_pairs[idx]
        source_sentence = pair[0]
        target_sentence = pair[1]

        source_tokenized = tokenizer.encode_plus(source_sentence, max_length=200, padding='max_length', truncation=True, return_tensors='pt')
        target_tokenized = tokenizer.encode_plus(target_sentence, max_length=200, padding='max_length', truncation=True, return_tensors='pt')

        source_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_ids = target_tokenized['input_ids'].squeeze()

        return {
            'source_ids': source_ids,
            'source_mask': source_mask,
            'target_ids': target_ids
        }

# Assume `pairs` is a list of your sentence pairs
dataset = CompressionDataset(pairs)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [22]:
from rich.progress import track

device = torch.device("mps")

model = model.to(device)
model.train()

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
epochs = 10

for epoch in track(range(epochs)):
    epoch_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        
        source_ids = batch['source_ids'].to(device)
        source_mask = batch['source_mask'].to(device)
        target_ids = batch['target_ids'].to(device)

        outputs = model(input_ids=source_ids, attention_mask=source_mask, labels=target_ids)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}')


Output()