In [1]:
import os
import jsonlines

filename = "wikianswers_3length-100000samples.jsonl"
filepath = "data/" + filename

# load the jsonline as "pairs":
with jsonlines.open(filepath) as reader:
    # read the pairs as tuples of sentences 
    pairs = [tuple(p["set"]) for p in reader]
pairs[:5]

[('What word describes feeling sorrow regret or remorse for something?',
  'The word meaning sorrow remorse or regret?'),
 ('The word meaning sorrow remorse or regret?', 'What means full of feeling?'),
 ('What word describes feeling sorrow regret or remorse for something?',
  'What means full of feeling?'),
 ('What kinds of animals does live in namib desert inafrica?',
  'What kindes of animals live in the desert?'),
 ('What kindes of animals live in the desert?',
  'What kind animals live deserts?')]

In [2]:
len(pairs)

296941

In [3]:
# just select a subset of 1000 pairs to begin with

In [4]:
# inspect stats of the data, such as max sentence length:
max_sentence_length = max([len(s.split()) for p in pairs for s in p])
max_sentence_length
# remove all pairs with any sentence longer than 30 words
pairs = [p for p in pairs if max([len(s.split()) for s in p]) <= 30]

In [26]:
# now inspect max sequence length (i.e. characters)
max_sequence_length = max([len(s) for p in pairs for s in p])
max_sequence_length

# discard all examples with sentences longer than 200 characters
pairs = [p for p in pairs if max([len(s) for s in p]) <= 200]
len(pairs)

test_pairs = pairs[-1000:]

In [20]:
test_pairs

[('How many quarts of oil does a 4-cylinder honda civic del sol take?',
  'How many quarts of oil in honda del sol?'),
 ('How many quarts of oil in honda del sol?', 'Oil del sol 95?'),
 ('Does chugging beer make you more drunk then shots?',
  'How do you chug alcohol fast?'),
 ('Does chugging beer make you more drunk then shots?',
  'How do you chug beer?'),
 ('How do you chug alcohol fast?', 'How do you chug beer?'),
 ('What happens to the nipples in puberty for boys?',
  'Do boys nipples hurt during puberty?'),
 ('Does your nipple enlarge or becomes painful during pubert boys?',
  'Do boys nipples hurt during puberty?'),
 ('Does your nipple enlarge or becomes painful during pubert boys?',
  'What happens to the nipples in puberty for boys?'),
 ('What time do lawyers get up for work?', 'What time does a lawyer get up?'),
 ('What time does a lawyer get up?', 'What time do lawyers wake up?')]

In [6]:
# select a subset:
pairs = pairs[:1000]

In [7]:
# Required Libraries
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
import torch

tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)ve/main/spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 1.76MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 1.21k/1.21k [00:00<00:00, 3.57MB/s]
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Downloading pytorch_model.bin: 100%|██████████| 892M/892M [00:09<00:00, 93.9MB/s] 
Downloading (…)neration_config.json: 100%|██████████| 147/147 [00:00<00:00, 885kB/s]


In [8]:
class CompressionDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.sentence_pairs = sentence_pairs

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        pair = self.sentence_pairs[idx]
        source_sentence = pair[0]
        target_sentence = pair[1]

        source_tokenized = tokenizer.encode_plus(source_sentence, max_length=200, padding='max_length', truncation=True, return_tensors='pt')
        target_tokenized = tokenizer.encode_plus(target_sentence, max_length=200, padding='max_length', truncation=True, return_tensors='pt')

        source_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_ids = target_tokenized['input_ids'].squeeze()

        return {
            'source_ids': source_ids,
            'source_mask': source_mask,
            'target_ids': target_ids
        }

# Assume `pairs` is a list of your sentence pairs
dataset = CompressionDataset(pairs)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [11]:
from tqdm import tqdm

device = torch.device("cuda:2")

model = model.to(device)
model.train()

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
epochs = 10

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        
        source_ids = batch['source_ids'].to(device)
        source_mask = batch['source_mask'].to(device)
        target_ids = batch['target_ids'].to(device)

        outputs = model(input_ids=source_ids, attention_mask=source_mask, labels=target_ids)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}')


Epoch: 0


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 0, Loss: 0.1688132027075404
Epoch: 1


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 1, Loss: 0.10802048314658422
Epoch: 2


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 2, Loss: 0.09349556255435186
Epoch: 3


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 3, Loss: 0.08177530191957004
Epoch: 4


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 4, Loss: 0.07145963946268671
Epoch: 5


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 5, Loss: 0.06317383943805618
Epoch: 6


100%|██████████| 63/63 [00:31<00:00,  2.03it/s]


Epoch: 6, Loss: 0.05618708687169211
Epoch: 7


100%|██████████| 63/63 [00:30<00:00,  2.04it/s]


Epoch: 7, Loss: 0.048770262726715634
Epoch: 8


100%|██████████| 63/63 [00:30<00:00,  2.03it/s]


Epoch: 8, Loss: 0.0433187090924808
Epoch: 9


100%|██████████| 63/63 [00:30<00:00,  2.03it/s]

Epoch: 9, Loss: 0.03848822326177642





In [12]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [22]:
test_pairs

[('How many quarts of oil does a 4-cylinder honda civic del sol take?',
  'How many quarts of oil in honda del sol?'),
 ('How many quarts of oil in honda del sol?', 'Oil del sol 95?'),
 ('Does chugging beer make you more drunk then shots?',
  'How do you chug alcohol fast?'),
 ('Does chugging beer make you more drunk then shots?',
  'How do you chug beer?'),
 ('How do you chug alcohol fast?', 'How do you chug beer?'),
 ('What happens to the nipples in puberty for boys?',
  'Do boys nipples hurt during puberty?'),
 ('Does your nipple enlarge or becomes painful during pubert boys?',
  'Do boys nipples hurt during puberty?'),
 ('Does your nipple enlarge or becomes painful during pubert boys?',
  'What happens to the nipples in puberty for boys?'),
 ('What time do lawyers get up for work?', 'What time does a lawyer get up?'),
 ('What time does a lawyer get up?', 'What time do lawyers wake up?')]

In [28]:
def compress_sentence(sentence):
    model.eval()

    inputs = tokenizer.encode_plus(sentence, return_tensors="pt", max_length=512, padding='max_length', truncation=True)

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10)

    compressed_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Compressed Sentence: ", compressed_sentence)
    return compressed_sentence


# select 30 random samples from test_pairs
import random
random.seed(42)
test_pairs = random.sample(test_pairs, 30)


for gold, shorter in test_pairs:
    print("Gold: ", gold)
    print("Shorter: ", shorter)
    compress_sentence(gold)
    print()

Gold:  Is a chimpanzee endangered and why is it endangered?
Shorter:  Have chimpanzees been endangered?
Compressed Sentence:  Is a chimpanzee

Gold:  What is the locations possible economic impact on Cuba's population?
Shorter:  What is the population number of Cuba?
Compressed Sentence:  What is the population density of Cuba?

Gold:  Which state is started commonwealth games?
Shorter:  Who started the commonwealth games?
Compressed Sentence:  Who started commonwealth game?

Gold:  Some thing can hurt mor than a belly flop an you prformns acrobatics high above sea world water?
Shorter:  Is a high water table a good thing or a bad thing?
Compressed Sentence:  How do you get high above sea world?

Gold:  Is there such thing as a proffesional pimple popper?
Shorter:  Best thing for pimples when your in year 6?
Compressed Sentence:  Is there such thing as a pi

Gold:  Which of these factors influences one is sense of sense of self?
Shorter:  Which of these factors influences one is sense 