In [29]:
from datasets import load_dataset

dataset = load_dataset("embedding-data/sentence-compression")

Found cached dataset json (/Users/tollef/.cache/huggingface/datasets/embedding-data___json/embedding-data--sentence-compression-d643585deb6e0073/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
for i in range(10):
    long, short = dataset["train"][i]["set"]
    print(f"full sentence: {long}")
    print(f"compressed: {short}")
    print()


full sentence: The USHL completed an expansion draft on Monday as 10 players who were on the rosters of USHL teams during the 2009-10 season were selected by the League's two newest entries, the Muskegon Lumberjacks and Dubuque Fighting Saints.
compressed: USHL completes expansion draft

full sentence: Major League Baseball Commissioner Bud Selig will be speaking at St. Norbert College next month.
compressed: Bud Selig to speak at St. Norbert College

full sentence: It's fresh cherry time in Michigan and the best time to enjoy this delicious and nutritious fruit.
compressed: It's cherry time

full sentence: An Evesham man is facing charges in Pennsylvania after he allegedly dragged his girlfriend from the side of his pickup truck on the campus of Kutztown University in the early morning hours of Dec. 5, police said.
compressed: Evesham man faces charges for Pa.

full sentence: NRT LLC, one of the nation's largest residential real estate brokerage companies, announced several executive 

In [31]:
# rework the dataset into a list of "pairs"
import numpy as np

pairs = np.array([(x["set"][0], x["set"][1]) for x in dataset["train"]])

In [32]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

class CompressionDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.sentence_pairs = sentence_pairs

    def __len__(self):
        return len(self.sentence_pairs)

    def __getitem__(self, idx):
        pair = self.sentence_pairs[idx]
        source_sentence = pair[0]
        target_sentence = pair[1]

        source_tokenized = tokenizer.encode_plus(source_sentence, max_length=200, padding='max_length', truncation=True, return_tensors='pt')
        target_tokenized = tokenizer.encode_plus(target_sentence, max_length=200, padding='max_length', truncation=True, return_tensors='pt')

        source_ids = source_tokenized['input_ids'].squeeze()
        source_mask = source_tokenized['attention_mask'].squeeze()
        target_ids = target_tokenized['input_ids'].squeeze()

        return {
            'source_ids': source_ids,
            'source_mask': source_mask,
            'target_ids': target_ids
        }



For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [34]:
# max: 180000
# select a subset
pairs = pairs[:50000]

In [35]:
from tqdm import tqdm
import torch

# Assume `pairs` is a list of sentence pairs (target, source)
dataset = CompressionDataset(pairs)
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

device = torch.device("mps")
model = model.to(device)
model.train()

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)
epochs = 10

for epoch in range(epochs):
    print(f'Epoch: {epoch}')
    epoch_loss = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        
        source_ids = batch['source_ids'].to(device)
        source_mask = batch['source_mask'].to(device)
        target_ids = batch['target_ids'].to(device)

        outputs = model(input_ids=source_ids, attention_mask=source_mask, labels=target_ids)

        loss = outputs.loss
        loss.backward()

        optimizer.step()
        epoch_loss += loss.item()

    print(f'Epoch: {epoch}, Loss: {epoch_loss/len(dataloader)}')


Epoch: 0


100%|██████████| 2500/2500 [18:27<00:00,  2.26it/s]


Epoch: 0, Loss: 0.1283133092608303
Epoch: 1


100%|██████████| 2500/2500 [18:30<00:00,  2.25it/s]


Epoch: 1, Loss: 0.04230836736988276
Epoch: 2


100%|██████████| 2500/2500 [18:43<00:00,  2.22it/s]


Epoch: 2, Loss: 0.03343313263487071
Epoch: 3


100%|██████████| 2500/2500 [18:49<00:00,  2.21it/s]


Epoch: 3, Loss: 0.025706315400451422
Epoch: 4


100%|██████████| 2500/2500 [19:00<00:00,  2.19it/s]


Epoch: 4, Loss: 0.01972276006778702
Epoch: 5


 42%|████▏     | 1059/2500 [08:00<10:53,  2.21it/s]


KeyboardInterrupt: 

In [36]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [1]:
def compress_sentence(sentence):
    inputs = tokenizer.encode_plus(sentence, return_tensors="pt", max_length=512, padding='max_length', truncation=True)

    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=10)

    compressed_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # print("Compressed Sentence: ", compressed_sentence)
    return compressed_sentence


for gold, shorter in pairs[-50:]:
    print("Gold: ", gold)
    print("Shorter: ", shorter)
    compress_sentence(gold)
    print()

NameError: name 'pairs' is not defined

In [23]:
compress_sentence("This is a longer sequence of text that contains words and tokens that may be compressed")

Compressed Sentence:  This is a longer sequence of text


'This is a longer sequence of text'

In [27]:
s = "These are just two of the 25,000 dead fighters who have been identified by the BBC, independent Russian media organisation Mediazona, and a team of volunteers, using information from official reports, newspapers, social media, and new memorials and graves."
compress_sentence(s)

Compressed Sentence:  25,000 dead fighters identified by BBC


'25,000 dead fighters identified by BBC'

# VAE

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class SentenceVAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(SentenceVAE, self).__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2 * latent_dim)
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim)
        )

    def encode(self, x):
        mu, logvar = torch.chunk(self.encoder(x), 2, dim=-1)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        return self.decoder(z)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        x_hat = self.decode(z)
        return x_hat, mu, logvar

# Step 1: Load the Google Sentence Compression dataset
dataset = load_dataset('embedding_data/sentence-compression')

# Step 2: Preprocess the dataset
tokenizer = AutoTokenizer.from_pretrained('t5-base')
input_dim = tokenizer.model_max_length
model_inputs = tokenizer([example['set'][0] for example in dataset['train']], padding=True, truncation=True, return_tensors='pt')
model_outputs = tokenizer([example['set'][1] for example in dataset['train']], padding=True, truncation=True, return_tensors='pt')

# Step 3: Train the VAE
hidden_dim = 256
latent_dim = 32
vae = SentenceVAE(input_dim, hidden_dim, latent_dim)
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)

for epoch in range(10):
    for i in range(len(model_inputs)):
        input_ids = model_inputs['input_ids'][i].unsqueeze(0)
        attention_mask = model_inputs['attention_mask'][i].unsqueeze(0)
        decoder_input_ids = model_outputs['input_ids'][i].unsqueeze(0)
        decoder_attention_mask = model_outputs['attention_mask'][i].unsqueeze(0)

        x = F.one_hot(input_ids, num_classes=input_dim).float().squeeze(0)
        x_hat, mu, logvar = vae(x)
        kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        reconstruction_loss = F.binary_cross_entropy_with_logits(x_hat, x)
        loss = reconstruction_loss + kl_div
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# Step 4: Use the VAE to compress new sentences
vae.eval()
input_sentence = "This is a long sentence that needs to be compressed."
input_ids = tokenizer(input_sentence, padding=True, truncation=True, return_tensors='pt')['input_ids']
x = F.one_hot(input_ids, num_classes=input_dim).float().squeeze(0)
mu, logvar = vae.encode(x)
z = vae.reparameterize(mu, logvar)

# Step 5: Use local search to reconstruct the compressed sentence
z.requires_grad = True
optimizer = torch.optim.LBFGS([z])

def closure():
    optimizer.zero_grad()
    x_hat = vae.decode(z)
    loss = F.binary_cross_entropy_with_logits(x_hat, x)
    loss.backward()
    return loss

for i in range(10):
    optimizer.step(closure)

x_hat = vae.decode(z)
output_ids = torch.argmax(x_hat, dim=-1).unsqueeze(0)
output_sentence = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Step 6: Evaluate the performance of the model
vae.eval()
eval_inputs = tokenizer([example['set'][0] for example in dataset['validation']], padding=True, truncation=True, return_tensors='pt')
eval_outputs = tokenizer([example['set'][1] for example in dataset['validation']], padding=True, truncation=True, return_tensors='pt')
eval_loss = 0

with torch.no_grad():
    for i in range(len(eval_inputs)):
        input_ids = eval_inputs['input_ids'][i].unsqueeze(0)
        attention_mask = eval_inputs['attention_mask'][i].unsqueeze(0)
        decoder_input_ids = eval_outputs['input_ids'][i].unsqueeze(0)
        decoder_attention_mask = eval_outputs['attention_mask'][i].unsqueeze(0)

        x = F.one_hot(input_ids, num_classes=input_dim).float().squeeze(0)
        x_hat, mu, logvar = vae(x)
        kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        reconstruction_loss = F.binary_cross_entropy_with_logits(x_hat, x)
        loss = reconstruction_loss + kl_div
        eval_loss += loss.item()

eval_loss /= len(eval_inputs)

# Step 7: Deploy the model in a production environment
# ...