In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q
!pip install evaluate

In [None]:
# Import necessary libraries
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import matplotlib.pyplot as plt
import pandas as pd
from evaluate import load
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import nltk
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
import torch

# Download necessary NLTK resources
nltk.download("punkt")

# Set device for PyTorch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


In [None]:
# Step 1: Setup environment and import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer
from transformers import get_linear_schedule_with_warmup
from datasets import load_dataset
import numpy as np



# Step 2: Load a dataset for summarization - we use the XSum dataset from Hugging Face for demonstration
# The XSum dataset has articles and summaries
xsum_dataset = load_dataset('xsum', split='train[:1%]')  # using 1% for quick demonstration
print('Step 2: Loaded XSum dataset sample.')
print(xsum_dataset[0])



# Step 3: Preprocessing - tokenize input and summary
# We use BertTokenizer for both encoder and decoder sections for simplicity, though normally you'd use a different one for the decoder

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Define maximum lengths
max_input_length = 256
max_summary_length = 64

def preprocess(sample):
    # Tokenize the source article
    inputs = tokenizer(sample['document'], truncation=True, 
                       padding='max_length', max_length=max_input_length, 
                       return_tensors='pt')
    # Tokenize the summary
    targets = tokenizer(sample['summary'], 
                        truncation=True,
                        padding='max_length',
                        max_length=max_summary_length,
                        return_tensors='pt')

    # Squeeze to remove batch dimension, we'll add later
    inputs = {key: val.squeeze(0) for key, val in inputs.items()}
    targets = {key: val.squeeze(0) for key, val in targets.items()}

    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'],
            'labels': targets['input_ids']}

# Preprocess a small batch
processed_dataset = [preprocess(x) for x in xsum_dataset]
print('Step 3: Preprocessing done on small sample.')





# Step 4: Model Architecture
# We build a summarization model using a Bert encoder and a Transformer decoder

class Bert2TransformerSummarizer(nn.Module):
    def __init__(self, bert_model_name='bert-base-uncased', d_model=768, nhead=8, 
                 num_decoder_layers=6, dim_feedforward=2048, dropout=0.1):
        
        
        super(Bert2TransformerSummarizer, self).__init__()
        # Load BERT as encoder
        self.encoder = BertModel.from_pretrained(bert_model_name)
        # Freeze BERT parameters if desired
        # for param in self.encoder.parameters():
        #     param.requires_grad = False

        # Define the decoder: use PyTorch Transformer decoder
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, 
                                                        dim_feedforward=dim_feedforward,
                                                        dropout=dropout)
        
        self.decoder = nn.TransformerDecoder(self.decoder_layer, 
                                             num_layers=num_decoder_layers)

        # Embedding for decoder - using the same vocab size as BERT
        self.decoder_embedding = nn.Embedding(tokenizer.vocab_size, d_model)

        # Final linear layer to project decoder outputs to vocabulary logits
        self.fc_out = nn.Linear(d_model, tokenizer.vocab_size)

        # Positional encoding (simplified using learned embeddings)
        self.positional_embedding = nn.Embedding(512, d_model)  # assume max sequence length 512

    def forward(self, src_input_ids, src_attention_mask, tgt_input_ids):
        # Encode using BERT encoder
        encoder_outputs = self.encoder(input_ids=src_input_ids,
                                       attention_mask=src_attention_mask)
        memory = encoder_outputs.last_hidden_state.transpose(0,1)  # shape [seq_len, batch, d_model]

        # Embed decoder inputs
        tgt_embeddings = self.decoder_embedding(tgt_input_ids)  # [batch, tgt_seq, d_model]
        batch_size, tgt_seq, d_model = tgt_embeddings.size()
        # Add positional embeddings
        positions = torch.arange(0, tgt_seq).unsqueeze(0).expand(batch_size, tgt_seq).to(tgt_embeddings.device)
        tgt_embeddings = tgt_embeddings + self.positional_embedding(positions)
        tgt_embeddings = tgt_embeddings.transpose(0,1)  # shape [tgt_seq, batch, d_model]

        # Create target mask for transformer decoder (subsequent mask so predictions don't look ahead)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt_seq).to(tgt_embeddings.device)

        # Decode
        outputs = self.decoder(tgt=tgt_embeddings, memory=memory, tgt_mask=tgt_mask)
        outputs = outputs.transpose(0,1)  # [batch, tgt_seq, d_model]
        logits = self.fc_out(outputs)  # [batch, tgt_seq, vocab_size]
        return logits

# Instantiate the model
model = Bert2TransformerSummarizer()
print('Step 4: Model built using Bert encoder and Transformer decoder.')

# Step 5: Training Loop (simplified demonstration)

# Use one sample batch from the processed data for demonstration
batch = processed_dataset[0]
# Add batch dimension
src_input_ids = batch['input_ids'].unsqueeze(0)  # shape [1, seq_len]
src_attention_mask = batch['attention_mask'].unsqueeze(0)
# For target, shifting right, here we use same tokens for input and labels for demo
# Create a simple mechanism: input to decoder is summary tokens shifted right
labels = batch['labels']
# For decoder input, prepend [CLS] token id (for demonstration) and remove last token
cls_token_id = tokenizer.cls_token_id if tokenizer.cls_token_id is not None else tokenizer.convert_tokens_to_ids('[CLS]')

tgt_input_ids = torch.cat([torch.tensor([cls_token_id]), labels[:-1]])
# Add batch dimension
tgt_input_ids = tgt_input_ids.unsqueeze(0)

# Forward pass
logits = model(src_input_ids, src_attention_mask, tgt_input_ids)

print('Step 5: Completed a forward pass through the model.')
print('Logits shape:', logits.shape)

# Step 6: Define a basic training loop framework

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=5e-5)

# Dummy training loop with one iteration for demonstration
model.train()
optimizer.zero_grad()

# Shift logits and labels for loss computation: logits for each token and corresponding label
# Flatten the logits, expected shape: [batch*tgt_seq, vocab_size]
logits_flat = logits.reshape(-1, logits.size(-1))
# The target labels are the original summary tokens (not including the prepended token)
# but here we are using the entire labels for simplicity
loss = criterion(logits_flat, labels.unsqueeze(0).reshape(-1))
loss.backward()
optimizer.step()

print('Step 6: Completed one training step with loss:', loss.item())

print('BERT Summarizer Demonstration Completed.')

print('done')

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset

# Load T5 Tokenizer and Model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Define maximum lengths
max_input_length = 512
max_summary_length = 150

# Load a dataset for demonstration
dataset = load_dataset('xsum', split='train[:1%]')
print("Dataset loaded.")

# Preprocessing function
def preprocess(sample, task_prefix):
    input_text = task_prefix + sample['document']
    inputs = tokenizer(input_text, truncation=True, padding='max_length', max_length=max_input_length, return_tensors='pt')
    targets = tokenizer(sample['summary'], truncation=True, padding='max_length', max_length=max_summary_length, return_tensors='pt')
    return {'input_ids': inputs['input_ids'].squeeze(0), 'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': targets['input_ids'].squeeze(0)}

# Process dataset
processed_dataset = [preprocess(x, "summarize: ") for x in dataset]

# Test sample input
sample_input = "T5 is a transformer model developed by Google. It is widely used for text summarization and classification."
tokenized_input = tokenizer("summarize: " + sample_input, return_tensors='pt', max_length=max_input_length, truncation=True)

# Generate Summary
model.eval()
with torch.no_grad():
    summary_ids = model.generate(tokenized_input['input_ids'], max_length=max_summary_length, num_beams=5, early_stopping=True)
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Generated Summary:", summary_text)

# Classification using T5
classification_input = "classify: This movie was fantastic, I really loved it!"
tokenized_classification = tokenizer(classification_input, return_tensors='pt', max_length=max_input_length, truncation=True)

with torch.no_grad():
    classification_ids = model.generate(tokenized_classification['input_ids'], max_length=10)
classification_text = tokenizer.decode(classification_ids[0], skip_special_tokens=True)
print("Classification Output:", classification_text)

print("T5 Summarization and Classification Completed.")
