<a href="https://colab.research.google.com/github/swastik-raj-vansh-singh/Text-summerization/blob/main/Text%20summerization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from datasets import load_dataset

# Load the CNN/Daily Mail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Define the preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples['article'], max_length=512, truncation=True, padding='max_length')
    targets = tokenizer(examples['highlights'], max_length=150, truncation=True, padding='max_length')

    inputs['labels'] = targets['input_ids']
    return inputs

# Preprocess the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Define the collate function
def collate_fn(batch):
    input_ids = torch.stack([torch.tensor(item['input_ids']) for item in batch])
    attention_mask = torch.stack([torch.tensor(item['attention_mask']) for item in batch])
    labels = torch.stack([torch.tensor(item['labels']) for item in batch])
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Prepare the DataLoader
train_dataset = tokenized_datasets['train']
val_dataset = tokenized_datasets['validation']

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn)

# Define the model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the training function
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# Define the evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            total_loss += loss.item()

    return total_loss / len(dataloader)

# Define the summary generation function
def generate_summary(model, tokenizer, text, device, max_input_len=512, max_output_len=150):
    model.eval()
    inputs = tokenizer.encode_plus(
        text,
        max_length=max_input_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_output_len,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Training loop
for epoch in range(3):  # Number of epochs
    print("Traingin")
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    val_loss = evaluate(model, val_dataloader, device)
    print(f'Epoch {epoch + 1} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

# Generate summary
test_text = "This is a long text that needs summarizing. It covers multiple aspects and requires a concise summary."
summary = generate_summary(model, tokenizer, test_text, device)
print(f"Summary: {summary}")


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Traingin


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
!pip install datasets
from transformers import BertTokenizer, BertModel, AdamW
from datasets import load_dataset
from sklearn.model_selection import train_test_split

# Load the CNN/Daily Mail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define the custom dataset
class SummarizationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Define the model
class BertSummarizer(nn.Module):
    def __init__(self, model_name, num_labels):
        super(BertSummarizer, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        logits = self.classifier(pooled_output)
        return logits

def preprocess_data(dataset, max_len):
    texts = []
    labels = []
    for example in dataset:
        text = example['article']
        summary = example['highlights']
        sentences = text.split('. ')
        label = [1 if sentence in summary else 0 for sentence in sentences]
        texts.extend(sentences)
        labels.extend(label)

    return train_test_split(texts, labels, test_size=0.1, random_state=42)

max_len = 128
texts, labels, val_texts, val_labels = preprocess_data(dataset['train'], max_len)

train_dataset = SummarizationDataset(texts, labels, tokenizer, max_len)
val_dataset = SummarizationDataset(val_texts, val_labels, tokenizer, max_len)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

model = BertSummarizer('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

    return total_loss / len(dataloader)

for epoch in range(3):
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    val_loss = evaluate(model, val_dataloader, device)
    print(f'Epoch {epoch + 1} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

def extract_sentences(model, tokenizer, text, device, max_len=128):
    model.eval()
    sentences = text.split('. ')
    inputs = tokenizer(sentences, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt', padding=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
    probabilities = torch.softmax(logits, dim=1)
    predicted_labels = torch.argmax(probabilities, dim=1)

    important_sentences = [sentences[i] for i in range(len(sentences)) if predicted_labels[i] == 1]
    return ' '.join(important_sentences)

test_text = "This is an important sentence. This sentence is not important. Another important sentence."
summary = extract_sentences(model, tokenizer, test_text, device)
print(f"Summary: {summary}")


In [None]:
!pip install transformers torch

from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")

print(dataset['train'][0])
from transformers import PegasusTokenizer

model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    inputs = examples['article']
    targets = examples['highlights']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)
from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments

model = PegasusForConditionalGeneration.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

trainer.train()
model.save_pretrained("path_to_save_model")
tokenizer.save_pretrained("path_to_save_model")



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PointerGeneratorNetwork(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PointerGeneratorNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.decoder_lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.attention = nn.Linear(hidden_dim * 2, hidden_dim)
        self.vocab_projection = nn.Linear(hidden_dim, vocab_size)
        self.p_gen_linear = nn.Linear(hidden_dim * 4, 1)

    def forward(self, src, tgt):
        src_embedded = self.embedding(src)
        tgt_embedded = self.embedding(tgt)

        encoder_outputs, (h, c) = self.encoder_lstm(src_embedded)
        decoder_outputs, _ = self.decoder_lstm(tgt_embedded, (h, c))

        attn_weights = torch.bmm(decoder_outputs, encoder_outputs.transpose(1, 2))
        attn_weights = F.softmax(attn_weights, dim=-1)
        context_vector = torch.bmm(attn_weights, encoder_outputs)

        concat_output = torch.cat((decoder_outputs, context_vector), dim=-1)

        p_gen = torch.sigmoid(self.p_gen_linear(concat_output))

        vocab_dist = F.softmax(self.vocab_projection(concat_output), dim=-1)

        return vocab_dist, p_gen
def train_model(model, train_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        for src, tgt in train_loader:
            optimizer.zero_grad()
            vocab_dist, p_gen = model(src, tgt)

            loss = criterion(vocab_dist, tgt)
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

vocab_size = 50000
embedding_dim = 256
hidden_dim = 512
model = PointerGeneratorNetwork(vocab_size, embedding_dim, hidden_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
!pip install transformers datasets rouge-score torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, load_metric

pegasus_model_name = "google/pegasus-xsum"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(pegasus_model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_model_name)

t5_model_name = "t5-small"
t5_tokenizer = T5Tokenizer.from_pretrained(t5_model_name)
t5_model = T5ForConditionalGeneration.from_pretrained(t5_model_name)

bert_model_name = "bert-base-uncased"
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = BertForSequenceClassification.from_pretrained(bert_model_name)
dataset = load_dataset("cnn_dailymail", "3.0.0")
metric = load_metric("rouge")

def preprocess_function(examples, tokenizer, max_input_length=1024, max_target_length=128):
    inputs = examples['article']
    targets = examples['highlights']
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets_pegasus = dataset.map(lambda x: preprocess_function(x, pegasus_tokenizer), batched=True)
tokenized_datasets_t5 = dataset.map(lambda x: preprocess_function(x, t5_tokenizer), batched=True)
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = pegasus_tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = pegasus_tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {key: value.mid.fmeasure * 100 for key, value in result.items()}

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,

)

pegasus_trainer = Trainer(
    model=pegasus_model,
    args=training_args,
    train_dataset=tokenized_datasets_pegasus["train"],
    eval_dataset=tokenized_datasets_pegasus["validation"],
    tokenizer=pegasus_tokenizer,
    compute_metrics=compute_metrics,
)

t5_trainer = Trainer(
    model=t5_model,
    args=training_args,
    train_dataset=tokenized_datasets_t5["train"],
    eval_dataset=tokenized_datasets_t5["validation"],
    tokenizer=t5_tokenizer,
    compute_metrics=compute_metrics,
)
pegasus_trainer.train()

pegasus_results = pegasus_trainer.evaluate()

t5_trainer.train()

t5_results = t5_trainer.evaluate()
print("PEGASUS Results:", pegasus_results)
print("T5 Results:", t5_results)
def evaluate_pointer_generator(model, dataset, tokenizer, metric):
    model.eval()
    preds = []
    labels = []
    for batch in dataset:
        inputs = tokenizer(batch['article'], return_tensors='pt', max_length=1024, truncation=True, padding=True)
        targets = tokenizer(batch['highlights'], return_tensors='pt', max_length=128, truncation=True, padding=True)

        with torch.no_grad():
            outputs = model.generate(inputs.input_ids)

        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(targets.input_ids, skip_special_tokens=True)

        preds.extend(decoded_preds)
        labels.extend(decoded_labels)

    result = metric.compute(predictions=preds, references=labels, use_stemmer=True)
    return {key: value.mid.fmeasure * 100 for key, value in result.items()}

pointer_generator_results = evaluate_pointer_generator(pointer_generator_model, tokenized_datasets["validation"], tokenizer, metric)

print("Pointer-Generator Results:", pointer_generator_results)
