In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from rouge import Rouge
import os
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
nltk.download("punkt_tab")

def get_device():
    if torch.cuda.is_available():
        print("Using GPU")
        return torch.device('cuda')
    else:
        print("Using CPU")
        return torch.device('cpu')

device = get_device()

print(f'Device: {device}')

Using CPU
Device: cpu


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vaibh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
class BiLSTMSummarizer(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTMSummarizer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.encoder = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.decoder = nn.LSTM(embedding_dim, hidden_dim * 2, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.fc.out_features

        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src.device)

        embedded = self.embedding(src)
        enc_output, (hidden, cell) = self.encoder(embedded)

        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1).unsqueeze(0)
        cell = torch.cat((cell[-2, :, :], cell[-1, :, :]), dim=1).unsqueeze(0)

        input = trg[:, 0]

        for t in range(1, trg_len):
            input_embedded = self.embedding(input).unsqueeze(1)
            output, (hidden, cell) = self.decoder(input_embedded, (hidden, cell))
            prediction = self.fc(output.squeeze(1))
            outputs[:, t] = prediction

            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = prediction.argmax(1)
            input = trg[:, t] if teacher_force else top1

        return outputs

In [9]:
# Custom dataset class
class SummarizationDataset(Dataset):
    def __init__(self, articles, summaries, vocab, max_length=100):
        self.articles = articles
        self.summaries = summaries
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        summary = self.summaries[idx]

        article_indices = [self.vocab['<sos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in article][:self.max_length-2] + [self.vocab['<eos>']]
        summary_indices = [self.vocab['<sos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in summary][:self.max_length-2] + [self.vocab['<eos>']]

        article_indices = article_indices + [self.vocab['<pad>']] * (self.max_length - len(article_indices))
        summary_indices = summary_indices + [self.vocab['<pad>']] * (self.max_length - len(summary_indices))

        return torch.tensor(article_indices), torch.tensor(summary_indices)


In [10]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    articles = df['Content'].tolist()   # Use the 'Content' as the source article
    summaries = df['Headline'].tolist() # Use the 'Headline' as the target summary
    return articles, summaries

In [11]:
# Tokenize text
def tokenize(text):
    return word_tokenize(text.lower())

In [12]:
# Build vocabulary
def build_vocab(texts, min_freq=2):
    word_freq = Counter()
    for text in texts:
        word_freq.update(text)

    vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
    for word, freq in word_freq.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)

    return vocab, {v: k for k, v in vocab.items()}

In [14]:
# Load data
articles, summaries = load_data(r'hindi_news_dataset.csv')

In [15]:
# Tokenize data
tokenized_articles = [tokenize(article) for article in articles]
tokenized_summaries = [tokenize(summary) for summary in summaries]

In [16]:
# Build vocabulary
vocab, inv_vocab = build_vocab(tokenized_articles + tokenized_summaries)

In [17]:
# Split data
train_articles, test_articles, train_summaries, test_summaries = train_test_split(tokenized_articles, tokenized_summaries, test_size=0.2, random_state=42)
train_articles, val_articles, train_summaries, val_summaries = train_test_split(train_articles, train_summaries, test_size=0.1, random_state=42)

In [18]:
train_dataset = SummarizationDataset(train_articles, train_summaries, vocab)
val_dataset = SummarizationDataset(val_articles, val_summaries, vocab)
test_dataset = SummarizationDataset(test_articles, test_summaries, vocab)

In [19]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [13]:
model = BiLSTMSummarizer(len(vocab), embedding_dim=128, hidden_dim=256, output_dim=len(vocab)).to(device)

In [14]:
def train(model, iterator, optimizer, criterion, device, clip=1, teacher_forcing_ratio=0.5):
    model.train()
    epoch_loss = 0
    for batch in tqdm(iterator, desc="Training"):
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg, teacher_forcing_ratio)

        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        trg = trg[:, 1:].reshape(-1)

        loss = criterion(output, trg)
        loss.backward()

        if isinstance(device, torch.device) and device.type == 'xla':
            xm.optimizer_step(optimizer, barrier=True)  # TPU-specific optimizer step
        else:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
            optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [7]:
# Evaluation function
def evaluate(model, iterator, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in tqdm(iterator, desc="Evaluating"):
            src, trg = batch
            src, trg = src.to(device), trg.to(device)

            output = model(src, trg, 0)  # turn off teacher forcing

            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            trg = trg[:, 1:].reshape(-1)

            loss = criterion(output, trg)
            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [None]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

In [17]:
num_epochs = 6
best_val_loss = float('inf')
for epoch in range(num_epochs):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss = evaluate(model, val_loader, criterion, device)
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {val_loss:.3f}')
    # Save model if validation loss improves
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({'model_state_dict': model.state_dict(), 'vocab': vocab}, 'best_model.pth')
        print(f"Model saved to 'best_model.pth'")

Training: 100%|██████████| 4174/4174 [1:17:12<00:00,  1.11s/it]
Evaluating: 100%|██████████| 464/464 [04:24<00:00,  1.76it/s]


Epoch: 01
	Train Loss: 4.964
	 Val. Loss: 4.480
Model saved to 'best_model.pth'


Training: 100%|██████████| 4174/4174 [1:17:03<00:00,  1.11s/it]
Evaluating: 100%|██████████| 464/464 [04:22<00:00,  1.77it/s]


Epoch: 02
	Train Loss: 2.686
	 Val. Loss: 3.318
Model saved to 'best_model.pth'


Training: 100%|██████████| 4174/4174 [1:17:08<00:00,  1.11s/it]
Evaluating: 100%|██████████| 464/464 [04:22<00:00,  1.77it/s]


Epoch: 03
	Train Loss: 1.846
	 Val. Loss: 2.781
Model saved to 'best_model.pth'


Training: 100%|██████████| 4174/4174 [1:17:04<00:00,  1.11s/it]
Evaluating: 100%|██████████| 464/464 [04:22<00:00,  1.77it/s]


Epoch: 04
	Train Loss: 1.408
	 Val. Loss: 2.447
Model saved to 'best_model.pth'


Training: 100%|██████████| 4174/4174 [1:17:03<00:00,  1.11s/it]
Evaluating: 100%|██████████| 464/464 [04:22<00:00,  1.77it/s]


Epoch: 05
	Train Loss: 1.139
	 Val. Loss: 2.214
Model saved to 'best_model.pth'


Training: 100%|██████████| 4174/4174 [1:16:56<00:00,  1.11s/it]
Evaluating: 100%|██████████| 464/464 [04:21<00:00,  1.78it/s]


Epoch: 06
	Train Loss: 0.957
	 Val. Loss: 2.059
Model saved to 'best_model.pth'


In [20]:
# Load model function
def load_model(filepath, device):
    checkpoint = torch.load(filepath, map_location=device)
    vocab = checkpoint['vocab']
    model = BiLSTMSummarizer(len(vocab), embedding_dim=128, hidden_dim=256, output_dim=len(vocab)).to(device)
    model.load_state_dict(checkpoint['model_state_dict'])
    return model, checkpoint

In [25]:
# Load the best model for testing
best_model, _ = load_model('best_model.pth', device)
test_loss = evaluate(best_model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.3f}')

  checkpoint = torch.load(filepath, map_location=device)
Evaluating:   0%|          | 3/1160 [00:18<1:56:41,  6.05s/it]


KeyboardInterrupt: 

In [24]:
optimizer = optim.Adam(best_model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])

In [28]:
def beam_search(
    model,
    src,
    vocab,
    inv_vocab,
    beam_width=3,
    max_length=100,
    min_length=10,
    device="cpu",
):
    model.eval()
    with torch.no_grad():
        # Embedding the input sequence
        embedded = model.embedding(src)  # shape: (batch_size, seq_len, embedding_dim)
        enc_output, (hidden, cell) = model.encoder(embedded)  # LSTM encoder output

        # In case of bi-directional LSTM, combine the hidden states
        if model.encoder.bidirectional:
            hidden = torch.cat(
                (hidden[-2, :, :], hidden[-1, :, :]), dim=1
            )  # shape: (batch_size, hidden_dim)
            cell = torch.cat(
                (cell[-2, :, :], cell[-1, :, :]), dim=1
            )  # shape: (batch_size, hidden_dim)
        else:
            hidden = hidden[-1, :, :]  # Take the last layer if not bi-directional
            cell = cell[-1, :, :]  # Take the last layer if not bi-directional

        # Now we process one sequence at a time, so set batch size to 1
        hidden = hidden.unsqueeze(0)  # shape: (1, batch_size, hidden_dim)
        cell = cell.unsqueeze(0)  # shape: (1, batch_size, hidden_dim)

        # Initialize the beam with the start-of-sequence token
        beam = [
            ([vocab["<sos>"]], 0, hidden[:, 0:1, :], cell[:, 0:1, :])
        ]  # Start with one sequence
        complete_hypotheses = []

        # Perform beam search
        for t in range(max_length):
            new_beam = []
            for seq, score, hidden, cell in beam:
                # If end-of-sequence token is reached and length is >= min_length, add to complete hypotheses
                if seq[-1] == vocab["<eos>"] and len(seq) >= min_length:
                    complete_hypotheses.append((seq, score))
                    continue

                # Prepare the input for the decoder (last predicted token)
                input = (
                    torch.LongTensor([seq[-1]]).unsqueeze(0).to(device)
                )  # shape: (1, 1)
                input_embedded = model.embedding(input)  # shape: (1, 1, embedding_dim)

                # Pass through the decoder
                output, (hidden, cell) = model.decoder(
                    input_embedded, (hidden, cell)
                )  # Decode step
                predictions = model.fc(
                    output.squeeze(1)
                )  # Linear layer to get vocab distribution

                # Get top beam_width predictions
                topk_scores, topk_indices = torch.topk(predictions, beam_width, dim=1)

                for i in range(beam_width):
                    next_seq = seq + [topk_indices[0, i].item()]
                    next_score = score + topk_scores[0, i].item()
                    new_beam.append((next_seq, next_score, hidden, cell))

            # Sort the beam by score and select the top candidates
            beam = sorted(new_beam, key=lambda x: x[1], reverse=True)[:beam_width]

        # If no complete hypotheses were found, return the highest scoring incomplete hypothesis
        if len(complete_hypotheses) == 0:
            complete_hypotheses = beam

        # Return the sequence with the highest score
        best_hypothesis = max(complete_hypotheses, key=lambda x: x[1])[0]
        return [
            inv_vocab[idx]
            for idx in best_hypothesis
            if idx not in [vocab["<sos>"], vocab["<eos>"], vocab["<pad>"]]
        ]

In [29]:
# Evaluate using ROUGE score
rouge = Rouge()
best_model.eval()
predictions = []
references = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Generating summaries"):
        src, trg = batch
        src = src.to(device)
        pred = beam_search(best_model, src, vocab, inv_vocab, min_length=10, device=device)
        predictions.extend([' '.join(pred)])
        references.extend([' '.join([inv_vocab[idx.item()] for idx in trg[0] if idx.item() not in [vocab['<sos>'], vocab['<eos>'], vocab['<pad>']]])])


Generating summaries:   0%|          | 1/1160 [00:01<21:30,  1.11s/it]


KeyError: 60441

In [None]:
min_length = 10
predictions = [' '.join(pred[:min_length]) for pred in predictions]
scores = rouge.get_scores(predictions, references, avg=True)
print("ROUGE scores:", scores)