In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import time
from collections import Counter
import spacy
from tqdm import tqdm
import os
import gc

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
def optimize_memory():
    """Apply memory optimization techniques for Kaggle"""
    # Set PyTorch to release memory when no longer needed
    torch.cuda.empty_cache()

    # Set environment variables for better memory management
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

    # Check available GPU memory
    if torch.cuda.is_available():
        print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
        print(f"Available GPU memory: {torch.cuda.memory_reserved(0) / 1e9:.2f} GB")

optimize_memory()


Total GPU memory: 15.83 GB
Available GPU memory: 0.00 GB


In [4]:
try:
    from rouge import Rouge
except ImportError:
    !pip install rouge
    from rouge import Rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [5]:
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED) if torch.cuda.is_available() else None
np.random.seed(SEED)
random.seed(SEED)

In [6]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [7]:


# Set seeds for reproducibility
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED) if torch.cuda.is_available() else None
np.random.seed(SEED)
random.seed(SEED)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load spaCy for sentence tokenization
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import subprocess
    subprocess.call("python -m spacy download en_core_web_sm", shell=True)
    nlp = spacy.load("en_core_web_sm")

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
minimal_stopwords = set(['the', 'and', 'a', 'of', 'to', 'in', 'that', 'it', 'with', 'for', 'on', 'at'])
lemmatizer = WordNetLemmatizer()

def improved_preprocess(text, lower_case=True, lemmatize=True, stopword_removal=True):
    """Improved preprocessing function that preserves more contextual information"""
    if lower_case:
        text = text.lower()

    # Remove non-alphanumeric characters except for basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\?\!]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords if specified
    if stopword_removal:
        tokens = [word for word in tokens if word not in minimal_stopwords]

    # Lemmatize if specified
    if lemmatize:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into text
    processed_text = " ".join(tokens)
    return processed_text

def tokenize(text):
    """Tokenize text to words"""
    return nltk.word_tokenize(text.lower())

def build_vocab(sentences, min_freq_ratio=0.01):
    """Build vocabulary from sentences with minimum frequency threshold"""
    all_tokens = [token for sent in sentences for token in tokenize(sent)]
    total = len(all_tokens)
    counter = Counter(all_tokens)

    # Calculate minimum count threshold
    min_count = max(1, int(total * min_freq_ratio))

    # Initialize vocabulary with special tokens
    vocab = {"<pad>": 0, "<unk>": 1, "<bos>": 2, "<eos>": 3}
    idx = 4

    # Add words that meet the frequency threshold
    for word, count in counter.items():
        if count >= min_count:
            vocab[word] = idx
            idx += 1

    print(f"Vocabulary size: {len(vocab)}")
    print(f"Min count threshold: {min_count}")
    return vocab

class WikiTitleDataset(Dataset):
    def __init__(self, df, vocab, max_length_text=512, max_length_title=30):
        self.df = df
        self.vocab = vocab
        self.max_length_text = max_length_text
        self.max_length_title = max_length_title

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        title = self.df.iloc[idx]['title']

        # Convert text to indices
        text_tokens = tokenize(text)[:self.max_length_text]
        text_indices = [self.vocab.get(token, self.vocab['<unk>']) for token in text_tokens]

        # Convert title to indices
        title_tokens = tokenize(title)[:self.max_length_title-2]  # -2 for <bos> and <eos>
        title_indices = [self.vocab['<bos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in title_tokens] + [self.vocab['<eos>']]

        return {
            'text': torch.tensor(text_indices, dtype=torch.long),
            'title': torch.tensor(title_indices, dtype=torch.long),
            'raw_text': text,
            'raw_title': title
        }

def collate_fn(batch):
    """Custom collate function for DataLoader"""
    # Sort batch by text length in descending order for packed sequences
    batch = sorted(batch, key=lambda x: len(x['text']), reverse=True)

    text_lengths = [len(item['text']) for item in batch]
    title_lengths = [len(item['title']) for item in batch]

    # Pad sequences
    padded_texts = torch.nn.utils.rnn.pad_sequence([item['text'] for item in batch], padding_value=0)
    padded_titles = torch.nn.utils.rnn.pad_sequence([item['title'] for item in batch], padding_value=0)

    # Keep raw texts and titles
    raw_texts = [item['raw_text'] for item in batch]
    raw_titles = [item['raw_title'] for item in batch]

    return {
        'text': padded_texts,
        'title': padded_titles,
        'text_lengths': torch.tensor(text_lengths),
        'title_lengths': torch.tensor(title_lengths),
        'raw_text': raw_texts,
        'raw_title': raw_titles
    }

class EncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.3):
        super(EncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, bidirectional=True, batch_first=False)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, x, lengths=None):
        """
        Args:
            x: Input sequence tensor [seq_len, batch_size]
            lengths: Length of each sequence in the batch
        Returns:
            outputs: GRU outputs [seq_len, batch_size, hidden_dim * 2]
            hidden: Final hidden state [1, batch_size, hidden_dim]
        """
        embedded = self.dropout(self.embedding(x))

        if lengths is not None:
            packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu())
            outputs, hidden = self.gru(packed)
            outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        else:
            outputs, hidden = self.gru(embedded)

        # Concatenate bidirectional hidden states
        hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        hidden = torch.tanh(self.fc(hidden))
        hidden = hidden.unsqueeze(0)

        return outputs, hidden

    def load_embeddings(self, pretrained_embeddings):
        """Load pretrained word embeddings"""
        self.embedding.weight.data.copy_(pretrained_embeddings)
        print("Loaded pretrained embeddings successfully!")

class HierEncoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.3):
        super(HierEncoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.word_gru = nn.GRU(embedding_dim, hidden_dim, bidirectional=True, batch_first=False)
        self.sent_gru = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, x, lengths=None, sentence_boundaries=None):
        """
        Args:
            x: Input sequence tensor [seq_len, batch_size]
            lengths: Length of each sequence in the batch
            sentence_boundaries: List of indices where sentences end for each batch item
                Format: List of lists, where each inner list contains sentence end indices
        Returns:
            outputs: GRU outputs [seq_len, batch_size, hidden_dim * 2]
            hidden: Final hidden state [1, batch_size, hidden_dim]
        """
        batch_size = x.shape[1]
        embedded = self.dropout(self.embedding(x))

        if lengths is None:
            seq_len = x.size(0)
            lengths = torch.full((batch_size,), seq_len, device=x.device)

        # Process at word level
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), enforce_sorted=False)
        word_outputs, word_hidden = self.word_gru(packed)
        word_outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(word_outputs)

        # If sentence boundaries not provided, create artificial ones
        if sentence_boundaries is None:
            sentence_boundaries = [list(range(20, min(length.item(), 500), 20)) for length in lengths]
            for i, length in enumerate(lengths):
                if length.item() not in sentence_boundaries[i]:
                    sentence_boundaries[i].append(length.item())

        # Process at sentence level
        sent_level_outputs = []
        for batch_idx in range(batch_size):
            sent_ends = sentence_boundaries[batch_idx]
            if not sent_ends:
                sent_avg = torch.mean(word_outputs[:lengths[batch_idx], batch_idx, :], dim=0)
                sent_level_outputs.append(sent_avg.unsqueeze(0))
            else:
                sent_reprs = []
                prev_end = 0
                for end in sent_ends:
                    if end > prev_end:  # Ensure we don't process empty sentences
                        sent_repr = torch.mean(word_outputs[prev_end:end, batch_idx, :], dim=0)
                        sent_reprs.append(sent_repr)
                    prev_end = end

                if sent_reprs:
                    batch_sent_reprs = torch.stack(sent_reprs)
                    sent_level_outputs.append(batch_sent_reprs)
                else:
                    sent_avg = torch.mean(word_outputs[:lengths[batch_idx], batch_idx, :], dim=0)
                    sent_level_outputs.append(sent_avg.unsqueeze(0))

        # Pad sentence representations to same length
        max_sent_count = max(output.size(0) for output in sent_level_outputs)
        sent_padded = []
        for output in sent_level_outputs:
            if output.size(0) < max_sent_count:
                padding = torch.zeros(max_sent_count - output.size(0), output.size(1), device=x.device)
                sent_padded.append(torch.cat([output, padding], dim=0))
            else:
                sent_padded.append(output)

        sent_batch = torch.stack(sent_padded, dim=1)
        sent_lengths = torch.tensor([output.size(0) for output in sent_level_outputs], device=x.device)

        # Process through sentence-level GRU
        sent_outputs, sent_hidden = self.sent_gru(sent_batch.transpose(0, 1))

        # Combine bidirectional hidden states
        hidden = torch.cat((sent_hidden[0], sent_hidden[1]), dim=1)
        hidden = torch.tanh(self.fc(hidden))
        hidden = hidden.unsqueeze(0)

        return word_outputs, hidden

    def load_embeddings(self, pretrained_embeddings):
        """Load pretrained word embeddings"""
        self.embedding.weight.data.copy_(pretrained_embeddings)
        print("Loaded pretrained embeddings successfully!")

class DecoderRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.3):
        super(DecoderRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=False)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        """
        Args:
            x: Input token tensor [1, batch_size]
            hidden: Previous hidden state [1, batch_size, hidden_dim]
        Returns:
            outputs: Token probabilities [batch_size, vocab_size]
            hidden: Updated hidden state [1, batch_size, hidden_dim]
        """
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.gru(embedded, hidden)
        output = self.fc(output.squeeze(0))  # [batch_size, vocab_size]
        output = F.log_softmax(output, dim=1)  # [batch_size, vocab_size]
        return output, hidden

class Decoder2RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.3):
        super(Decoder2RNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru1 = nn.GRU(embedding_dim, hidden_dim, batch_first=False)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim, batch_first=False)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        """
        Args:
            x: Input token tensor [1, batch_size]
            hidden: Previous hidden state [1, batch_size, hidden_dim]
        Returns:
            output: Token probabilities [batch_size, vocab_size]
            hidden: Updated hidden state from second GRU [1, batch_size, hidden_dim]
        """
        embedded = self.dropout(self.embedding(x))
        output1, hidden1 = self.gru1(embedded, hidden)
        output1 = self.dropout(output1)
        output2, hidden2 = self.gru2(output1, hidden)
        output = self.fc(output2.squeeze(0))
        output = F.log_softmax(output, dim=1)
        return output, hidden2

class Seq2seqRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, vocab=None,
                 encoder_type='basic', decoder_type='basic', use_pretrained=False,
                 embeddings_path=None):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab = vocab
        self.max_length = 30

        # Initialize encoder based on type
        if encoder_type == 'hierarchical':
            self.encoder = HierEncoderRNN(vocab_size, embedding_dim, hidden_dim)
        else:
            self.encoder = EncoderRNN(vocab_size, embedding_dim, hidden_dim)

        # Initialize decoder based on type
        if decoder_type == 'dual':
            self.decoder = Decoder2RNN(vocab_size, embedding_dim, hidden_dim)
        else:  # Default to basic decoder
            self.decoder = DecoderRNN(vocab_size, embedding_dim, hidden_dim)

        # Load pretrained embeddings if specified
        if use_pretrained and embeddings_path:
            self._load_pretrained_embeddings(embeddings_path)

    def _load_pretrained_embeddings(self, embeddings_path):
        """Load GloVe embeddings from file"""
        print(f"Loading pretrained embeddings from {embeddings_path}...")
        weights_matrix = torch.FloatTensor(self.vocab_size, self.embedding_dim).normal_(0, 0.1)

        word2vec = {}
        with open(embeddings_path, 'r', encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = torch.FloatTensor([float(val) for val in values[1:]])
                word2vec[word] = vector

        words_found = 0
        for word, idx in self.vocab.items():
            if word in word2vec:
                weights_matrix[idx] = word2vec[word]
                words_found += 1

        print(f"Found embeddings for {words_found}/{self.vocab_size} words")
        self.encoder.load_embeddings(weights_matrix)

    def forward(self, src, target=None, teacher_forcing_ratio=0.5, search_method='greedy', beam_size=3, lengths=None):
        """
        Args:
            src: Source sequence [seq_len, batch_size]
            target: Target sequence [seq_len, batch_size] (for training)
            teacher_forcing_ratio: Probability of using teacher forcing
            search_method: 'greedy' or 'beam' for decoding strategy
            beam_size: Number of beams to use in beam search
            lengths: Sequence lengths for packed sequences
        """
        batch_size = src.shape[1]

        # Encode input sequence
        encoder_outputs, hidden = self.encoder(src, lengths=lengths)

        # If target is provided (training mode)
        if target is not None:
            target_len = target.shape[0]
            outputs = torch.zeros(target_len, batch_size, self.vocab_size).to(src.device)

            # First input to the decoder is the <bos> token
            decoder_input = target[0, :].unsqueeze(0)  # [1, batch_size]

            # Teacher forcing: Feed the target as the next input
            for t in range(1, target_len):
                decoder_output, hidden = self.decoder(decoder_input, hidden)
                outputs[t] = decoder_output

                # Decide if we use teacher forcing or not
                use_teacher_force = random.random() < teacher_forcing_ratio

                # Get the highest predicted token
                top1 = decoder_output.argmax(1)

                # Use teacher forcing: use actual target token as next input
                # Otherwise: use predicted token
                decoder_input = target[t].unsqueeze(0) if use_teacher_force else top1.unsqueeze(0)

            return outputs

        # If no target is provided (inference mode)
        else:
            if search_method == 'beam':
                return self._beam_search_decode(hidden, batch_size, beam_size)
            else:
                return self._greedy_decode(hidden, batch_size)

    def _greedy_decode(self, hidden, batch_size):
        """Greedy decoding for inference"""
        # First input to the decoder is the <bos> token
        decoder_input = torch.tensor([[self.vocab['<bos>']] * batch_size], device=hidden.device)

        outputs = []
        finished = [False] * batch_size

        for t in range(self.max_length):
            decoder_output, hidden = self.decoder(decoder_input, hidden)

            # Get the highest predicted token
            top1 = decoder_output.argmax(1)
            outputs.append(top1)

            # Next input is the predicted token
            decoder_input = top1.unsqueeze(0)

            # Check if any sequences have reached <eos>
            for i in range(batch_size):
                if top1[i] == self.vocab['<eos>']:
                    finished[i] = True

            # If all sequences have reached <eos>, stop decoding
            if all(finished):
                break

        # If we have no outputs (very unlikely), add at least one token
        if not outputs:
            # Add a token (e.g., <unk>) to prevent empty output
            dummy_output = torch.tensor([self.vocab['<unk>']] * batch_size, device=hidden.device)
            outputs.append(dummy_output)

        return torch.stack(outputs)
    
    def _beam_search_decode(self, hidden, batch_size, beam_size=3):
        """
        Beam search decoding for inference
        Args:
            hidden: Initial hidden state from encoder [1, batch_size, hidden_dim]
            batch_size: Batch size
            beam_size: Number of beams to track
        Returns:
            outputs: Tensor of shape [seq_len, batch_size] with best sequences
        """
        # Process each batch item separately
        all_best_sequences = []
        device = hidden.device
    
        for b in range(batch_size):
            # Get hidden state for this batch item
            batch_hidden = hidden[:, b:b+1, :].clone()
            
            # First token is always <bos>
            start_token = torch.tensor([[self.vocab['<bos>']]], device=device)
            
            # Initialize beam with just the start token
            beams = [
                {
                    'sequence': [self.vocab['<bos>']],  # Use list for sequences
                    'score': 0.0,
                    'hidden': batch_hidden.clone(),
                    'finished': False
                }
                for _ in range(beam_size)
            ]
            
            # For the first timestep, all beams have the same start token and hidden state
            decoder_output, new_hidden = self.decoder(start_token, batch_hidden)
            
            # Get top-k tokens for the first step
            topk_probs, topk_indices = decoder_output.squeeze(0).topk(beam_size)
            
            # Initialize beams with top-k tokens from first step
            for i in range(beam_size):
                beams[i]['sequence'] = [self.vocab['<bos>'], topk_indices[i].item()]
                beams[i]['score'] = topk_probs[i].item()
                beams[i]['hidden'] = new_hidden.clone()
                beams[i]['finished'] = topk_indices[i].item() == self.vocab['<eos>']
                
            # For each additional timestep
            for t in range(1, self.max_length - 1):  # -1 because we already did one step
                # Check if all beams are finished
                if all(beam['finished'] for beam in beams):
                    break
                    
                # Collect candidates from all beams
                candidates = []
                
                # Process active beams
                for beam_idx, beam in enumerate(beams):
                    if beam['finished']:
                        # Keep finished beams in the candidates
                        candidates.append({
                            'sequence': beam['sequence'],
                            'score': beam['score'],
                            'hidden': beam['hidden'],
                            'finished': True,
                            'parent_beam': beam_idx
                        })
                    else:
                        # Continue this active beam
                        last_token = torch.tensor([[beam['sequence'][-1]]], device=device)
                        decoder_output, new_hidden = self.decoder(last_token, beam['hidden'])
                        
                        # Get top-k tokens for this beam
                        topk_probs, topk_indices = decoder_output.squeeze(0).topk(beam_size)
                        
                        # Calculate scores
                        for i in range(beam_size):
                            token = topk_indices[i].item()
                            new_score = beam['score'] + topk_probs[i].item()
                            
                            candidates.append({
                                'sequence': beam['sequence'] + [token],
                                'score': new_score,
                                'hidden': new_hidden.clone(),
                                'finished': token == self.vocab['<eos>'],
                                'parent_beam': beam_idx
                            })
                
                # Select top beams based on score
                candidates.sort(key=lambda x: x['score'], reverse=True)
                beams = candidates[:beam_size]
            
            # Select the best beam
            best_beam = max(beams, key=lambda x: x['score'])
            best_sequence = best_beam['sequence']
            
            # Remove start token and end token if present
            if best_sequence[0] == self.vocab['<bos>']:
                best_sequence = best_sequence[1:]
            if best_sequence and best_sequence[-1] == self.vocab['<eos>']:
                best_sequence = best_sequence[:-1]
                
            # Convert to tensor
            best_tensor = torch.tensor(best_sequence, device=device)
            all_best_sequences.append(best_tensor)
        
        # Pad sequences to same length
        max_len = max(seq.size(0) for seq in all_best_sequences) if all_best_sequences else 1
        padded_sequences = []
        
        for seq in all_best_sequences:
            if seq.size(0) < max_len:
                padding = torch.zeros(max_len - seq.size(0), device=device).long()
                padded_seq = torch.cat([seq, padding], dim=0)
            else:
                padded_seq = seq
            padded_sequences.append(padded_seq)
        
        # Stack and transpose to get [seq_len, batch_size]
        result = torch.stack(padded_sequences).transpose(0, 1)
        return result

Using device: cuda


In [8]:
import gc
import torch
from torch.cuda.amp import autocast, GradScaler

# Initialize gradient scaler for mixed precision training
scaler = GradScaler()

def train_epoch(model, dataloader, optimizer, criterion, clip=1.0, teacher_forcing_ratio=0.5):
    """Train the model for one epoch using mixed precision"""
    model.train()
    epoch_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        # Move data to device
        src = batch['text'].to(device)
        trg = batch['title'].to(device)
        text_lengths = batch['text_lengths']

        # Zero gradients
        optimizer.zero_grad()

        # Use mixed precision for memory efficiency
        with autocast():
            output = model(src, trg, teacher_forcing_ratio, lengths=text_lengths)

            # Make sure shapes are correct
            output = output[1:].reshape(-1, output.shape[-1])  # skip <bos> for output
            trg = trg[1:].reshape(-1)  # skip <bos> for target

            loss = criterion(output, trg)

        # Scaled backward pass
        scaler.scale(loss).backward()

        # Clip gradients
        scaler.unscale_(optimizer)  # Unscale before clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Step with scaler
        scaler.step(optimizer)
        scaler.update()

        epoch_loss += loss.item()

        # Free up unused memory
        torch.cuda.empty_cache()
        gc.collect()

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, criterion):
    """Evaluate with proper handling of padding"""
    model.eval()
    epoch_loss = 0
    batch_count = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            src = batch['text'].to(device)
            trg = batch['title'].to(device)
            text_lengths = batch['text_lengths']

            # No teacher forcing
            output = model(src, trg, teacher_forcing_ratio=0.0, lengths=text_lengths)

            # Skip <bos> token in both output and trg
            output = output[1:].reshape(-1, output.shape[-1])
            trg = trg[1:].reshape(-1)

            # Calculate loss
            loss = criterion(output, trg)
            epoch_loss += loss.item()
            batch_count += 1

            # Free memory
            torch.cuda.empty_cache()
            gc.collect()

    torch.cuda.empty_cache()
    return epoch_loss / batch_count

def generate_titles(model, dataloader, vocab, search_method='greedy', beam_size=3):
    """Generate titles for the test set"""
    model.eval()
    idx_to_word = {idx: word for word, idx in vocab.items()}
    generated_titles = []
    reference_titles = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating titles"):
            src = batch['text'].to(device)
            text_lengths = batch['text_lengths']

            output = model(src, target=None, teacher_forcing_ratio=0,
                          search_method=search_method, beam_size=beam_size if search_method == 'beam' else None,
                          lengths=text_lengths)

            for i in range(output.shape[1]):
                pred_title = []
                for j in range(output.shape[0]):
                    idx = output[j, i].item()
                    if idx == 0 or idx == vocab['<eos>']:  # Stop at padding or EOS token
                        break
                    if idx in idx_to_word and idx != vocab['<bos>']:  # Skip BOS token
                        pred_title.append(idx_to_word[idx])

                # Format the title properly
                if pred_title:
                    # Capitalize first word
                    if pred_title[0]:
                        pred_title[0] = pred_title[0].capitalize()

                    # Capitalize proper nouns and other important words
                    for j in range(1, len(pred_title)):
                        if pred_title[j] not in minimal_stopwords and len(pred_title[j]) > 2:
                            pred_title[j] = pred_title[j].capitalize()
                else:
                    pred_title = ["Untitled"]  # Default title if nothing generated

                generated_titles.append(' '.join(pred_title))
                reference_titles.append(batch['raw_title'][i])

            # Free memory
            torch.cuda.empty_cache()
            gc.collect()

    return generated_titles, reference_titles


  scaler = GradScaler()


In [9]:
def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler=None,
               n_epochs=15, clip=1.0, teacher_forcing_ratio=0.5, patience=5, model_name='basic'):
    """Train the model with early stopping and learning rate scheduler"""
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []

    for epoch in range(n_epochs):
        start_time = time.time()

        # Gradually decrease teacher forcing ratio
        current_tf_ratio = max(0.1, teacher_forcing_ratio * (1.0 - epoch/n_epochs))

        train_loss = train_epoch(model, train_loader, optimizer, criterion, clip, current_tf_ratio)
        val_loss = evaluate(model, val_loader, criterion)

        if scheduler is not None:
            scheduler.step(val_loss)

        train_losses.append(train_loss)
        val_losses.append(val_loss)

        end_time = time.time()
        epoch_mins, epoch_secs = divmod(end_time - start_time, 60)

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs:.0f}s')
        print(f'\tTrain Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
        print(f'\tTeacher forcing ratio: {current_tf_ratio:.2f}')
        print(f'\tCurrent LR: {optimizer.param_groups[0]["lr"]:.6f}')

        # Save model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), f'best_model_{model_name}.pt')
            patience_counter = 0
            print("\tSaved new best model!")
        else:
            patience_counter += 1
            print(f"\tNo improvement: patience {patience_counter}/{patience}")

        # Early stopping
        if patience_counter >= patience:
            print("Early stopping triggered!")
            break

        # Free up memory
        torch.cuda.empty_cache()
        gc.collect()

    return train_losses, val_losses

def calculate_rouge(generated_titles, reference_titles):
    """Calculate ROUGE scores between generated and reference titles"""
    rouge = Rouge()
    valid_pairs = []

    for gen, ref in zip(generated_titles, reference_titles):
        # ROUGE requires non-empty strings
        if len(gen.strip()) == 0:
            gen = "untitled"
        if len(ref.strip()) == 0:
            ref = "untitled"
        valid_pairs.append((gen, ref))

    hyps, refs = zip(*valid_pairs)

    try:
        scores = rouge.get_scores(hyps, refs, avg=True)
        return scores
    except Exception as e:
        print(f"Error calculating ROUGE scores: {e}")
        # Handle specific ROUGE errors by fixing problematic pairs
        fixed_pairs = []
        for gen, ref in valid_pairs:
            # Ensure minimum length for ROUGE calculation
            if len(gen.split()) < 1:
                gen = "untitled"
            if len(ref.split()) < 1:
                ref = "untitled"
            fixed_pairs.append((gen, ref))

        if fixed_pairs:
            hyps, refs = zip(*fixed_pairs)
            try:
                scores = rouge.get_scores(hyps, refs, avg=True)
                return scores
            except:
                pass

        # Return default scores in case of error
        return {
            'rouge-1': {'f': 0.0, 'p': 0.0, 'r': 0.0},
            'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0},
            'rouge-l': {'f': 0.0, 'p': 0.0, 'r': 0.0}
        }

def load_glove_embeddings(embeddings_path, vocab, embedding_dim=300):
    """
    Load GloVe embeddings for the vocabulary with memory optimization
    Args:
        embeddings_path: Path to the GloVe embeddings file
        vocab: Dictionary mapping words to indices
        embedding_dim: Dimension of embeddings
    Returns:
        weights_matrix: Tensor of shape [vocab_size, embedding_dim]
    """
    print(f"Loading GloVe embeddings from {embeddings_path}...")
    weights_matrix = torch.FloatTensor(len(vocab), embedding_dim).normal_(0, 0.1)

    # Process the GloVe file in chunks to save memory
    word2vec = {}
    words_in_vocab = set(vocab.keys())

    with open(embeddings_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading GloVe file"):
            values = line.split()
            word = values[0]

            # Only store embeddings for words in our vocabulary
            if word in words_in_vocab:
                vector = torch.FloatTensor([float(val) for val in values[1:]])
                word2vec[word] = vector

            # Periodically clear memory
            if len(word2vec) % 50000 == 0:
                gc.collect()

    words_found = 0
    for word, idx in tqdm(vocab.items(), desc="Mapping to vocabulary"):
        if word in word2vec:
            weights_matrix[idx] = word2vec[word]
            words_found += 1

    print(f"Found embeddings for {words_found}/{len(vocab)} words")

    # Clear memory
    del word2vec
    gc.collect()

    return weights_matrix



In [11]:
print("Loading data...")
train_df = pd.read_csv('/kaggle/input/wiki-dataset/train.csv')
val_df = train_df.sample(n=500, random_state=42)
train_df = train_df.drop(val_df.index)
test_df = pd.read_csv('/kaggle/input/wiki-dataset/test.csv')

Loading data...


In [13]:
print("Preprocessing data...")
train_df['text'] = train_df['text'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=True))
val_df['text'] = val_df['text'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=True))
test_df['text'] = test_df['text'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=True))

Preprocessing data...


In [14]:
train_df['original_title'] = train_df['title']
val_df['original_title'] = val_df['title']
test_df['original_title'] = test_df['title']
train_df['title'] = train_df['title'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=False))
val_df['title'] = val_df['title'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=False))

In [15]:
print("Building vocabulary...")
all_texts = list(train_df['text']) + list(train_df['title'])
vocab = build_vocab(all_texts, min_freq_ratio=0.0000007)  # Using 1% threshold as specified

Building vocabulary...
Vocabulary size: 46040
Min count threshold: 19


In [16]:
# Create datasets
train_dataset = WikiTitleDataset(train_df, vocab, max_length_text=512, max_length_title=30)
val_dataset = WikiTitleDataset(val_df, vocab, max_length_text=512, max_length_title=30)
test_dataset = WikiTitleDataset(test_df, vocab, max_length_text=512, max_length_title=30)

batch_size = 16  # Adjust based on your GPU memory

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)  # Smaller batch for testing


In [17]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300

# Path to GloVe embeddings
glove_path = '/kaggle/input/wiki-dataset/glove.6B.300d.txt'

In [22]:
# Set batch size for basic model
torch.cuda.empty_cache()
batch_size = 32

# Create datasets
train_dataset = WikiTitleDataset(train_df, vocab, max_length_text=512, max_length_title=10)
val_dataset = WikiTitleDataset(val_df, vocab, max_length_text=512, max_length_title=10)
test_dataset = WikiTitleDataset(test_df, vocab, max_length_text=512, max_length_title=10)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)

# Train BASIC model
print(f"\n{'='*50}\nTraining basic model\n{'='*50}")

model_basic = Seq2seqRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    vocab=vocab,
    encoder_type='basic',
    decoder_type='basic',
    use_pretrained=False,
    embeddings_path=None
).to(device)

torch.cuda.empty_cache()
gc.collect()

optimizer = torch.optim.Adam(model_basic.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
criterion = nn.CrossEntropyLoss(ignore_index=0)

train_losses, val_losses = train_model(
    model_basic, train_loader, val_loader, optimizer, criterion,
    scheduler=scheduler, n_epochs=10, clip=1.0,
    teacher_forcing_ratio=0.7, patience=3,
    model_name='basic'
)

model_basic.load_state_dict(torch.load('best_model_basic.pt'))

generated_titles, reference_titles = generate_titles(
    model_basic, test_loader, vocab, search_method='greedy'
)

print("\nCalculating ROUGE scores...")
rouge_scores = calculate_rouge(generated_titles, reference_titles)

print("\nROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: {rouge_scores['rouge-l']['f']:.4f}")

print("\nExample predictions:")
for i in range(min(5, len(generated_titles))):
    print(f"Reference: {reference_titles[i]}")
    print(f"Generated: {generated_titles[i]}")
    print("---")

del model_basic
torch.cuda.empty_cache()
gc.collect()



Training basic model


  with autocast():
Training: 100%|██████████| 419/419 [04:41<00:00,  1.49it/s]
Evaluating: 100%|██████████| 16/16 [00:09<00:00,  1.60it/s]


Epoch: 01 | Time: 4.0m 52s
	Train Loss: 5.5365 | Val Loss: 4.3067
	Teacher forcing ratio: 0.70
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 419/419 [04:43<00:00,  1.48it/s]
Evaluating: 100%|██████████| 16/16 [00:09<00:00,  1.61it/s]


Epoch: 02 | Time: 4.0m 54s
	Train Loss: 3.9983 | Val Loss: 4.0656
	Teacher forcing ratio: 0.63
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 419/419 [04:44<00:00,  1.47it/s]
Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.53it/s]


Epoch: 03 | Time: 4.0m 55s
	Train Loss: 3.2199 | Val Loss: 3.9263
	Teacher forcing ratio: 0.56
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 419/419 [04:57<00:00,  1.41it/s]
Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.58it/s]


Epoch: 04 | Time: 5.0m 8s
	Train Loss: 2.5238 | Val Loss: 3.8256
	Teacher forcing ratio: 0.49
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 419/419 [04:46<00:00,  1.46it/s]
Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.57it/s]


Epoch: 05 | Time: 4.0m 57s
	Train Loss: 1.8814 | Val Loss: 3.8367
	Teacher forcing ratio: 0.42
	Current LR: 0.001000
	No improvement: patience 1/3


Training: 100%|██████████| 419/419 [04:43<00:00,  1.48it/s]
Evaluating: 100%|██████████| 16/16 [00:09<00:00,  1.61it/s]


Epoch: 06 | Time: 4.0m 54s
	Train Loss: 1.3110 | Val Loss: 3.8251
	Teacher forcing ratio: 0.35
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 419/419 [04:41<00:00,  1.49it/s]
Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.60it/s]


Epoch: 07 | Time: 4.0m 52s
	Train Loss: 0.8534 | Val Loss: 3.9088
	Teacher forcing ratio: 0.28
	Current LR: 0.001000
	No improvement: patience 1/3


Training: 100%|██████████| 419/419 [04:47<00:00,  1.46it/s]
Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.57it/s]


Epoch: 08 | Time: 4.0m 58s
	Train Loss: 0.5301 | Val Loss: 3.9039
	Teacher forcing ratio: 0.21
	Current LR: 0.001000
	No improvement: patience 2/3


Training: 100%|██████████| 419/419 [04:51<00:00,  1.44it/s]
Evaluating: 100%|██████████| 16/16 [00:10<00:00,  1.56it/s]
  model_basic.load_state_dict(torch.load('best_model_basic.pt'))


Epoch: 09 | Time: 5.0m 1s
	Train Loss: 0.3545 | Val Loss: 3.9319
	Teacher forcing ratio: 0.14
	Current LR: 0.000500
	No improvement: patience 3/3
Early stopping triggered!


Generating titles: 100%|██████████| 13/13 [00:05<00:00,  2.54it/s]



Calculating ROUGE scores...

ROUGE Scores:
ROUGE-1: 0.2393
ROUGE-2: 0.0535
ROUGE-L: 0.2393

Example predictions:
Reference: Weyburn
Generated: <unk>
---
Reference: Catholic High School, Singapore
Generated: Cyfair High High School
---
Reference: Minnesota Golden Gophers
Generated: La Fighting Fighting
---
Reference: List of people from Louisiana
Generated: List of People From Georgia
---
Reference: FC Shakhtar Donetsk
Generated: Fc Dynamo Moscow
---


768

In [21]:
batch_size = 16

# Re-create data loaders with smaller batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)

# Train ALL IMPROVEMENTS model
print(f"\n{'='*50}\nTraining all_improvements model\n{'='*50}")

model_all = Seq2seqRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    vocab=vocab,
    encoder_type='hierarchical',
    decoder_type='dual',
    use_pretrained=True,
    embeddings_path=glove_path
).to(device)

torch.cuda.empty_cache()
gc.collect()

# Update batch size to handle complex model


optimizer = torch.optim.Adam(model_all.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2, verbose=True)
criterion = nn.CrossEntropyLoss(ignore_index=0)

train_losses, val_losses = train_model(
    model_all, train_loader, val_loader, optimizer, criterion,
    scheduler=scheduler, n_epochs=10, clip=1.0,
    teacher_forcing_ratio=0.7, patience=3,
    model_name='all_improvements'
)


model_all.load_state_dict(torch.load('best_model_all_improvements.pt'))

generated_titles, reference_titles = generate_titles(
    model_all, test_loader, vocab, search_method='beam', beam_size=3
)

print("\nCalculating ROUGE scores...")
rouge_scores = calculate_rouge(generated_titles, reference_titles)

print("\nROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: {rouge_scores['rouge-l']['f']:.4f}")

print("\nExample predictions:")
for i in range(min(5, len(generated_titles))):
    print(f"Reference: {reference_titles[i]}")
    print(f"Generated: {generated_titles[i]}")
    print("---")

del model_all
torch.cuda.empty_cache()
gc.collect()


Training all_improvements model
Loading pretrained embeddings from /kaggle/input/wiki-dataset/glove.6B.300d.txt...
Found embeddings for 41488/46040 words
Loaded pretrained embeddings successfully!


  with autocast():
Training: 100%|██████████| 837/837 [08:04<00:00,  1.73it/s]
Evaluating: 100%|██████████| 32/32 [00:15<00:00,  2.12it/s]


Epoch: 01 | Time: 8.0m 19s
	Train Loss: 6.0074 | Val Loss: 5.3107
	Teacher forcing ratio: 0.70
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 837/837 [08:04<00:00,  1.73it/s]
Evaluating: 100%|██████████| 32/32 [00:15<00:00,  2.12it/s]


Epoch: 02 | Time: 8.0m 19s
	Train Loss: 4.7503 | Val Loss: 4.8399
	Teacher forcing ratio: 0.63
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 837/837 [08:05<00:00,  1.72it/s]
Evaluating: 100%|██████████| 32/32 [00:15<00:00,  2.05it/s]


Epoch: 03 | Time: 8.0m 21s
	Train Loss: 4.1926 | Val Loss: 4.5152
	Teacher forcing ratio: 0.56
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 837/837 [08:07<00:00,  1.72it/s]
Evaluating: 100%|██████████| 32/32 [00:16<00:00,  2.00it/s]


Epoch: 04 | Time: 8.0m 24s
	Train Loss: 3.7012 | Val Loss: 4.4633
	Teacher forcing ratio: 0.49
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 837/837 [08:23<00:00,  1.66it/s]
Evaluating: 100%|██████████| 32/32 [00:15<00:00,  2.02it/s]


Epoch: 05 | Time: 8.0m 39s
	Train Loss: 3.2546 | Val Loss: 4.3864
	Teacher forcing ratio: 0.42
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 837/837 [08:22<00:00,  1.67it/s]
Evaluating: 100%|██████████| 32/32 [00:15<00:00,  2.12it/s]


Epoch: 06 | Time: 8.0m 37s
	Train Loss: 2.8633 | Val Loss: 4.3496
	Teacher forcing ratio: 0.35
	Current LR: 0.001000
	Saved new best model!


Training: 100%|██████████| 837/837 [08:00<00:00,  1.74it/s]
Evaluating: 100%|██████████| 32/32 [00:15<00:00,  2.13it/s]


Epoch: 09 | Time: 8.0m 16s
	Train Loss: 1.9086 | Val Loss: 4.3983
	Teacher forcing ratio: 0.14
	Current LR: 0.001000
	No improvement: patience 1/3


Training: 100%|██████████| 837/837 [08:01<00:00,  1.74it/s]
Evaluating: 100%|██████████| 32/32 [00:14<00:00,  2.15it/s]
  model_all.load_state_dict(torch.load('best_model_all_improvements.pt'))


Epoch: 10 | Time: 8.0m 17s
	Train Loss: 1.6394 | Val Loss: 4.4103
	Teacher forcing ratio: 0.10
	Current LR: 0.001000
	No improvement: patience 2/3


Generating titles: 100%|██████████| 13/13 [00:05<00:00,  2.26it/s]



Calculating ROUGE scores...

ROUGE Scores:
ROUGE-1: 0.1291
ROUGE-2: 0.0202
ROUGE-L: 0.1291

Example predictions:
Reference: Weyburn
Generated: <unk>
---
Reference: Catholic High School, Singapore
Generated: St. High School
---
Reference: Minnesota Golden Gophers
Generated: Washington Stadium
---
Reference: List of people from Louisiana
Generated: List of People From Utah
---
Reference: FC Shakhtar Donetsk
Generated: Fc Basel
---


709

In [26]:
model_all = Seq2seqRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    vocab=vocab,
    encoder_type='hierarchical',
    decoder_type='dual',
    use_pretrained=False,
    embeddings_path=glove_path
).to(device)
model_all.load_state_dict(torch.load('best_model_all_improvements.pt'))

generated_titles, reference_titles = generate_titles(
    model_all, test_loader, vocab, search_method='greedy', beam_size=3
)

print("\nCalculating ROUGE scores greedy + decoder2 + hierarchical encoder...")
rouge_scores = calculate_rouge(generated_titles, reference_titles)

print("\nROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: {rouge_scores['rouge-l']['f']:.4f}")

print("\nExample predictions:")
for i in range(min(5, len(generated_titles))):
    print(f"Reference: {reference_titles[i]}")
    print(f"Generated: {generated_titles[i]}")
    print("---")

del model_all
torch.cuda.empty_cache()
gc.collect()

  model_all.load_state_dict(torch.load('best_model_all_improvements.pt'))
Generating titles: 100%|██████████| 13/13 [00:05<00:00,  2.40it/s]



Calculating ROUGE scores greedy + decoder2 + hierarchical encoder...

ROUGE Scores:
ROUGE-1: 0.1330
ROUGE-2: 0.0238
ROUGE-L: 0.1330

Example predictions:
Reference: Weyburn
Generated: <unk>
---
Reference: Catholic High School, Singapore
Generated: St. High School
---
Reference: Minnesota Golden Gophers
Generated: Washington Stadium
---
Reference: List of people from Louisiana
Generated: List of People From Utah
---
Reference: FC Shakhtar Donetsk
Generated: Fc Basel
---


705

In [24]:
model_basic = Seq2seqRNN(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    hidden_dim=hidden_dim,
    vocab=vocab,
    encoder_type='basic',
    decoder_type='basic',
    use_pretrained=False,
    embeddings_path=None
).to(device)

model_basic.load_state_dict(torch.load('best_model_basic.pt'))

generated_titles, reference_titles = generate_titles(
    model_basic, test_loader, vocab, search_method='beam'
)

print("\nCalculating ROUGE scores...beam + basic rnn")
rouge_scores = calculate_rouge(generated_titles, reference_titles)

print("\nROUGE Scores:")
print(f"ROUGE-1: {rouge_scores['rouge-1']['f']:.4f}")
print(f"ROUGE-2: {rouge_scores['rouge-2']['f']:.4f}")
print(f"ROUGE-L: {rouge_scores['rouge-l']['f']:.4f}")

print("\nExample predictions:")
for i in range(min(5, len(generated_titles))):
    print(f"Reference: {reference_titles[i]}")
    print(f"Generated: {generated_titles[i]}")
    print("---")

del model_basic
torch.cuda.empty_cache()
gc.collect()


  model_basic.load_state_dict(torch.load('best_model_basic.pt'))
Generating titles: 100%|██████████| 13/13 [00:05<00:00,  2.30it/s]



Calculating ROUGE scores...beam + basic rnn

ROUGE Scores:
ROUGE-1: 0.2395
ROUGE-2: 0.0477
ROUGE-L: 0.2395

Example predictions:
Reference: Weyburn
Generated: <unk>
---
Reference: Catholic High School, Singapore
Generated: Cyfair High High School
---
Reference: Minnesota Golden Gophers
Generated: La Fighting
---
Reference: List of people from Louisiana
Generated: List of People People
---
Reference: FC Shakhtar Donetsk
Generated: Fc Dynamo Moscow
---


695