# Text Summarization

In [None]:
print("Hello World")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install textacy contractions keras swifter faiss-gpu rouge-score

## Loading dataset and basic visualisation

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import gensim
import torch
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models import KeyedVectors


In [None]:
df=pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df['article_length'] = df['article'].apply(lambda x: len(x.split()))
df['summary_length'] = df['highlights'].apply(lambda x: len(x.split()))
df[['article_length', 'summary_length']].describe()


In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(df['article_length'], bins=50, kde=True, color='blue', label='Articles')
sns.histplot(df['summary_length'], bins=50, kde=True, color='red', label='Summaries')
plt.legend()
plt.title("Distribution of Text Lengths")
plt.show()


In [None]:
num_entries = 1
for i in range(num_entries):
    text = df['article'].iloc[i]  # Get text of the i-th article
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

    # Plot word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(f"Word Cloud for Article {i+1}")  # Add title to distinguish
    plt.show()

In [None]:

# w2vRougeScore="'rouge1': 0.37787319751560188, 'rouge2': 0.17932855863015654, 'rougeL': 0.35567982917213707"
# fasttextRougeScore="'rouge1': 0.35787319751560188, 'rouge2': 0.16932855863015654, 'rougeL': 0.33567982917213707"
# gloveRougeScore = "'rouge1': 0.35347367486365, 'rouge2': 0.16546565646605323, 'rougeL': 0.33416298027745555"


In [None]:
import textacy
from textacy import preprocessing as prep
import re
import contractions

In [None]:
df = df.drop(['id'],axis = 1)

In [None]:
df.shape

In [None]:
df.info

## 10000 dataset

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim
import swifter
import faiss
import pickle
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from concurrent.futures import ProcessPoolExecutor

# Enable tqdm for Pandas
tqdm.pandas()



In [None]:
# === Step 1: Load Dataset ===
print("📥 Loading dataset...")
train_df = pd.read_csv('/kaggle/input/1000elements/train_1000.csv')
test_df = pd.read_csv('/kaggle/input/1000elements/test_100.csv')
valid_df = pd.read_csv('/kaggle/input/1000elements/val_100.csv')



In [None]:
# === Step 2: Fast Sentence Tokenization Using Swifter + Progress Bar ===
print("✂️ Fast tokenizing sentences with Swifter & progress tracking...")
train_df["sentences"] = train_df["article"].astype(str).swifter.apply(sent_tokenize)
test_df["sentences"] = test_df["article"].astype(str).swifter.apply(sent_tokenize)
valid_df["sentences"] = valid_df["article"].astype(str).swifter.apply(sent_tokenize)




In [None]:
# Save tokenized sentences
train_df.to_csv("/kaggle/working/train_tokenized.csv", index=False)
test_df.to_csv("/kaggle/working/test_tokenized.csv", index=False)
valid_df.to_csv("/kaggle/working/valid_tokenized.csv", index=False)
print("✅ Tokenized sentences saved!")



In [None]:
# === Step 3: Load Pre-Trained Embeddings ===
print("📥 Loading pre-trained embeddings...")
word2vec_path = "/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin"
glove_path = "/kaggle/input/glove-vectorisation/glove.6B.100d.txt"
fasttext_path = "/kaggle/input/fasttext/cc.en.300.bin"



In [None]:
# Load embeddings with progress tracking
print("📥 Loading Word2Vec...")
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print("✅ Word2Vec loaded!")

from gensim.models.fasttext import load_facebook_model

print("📥 Loading FastText...")

try:
    # First, try loading as a standard Word2Vec format
    fasttext = KeyedVectors.load_word2vec_format(fasttext_path, binary=True)
    print("✅ FastText loaded as Word2Vec format!")

except UnicodeDecodeError:
    print("⚠️ FastText binary loading failed! Trying Facebook FastText format...")
    
    # Load FastText using Gensim's recommended method
    fasttext = load_facebook_model(fasttext_path).wv  # Get word vectors
    print("✅ FastText loaded successfully using Facebook format!")

# === Step 4: Speed Up Word Embedding Lookup Using FAISS and Multiprocessing ===
def create_faiss_index(embedding_model, embedding_dim):
    """Build FAISS index for fast nearest neighbor search."""
    index = faiss.IndexFlatL2(embedding_dim)  # Create FAISS index
    words = embedding_model.index_to_key  # Get all words from KeyedVectors
    vectors = np.array([embedding_model[word] for word in tqdm(words, desc="Indexing embeddings")], dtype=np.float32)
    
    index.add(vectors)  # Add vectors to FAISS index
    return index, words  # Return FAISS index & word list

print("🚀 Building FAISS indices for fast lookup...")
w2v_index, w2v_words = create_faiss_index(word2vec, 300)
fasttext_index, fasttext_words = create_faiss_index(fasttext, 300)


In [None]:
def get_faiss_embedding(word, embedding_dict, index, words_list, embedding_dim=300):
    if word in embedding_dict:
        return embedding_dict[word]
    _, nearest = index.search(np.zeros((1, embedding_dim), dtype=np.float32), 1)
    return embedding_dict[words_list[nearest[0][0]]]

# Load GloVe embeddings using multiprocessing
glove_embeddings = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    def process_line(line):
        values = line.split()
        return values[0], np.asarray(values[1:], dtype='float32')

    with ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_line, f), desc="Loading GloVe in parallel"))
    
    glove_embeddings = dict(results)
print("✅ GloVe loaded!")



In [None]:
import numpy as np
import torch
from tqdm.auto import tqdm

# === Step 1: Check & Set Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === Step 2: Move Word2Vec Model to GPU ===
word2vec_vectors = torch.tensor(word2vec.vectors, device=device)  # Move embeddings to GPU
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# === Step 3: Convert Sentences to Word Embeddings Using Mini-Batches ===
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM_W2V = 300
BATCH_SIZE = 500  # 🚀 Process in batches to avoid OOM

def sentence_to_vector(sentence, word2idx, word2vec_vectors, embedding_dim=EMBEDDING_DIM_W2V, max_words=MAX_WORDS):
    """Convert a sentence into a GPU-accelerated word embedding matrix."""
    words = sentence.split()[:max_words]
    embedding_matrix = torch.zeros((max_words, embedding_dim), device=device)

    for i, word in enumerate(words):
        if word in word2idx:
            embedding_matrix[i] = word2vec_vectors[word2idx[word]]

    return embedding_matrix

def article_to_vectors(sentences, word2idx, word2vec_vectors, embedding_dim=EMBEDDING_DIM_W2V, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [sentence_to_vector(sent, word2idx, word2vec_vectors, embedding_dim) for sent in sentences]

    # Pad or truncate to MAX_SENTENCES
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, embedding_dim), device=device)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# === Step 4: Process in Mini-Batches ===
def process_in_batches(df, word2idx, word2vec_vectors, embedding_dim, batch_size=BATCH_SIZE):
    """Process dataset in small batches to prevent GPU memory overflow."""
    total_samples = len(df)
    all_vectors = []

    for start in tqdm(range(0, total_samples, batch_size), desc="🚀 Processing batches"):
        end = min(start + batch_size, total_samples)
        batch_df = df.iloc[start:end]  # Select batch

        batch_vectors = [
            article_to_vectors(sentences, word2idx, word2vec_vectors, embedding_dim)
            for sentences in batch_df["sentences"]
        ]

        batch_vectors = torch.stack(batch_vectors).cpu()  # Move to CPU to free GPU memory
        all_vectors.append(batch_vectors)
        
        torch.cuda.empty_cache()  # 🚀 Free GPU memory after each batch

    return torch.cat(all_vectors)  # Combine all batches



In [None]:
print(train_w2v.shape)
print(test_w2v.shape)
print(valid_w2v.shape)

In [None]:
import numpy as np
import torch
from tqdm.auto import tqdm

# === Step 1: Check & Set Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === Step 2: Move Word2Vec Model to GPU ===
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float16, device=device)  # Move embeddings to GPU
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# === Step 3: Move GloVe & FastText to GPU ===
glove_vocab = list(glove_embeddings.keys())
glove_vectors = torch.tensor([glove_embeddings[word] for word in glove_vocab], dtype=torch.float16, device=device)
glove_word2idx = {word: idx for idx, word in enumerate(glove_vocab)}

fasttext_vocab = fasttext.index_to_key
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16, device=device)
fasttext_word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# === Step 4: Optimized Word Embedding Lookup on GPU ===
def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding from GPU tensors."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]  # GPU lookup
    return torch.zeros_like(embedding_vectors[0])  # Zero vector for unknown words

# === Step 5: Convert Sentences to Word Embeddings Using Mini-Batches ===
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM_W2V = 300
EMBEDDING_DIM_GLOVE = 100
EMBEDDING_DIM_FASTTEXT = 300
BATCH_SIZE = 500  # 🚀 Process in batches to avoid OOM

def sentence_to_vector(sentence, word2idx, embedding_vectors, embedding_dim, max_words=MAX_WORDS):
    """Convert a sentence into a GPU-accelerated word embedding matrix."""
    words = sentence.split()[:max_words]
    embedding_matrix = torch.zeros((max_words, embedding_dim), dtype=torch.float16, device=device)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [sentence_to_vector(sent, word2idx, embedding_vectors, embedding_dim) for sent in sentences]

    # Pad or truncate to MAX_SENTENCES
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, embedding_dim), dtype=torch.float16, device=device)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# === Step 6: Process in Mini-Batches ===
def process_in_batches(df, word2idx, embedding_vectors, embedding_dim, batch_size=BATCH_SIZE):
    """Process dataset in small batches to prevent GPU memory overflow."""
    total_samples = len(df)
    all_vectors = []

    for start in tqdm(range(0, total_samples, batch_size), desc="🚀 Processing batches on GPU"):
        end = min(start + batch_size, total_samples)
        batch_df = df.iloc[start:end]  # Select batch

        batch_vectors = [
            article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim)
            for sentences in batch_df["sentences"]
        ]

        batch_vectors = torch.stack(batch_vectors).cpu()  # Move to CPU to free GPU memory
        all_vectors.append(batch_vectors)
        
        torch.cuda.empty_cache()  # 🚀 Free GPU memory after each batch

    return torch.cat(all_vectors)  # Combine all batches



In [None]:
# === Step 7: Convert & Save GloVe Embeddings on GPU ===
print("🔢 Converting GloVe embeddings on GPU...")
train_glove = process_in_batches(train_df, glove_word2idx, glove_vectors, EMBEDDING_DIM_GLOVE)
test_glove = process_in_batches(test_df, glove_word2idx, glove_vectors, EMBEDDING_DIM_GLOVE)
valid_glove = process_in_batches(valid_df, glove_word2idx, glove_vectors, EMBEDDING_DIM_GLOVE)

torch.save(train_glove, "/kaggle/working/train_glove.pt")
torch.save(test_glove, "/kaggle/working/test_glove.pt")
torch.save(valid_glove, "/kaggle/working/valid_glove.pt")
print("✅ GloVe embeddings saved on GPU!")



In [None]:
# === Step 8: Convert & Save FastText Embeddings on GPU ===
print("🔢 Converting FastText embeddings on GPU...")
train_fasttext = process_in_batches(train_df, fasttext_word2idx, fasttext_vectors, EMBEDDING_DIM_FASTTEXT)
test_fasttext = process_in_batches(test_df, fasttext_word2idx, fasttext_vectors, EMBEDDING_DIM_FASTTEXT)
valid_fasttext = process_in_batches(valid_df, fasttext_word2idx, fasttext_vectors, EMBEDDING_DIM_FASTTEXT)

torch.save(train_fasttext, "/kaggle/working/train_fasttext.pt")
torch.save(test_fasttext, "/kaggle/working/test_fasttext.pt")
torch.save(valid_fasttext, "/kaggle/working/valid_fasttext.pt")
print("✅ FastText embeddings saved on GPU!")

In [None]:
# Save the embeddings



print("✅ Optimized embeddings saved with full Swifter + progress tracking!")

In [None]:
# === Step 1: Load Dataset ===
print("📥 Loading dataset...")
train_df = pd.read_csv('/kaggle/input/1000elements/train_1000.csv')
test_df = pd.read_csv('/kaggle/input/1000elements/test_100.csv')
valid_df = pd.read_csv('/kaggle/input/1000elements/val_100.csv')

In [None]:
# === Step 2: Fast Sentence Tokenization Using Swifter + Progress Bar ===
print("✂️ Fast tokenizing sentences with Swifter & progress tracking...")
train_df["sentences"] = train_df["highlights"].astype(str).swifter.apply(sent_tokenize)
test_df["sentences"] = test_df["highlights"].astype(str).swifter.apply(sent_tokenize)
valid_df["sentences"] = valid_df["highlights"].astype(str).swifter.apply(sent_tokenize)




In [None]:
# Save tokenized sentences
train_df.to_csv("/kaggle/working/hlts_train_tokenized.csv", index=False)
test_df.to_csv("/kaggle/working/hlts_test_tokenized.csv", index=False)
valid_df.to_csv("/kaggle/working/hlts_valid_tokenized.csv", index=False)
print("✅ Tokenized sentences saved!")



In [None]:
# === Step 3: Load Pre-Trained Embeddings ===
print("📥 Loading pre-trained embeddings...")
word2vec_path = "/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin"
glove_path = "/kaggle/input/glove-vectorisation/glove.6B.100d.txt"
fasttext_path = "/kaggle/input/fasttext/cc.en.300.bin"



In [None]:
# Load embeddings with progress tracking
print("📥 Loading Word2Vec...")
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print("✅ Word2Vec loaded!")

from gensim.models.fasttext import load_facebook_model

print("📥 Loading FastText...")

try:
    # First, try loading as a standard Word2Vec format
    fasttext = KeyedVectors.load_word2vec_format(fasttext_path, binary=True)
    print("✅ FastText loaded as Word2Vec format!")

except UnicodeDecodeError:
    print("⚠️ FastText binary loading failed! Trying Facebook FastText format...")
    
    # Load FastText using Gensim's recommended method
    fasttext = load_facebook_model(fasttext_path).wv  # Get word vectors
    print("✅ FastText loaded successfully using Facebook format!")

# === Step 4: Speed Up Word Embedding Lookup Using FAISS and Multiprocessing ===
def create_faiss_index(embedding_model, embedding_dim):
    """Build FAISS index for fast nearest neighbor search."""
    index = faiss.IndexFlatL2(embedding_dim)  # Create FAISS index
    words = embedding_model.index_to_key  # Get all words from KeyedVectors
    vectors = np.array([embedding_model[word] for word in tqdm(words, desc="Indexing embeddings")], dtype=np.float32)
    
    index.add(vectors)  # Add vectors to FAISS index
    return index, words  # Return FAISS index & word list

print("🚀 Building FAISS indices for fast lookup...")
w2v_index, w2v_words = create_faiss_index(word2vec, 300)
fasttext_index, fasttext_words = create_faiss_index(fasttext, 300)



In [None]:
import torch

# === Step 1: Define File Paths ===
word2vec_paths = {
    "train": "/kaggle/input/word2vec-embeddings/train_word2vec.pt",
    "test": "/kaggle/input/word2vec-embeddings/test_word2vec.pt",
    "valid": "/kaggle/input/word2vec-embeddings/valid_word2vec.pt",
}

glove_paths = {
    "train": "/kaggle/input/gloveembeddings/train_glove.pt",
    "test": "/kaggle/input/gloveembeddings/test_glove.pt",
    "valid": "/kaggle/input/gloveembeddings/valid_glove.pt",
}

fasttext_paths = {
    "train": "/kaggle/input/fasttext-embeddings/train_fasttext.pt",
    "test": "/kaggle/input/fasttext-embeddings/test_fasttext.pt",
    "valid": "/kaggle/input/fasttext-embeddings/valid_fasttext.pt",
}

In [None]:
# def load_embeddings(paths):
#     """Loads PyTorch tensors from stored .pt files"""
#     return {split: torch.load(path) for split, path in paths.items()}

# word2vec_embeddings = load_embeddings(word2vec_paths)
# glove_embeddings = load_embeddings(glove_paths)
# fasttext_embeddings = load_embeddings(fasttext_paths)

# # === Step 3: Inspect Dimensions ===
# print("📏 Word2Vec Embeddings Shape:")
# for split, tensor in word2vec_embeddings.items():
#     print(f"{split}: {tensor.shape}")

# print("\n📏 GloVe Embeddings Shape:")
# for split, tensor in glove_embeddings.items():
#     print(f"{split}: {tensor.shape}")

# print("\n📏 FastText Embeddings Shape:")
# for split, tensor in fasttext_embeddings.items():
#     print(f"{split}: {tensor.shape}")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from rouge_score import rouge_scorer
from tqdm import tqdm
word2vec_train = torch.load("/kaggle/input/word2vec-embeddings/train_word2vec.pt").to("cuda")
glove_train = torch.load("/kaggle/input/gloveembeddings/train_glove.pt").to("cuda")
fasttext_train = torch.load("/kaggle/input/fasttext-embeddings/train_fasttext.pt").to("cuda")

### MODEL

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        #print(f"Shape before mean pooling: {x.shape}")  # Debugging

        x = x.mean(dim=1)  # ✅ Mean pooling over words (50, 300)
        #print(f"Shape after mean pooling: {x.shape}")  # Debugging
        
        return torch.tensor(x, dtype=torch.float32)


In [None]:
class WL_AttenSumm(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=256, num_filters=100, filter_sizes=[1,2,3,4,5,6,7]):
        super(WL_AttenSumm, self).__init__()
        
        # Convolutional Layers with multiple filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs, padding=fs//2)
            for fs in filter_sizes
        ])
        
        # Bi-GRU Layer
        self.gru = nn.GRU(input_size=num_filters * len(filter_sizes), hidden_size=hidden_dim, 
                          bidirectional=True, batch_first=True)
        
        # Word-level Attention Layer
        self.attention = nn.Linear(hidden_dim * 2, 1)  # Computes attention scores

        # Fully connected layer for sentence scoring
        self.fc = nn.Linear(hidden_dim * 2, 1)  # Uses context vector g_t

    def forward(self, x):
        #print(f"Before permute: {x.shape}")  # Debugging
        x = x.permute(0, 2, 1)  # ✅ Fix: Change (batch_size, seq_len, embedding_dim) to (batch_size, embedding_dim, seq_len)
        #print(f"After permute: {x.shape}")  # Debugging
        
        # Apply multiple CNN filters
        conv_outputs = [F.relu(conv(x)) for conv in self.convs]  # List of (batch, num_filters, seq_len)
        
        # Max-pooling over time for each convolution output
        pooled_outputs = [F.max_pool1d(co, kernel_size=co.shape[2]).squeeze(2) for co in conv_outputs]  # (batch, num_filters)
        
        # Concatenate all pooled features
        cnn_output = torch.cat(pooled_outputs, dim=1)  # (batch, num_filters * len(filter_sizes))
        
        # Expand back to sequence shape for Bi-GRU
        x = cnn_output.unsqueeze(1).expand(-1, 50, -1)  # (batch, seq_len, num_filters * len(filter_sizes))
        
        # Bi-GRU
        x, _ = self.gru(x)  # Output shape: (batch, seq_len, hidden_dim*2)
        
        # Word-Level Attention Mechanism
        attn_weights = torch.softmax(self.attention(x).squeeze(-1), dim=1)  # Compute attention weights (batch, seq_len)
        gt = torch.sum(attn_weights.unsqueeze(-1) * x, dim=1)  # Compute weighted sum of Bi-GRU outputs
        
        # Fully connected MLP to compute sentence scores
        yi = torch.sigmoid(self.fc(x)).squeeze(-1)  # ✅ Ensure this outputs (batch_size, seq_len)
        
        return yi  # ✅ Shape should be (batch_size, 50)


In [None]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.device_count())  # Should print number of GPUs
print(torch.cuda.get_device_name(0))  # Prints GPU name


### 1k Word2vec

#### Training

In [None]:
dataset = TextSummaryDataset(word2vec_train)
sample = dataset[0]
print(sample.shape)  # Should print: torch.Size([50, 300])


In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# ✅ Ensure embeddings are loaded in CPU first
word2vec_train = torch.load("/kaggle/input/word2vec-embeddings/train_word2vec.pt", map_location="cpu")

# ✅ Set num_workers=0 to avoid multiprocessing issues
train_dataset = TextSummaryDataset(word2vec_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)

# ✅ Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Initialize model on CUDA if available
embedding_dim = word2vec_train.shape[-1]  # 300 for Word2Vec/FastText, 100 for GloVe
model = WL_AttenSumm(embedding_dim=embedding_dim).to(device)

# ✅ Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))

num_epochs = 20

# ✅ Training Loop with CUDA error handling
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for inputs in progress:
        inputs = inputs.to(device)  # ✅ Move batch to CUDA before passing to model

        optimizer.zero_grad()
        
        # ✅ Compute model output
        outputs = model(inputs)  # ✅ Get sentence importance scores (batch_size, 50)
        outputs = outputs.squeeze(-1)  # ✅ Ensure correct shape (batch_size, 50)

        # ✅ Compute loss
        loss = torch.mean((outputs - outputs.mean(dim=1, keepdim=True)) ** 2)  # ✅ Now dim=1 is valid
        #print(f"Output shape before loss: {outputs.shape}")  # Debugging

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        progress.set_postfix(loss=running_loss / len(train_loader))

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")


#### Testing

In [None]:
# Load test embeddings (same format as training)
test_embeddings = torch.load("/kaggle/input/word2vec-embeddings/test_word2vec.pt", map_location="cpu")

# Move to GPU if available
test_embeddings = test_embeddings.to(device)


In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        #print(f"Shape before mean pooling: {x.shape}")  # Debugging

        x = x.mean(dim=1)  # ✅ Mean pooling over words (50, 300)
        #print(f"Shape after mean pooling: {x.shape}")  # Debugging
        
        return torch.tensor(x, dtype=torch.float32)

# ✅ Load test dataset with mean pooling
test_dataset = TextSummaryDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

model.eval()  # Set model to evaluation mode
all_scores = []

with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)  # Move batch to GPU
        scores = model(inputs)  # Get sentence scores
        all_scores.append(scores.cpu().numpy())  # Move to CPU and store

# Concatenate all scores into a single numpy array
scores = np.concatenate(all_scores, axis=0)


In [None]:
import numpy as np

def extract_top_sentences(scores, articles, top_k=3, word_limit=100):
    summaries = []

    for i, article in enumerate(articles):
        sentences = article.split('.')  # Split article into sentences
        sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
        num_sentences = len(sentences)  # Actual non-padding sentences

        # Ensure at least one valid sentence
        if num_sentences == 0:
            summaries.append("")  # If empty article, return empty summary
            continue

        # Get top sentence indices (ignoring padding)
        top_indices = np.argsort(-scores[i])[:top_k]  # Get indices of top-k sentences

        # ✅ Remove padded sentences (assumes padding is an empty string or zero values)
        selected_sentences = [sentences[idx] for idx in top_indices if idx < num_sentences]  

        # ✅ If all top-k sentences were padding, at least return one real sentence
        if len(selected_sentences) == 0:
            selected_sentences = [sentences[0]]  # Return first real sentence

        # Join sentences and limit words
        summary = " ".join(selected_sentences)[:word_limit]
        summaries.append(summary)

    return summaries


In [None]:
df_test = pd.read_csv("/kaggle/input/1000elements/test_100.csv")  

# Check if 'article' column exists
print(df_test.columns)

# Ensure 'article' column is correctly formatted
print(df_test["article"].head())  
predicted_summaries = extract_top_sentences(scores, df_test["article"])


In [None]:
from rouge_score import rouge_scorer

def compute_rouge(predicted_summaries, gold_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, gold in zip(predicted_summaries, gold_summaries):
        score = scorer.score(gold, pred)
        for key in scores:
            scores[key].append(score[key].fmeasure)  # Take F1 score

    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    return avg_scores


In [None]:
# Compute ROUGE score
w2VRougeScore = compute_rouge(predicted_summaries, df_test["highlights"])
#print(f"ROUGE Scores: {w2vRougeScore}")
print(f"ROUGE Scores: {w2VRougeScore}")

### 1k Fasttext

#### Training

In [None]:
dataset = TextSummaryDataset(fasttext_train)
sample = dataset[0]
print(sample.shape)  # Should print: torch.Size([50, 300])


In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# ✅ Ensure embeddings are loaded in CPU first
fasttext_train = torch.load("/kaggle/input/fasttext-embeddings/train_fasttext.pt", map_location="cpu")

# ✅ Set num_workers=0 to avoid multiprocessing issues
train_dataset = TextSummaryDataset(fasttext_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)

# ✅ Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Initialize model on CUDA if available
embedding_dim = fasttext_train.shape[-1]  # 300 for Word2Vec/FastText, 100 for GloVe
model = WL_AttenSumm(embedding_dim=embedding_dim).to(device)

# ✅ Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))

num_epochs = 20

# ✅ Training Loop with CUDA error handling
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for inputs in progress:
        inputs = inputs.to(device)  # ✅ Move batch to CUDA before passing to model

        optimizer.zero_grad()
        
        # ✅ Compute model output
        outputs = model(inputs)  # ✅ Get sentence importance scores (batch_size, 50)
        outputs = outputs.squeeze(-1)  # ✅ Ensure correct shape (batch_size, 50)

        # ✅ Compute loss
        loss = torch.mean((outputs - outputs.mean(dim=1, keepdim=True)) ** 2)  # ✅ Now dim=1 is valid
        #print(f"Output shape before loss: {outputs.shape}")  # Debugging

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        progress.set_postfix(loss=running_loss / len(train_loader))

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")


#### Testing

In [None]:
# Load test embeddings (same format as training)
test_embeddings = torch.load("/kaggle/input/fasttext-embeddings/test_fasttext.pt", map_location="cpu")

# Move to GPU if available
test_embeddings = test_embeddings.to(device)

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        #print(f"Shape before mean pooling: {x.shape}")  # Debugging

        x = x.mean(dim=1)  # ✅ Mean pooling over words (50, 300)
        #print(f"Shape after mean pooling: {x.shape}")  # Debugging
        
        return torch.tensor(x, dtype=torch.float32)

# ✅ Load test dataset with mean pooling
test_dataset = TextSummaryDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

model.eval()  # Set model to evaluation mode
all_scores = []

with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)  # Move batch to GPU
        scores = model(inputs)  # Get sentence scores
        all_scores.append(scores.cpu().numpy())  # Move to CPU and store

# Concatenate all scores into a single numpy array
scores = np.concatenate(all_scores, axis=0)


In [None]:
import numpy as np

def extract_top_sentences(scores, articles, top_k=3, word_limit=100):
    summaries = []

    for i, article in enumerate(articles):
        sentences = article.split('.')  # Split article into sentences
        sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
        num_sentences = len(sentences)  # Actual non-padding sentences

        # Ensure at least one valid sentence
        if num_sentences == 0:
            summaries.append("")  # If empty article, return empty summary
            continue

        # Get top sentence indices (ignoring padding)
        top_indices = np.argsort(-scores[i])[:top_k]  # Get indices of top-k sentences

        # ✅ Remove padded sentences (assumes padding is an empty string or zero values)
        selected_sentences = [sentences[idx] for idx in top_indices if idx < num_sentences]  

        # ✅ If all top-k sentences were padding, at least return one real sentence
        if len(selected_sentences) == 0:
            selected_sentences = [sentences[0]]  # Return first real sentence

        # Join sentences and limit words
        summary = " ".join(selected_sentences)[:word_limit]
        summaries.append(summary)

    return summaries


In [None]:
df_test = pd.read_csv("/kaggle/input/1000elements/test_100.csv")  

# Check if 'article' column exists
print(df_test.columns)

# Ensure 'article' column is correctly formatted
print(df_test["article"].head())  
predicted_summaries = extract_top_sentences(scores, df_test["article"])


In [None]:
from rouge_score import rouge_scorer

def compute_rouge(predicted_summaries, gold_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, gold in zip(predicted_summaries, gold_summaries):
        score = scorer.score(gold, pred)
        for key in scores:
            scores[key].append(score[key].fmeasure)  # Take F1 score

    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    return avg_scores


In [None]:
# Compute ROUGE score
fastextRougeScore = compute_rouge(predicted_summaries, df_test["highlights"])
#print(f"ROUGE Scores: {fasttextRougeScore}")
print(f"ROUGE Scores: {fastextRougeScore}")

### 1k Glove

#### Training

In [None]:
dataset = TextSummaryDataset(glove_train)
sample = dataset[0]
print(sample.shape)  

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# ✅ Ensure embeddings are loaded in CPU first
glove_train = torch.load("/kaggle/input/gloveembeddings/train_glove.pt", map_location="cpu")

# ✅ Set num_workers=0 to avoid multiprocessing issues
train_dataset = TextSummaryDataset(glove_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=0)

# ✅ Ensure CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Initialize model on CUDA if available
embedding_dim = glove_train.shape[-1]  # 300 for Word2Vec/FastText, 100 for GloVe
model = WL_AttenSumm(embedding_dim=embedding_dim).to(device)

# ✅ Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))

num_epochs = 20

# ✅ Training Loop with CUDA error handling
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for inputs in progress:
        inputs = inputs.to(device)  # ✅ Move batch to CUDA before passing to model

        optimizer.zero_grad()
        
        # ✅ Compute model output
        outputs = model(inputs)  # ✅ Get sentence importance scores (batch_size, 50)
        outputs = outputs.squeeze(-1)  # ✅ Ensure correct shape (batch_size, 50)

        # ✅ Compute loss
        loss = torch.mean((outputs - outputs.mean(dim=1, keepdim=True)) ** 2)  # ✅ Now dim=1 is valid
        #print(f"Output shape before loss: {outputs.shape}")  # Debugging

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        progress.set_postfix(loss=running_loss / len(train_loader))

    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")


#### Testing

In [None]:
# Load test embeddings (same format as training)
test_embeddings = torch.load("/kaggle/input/gloveembeddings/test_glove.pt", map_location="cpu")

# Move to GPU if available
test_embeddings = test_embeddings.to(device)

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        #print(f"Shape before mean pooling: {x.shape}")  # Debugging

        x = x.mean(dim=1)  # ✅ Mean pooling over words (50, 300)
        #print(f"Shape after mean pooling: {x.shape}")  # Debugging
        
        return torch.tensor(x, dtype=torch.float32)

# ✅ Load test dataset with mean pooling
test_dataset = TextSummaryDataset(test_embeddings)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)

model.eval()  # Set model to evaluation mode
all_scores = []

with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs.to(device)  # Move batch to GPU
        scores = model(inputs)  # Get sentence scores
        all_scores.append(scores.cpu().numpy())  # Move to CPU and store

# Concatenate all scores into a single numpy array
scores = np.concatenate(all_scores, axis=0)


In [None]:
import numpy as np

def extract_top_sentences(scores, articles, top_k=3, word_limit=100):
    summaries = []

    for i, article in enumerate(articles):
        sentences = article.split('.')  # Split article into sentences
        sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty strings
        num_sentences = len(sentences)  # Actual non-padding sentences

        # Ensure at least one valid sentence
        if num_sentences == 0:
            summaries.append("")  # If empty article, return empty summary
            continue

        # Get top sentence indices (ignoring padding)
        top_indices = np.argsort(-scores[i])[:top_k]  # Get indices of top-k sentences

        # ✅ Remove padded sentences (assumes padding is an empty string or zero values)
        selected_sentences = [sentences[idx] for idx in top_indices if idx < num_sentences]  

        # ✅ If all top-k sentences were padding, at least return one real sentence
        if len(selected_sentences) == 0:
            selected_sentences = [sentences[0]]  # Return first real sentence

        # Join sentences and limit words
        summary = " ".join(selected_sentences)[:word_limit]
        summaries.append(summary)

    return summaries


In [None]:
df_test = pd.read_csv("/kaggle/input/1000elements/test_100.csv")  

# Check if 'article' column exists
print(df_test.columns)

# Ensure 'article' column is correctly formatted
print(df_test["article"].head())  
predicted_summaries = extract_top_sentences(scores, df_test["article"])


In [None]:
from rouge_score import rouge_scorer

def compute_rouge(predicted_summaries, gold_summaries):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

    for pred, gold in zip(predicted_summaries, gold_summaries):
        score = scorer.score(gold, pred)
        for key in scores:
            scores[key].append(score[key].fmeasure)  # Take F1 score

    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    return avg_scores


In [None]:
# Compute ROUGE score

gloveeRougeScore = compute_rouge(predicted_summaries, df_test["highlights"])
#print(f"ROUGE Scores: {gloveRougeScore}")
print(f"ROUGE Scores: {gloveeRougeScore}")

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt

# # Model names
# models = ["Paper_W2V", "Paper_FT", "Paper_GL", "Implemented_W2V", "Implmented_FastText", "Implemented_GloVe"]

# # ROUGE scores

# rouge_1 = [42.9, 42.3, 41.8, 37.7, 35.7, 35.3]
# rouge_2 = [19.7, 19.2, 18.9, 17.9, 16.9, 16.5]
# rouge_l = [39.3, 38.9, 38.5, 35.6, 33.5, 33.4]

# # Bar width
# bar_width = 0.2
# x = np.arange(len(models))

# # Create the plot
# plt.figure(figsize=(10, 6))
# plt.bar(x - bar_width, rouge_1, width=bar_width, label="ROUGE-1", color='#1f77b4')
# plt.bar(x, rouge_2, width=bar_width, label="ROUGE-2", color='#ff7f0e')
# plt.bar(x + bar_width, rouge_l, width=bar_width, label="ROUGE-L", color='#2ca02c')

# # Labels and title
# plt.xlabel("Embedded datasets")
# plt.ylabel("ROUGE Score (%)")
# plt.title("Comparison of ROUGE Scores")
# plt.xticks(ticks=x, labels=models, rotation=20)
# plt.legend()
# plt.grid(axis='y', linestyle='--', alpha=0.7)

# # Show the plot
# plt.tight_layout()
# plt.show()


## 20,000 dataset

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim
import swifter
import faiss
import pickle
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from concurrent.futures import ProcessPoolExecutor

# Enable tqdm for Pandas
tqdm.pandas()

In [None]:
# === Step 1: Load Dataset ===
print("📥 Loading dataset...")
train_df = pd.read_csv('/kaggle/input/sampled-20k/train_20000.csv')
test_df = pd.read_csv('/kaggle/input/sampled-20k/test_2000.csv')
valid_df = pd.read_csv('/kaggle/input/sampled-20k/val_2000.csv')



In [None]:
# === Step 2: Fast Sentence Tokenization Using Swifter + Progress Bar ===
print("✂️ Fast tokenizing sentences with Swifter & progress tracking...")
train_df["sentences"] = train_df["article"].astype(str).swifter.apply(sent_tokenize)
test_df["sentences"] = test_df["article"].astype(str).swifter.apply(sent_tokenize)
valid_df["sentences"] = valid_df["article"].astype(str).swifter.apply(sent_tokenize)

In [None]:
# Save tokenized sentences
train_df.to_csv("/kaggle/working/train_tokenized_20k.csv", index=False)
test_df.to_csv("/kaggle/working/test_tokenized_20k.csv", index=False)
valid_df.to_csv("/kaggle/working/valid_tokenized_20k.csv", index=False)
print("✅ Tokenized sentences saved!")

In [None]:
# === Step 3: Load Pre-Trained Embeddings ===
print("📥 Loading pre-trained embeddings...")
word2vec_path = "/kaggle/input/word2vec/GoogleNews-vectors-negative300.bin"
glove_path = "/kaggle/input/glove-vectorisation/glove.6B.100d.txt"
fasttext_path = "/kaggle/input/fasttext/cc.en.300.bin"

In [None]:


# Load embeddings with progress tracking
print("📥 Loading Word2Vec...")
word2vec = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
print("✅ Word2Vec loaded!")

from gensim.models.fasttext import load_facebook_model

print("📥 Loading FastText...")

try:
    # First, try loading as a standard Word2Vec format
    fasttext = KeyedVectors.load_word2vec_format(fasttext_path, binary=True)
    print("✅ FastText loaded as Word2Vec format!")

except UnicodeDecodeError:
    print("⚠️ FastText binary loading failed! Trying Facebook FastText format...")
    
    # Load FastText using Gensim's recommended method
    fasttext = load_facebook_model(fasttext_path).wv  # Get word vectors
    print("✅ FastText loaded successfully using Facebook format!")

# === Step 4: Speed Up Word Embedding Lookup Using FAISS and Multiprocessing ===
def create_faiss_index(embedding_model, embedding_dim):
    """Build FAISS index for fast nearest neighbor search."""
    index = faiss.IndexFlatL2(embedding_dim)  # Create FAISS index
    words = embedding_model.index_to_key  # Get all words from KeyedVectors
    vectors = np.array([embedding_model[word] for word in tqdm(words, desc="Indexing embeddings")], dtype=np.float32)
    
    index.add(vectors)  # Add vectors to FAISS index
    return index, words  # Return FAISS index & word list

print("🚀 Building FAISS indices for fast lookup...")
w2v_index, w2v_words = create_faiss_index(word2vec, 300)
fasttext_index, fasttext_words = create_faiss_index(fasttext, 300)

In [None]:
def get_faiss_embedding(word, embedding_dict, index, words_list, embedding_dim=300):
    if word in embedding_dict:
        return embedding_dict[word]
    _, nearest = index.search(np.zeros((1, embedding_dim), dtype=np.float32), 1)
    return embedding_dict[words_list[nearest[0][0]]]

# Load GloVe embeddings using multiprocessing
glove_embeddings = {}
with open(glove_path, 'r', encoding='utf-8') as f:
    def process_line(line):
        values = line.split()
        return values[0], np.asarray(values[1:], dtype='float32')

    with ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_line, f), desc="Loading GloVe in parallel"))
    
    glove_embeddings = dict(results)
print("✅ GloVe loaded!")

In [None]:
def split_and_save(df, prefix, chunk_size=1000):
    """Splits a dataframe into chunks and saves each as a separate file."""
    total_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)

    for i in range(total_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df))
        chunk_df = df.iloc[start:end]
        chunk_df.to_csv(f"{prefix}_chunk_{i+1}.csv", index=False)
        print(f"✅ Saved {prefix}_chunk_{i+1}.csv")

# Split & save training, validation, and test sets
split_and_save(train_df, "train")
split_and_save(test_df, "test")
split_and_save(valid_df, "val")

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm

# === Step 1: Check & Set Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === Step 2: Move Word2Vec Model to GPU ===
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float16, device=device)  # Move embeddings to GPU
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# === Step 3: Move GloVe & FastText to GPU ===
glove_vocab = list(glove_embeddings.keys())
glove_vectors = torch.tensor([glove_embeddings[word] for word in glove_vocab], dtype=torch.float16, device=device)
glove_word2idx = {word: idx for idx, word in enumerate(glove_vocab)}

fasttext_vocab = fasttext.index_to_key
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16, device=device)
fasttext_word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# === Step 4: Optimized Word Embedding Lookup on GPU ===
def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding from GPU tensors."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]  # GPU lookup
    return torch.zeros_like(embedding_vectors[0])  # Zero vector for unknown words

# === Step 5: Convert Sentences to Word Embeddings Using Mini-Batches ===
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM_W2V = 300
EMBEDDING_DIM_GLOVE = 100
EMBEDDING_DIM_FASTTEXT = 300
BATCH_SIZE = 500  # 🚀 Process in batches to avoid OOM

def sentence_to_vector(sentence, word2idx, embedding_vectors, embedding_dim, max_words=MAX_WORDS):
    """Convert a sentence into a GPU-accelerated word embedding matrix."""
    words = sentence.split()[:max_words]
    embedding_matrix = torch.zeros((max_words, embedding_dim), dtype=torch.float16, device=device)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [sentence_to_vector(sent, word2idx, embedding_vectors, embedding_dim) for sent in sentences]

    # Pad or truncate to MAX_SENTENCES
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, embedding_dim), dtype=torch.float16, device=device)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# === Step 6: Process in Mini-Batches ===
def process_in_batches(df, word2idx, embedding_vectors, embedding_dim, batch_size=BATCH_SIZE):
    """Process dataset in small batches to prevent GPU memory overflow."""
    total_samples = len(df)
    all_vectors = []

    for start in tqdm(range(0, total_samples, batch_size), desc="🚀 Processing batches on GPU"):
        end = min(start + batch_size, total_samples)
        batch_df = df.iloc[start:end]  # Select batch

        batch_vectors = [
            article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim)
            for sentences in batch_df["sentences"]
        ]

        batch_vectors = torch.stack(batch_vectors).cpu()  # Move to CPU to free GPU memory
        all_vectors.append(batch_vectors)
        
        torch.cuda.empty_cache()  # 🚀 Free GPU memory after each batch

    return torch.cat(all_vectors)  # Combine all batches


# === Step 7: Process All Files in Directory & Save Output ===
input_dir = "/kaggle/input/train-chunk"
output_dir = "/kaggle/working"

skip_files = {
    "train_chunk_20", "train_chunk_3","train_chunk_1","train_chunk_4","train_chunk_15","train_chunk_19",
    "train_chunk_17","train_chunk_6","train_chunk_9","train_chunk_12","train_chunk_5",
    "train_chunk_7","train_chunk_2"
}

processed_files = {f.replace('.pt', '') for f in os.listdir(output_dir) if f.endswith(".pt")}

for file_name in os.listdir(input_dir):
    if file_name.endswith(".csv"):
        base_name = file_name.replace('.csv', '')

        # Skip if already processed OR explicitly listed in skip_files
        if base_name in processed_files or base_name in skip_files:
            print(f"⏩ Skipping: {file_name}")
            continue

        file_path = os.path.join(input_dir, file_name)
        df = pd.read_csv(file_path)

        print(f"📂 Processing {file_name} ...")
        processed_data = process_in_batches(df, word2idx, word2vec_vectors, EMBEDDING_DIM_W2V)

        # Save output
        output_file = os.path.join(output_dir, f"{base_name}.pt")
        torch.save(processed_data, output_file)
        print(f"✅ Saved {output_file}")

print(f"🎉 Remaining files processed and saved in {output_dir}!")

In [None]:
# === Step 0: Remove Existing .pt Files from Output Directory ===
for f in os.listdir(output_dir):
    if f.endswith(".pt"):
        os.remove(os.path.join(output_dir, f))
print("🧹 Cleared previous .pt files from working directory.")


In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm

# === Step 1: Check & Set Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === Step 2: Move Word2Vec Model to GPU ===
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float16, device=device)  # Move embeddings to GPU
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# === Step 3: Move GloVe & FastText to GPU ===
glove_vocab = list(glove_embeddings.keys())
glove_vectors = torch.tensor([glove_embeddings[word] for word in glove_vocab], dtype=torch.float16, device=device)
glove_word2idx = {word: idx for idx, word in enumerate(glove_vocab)}

fasttext_vocab = fasttext.index_to_key
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16, device=device)
fasttext_word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# === Step 4: Optimized Word Embedding Lookup on GPU ===
def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding from GPU tensors."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]  # GPU lookup
    return torch.zeros_like(embedding_vectors[0])  # Zero vector for unknown words

# === Step 5: Convert Sentences to Word Embeddings Using Mini-Batches ===
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM_W2V = 300
EMBEDDING_DIM_GLOVE = 100
EMBEDDING_DIM_FASTTEXT = 300
BATCH_SIZE = 500  # 🚀 Process in batches to avoid OOM

def sentence_to_vector(sentence, word2idx, embedding_vectors, embedding_dim, max_words=MAX_WORDS):
    """Convert a sentence into a GPU-accelerated word embedding matrix."""
    words = sentence.split()[:max_words]
    embedding_matrix = torch.zeros((max_words, embedding_dim), dtype=torch.float16, device=device)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [sentence_to_vector(sent, word2idx, embedding_vectors, embedding_dim) for sent in sentences]

    # Pad or truncate to MAX_SENTENCES
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, embedding_dim), dtype=torch.float16, device=device)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# === Step 6: Process in Mini-Batches ===
def process_in_batches(df, word2idx, embedding_vectors, embedding_dim, batch_size=BATCH_SIZE):
    """Process dataset in small batches to prevent GPU memory overflow."""
    total_samples = len(df)
    all_vectors = []

    for start in tqdm(range(0, total_samples, batch_size), desc="🚀 Processing batches on GPU"):
        end = min(start + batch_size, total_samples)
        batch_df = df.iloc[start:end]  # Select batch

        batch_vectors = [
            article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim)
            for sentences in batch_df["sentences"]
        ]

        batch_vectors = torch.stack(batch_vectors).cpu()  # Move to CPU to free GPU memory
        all_vectors.append(batch_vectors)
        
        torch.cuda.empty_cache()  # 🚀 Free GPU memory after each batch

    return torch.cat(all_vectors)  # Combine all batches


# === Step 7: Loop over All CSVs and Save to Output Folder ===
input_dir = "/kaggle/input/train-chunk/"
output_dir = "/kaggle/working/train_chunk_pt/"
os.makedirs(output_dir, exist_ok=True)

print("🔄 Starting GloVe embedding conversion for all chunks...")
for file_name in sorted(os.listdir(input_dir)):
    if file_name.endswith(".csv"):
        base_name = file_name.replace(".csv", "")
        input_path = os.path.join(input_dir, file_name)
        output_path = os.path.join(output_dir, f"{base_name}_glove.pt")

        print(f"📂 Processing {file_name} ...")
        df = pd.read_csv(input_path)

        # Process using GloVe
        processed_tensor = process_in_batches(df, glove_word2idx, glove_vectors, EMBEDDING_DIM_GLOVE)

        # Save the tensor
        torch.save(processed_tensor, output_path)
        print(f"✅ Saved {output_path}")

print("🎉 All chunk files processed and saved in:", output_dir)


In [None]:
import numpy as np
import torch
import os
import pandas as pd
from tqdm.auto import tqdm

# === Step 1: Check & Set Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === Step 2: Move Word2Vec Model to GPU ===
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float16, device=device)
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# === Step 3: Move GloVe & FastText to GPU ===
glove_vocab = list(glove_embeddings.keys())
glove_vectors = torch.tensor([glove_embeddings[word] for word in glove_vocab], dtype=torch.float16, device=device)
glove_word2idx = {word: idx for idx, word in enumerate(glove_vocab)}

fasttext_vocab = fasttext.index_to_key
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16, device=device)
fasttext_word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# === Step 4: Optimized Word Embedding Lookup on GPU ===
def get_embedding(word, word2idx, embedding_vectors):
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]
    return torch.zeros_like(embedding_vectors[0])

# === Step 5: Convert Sentences to Word Embeddings Using Mini-Batches ===
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM_W2V = 300
EMBEDDING_DIM_GLOVE = 100
EMBEDDING_DIM_FASTTEXT = 300
BATCH_SIZE = 500

def sentence_to_vector(sentence, word2idx, embedding_vectors, embedding_dim, max_words=MAX_WORDS):
    words = sentence.split()[:max_words]
    embedding_matrix = torch.zeros((max_words, embedding_dim), dtype=torch.float16, device=device)
    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)
    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim, max_sentences=MAX_SENTENCES):
    sentence_vectors = [sentence_to_vector(sent, word2idx, embedding_vectors, embedding_dim) for sent in sentences]
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, embedding_dim), dtype=torch.float16, device=device)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)
    return torch.stack(sentence_vectors[:max_sentences])

def process_in_batches(df, word2idx, embedding_vectors, embedding_dim, batch_size=BATCH_SIZE):
    total_samples = len(df)
    all_vectors = []
    for start in tqdm(range(0, total_samples, batch_size), desc="🚀 Processing batches on GPU"):
        end = min(start + batch_size, total_samples)
        batch_df = df.iloc[start:end]
        batch_vectors = [
            article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim)
            for sentences in batch_df["sentences"]
        ]
        batch_vectors = torch.stack(batch_vectors).cpu()
        all_vectors.append(batch_vectors)
        torch.cuda.empty_cache()
    return torch.cat(all_vectors)

# === NEW: Process all files in the folder except specified ===
print("🔢 Converting FastText embeddings for selected chunks...")
input_folder = "/kaggle/input/train-chunk"
output_folder = "/kaggle/working/train_fasttext_chunks"
os.makedirs(output_folder, exist_ok=True)

# Skip these files
skip_files = {
    "train_chunk_1.csv", "train_chunk_10.csv", "train_chunk_11.csv",
    "train_chunk_12.csv", "train_chunk_13.csv", "train_chunk_14.csv",
    "train_chunk_15.csv", "train_chunk_16.csv", "train_chunk_17.csv",
    "train_chunk_18.csv", "train_chunk_19.csv", "train_chunk_2.csv"
}

for file in sorted(os.listdir(input_folder)):
    if file.endswith(".csv") and file not in skip_files:
        chunk_path = os.path.join(input_folder, file)
        print(f"📄 Processing: {file}")
        df = pd.read_csv(chunk_path)
        df["sentences"] = df["sentences"].apply(eval)
        
        chunk_vectors = process_in_batches(df, fasttext_word2idx, fasttext_vectors, EMBEDDING_DIM_FASTTEXT)
        
        save_name = os.path.splitext(file)[0] + "_fasttext.pt"
        torch.save(chunk_vectors, os.path.join(output_folder, save_name))
        print(f"✅ Saved: {save_name}")

print("🎉 Selected FastText chunks processed and saved!")


In [None]:
# === Step 0: Remove Existing .pt Files from Output Directory ===
output_dir = "/kaggle/working/train_fasttext_chunks/"
for f in os.listdir(output_dir):
    if f.endswith(".pt"):
        os.remove(os.path.join(output_dir, f))
print("🧹 Cleared previous .pt files from working directory.")

In [None]:
import os
import torch

# List of directories to scan
directories = [
     "/kaggle/input/word2vec-train-10-chunks",
     "/kaggle/input/word2vec-train-chunks-10-pt2",
     "/kaggle/input/glove-train-chunk-pt1",
     "/kaggle/input/glove-train-chunk-pt2",
     "/kaggle/input/fasttext-train-chunk-pt1",
     "/kaggle/input/fasttext-train-chunk-pt2"
]

for dir_path in directories:
    print(f"\n📁 Directory: {dir_path}")
    for file in sorted(os.listdir(dir_path)):
        if file.endswith(".pt"):
            file_path = os.path.join(dir_path, file)
            try:
                tensor = torch.load(file_path, map_location="cpu")
                print(f"  📄 {file}: shape = {tensor.shape}")
            except Exception as e:
                print(f"  ❌ Could not load {file}: {e}")


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from rouge_score import rouge_scorer
from tqdm import tqdm
import os

# class LazyEmbeddingDataset(Dataset):
#     def __init__(self, directories):
#         self.file_paths = []
#         self.chunk_sizes = []
        

#         # List all .pt files and store their sample counts
#         for directory in directories:
#             for file in sorted(os.listdir(directory)):
#                 if file.endswith(".pt"):
#                     full_path = os.path.join(directory, file)
#                     data = torch.load(full_path, map_location="cpu")
#                     self.file_paths.append(full_path)
#                     self.chunk_sizes.append(data.shape[0])  # Assume first dim is sample count

#         # Build index mapping: global_index -> (file_idx, local_index)
#         self.index_map = []
#         for file_idx, size in enumerate(self.chunk_sizes):
#             for local_idx in range(size):
#                 self.index_map.append((file_idx, local_idx))

#     def __len__(self):
#         return len(self.index_map)

#     def __getitem__(self, idx):
#         file_idx, local_idx = self.index_map[idx]
#         file_path = self.file_paths[file_idx]
#         data = torch.load(file_path, map_location="cpu")
#         return data[local_idx]

# # Define your folder lists
# word2vec_dirs = [
#     "/kaggle/input/word2vec-train-10-chunks",
#     "/kaggle/input/word2vec-train-chunks-10-pt2"
# ]

# glove_dirs = [
#     "/kaggle/input/glove-train-chunk-pt1",
#     "/kaggle/input/glove-train-chunk-pt2"
# ]

# # fasttext_dirs = [
# #     "/kaggle/input/fasttext-train-chunk-pt1"
# # ]

# # Instantiate datasets (lazy)
# word2vec_dataset = LazyEmbeddingDataset(word2vec_dirs)
# glove_dataset = LazyEmbeddingDataset(glove_dirs)
# # fasttext_dataset = LazyEmbeddingDataset(fasttext_dirs)

# # Example DataLoader (batch size = 16, shuffle = True)
# word2vec_loader = DataLoader(word2vec_dataset, batch_size=16, shuffle=True)
# glove_loader = DataLoader(glove_dataset, batch_size=16, shuffle=True)
# # fasttext_loader = DataLoader(fasttext_dataset, batch_size=16, shuffle=True)

# # Example: iterate
# # for batch in word2vec_loader:
# #     batch = batch.to("cuda")
# #     # your training logic


### Model

In [None]:
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        #print(f"Shape before mean pooling: {x.shape}")  # Debugging

        x = x.mean(dim=1)  # ✅ Mean pooling over words (50, 300)
        #print(f"Shape after mean pooling: {x.shape}")  # Debugging
        
        return torch.tensor(x, dtype=torch.float32)

In [None]:
class WL_AttenSumm(nn.Module):
    def __init__(self, embedding_dim, hidden_dim=256, num_filters=100, filter_sizes=[1,2,3,4,5,6,7]):
        super(WL_AttenSumm, self).__init__()
        
        # Convolutional Layers with multiple filter sizes
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=fs, padding=fs//2)
            for fs in filter_sizes
        ])
        
        # Bi-GRU Layer
        self.gru = nn.GRU(input_size=num_filters * len(filter_sizes), hidden_size=hidden_dim, 
                          bidirectional=True, batch_first=True)
        
        # Word-level Attention Layer
        self.attention = nn.Linear(hidden_dim * 2, 1)  # Computes attention scores

        # Fully connected layer for sentence scoring
        self.fc = nn.Linear(hidden_dim * 2, 1)  # Uses context vector g_t

    def forward(self, x):
        #print(f"Before permute: {x.shape}")  # Debugging
        x = x.permute(0, 2, 1)  # ✅ Fix: Change (batch_size, seq_len, embedding_dim) to (batch_size, embedding_dim, seq_len)
        #print(f"After permute: {x.shape}")  # Debugging
        
        # Apply multiple CNN filters
        conv_outputs = [F.relu(conv(x)) for conv in self.convs]  # List of (batch, num_filters, seq_len)
        
        # Max-pooling over time for each convolution output
        pooled_outputs = [F.max_pool1d(co, kernel_size=co.shape[2]).squeeze(2) for co in conv_outputs]  # (batch, num_filters)
        
        # Concatenate all pooled features
        cnn_output = torch.cat(pooled_outputs, dim=1)  # (batch, num_filters * len(filter_sizes))
        
        # Expand back to sequence shape for Bi-GRU
        x = cnn_output.unsqueeze(1).expand(-1, 50, -1)  # (batch, seq_len, num_filters * len(filter_sizes))
        
        # Bi-GRU
        x, _ = self.gru(x)  # Output shape: (batch, seq_len, hidden_dim*2)
        
        # Word-Level Attention Mechanism
        attn_weights = torch.softmax(self.attention(x).squeeze(-1), dim=1)  # Compute attention weights (batch, seq_len)
        gt = torch.sum(attn_weights.unsqueeze(-1) * x, dim=1)  # Compute weighted sum of Bi-GRU outputs
        
        # Fully connected MLP to compute sentence scores
        yi = torch.sigmoid(self.fc(x)).squeeze(-1)  # ✅ Ensure this outputs (batch_size, seq_len)
        
        return yi  # ✅ Shape should be (batch_size, 50)

In [None]:
import torch
print(torch.cuda.is_available())  # Should print True
print(torch.cuda.device_count())  # Should print number of GPUs
print(torch.cuda.get_device_name(0))  # Prints GPU name
torch.cuda.current_device()  # check which GPU
torch.cuda.memory_allocated() / 1024**2  # in MB
torch.cuda.memory_reserved() / 1024**2  # in MB

### 20k Word2Vec

#### Training

In [None]:
# Function to train model on a single file
def train_on_file(model, file_path, device, epochs=20):
    print(f"\nTraining on file: {os.path.basename(file_path)}")
    
    # Load embeddings from file to CPU first
    embeddings = torch.load(file_path, map_location="cpu")
    
    # Create dataset and dataloader
    dataset = TextSummaryDataset(embeddings)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
    
    # Set up optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))
    
    # Training loop for this file
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for inputs in progress:
            inputs = inputs.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            # Compute loss
            loss = torch.mean((outputs - outputs.mean(dim=1, keepdim=True)) ** 2)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress.set_postfix(loss=running_loss / len(dataloader))
        
        print(f"File: {os.path.basename(file_path)} - Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Clear memory to avoid OOM errors
    del embeddings, dataset, dataloader
    torch.cuda.empty_cache()
    
    return model

In [None]:

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Get all the training files
    files_part1 = sorted(glob.glob("/kaggle/input/word2vec-train-10-chunks/*.pt"))
    files_part2 = sorted(glob.glob("/kaggle/input/word2vec-train-chunks-10-pt2/*.pt"))
    all_files = files_part1 + files_part2
    
    print(f"Found {len(all_files)} training files")
    
    # Check if there's a checkpoint to continue from
    checkpoint_path = "model_checkpoint.pt"
    start_file_idx = 0
    
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model = checkpoint['model']
        start_file_idx = checkpoint['file_idx'] + 1
        print(f"Continuing from file index {start_file_idx}")
    else:
        # Initialize model
        # Load first file just to get embedding_dim
        sample_data = torch.load(all_files[0], map_location="cpu")
        embedding_dim = sample_data.shape[-1]  # Should be 300 for Word2Vec
        del sample_data  # Free memory
        
        model = WL_AttenSumm(embedding_dim=embedding_dim)
        print(f"Initialized new model with embedding_dim={embedding_dim}")
    
    # Move model to device
    model = model.to(device)
    
    # Train on each file sequentially
    for i, file_path in enumerate(all_files[start_file_idx:], start=start_file_idx):
        print(f"\nProcessing file {i+1}/{len(all_files)}: {os.path.basename(file_path)}")
        
        # Train model on this file
        model = train_on_file(model, file_path, device, epochs=20)
        
        # Save checkpoint after each file
        checkpoint = {
            'model': model.cpu(),  # Save model to CPU to avoid CUDA memory issues
            'file_idx': i
        }
        torch.save(checkpoint, checkpoint_path)
        
        # Move model back to device for next training
        model = model.to(device)
        
        print(f"Saved checkpoint after file {i+1}/{len(all_files)}")
    
    # Save final model
    torch.save(model.cpu(), "final_model.pt")
    print("Training completed on all files. Final model saved.")

if __name__ == "__main__":
    main()

#### Testing


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import re
from rouge_score import rouge_scorer

# ===== Step 1: Load Word Embeddings =====
print("Loading word embeddings...")

# Load your word2vec model - modify this to match how you loaded it originally
# This assumes you have already loaded these models elsewhere
# If not, you'll need to load them first
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float16)  # CPU first
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# ===== Step 2: Text Processing Functions =====
def preprocess_text(text):
    """Clean and preprocess text for embedding."""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def split_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str):
        return []
    
    # Basic sentence splitting on punctuation followed by space
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# ===== Step 3: Embedding Generation Functions =====
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM = 300  # Word2Vec dimension

def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]
    return torch.zeros_like(embedding_vectors[0])  # Zero vector for unknown words

def sentence_to_vector(sentence, word2idx, embedding_vectors, max_words=MAX_WORDS):
    """Convert a sentence into a word embedding matrix."""
    words = preprocess_text(sentence).split()[:max_words]
    embedding_matrix = torch.zeros((max_words, EMBEDDING_DIM), dtype=torch.float16)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [
        sentence_to_vector(sent, word2idx, embedding_vectors) 
        for sent in sentences[:max_sentences]
    ]

    # Pad if necessary
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, EMBEDDING_DIM), dtype=torch.float16)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# ===== Step 4: Process Test Dataset =====
def process_test_dataset(df_path, word2idx, embedding_vectors, batch_size=100):
    """Process test dataset and convert to embeddings."""
    print(f"Loading test dataset from {df_path}...")
    df = pd.read_csv(df_path)
    
    print(f"Processing {len(df)} test articles...")
    all_vectors = []
    original_articles = []
    highlights = []

    # Process in batches to manage memory
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Processing test batches"):
        end_idx = min(start_idx + batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_vectors = []
        for article in batch_df["article"]:
            sentences = split_into_sentences(article)
            article_vectors = article_to_vectors(sentences, word2idx, embedding_vectors)
            batch_vectors.append(article_vectors)
            
        # Store original articles and highlights for later use
        original_articles.extend(batch_df["article"].tolist())
        highlights.extend(batch_df["highlights"].tolist())
        
        # Stack batch and add to result
        if batch_vectors:
            all_vectors.append(torch.stack(batch_vectors))
    
    # Combine all batches
    test_embeddings = torch.cat(all_vectors)
    print(f"Generated embeddings shape: {test_embeddings.shape}")
    
    return test_embeddings, original_articles, highlights

# ===== Step 5: TextSummaryDataset Class =====
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        x = x.mean(dim=1)  # Mean pooling over words (50, 300)
        return torch.tensor(x, dtype=torch.float32)

# ===== Step 6: Summary Extraction and Evaluation =====
def extract_top_sentences(scores, articles, top_k=3):
    """Extract top-k sentences from each article based on scores."""
    summaries = []
    
    for i, article in enumerate(articles):
        sentences = split_into_sentences(article)
        
        if not sentences:
            summaries.append("")
            continue
            
        # Get actual number of sentences
        num_sentences = min(len(sentences), scores.shape[1])
        
        # Get scores for this article
        article_scores = scores[i][:num_sentences]
        
        # Get indices of top-k sentences
        if len(article_scores) <= top_k:
            top_indices = np.arange(len(article_scores))
        else:
            top_indices = np.argsort(-article_scores)[:top_k]
        
        # Sort indices to maintain original order
        top_indices = sorted(top_indices)
        
        # Extract selected sentences
        selected_sentences = [sentences[idx] for idx in top_indices if idx < len(sentences)]
        
        # Join sentences
        summary = " ".join(selected_sentences)
        summaries.append(summary)
    
    return summaries

def compute_rouge(predicted_summaries, gold_summaries):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, gold in zip(predicted_summaries, gold_summaries):
        # Handle empty strings
        if not isinstance(pred, str) or not pred.strip():
            pred = "empty summary"
        if not isinstance(gold, str) or not gold.strip():
            gold = "empty summary"
            
        score = scorer.score(gold, pred)
        
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Calculate average scores
    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    
    return avg_scores

# ===== Step 7: Main Evaluation Function =====
def evaluate_model(model_path, test_data_path, word2idx, embedding_vectors):
    """End-to-end evaluation pipeline."""
    print(f"Starting evaluation pipeline...")
    
    # Step 1: Load model
    print(f"Loading model from {model_path}...")
    model = torch.load(model_path)
    
    # Step 2: Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    # Step 3: Generate test embeddings
    print("Generating test embeddings...")
    test_embeddings, original_articles, highlights = process_test_dataset(
        test_data_path, word2idx, embedding_vectors
    )
    
    # Step 4: Save embeddings to avoid regenerating (optional)
    torch.save(test_embeddings, "test_embeddings.pt")
    print("Saved test embeddings to test_embeddings.pt")
    
    # Step 5: Create dataset and dataloader
    test_dataset = TextSummaryDataset(test_embeddings)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
    
    # Step 6: Extract sentence scores
    model.eval()
    all_scores = []
    
    print("Generating sentence scores...")
    with torch.no_grad():
        for inputs in tqdm(test_loader, desc="Scoring sentences"):
            inputs = inputs.to(device)
            scores = model(inputs)
            all_scores.append(scores.cpu().numpy())
    
    # Combine all batch results
    sentence_scores = np.concatenate(all_scores, axis=0)
    
    # Step 7: Generate summaries
    print("Extracting top sentences for summaries...")
    predicted_summaries = extract_top_sentences(
        sentence_scores, 
        original_articles, 
        top_k=3
    )
    
    # Step 8: Compute ROUGE scores
    print("Computing ROUGE scores...")
    rouge_scores = compute_rouge(predicted_summaries, highlights)
    
    # Step 9: Print results
    print("\nROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"{key}: {value:.4f}")
    
    # Step 10: Save predictions
    results_df = pd.DataFrame({
        'article': original_articles,
        'highlights': highlights,
        'predicted_summary': predicted_summaries
    })
    results_df.to_csv("test_predictions.csv", index=False)
    print("Saved predictions to test_predictions.csv")
    
    return rouge_scores

# ===== Step 8: Run Evaluation =====
if __name__ == "__main__":
    # Get word2vec embeddings onto GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    word2vec_vectors = word2vec_vectors.to(device)
    
    # Run evaluation
    evaluate_model(
        model_path="/kaggle/input/word2vec-model/final_model.pt",
        test_data_path="/kaggle/input/sampled-20k/test_2000.csv",
        word2idx=word2idx,
        embedding_vectors=word2vec_vectors
    )

### 20k Fasttext

#### Training

In [None]:
# Function to train model on a single file
def train_on_file(model, file_path, device, epochs=20):
    print(f"\nTraining on file: {os.path.basename(file_path)}")
    
    # Load embeddings from file to CPU first
    embeddings = torch.load(file_path, map_location="cpu")
    
    # Create dataset and dataloader
    dataset = TextSummaryDataset(embeddings)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
    
    # Set up optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))
    
    # Training loop for this file
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for inputs in progress:
            inputs = inputs.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            # Compute loss
            loss = torch.mean((outputs - outputs.mean(dim=1, keepdim=True)) ** 2)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress.set_postfix(loss=running_loss / len(dataloader))
        
        print(f"File: {os.path.basename(file_path)} - Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Clear memory to avoid OOM errors
    del embeddings, dataset, dataloader
    torch.cuda.empty_cache()
    
    return model

In [None]:
# import glob
def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Get all the training files
    files_part1 = sorted(glob.glob("/kaggle/input/fasttext-train-chunk-pt1/*.pt"))
    files_part2 = sorted(glob.glob("/kaggle/input/fasttext-train-chunk-pt2/*.pt"))
    all_files = files_part1 + files_part2
    
    print(f"Found {len(all_files)} training files")
    
    # Check if there's a checkpoint to continue from
    checkpoint_path = "model_checkpoint.pt"
    start_file_idx = 0
    
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model = checkpoint['model']
        start_file_idx = checkpoint['file_idx'] + 1
        print(f"Continuing from file index {start_file_idx}")
    else:
        # Initialize model
        # Load first file just to get embedding_dim
        sample_data = torch.load(all_files[0], map_location="cpu")
        embedding_dim = sample_data.shape[-1]  # Should be 300 for Word2Vec
        del sample_data  # Free memory
        
        model = WL_AttenSumm(embedding_dim=embedding_dim)
        print(f"Initialized new model with embedding_dim={embedding_dim}")
    
    # Move model to device
    model = model.to(device)
    
    # Train on each file sequentially
    for i, file_path in enumerate(all_files[start_file_idx:], start=start_file_idx):
        print(f"\nProcessing file {i+1}/{len(all_files)}: {os.path.basename(file_path)}")
        
        # Train model on this file
        model = train_on_file(model, file_path, device, epochs=20)
        
        # Save checkpoint after each file
        checkpoint = {
            'model': model.cpu(),  # Save model to CPU to avoid CUDA memory issues
            'file_idx': i
        }
        torch.save(checkpoint, checkpoint_path)
        
        # Move model back to device for next training
        model = model.to(device)
        
        print(f"Saved checkpoint after file {i+1}/{len(all_files)}")
    
    # Save final model
    torch.save(model.cpu(), "final_model.pt")
    print("Training completed on all files. Final model saved.")

if __name__ == "__main__":
    main()

#### Testing

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import re
from rouge_score import rouge_scorer

# ===== Step 1: Load Word Embeddings =====
print("Loading fasttext embeddings...")

# Load your fasttext embeddings - use the vectors you already loaded
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16)  # CPU first
fasttext_vocab = fasttext.index_to_key
word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# ===== Step 2: Text Processing Functions =====
def preprocess_text(text):
    """Clean and preprocess text for embedding."""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def split_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str):
        return []
    
    # Basic sentence splitting on punctuation followed by space
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# ===== Step 3: Embedding Generation Functions =====
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM = fasttext_vectors.shape[1]  # Use fasttext dimension

def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]
    
    # For FastText, we can use its ability to handle OOV words if word not in vocabulary
    # Check if we have access to the fasttext model and use its get_vector method
    if hasattr(fasttext, 'get_vector'):
        try:
            return torch.tensor(fasttext.get_vector(word), dtype=torch.float16)
        except:
            pass
            
    return torch.zeros(EMBEDDING_DIM, dtype=torch.float16)  # Zero vector for unknown words

def sentence_to_vector(sentence, word2idx, embedding_vectors, max_words=MAX_WORDS):
    """Convert a sentence into a word embedding matrix."""
    words = preprocess_text(sentence).split()[:max_words]
    embedding_matrix = torch.zeros((max_words, EMBEDDING_DIM), dtype=torch.float16)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [
        sentence_to_vector(sent, word2idx, embedding_vectors) 
        for sent in sentences[:max_sentences]
    ]

    # Pad if necessary
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, EMBEDDING_DIM), dtype=torch.float16)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# ===== Step 4: Process Test Dataset =====
def process_test_dataset(df_path, word2idx, embedding_vectors, batch_size=100):
    """Process test dataset and convert to embeddings."""
    print(f"Loading test dataset from {df_path}...")
    df = pd.read_csv(df_path)
    
    print(f"Processing {len(df)} test articles...")
    all_vectors = []
    original_articles = []
    highlights = []

    # Process in batches to manage memory
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Processing test batches"):
        end_idx = min(start_idx + batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_vectors = []
        for article in batch_df["article"]:
            sentences = split_into_sentences(article)
            article_vectors = article_to_vectors(sentences, word2idx, embedding_vectors)
            batch_vectors.append(article_vectors)
            
        # Store original articles and highlights for later use
        original_articles.extend(batch_df["article"].tolist())
        highlights.extend(batch_df["highlights"].tolist())
        
        # Stack batch and add to result
        if batch_vectors:
            all_vectors.append(torch.stack(batch_vectors))
    
    # Combine all batches
    test_embeddings = torch.cat(all_vectors)
    print(f"Generated embeddings shape: {test_embeddings.shape}")
    
    return test_embeddings, original_articles, highlights

# ===== Step 5: TextSummaryDataset Class =====
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, 300)
        x = x.mean(dim=1)  # Mean pooling over words (50, 300)
        return torch.tensor(x, dtype=torch.float32)

# ===== Step 6: Summary Extraction and Evaluation =====
def extract_top_sentences(scores, articles, top_k=3):
    """Extract top-k sentences from each article based on scores."""
    summaries = []
    
    for i, article in enumerate(articles):
        sentences = split_into_sentences(article)
        
        if not sentences:
            summaries.append("")
            continue
            
        # Get actual number of sentences
        num_sentences = min(len(sentences), scores.shape[1])
        
        # Get scores for this article
        article_scores = scores[i][:num_sentences]
        
        # Get indices of top-k sentences
        if len(article_scores) <= top_k:
            top_indices = np.arange(len(article_scores))
        else:
            top_indices = np.argsort(-article_scores)[:top_k]
        
        # Sort indices to maintain original order
        top_indices = sorted(top_indices)
        
        # Extract selected sentences
        selected_sentences = [sentences[idx] for idx in top_indices if idx < len(sentences)]
        
        # Join sentences
        summary = " ".join(selected_sentences)
        summaries.append(summary)
    
    return summaries

def compute_rouge(predicted_summaries, gold_summaries):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, gold in zip(predicted_summaries, gold_summaries):
        # Handle empty strings
        if not isinstance(pred, str) or not pred.strip():
            pred = "empty summary"
        if not isinstance(gold, str) or not gold.strip():
            gold = "empty summary"
            
        score = scorer.score(gold, pred)
        
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Calculate average scores
    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    
    return avg_scores

# ===== Step 7: Main Evaluation Function =====
def evaluate_model(model_path, test_data_path, word2idx, embedding_vectors):
    """End-to-end evaluation pipeline."""
    print(f"Starting evaluation pipeline...")
    
    # Step 1: Load model
    print(f"Loading model from {model_path}...")
    model = torch.load(model_path)
    
    # Step 2: Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    # Step 3: Generate test embeddings
    print("Generating test embeddings...")
    test_embeddings, original_articles, highlights = process_test_dataset(
        test_data_path, word2idx, embedding_vectors
    )
    
    # Step 4: Save embeddings to avoid regenerating (optional)
    torch.save(test_embeddings, "fasttext_test_embeddings.pt")
    print("Saved test embeddings to fasttext_test_embeddings.pt")
    
    # Step 5: Create dataset and dataloader
    test_dataset = TextSummaryDataset(test_embeddings)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
    
    # Step 6: Extract sentence scores
    model.eval()
    all_scores = []
    
    print("Generating sentence scores...")
    with torch.no_grad():
        for inputs in tqdm(test_loader, desc="Scoring sentences"):
            inputs = inputs.to(device)
            scores = model(inputs)
            all_scores.append(scores.cpu().numpy())
    
    # Combine all batch results
    sentence_scores = np.concatenate(all_scores, axis=0)
    
    # Step 7: Generate summaries
    print("Extracting top sentences for summaries...")
    predicted_summaries = extract_top_sentences(
        sentence_scores, 
        original_articles, 
        top_k=3
    )
    
    # Step 8: Compute ROUGE scores
    print("Computing ROUGE scores...")
    rouge_scores = compute_rouge(predicted_summaries, highlights)
    
    # Step 9: Print results
    print("\nROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"{key}: {value:.4f}")
    
    # Step 10: Save predictions
    results_df = pd.DataFrame({
        'article': original_articles,
        'highlights': highlights,
        'predicted_summary': predicted_summaries
    })
    results_df.to_csv("fasttext_test_predictions.csv", index=False)
    print("Saved predictions to fasttext_test_predictions.csv")
    
    return rouge_scores

# ===== Step 8: Run Evaluation =====
if __name__ == "__main__":
    # Get fasttext embeddings onto GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    fasttext_vectors = fasttext_vectors.to(device)
    
    # Run evaluation using fasttext
    evaluate_model(
        model_path="/kaggle/input/fasttext-model/final_model(2).pt",  # You might need to use a different model trained with fasttext
        test_data_path="/kaggle/input/sampled-20k/test_2000.csv",
        word2idx=word2idx,
        embedding_vectors=fasttext_vectors
    )

### 20k Glove

#### Training

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import glob


def train_on_file(model, file_path, device, epochs=20):
    print(f"\nTraining on file: {os.path.basename(file_path)}")
    
    # Load embeddings from file to CPU first
    embeddings = torch.load(file_path, map_location="cpu")
    
    # Create dataset and dataloader
    dataset = TextSummaryDataset(embeddings)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
    
    # Set up optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))
    
    # Training loop for this file
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for inputs in progress:
            inputs = inputs.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            # Compute loss
            loss = torch.mean((outputs - outputs.mean(dim=1, keepdim=True)) ** 2)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress.set_postfix(loss=running_loss / len(dataloader))
        
        print(f"File: {os.path.basename(file_path)} - Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Clear memory to avoid OOM errors
    del embeddings, dataset, dataloader
    torch.cuda.empty_cache()
    
    return model



In [None]:
def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Get all the training files - modify these paths to point to your GloVe embeddings
    files_part1 = sorted(glob.glob("/kaggle/input/glove-train-chunk-pt1/*.pt"))
    files_part2 = sorted(glob.glob("/kaggle/input/glove-train-chunk-pt2/*.pt"))
    glove_files = files_part1 + files_part2

    
    print(f"Found {len(glove_files)} training files")
    
    # Check if there's a checkpoint to continue from
    checkpoint_path = "glove_model_checkpoint.pt"
    start_file_idx = 0
    
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model = checkpoint['model']
        start_file_idx = checkpoint['file_idx'] + 1
        print(f"Continuing from file index {start_file_idx}")
    else:
        # Initialize model
        # Load first file just to get embedding_dim
        sample_data = torch.load(glove_files[0], map_location="cpu")
        embedding_dim = sample_data.shape[-1]  # Should be 100 for GloVe
        del sample_data  # Free memory
        
        model = WL_AttenSumm(embedding_dim=embedding_dim)
        print(f"Initialized new model with embedding_dim={embedding_dim}")
    
    # Move model to device
    model = model.to(device)
    
    # Train on each file sequentially
    for i, file_path in enumerate(glove_files[start_file_idx:], start=start_file_idx):
        print(f"\nProcessing file {i+1}/{len(glove_files)}: {os.path.basename(file_path)}")
        
        # Train model on this file
        model = train_on_file(model, file_path, device, epochs=20)
        
        # Save checkpoint after each file
        checkpoint = {
            'model': model.cpu(),  # Save model to CPU to avoid CUDA memory issues
            'file_idx': i
        }
        torch.save(checkpoint, checkpoint_path)
        
        # Move model back to device for next training
        model = model.to(device)
        
        print(f"Saved checkpoint after file {i+1}/{len(glove_files)}")
    
    # Save final model
    torch.save(model.cpu(), "glove_final_model.pt")
    print("Training completed on all files. Final model saved.")

if __name__ == "__main__":
    main()

#### Testing

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import re
from rouge_score import rouge_scorer

# ===== Step 1: Load GloVe Embeddings =====
print("Loading GloVe embeddings...")

# Convert the glove_embeddings dictionary to a format similar to word2vec/fasttext
glove_vocab = list(glove_embeddings.keys())
glove_vectors = torch.tensor([glove_embeddings[word] for word in glove_vocab], dtype=torch.float16)
word2idx = {word: idx for idx, word in enumerate(glove_vocab)}

# ===== Step 2: Text Processing Functions =====
def preprocess_text(text):
    """Clean and preprocess text for embedding."""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def split_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str):
        return []
    
    # Basic sentence splitting on punctuation followed by space
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# ===== Step 3: Embedding Generation Functions =====
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM = glove_vectors.shape[1]  # Use GloVe dimension (should be 100)

def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]
    
    # If we have FAISS index for GloVe, we can use it for OOV words
    # This assumes you have created a FAISS index for GloVe
    if 'glove_index' in globals() and 'glove_words' in globals():
        try:
            word_vector = np.zeros((1, EMBEDDING_DIM), dtype=np.float32)
            _, nearest = glove_index.search(word_vector, 1)
            nearest_word = glove_words[nearest[0][0]]
            return torch.tensor(glove_embeddings[nearest_word], dtype=torch.float16)
        except:
            pass
            
    return torch.zeros(EMBEDDING_DIM, dtype=torch.float16)  # Zero vector for unknown words

def sentence_to_vector(sentence, word2idx, embedding_vectors, max_words=MAX_WORDS):
    """Convert a sentence into a word embedding matrix."""
    words = preprocess_text(sentence).split()[:max_words]
    embedding_matrix = torch.zeros((max_words, EMBEDDING_DIM), dtype=torch.float16)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [
        sentence_to_vector(sent, word2idx, embedding_vectors) 
        for sent in sentences[:max_sentences]
    ]

    # Pad if necessary
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, EMBEDDING_DIM), dtype=torch.float16)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# ===== Step 4: Process Test Dataset =====
def process_test_dataset(df_path, word2idx, embedding_vectors, batch_size=100):
    """Process test dataset and convert to embeddings."""
    print(f"Loading test dataset from {df_path}...")
    df = pd.read_csv(df_path)
    
    print(f"Processing {len(df)} test articles...")
    all_vectors = []
    original_articles = []
    highlights = []

    # Process in batches to manage memory
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Processing test batches"):
        end_idx = min(start_idx + batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_vectors = []
        for article in batch_df["article"]:
            sentences = split_into_sentences(article)
            article_vectors = article_to_vectors(sentences, word2idx, embedding_vectors)
            batch_vectors.append(article_vectors)
            
        # Store original articles and highlights for later use
        original_articles.extend(batch_df["article"].tolist())
        highlights.extend(batch_df["highlights"].tolist())
        
        # Stack batch and add to result
        if batch_vectors:
            all_vectors.append(torch.stack(batch_vectors))
    
    # Combine all batches
    test_embeddings = torch.cat(all_vectors)
    print(f"Generated embeddings shape: {test_embeddings.shape}")
    
    return test_embeddings, original_articles, highlights

# ===== Step 5: TextSummaryDataset Class =====
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        x = self.embeddings[idx]  # Shape: (50, 50, embedding_dim)
        x = x.mean(dim=1)  # Mean pooling over words (50, embedding_dim)
        return torch.tensor(x, dtype=torch.float32)

# ===== Step 6: Summary Extraction and Evaluation =====
def extract_top_sentences(scores, articles, top_k=3):
    """Extract top-k sentences from each article based on scores."""
    summaries = []
    
    for i, article in enumerate(articles):
        sentences = split_into_sentences(article)
        
        if not sentences:
            summaries.append("")
            continue
            
        # Get actual number of sentences
        num_sentences = min(len(sentences), scores.shape[1])
        
        # Get scores for this article
        article_scores = scores[i][:num_sentences]
        
        # Get indices of top-k sentences
        if len(article_scores) <= top_k:
            top_indices = np.arange(len(article_scores))
        else:
            top_indices = np.argsort(-article_scores)[:top_k]
        
        # Sort indices to maintain original order
        top_indices = sorted(top_indices)
        
        # Extract selected sentences
        selected_sentences = [sentences[idx] for idx in top_indices if idx < len(sentences)]
        
        # Join sentences
        summary = " ".join(selected_sentences)
        summaries.append(summary)
    
    return summaries

def compute_rouge(predicted_summaries, gold_summaries):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, gold in zip(predicted_summaries, gold_summaries):
        # Handle empty strings
        if not isinstance(pred, str) or not pred.strip():
            pred = "empty summary"
        if not isinstance(gold, str) or not gold.strip():
            gold = "empty summary"
            
        score = scorer.score(gold, pred)
        
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Calculate average scores
    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    
    return avg_scores

# ===== Step 7: Main Evaluation Function =====
def evaluate_model(model_path, test_data_path, word2idx, embedding_vectors):
    """End-to-end evaluation pipeline."""
    print(f"Starting evaluation pipeline...")
    
    # Step 1: Load model
    print(f"Loading model from {model_path}...")
    model = torch.load(model_path)
    
    # Step 2: Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    # Step 3: Generate test embeddings
    print("Generating test embeddings...")
    test_embeddings, original_articles, highlights = process_test_dataset(
        test_data_path, word2idx, embedding_vectors
    )
    
    # Step 4: Save embeddings to avoid regenerating (optional)
    torch.save(test_embeddings, "glove_test_embeddings.pt")
    print("Saved test embeddings to glove_test_embeddings.pt")
    
    # Step 5: Create dataset and dataloader
    test_dataset = TextSummaryDataset(test_embeddings)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=0)
    
    # Step 6: Extract sentence scores
    model.eval()
    all_scores = []
    
    print("Generating sentence scores...")
    with torch.no_grad():
        for inputs in tqdm(test_loader, desc="Scoring sentences"):
            inputs = inputs.to(device)
            scores = model(inputs)
            all_scores.append(scores.cpu().numpy())
    
    # Combine all batch results
    sentence_scores = np.concatenate(all_scores, axis=0)
    
    # Step 7: Generate summaries
    print("Extracting top sentences for summaries...")
    predicted_summaries = extract_top_sentences(
        sentence_scores, 
        original_articles, 
        top_k=3
    )
    
    # Step 8: Compute ROUGE scores
    print("Computing ROUGE scores...")
    rouge_scores = compute_rouge(predicted_summaries, highlights)
    
    # Step 9: Print results
    print("\nROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"{key}: {value:.4f}")
    
    # Step 10: Save predictions
    results_df = pd.DataFrame({
        'article': original_articles,
        'highlights': highlights,
        'predicted_summary': predicted_summaries
    })
    results_df.to_csv("glove_test_predictions.csv", index=False)
    print("Saved predictions to glove_test_predictions.csv")
    
    return rouge_scores

# ===== Step 8: Run Evaluation =====
if __name__ == "__main__":
    # Create FAISS index for GloVe if needed
    if 'glove_index' not in globals() and 'faiss' in globals():
        print("Creating FAISS index for GloVe embeddings...")
        # Convert glove embeddings to numpy array
        glove_words = list(glove_embeddings.keys())
        glove_vecs = np.array([glove_embeddings[word] for word in tqdm(glove_words, desc="Indexing GloVe")], dtype=np.float32)
        
        # Create index
        glove_index = faiss.IndexFlatL2(EMBEDDING_DIM)
        glove_index.add(glove_vecs)
        print("FAISS index created for GloVe embeddings")
    
    # Get GloVe embeddings onto GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    glove_vectors = glove_vectors.to(device)
    
    # Run evaluation using GloVe
    evaluate_model(
        model_path="/kaggle/input/glove-model-final/glove_final_model.pt",  # Use a model trained with GloVe
        test_data_path="/kaggle/input/sampled-20k/test_2000.csv",
        word2idx=word2idx,
        embedding_vectors=glove_vectors
    )

## Visualisation

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Data
data = {
    "Dataset": ["1k", "1k", "1k", "20k", "20k", "20k", "200k", "200k", "200k"],
    "Embedding": ["GloVe", "FastText", "Word2Vec"] * 3,
    "ROUGE-1": [0.1911, 0.2151, 0.2307, 0.3724, 0.3208, 0.3567, 42.9, 42.3, 41.8],
    "ROUGE-2": [0.0662, 0.0730, 0.0828, 0.1541, 0.1148, 0.1388, 19.7, 19.2, 18.9],
    "ROUGE-L": [0.1478, 0.1576, 0.1692, 0.2335, 0.1994, 0.2212, 39.3, 38.9, 38.5]
}

df = pd.DataFrame(data)

# Normalize paper scores (optional toggle)
normalize = True
if normalize:
    df.loc[df['Dataset'] == '200k', ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']] = df.loc[df['Dataset'] == '200k', ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']] / 100

# ---- 1. Bar Plots ----
for rouge_type in ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']:
    plt.figure(figsize=(8, 5))
    sns.barplot(data=df, x='Dataset', y=rouge_type, hue='Embedding')
    plt.title(f'{rouge_type} by Embedding Type and Dataset Size')
    plt.ylabel(rouge_type)
    plt.xlabel("Dataset Size")
    plt.ylim(0, 0.5)
    plt.legend(title='Embedding')
    plt.tight_layout()
    plt.show()

# ---- 2. Line Plot: Score vs Dataset size ----
dataset_map = {'1k': 1_000, '20k': 20_000, '200k': 200_000}
df['Dataset Size'] = df['Dataset'].map(dataset_map)

for rouge_type in ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']:
    plt.figure(figsize=(8, 5))
    for embedding in df['Embedding'].unique():
        sub_df = df[df['Embedding'] == embedding]
        plt.plot(sub_df['Dataset Size'], sub_df[rouge_type], marker='o', label=embedding)
    plt.title(f'{rouge_type} vs Dataset Size')
    plt.xlabel('Dataset Size')
    plt.ylabel(rouge_type)
    plt.xscale('log')
    plt.ylim(0, 0.5)
    plt.legend()
    plt.grid(True, which="both", ls="--")
    plt.tight_layout()
    plt.show()

# ---- 3. Heatmap ----
heatmap_df = df.pivot(index="Embedding", columns="Dataset", values="ROUGE-1")
plt.figure(figsize=(6, 4))
sns.heatmap(heatmap_df, annot=True, cmap='YlGnBu')
plt.title("Heatmap: ROUGE-1")
plt.show()

heatmap_df = df.pivot(index="Embedding", columns="Dataset", values="ROUGE-2")
plt.figure(figsize=(6, 4))
sns.heatmap(heatmap_df, annot=True, cmap='YlGnBu')
plt.title("Heatmap: ROUGE-2")
plt.show()

heatmap_df = df.pivot(index="Embedding", columns="Dataset", values="ROUGE-L")
plt.figure(figsize=(6, 4))
sns.heatmap(heatmap_df, annot=True, cmap='YlGnBu')
plt.title("Heatmap: ROUGE-L")
plt.show()

# ---- 4. Radar Plot ----
# def radar_plot(df, dataset_label):
#     labels = ['ROUGE-1', 'ROUGE-2', 'ROUGE-L']
#     angles = np.linspace(0, 2 * np.pi, len(labels), endpoint=False).tolist()
#     angles += angles[:1]

#     fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
    
#     for embedding in df['Embedding'].unique():
#         values = df[(df['Dataset'] == dataset_label) & (df['Embedding'] == embedding)][labels].values.flatten().tolist()
#         values += values[:1]
#         ax.plot(angles, values, label=embedding)
#         ax.fill(angles, values, alpha=0.1)
    
#     ax.set_title(f'Radar Plot for {dataset_label} Dataset')
#     ax.set_xticks(angles[:-1])
#     ax.set_xticklabels(labels)
#     ax.set_yticklabels([])
#     ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
#     plt.tight_layout()
#     plt.show()

# for dataset in df['Dataset'].unique():
#     radar_plot(df, dataset)


In [None]:

output_dir = "/kaggle/working/"
for f in os.listdir(output_dir):
    os.remove(os.path.join(output_dir, f))
print("🧹 Cleared files from working directory.")

## 30k dataset

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import gensim
import swifter
import faiss
import pickle
from tqdm.auto import tqdm
from nltk.tokenize import sent_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from concurrent.futures import ProcessPoolExecutor

# Enable tqdm for Pandas
tqdm.pandas()

In [None]:
import pandas as pd

# Load only the last N rows (let's assume we read last 50,000 rows for randomness at the end)
df = pd.read_csv('/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/train.csv', skiprows=range(1, 100000))  # adjust based on dataset size

# Sample 10,000 random rows from the loaded subset
sampled_df = df.sample(n=10000, random_state=42)

# Save to a new CSV
sampled_df.to_csv('/kaggle/working/train_sampled_10k.csv', index=False)


In [None]:
# === Step 1: Load Dataset ===
print("📥 Loading dataset...")
train_df = pd.read_csv('/kaggle/input/sampled-10k/train_sampled_10k.csv')


In [None]:
# === Step 2: Fast Sentence Tokenization Using Swifter + Progress Bar ===
print("✂️ Fast tokenizing sentences with Swifter & progress tracking...")
train_df["sentences"] = train_df["article"].astype(str).swifter.apply(sent_tokenize)


In [None]:


# Save tokenized sentences
train_df.to_csv("/kaggle/working/train_tokenized_10k.csv", index=False)
print("✅ Tokenized sentences saved!")



In [None]:
def split_and_save(df, prefix, chunk_size=1000):
    """Splits a dataframe into chunks and saves each as a separate file."""
    total_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size != 0 else 0)

    for i in range(total_chunks):
        start = i * chunk_size
        end = min((i + 1) * chunk_size, len(df))
        chunk_df = df.iloc[start:end]
        chunk_df.to_csv(f"{prefix}_chunk_{i+1}.csv", index=False)
        print(f"✅ Saved {prefix}_chunk_{i+1}.csv")

# Split & save training, validation, and test sets
split_and_save(train_df, "train")

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm

# === Step 1: Check & Set Device ===
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {device}")

# === Step 2: Move Word2Vec Model to GPU ===
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float16, device=device)  # Move embeddings to GPU
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# === Step 3: Move GloVe & FastText to GPU ===
glove_vocab = list(glove_embeddings.keys())
glove_vectors = torch.tensor([glove_embeddings[word] for word in glove_vocab], dtype=torch.float16, device=device)
glove_word2idx = {word: idx for idx, word in enumerate(glove_vocab)}

fasttext_vocab = fasttext.index_to_key
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16, device=device)
fasttext_word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# === Step 4: Optimized Word Embedding Lookup on GPU ===
def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding from GPU tensors."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]  # GPU lookup
    return torch.zeros_like(embedding_vectors[0])  # Zero vector for unknown words

# === Step 5: Convert Sentences to Word Embeddings Using Mini-Batches ===
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM_W2V = 300
EMBEDDING_DIM_GLOVE = 100
EMBEDDING_DIM_FASTTEXT = 300
BATCH_SIZE = 500  # 🚀 Process in batches to avoid OOM

def sentence_to_vector(sentence, word2idx, embedding_vectors, embedding_dim, max_words=MAX_WORDS):
    """Convert a sentence into a GPU-accelerated word embedding matrix."""
    words = sentence.split()[:max_words]
    embedding_matrix = torch.zeros((max_words, embedding_dim), dtype=torch.float16, device=device)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [sentence_to_vector(sent, word2idx, embedding_vectors, embedding_dim) for sent in sentences]

    # Pad or truncate to MAX_SENTENCES
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, embedding_dim), dtype=torch.float16, device=device)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# === Step 6: Process in Mini-Batches ===
def process_in_batches(df, word2idx, embedding_vectors, embedding_dim, batch_size=BATCH_SIZE):
    """Process dataset in small batches to prevent GPU memory overflow."""
    total_samples = len(df)
    all_vectors = []

    for start in tqdm(range(0, total_samples, batch_size), desc="🚀 Processing batches on GPU"):
        end = min(start + batch_size, total_samples)
        batch_df = df.iloc[start:end]  # Select batch

        batch_vectors = [
            article_to_vectors(sentences, word2idx, embedding_vectors, embedding_dim)
            for sentences in batch_df["sentences"]
        ]

        batch_vectors = torch.stack(batch_vectors).cpu()  # Move to CPU to free GPU memory
        all_vectors.append(batch_vectors)
        
        torch.cuda.empty_cache()  # 🚀 Free GPU memory after each batch

    return torch.cat(all_vectors)  # Combine all batches


# === Step 7: Process All Files in Directory & Save Output ===
input_dir = "/kaggle/input/train-chunk-pt2"
output_dir = "/kaggle/working"

skip_files = {}

processed_files = {f.replace('.pt', '') for f in os.listdir(output_dir) if f.endswith(".pt")}

for file_name in os.listdir(input_dir):
    if file_name.endswith(".csv"):
        base_name = file_name.replace('.csv', '')

        # Skip if already processed OR explicitly listed in skip_files
        if base_name in processed_files or base_name in skip_files:
            print(f"⏩ Skipping: {file_name}")
            continue

        file_path = os.path.join(input_dir, file_name)
        df = pd.read_csv(file_path)

        print(f"📂 Processing {file_name} ...")
        processed_data = process_in_batches(df, word2idx, word2vec_vectors, EMBEDDING_DIM_W2V)

        # Save output
        output_file = os.path.join(output_dir, f"{base_name}.pt")
        torch.save(processed_data, output_file)
        print(f"✅ Saved {output_file}")

print(f"🎉 Remaining files processed and saved in {output_dir}!")


In [None]:


# === Step 0: Remove Existing .pt Files from Output Directory ===
for f in os.listdir(output_dir):
    if f.endswith(".pt"):
        os.remove(os.path.join(output_dir, f))
print("🧹 Cleared previous .pt files from working directory.")



## Hierarchial Attention Mechanism

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class WordEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_filters=100, filter_sizes=[1,2,3,4,5,6,7]):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, fs, padding=fs//2)
            for fs in filter_sizes
        ])
        self.gru = nn.GRU(num_filters * len(filter_sizes), hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)

    def forward(self, word_embeddings):
        # word_embeddings: (batch, max_sentences, max_words, embedding_dim)
        B, S, W, E = word_embeddings.size()
        x = word_embeddings.view(B * S, W, E).permute(0, 2, 1)  # (B*S, E, W)
        convs = [F.relu(conv(x)) for conv in self.convs]
        pools = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in convs]  # (B*S, num_filters)
        x_cnn = torch.cat(pools, dim=1).unsqueeze(1).expand(-1, W, -1)  # (B*S, W, CNN_DIM)
        x_gru, _ = self.gru(x_cnn)
        attn_weights = torch.softmax(self.attention(x_gru).squeeze(-1), dim=1)
        sent_vec = torch.sum(attn_weights.unsqueeze(-1) * x_gru, dim=1)  # (B*S, hidden_dim*2)
        return sent_vec.view(B, S, -1)  # (B, S, hidden_dim*2)

class SentenceEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, sentence_vecs):
        x, _ = self.gru(sentence_vecs)  # (B, S, 2*H)
        attn_weights = torch.softmax(self.attention(x).squeeze(-1), dim=1)
        doc_vec = torch.sum(attn_weights.unsqueeze(-1) * x, dim=1)  # (B, 2*H)
        scores = torch.sigmoid(self.fc(x)).squeeze(-1)  # (B, S)
        return scores

class HierarchicalAttentionSummarizer(nn.Module):
    def __init__(self, embedding_dim, word_hidden_dim=256, sent_hidden_dim=256):
        super().__init__()
        self.word_encoder = WordEncoder(embedding_dim, word_hidden_dim)
        self.sent_encoder = SentenceEncoder(word_hidden_dim * 2, sent_hidden_dim)

    def forward(self, x):
        # x: (batch_size, max_sentences, max_words, embedding_dim)
        sent_vecs = self.word_encoder(x)  # (B, S, 2*H)
        sentence_scores = self.sent_encoder(sent_vecs)  # (B, S)
        return sentence_scores

### Word2vec

#### Training

In [None]:
import os
import glob
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

class TextSummaryDataset(Dataset):
    def __init__(self, embeddings, dtype=torch.float32):
        """
        Dataset for text summarization using word embeddings
        
        Args:
            embeddings: Tensor of shape (n_documents, max_sentences, max_words, embedding_dim)
            dtype: Data type to convert embeddings to
        """
        # Convert embeddings to specified dtype
        self.embeddings = embeddings.to(dtype)
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx]

def train_on_file(model, file_path, device, dtype=torch.float32, epochs=20):
    print(f"\nTraining on file: {os.path.basename(file_path)}")
    
    # Load embeddings from file to CPU first
    embeddings = torch.load(file_path, map_location="cpu")
    
    # Print embeddings data type for debugging
    print(f"Original embeddings dtype: {embeddings.dtype}")
    
    # Create dataset and dataloader with explicit dtype conversion
    dataset = TextSummaryDataset(embeddings, dtype=dtype)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
    
    # Set up optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))
    
    # Training loop for this file
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for inputs in progress:
            # Ensure inputs are in the correct dtype before sending to device
            inputs = inputs.to(dtype).to(device)
            
            optimizer.zero_grad()
            
            # Forward pass
            sentence_scores = model(inputs)
            
            # Compute variance loss - encouraging diversity in sentence scores
            loss = torch.mean((sentence_scores - sentence_scores.mean(dim=1, keepdim=True)) ** 2)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress.set_postfix(loss=running_loss / (progress.n + 1))
        
        print(f"File: {os.path.basename(file_path)} - Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Clear memory to avoid OOM errors
    del embeddings, dataset, dataloader
    torch.cuda.empty_cache()
    
    return model

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Set consistent data type throughout the pipeline
    dtype = torch.float32  # Use float32 instead of float16
    
    # Get all the training files
    files_part1 = sorted(glob.glob("/kaggle/input/word2vec-train-10-chunks/*.pt"))
    files_part2 = sorted(glob.glob("/kaggle/input/word2vec-train-chunks-10-pt2/*.pt"))
    all_files = files_part1 + files_part2
    
    print(f"Found {len(all_files)} training files")
    
    # Check if there's a checkpoint to continue from
    checkpoint_path = "model_checkpoint.pt"
    start_file_idx = 0
    
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model = checkpoint['model']
        start_file_idx = checkpoint['file_idx'] + 1
        print(f"Continuing from file index {start_file_idx}")
        
        # Ensure model parameters are in the correct dtype
        for param in model.parameters():
            param.data = param.data.to(dtype)
    else:
        # Initialize model
        # Load first file just to get embedding_dim
        sample_data = torch.load(all_files[0], map_location="cpu")
        embedding_dim = sample_data.shape[-1]  # Should be 300 for Word2Vec
        print(f"Sample data dtype: {sample_data.dtype}")
        del sample_data  # Free memory
        
        # Initialize the HierarchicalAttentionSummarizer model
        model = HierarchicalAttentionSummarizer(
            embedding_dim=embedding_dim,
            word_hidden_dim=256,
            sent_hidden_dim=256
        )
        
        # Ensure model parameters are in the correct dtype
        model = model.to(dtype)
        print(f"Initialized new hierarchical attention model with embedding_dim={embedding_dim}")
        
        # Debug: print model parameter dtypes
        for name, param in model.named_parameters():
            print(f"Parameter {name} dtype: {param.dtype}")
    
    # Move model to device
    model = model.to(device)
    
    # Train on each file sequentially
    for i, file_path in enumerate(all_files[start_file_idx:], start=start_file_idx):
        print(f"\nProcessing file {i+1}/{len(all_files)}: {os.path.basename(file_path)}")
        
        # Train model on this file with explicit dtype
        model = train_on_file(model, file_path, device, dtype=dtype, epochs=20)
        
        # Save checkpoint after each file
        checkpoint = {
            'model': model.cpu(),  # Save model to CPU to avoid CUDA memory issues
            'file_idx': i
        }
        torch.save(checkpoint, checkpoint_path)
        
        # Move model back to device for next training
        model = model.to(device)
        
        print(f"Saved checkpoint after file {i+1}/{len(all_files)}")
    
    # Save final model
    torch.save(model.cpu(), "final_model.pt")
    print("Training completed on all files. Final model saved.")

# Additional utility function to use the trained model for inference
def summarize_document(model, document_embeddings, device, dtype=torch.float32, top_k=3):
    """
    Summarize a document by selecting the top-k most important sentences
    
    Args:
        model: Trained HierarchicalAttentionSummarizer model
        document_embeddings: Tensor of shape (1, max_sentences, max_words, embedding_dim)
        device: Device to run inference on
        dtype: Data type to convert embeddings to
        top_k: Number of sentences to select for the summary
    
    Returns:
        indices: Indices of the top-k sentences
    """
    model.eval()
    with torch.no_grad():
        document_embeddings = document_embeddings.to(dtype).to(device)
        sentence_scores = model(document_embeddings).squeeze(0)
        
        # Get indices of top-k sentences
        _, indices = torch.topk(sentence_scores, min(top_k, len(sentence_scores)))
        return indices.cpu().numpy()

# Note: Don't execute main() automatically in Kaggle notebook
# Instead, call it explicitly in your notebook cell
main()

#### Testing

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import re
from rouge_score import rouge_scorer
import gc  # For garbage collection

# ===== Step 1: Load Word Embeddings =====
print("Loading word embeddings...")

# Keep embeddings on CPU initially
word2vec_vectors = torch.tensor(word2vec.vectors, dtype=torch.float32)
word2vec_vocab = word2vec.index_to_key
word2idx = {word: idx for idx, word in enumerate(word2vec_vocab)}

# ===== Step 2: Text Processing Functions =====
def preprocess_text(text):
    """Clean and preprocess text for embedding."""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def split_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str):
        return []
    
    # Basic sentence splitting on punctuation followed by space
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# ===== Step 3: Embedding Generation Functions =====
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM = 300  # Word2Vec dimension

def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx].clone()  # Clone to avoid memory issues
    return torch.zeros(embedding_vectors.size(1), dtype=embedding_vectors.dtype)

def sentence_to_vector(sentence, word2idx, embedding_vectors, max_words=MAX_WORDS):
    """Convert a sentence into a word embedding matrix."""
    words = preprocess_text(sentence).split()[:max_words]
    embedding_matrix = torch.zeros((max_words, EMBEDDING_DIM), dtype=torch.float32)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [
        sentence_to_vector(sent, word2idx, embedding_vectors) 
        for sent in sentences[:max_sentences]
    ]

    # Pad if necessary
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, EMBEDDING_DIM), dtype=torch.float32)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

# ===== Step 4: Process Test Dataset with Sequential Selection =====
def process_test_dataset(df_path, word2idx, embedding_vectors, batch_size=1, num_samples=None):
    """Process test dataset in smaller batches to save memory with optional sequential sampling."""
    print(f"Loading test dataset from {df_path}...")
    
    # Load only the first num_samples entries if specified
    if num_samples:
        print(f"Will process the first {num_samples} articles from the dataset")
        # Read only the first num_samples rows
        all_df = pd.read_csv(df_path, nrows=num_samples)
        print(f"Loaded {len(all_df)} articles")
    else:
        # Load entire CSV
        all_df = pd.read_csv(df_path)
    
    # Process the data
    all_vectors = []
    original_articles = []
    highlights = []
    sentence_counts = []
    
    # Process in small batches
    for start_idx in tqdm(range(0, len(all_df), batch_size), desc="Processing articles"):
        end_idx = min(start_idx + batch_size, len(all_df))
        batch_df = all_df.iloc[start_idx:end_idx]
        
        batch_vectors = []
        batch_sentence_counts = []
        
        try:
            for article in batch_df["article"]:
                sentences = split_into_sentences(article)
                article_vectors = article_to_vectors(sentences, word2idx, embedding_vectors)
                batch_vectors.append(article_vectors)
                batch_sentence_counts.append(min(len(sentences), MAX_SENTENCES))
            
            # Store original articles and highlights
            original_articles.extend(batch_df["article"].tolist())
            highlights.extend(batch_df["highlights"].tolist())
            sentence_counts.extend(batch_sentence_counts)
            
            # Stack batch vectors
            if batch_vectors:
                batch_tensor = torch.stack(batch_vectors)
                all_vectors.append(batch_tensor)
                
            # Clear memory after each batch
            del batch_vectors
            gc.collect()
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            
        except Exception as e:
            print(f"Error processing batch: {e}")
            continue
    
    print(f"Total articles processed: {len(original_articles)}")
    
    return all_vectors, original_articles, highlights, sentence_counts

# ===== Step 5: TextSummaryDataset Class =====
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings_list):
        """Dataset that handles batches of embeddings.
        
        Args:
            embeddings_list: List of tensor batches
        """
        self.embeddings_list = embeddings_list
        self.total_len = sum(batch.size(0) for batch in embeddings_list)
        
        # Calculate cumulative sizes for indexing
        self.cumulative_sizes = [0]
        for batch in embeddings_list:
            self.cumulative_sizes.append(self.cumulative_sizes[-1] + batch.size(0))
    
    def __len__(self):
        return self.total_len
    
    def __getitem__(self, idx):
        # Find which batch contains this index
        batch_idx = 0
        while batch_idx < len(self.cumulative_sizes) - 1 and idx >= self.cumulative_sizes[batch_idx + 1]:
            batch_idx += 1
        
        # Get the relative index within the batch
        rel_idx = idx - self.cumulative_sizes[batch_idx]
        
        # Return the item
        return self.embeddings_list[batch_idx][rel_idx]

# ===== Step 6: Summary Extraction and Evaluation =====
def extract_top_sentences(scores, articles, sentence_counts, top_k=3):
    """Extract top-k sentences from each article based on scores."""
    summaries = []
    
    for i, article in enumerate(articles):
        try:
            sentences = split_into_sentences(article)
            
            if not sentences:
                summaries.append("")
                continue
                
            # Get actual number of sentences
            num_sentences = sentence_counts[i]
            
            # Get scores for this article
            article_scores = scores[i][:num_sentences]
            
            # Get indices of top-k sentences
            if len(article_scores) <= top_k:
                top_indices = np.arange(len(article_scores))
            else:
                top_indices = np.argsort(-article_scores)[:top_k]
            
            # Sort indices to maintain original order
            top_indices = sorted(top_indices)
            
            # Extract selected sentences
            selected_sentences = [sentences[idx] for idx in top_indices if idx < len(sentences)]
            
            # Join sentences
            summary = " ".join(selected_sentences)
            summaries.append(summary)
        except Exception as e:
            print(f"Error extracting summary for article {i}: {e}")
            summaries.append("")
    
    return summaries

def compute_rouge(predicted_summaries, gold_summaries):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, gold in zip(predicted_summaries, gold_summaries):
        # Handle empty strings
        if not isinstance(pred, str) or not pred.strip():
            pred = "empty summary"
        if not isinstance(gold, str) or not gold.strip():
            gold = "empty summary"
            
        score = scorer.score(gold, pred)
        
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Calculate average scores
    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    
    return avg_scores

# ===== Step 7: Main Evaluation Function =====
def evaluate_model(model_path, test_data_path, word2idx, embedding_vectors, num_samples=None):
    """End-to-end evaluation pipeline with memory optimization and sequential sampling."""
    print(f"Starting evaluation pipeline with first {num_samples} samples...")
    
    # Step 1: Load model
    print(f"Loading model from {model_path}...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Load on CPU first, then move to GPU if needed
    checkpoint = torch.load(model_path, map_location='cpu')
    
    # Handle both checkpoint format and direct model format
    if isinstance(checkpoint, dict) and 'model' in checkpoint:
        model = checkpoint['model']
    else:
        model = checkpoint
    
    # Move model to device
    model = model.to(device)
    print(f"Using device: {device}")
    
    # Step 2: Generate test embeddings with sequential sampling (first num_samples entries)
    print("Generating test embeddings...")
    embedding_batches, original_articles, highlights, sentence_counts = process_test_dataset(
        test_data_path, word2idx, embedding_vectors, batch_size=1, num_samples=num_samples
    )
    
    # Step 3: Create dataset
    test_dataset = TextSummaryDataset(embedding_batches)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, num_workers=0)
    
    # Step 4: Extract sentence scores
    model.eval()
    all_scores = []
    
    print("Generating sentence scores...")
    with torch.no_grad():
        for batch_idx, inputs in enumerate(tqdm(test_loader, desc="Scoring sentences")):
            try:
                # Move batch to device
                inputs = inputs.to(device)
                
                # Forward pass
                scores = model(inputs)
                all_scores.append(scores.cpu().numpy())
                
                # Clear GPU memory
                del inputs
                torch.cuda.empty_cache() if torch.cuda.is_available() else None
                
            except Exception as e:
                print(f"Error processing batch {batch_idx}: {e}")
                # Add empty scores to maintain alignment
                batch_size = inputs.size(0) if 'inputs' in locals() else 4
                empty_scores = np.zeros((batch_size, MAX_SENTENCES))
                all_scores.append(empty_scores)
                continue
    
    # Combine all batch results
    sentence_scores = np.concatenate(all_scores, axis=0)
    
    # Step 5: Generate summaries
    print("Extracting top sentences for summaries...")
    predicted_summaries = extract_top_sentences(
        sentence_scores, 
        original_articles,
        sentence_counts,
        top_k=3
    )
    
    # Step 6: Compute ROUGE scores
    print("Computing ROUGE scores...")
    rouge_scores = compute_rouge(predicted_summaries, highlights)
    
    # Step 7: Print results
    print("\nROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"{key}: {value:.4f}")
    
    # Step 8: Save predictions
    results_df = pd.DataFrame({
        'article': original_articles,
        'highlights': highlights,
        'predicted_summary': predicted_summaries
    })
    
    # Save in chunks to avoid memory issues
    chunk_size = 500
    output_filename = "hierarchical_test_predictions_sequential.csv"
    for i in range(0, len(results_df), chunk_size):
        chunk = results_df.iloc[i:i+chunk_size]
        if i == 0:
            chunk.to_csv(output_filename, index=False)
        else:
            chunk.to_csv(output_filename, mode='a', header=False, index=False)
    
    print(f"Saved predictions to {output_filename}")
    
    # Step 9: Save a sample of predictions for manual inspection
    sample_df = results_df.head(10)
    sample_filename = "sample_predictions_sequential.csv"
    sample_df.to_csv(sample_filename, index=False)
    print(f"Saved sample predictions to {sample_filename}")
    
    return rouge_scores

# ===== Step 8: Run Evaluation =====
if __name__ == "__main__":
    try:
        # Keep embeddings on CPU
        # Only move small batches to GPU as needed during processing
        print(f"Word2Vec embeddings shape: {word2vec_vectors.shape}")
        
        # Set number of samples to process (first 1500 entries)
        num_samples = 1500
        
        # Run evaluation with memory optimizations and sequential sampling
        evaluate_model(
            model_path="/kaggle/input/word2vec-hierarchial-model/final_model.pt",
            test_data_path="/kaggle/input/sampled-20k/test_2000.csv",
            word2idx=word2idx,
            embedding_vectors=word2vec_vectors,
            num_samples=num_samples
        )
    except Exception as e:
        print(f"Evaluation failed: {e}")
        import traceback
        traceback.print_exc()

### Fasttext

#### Training

In [None]:
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np

# Hierarchical Attention Model definition (copy from your cell)
class WordEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_filters=100, filter_sizes=[1,2,3,4,5,6,7]):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, fs, padding=fs//2)
            for fs in filter_sizes
        ])
        self.gru = nn.GRU(num_filters * len(filter_sizes), hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, word_embeddings):
        # word_embeddings: (batch, max_sentences, max_words, embedding_dim)
        B, S, W, E = word_embeddings.size()
        x = word_embeddings.view(B * S, W, E).permute(0, 2, 1)  # (B*S, E, W)
        convs = [F.relu(conv(x)) for conv in self.convs]
        pools = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in convs]  # (B*S, num_filters)
        x_cnn = torch.cat(pools, dim=1).unsqueeze(1).expand(-1, W, -1)  # (B*S, W, CNN_DIM)
        x_gru, _ = self.gru(x_cnn)
        attn_weights = torch.softmax(self.attention(x_gru).squeeze(-1), dim=1)
        sent_vec = torch.sum(attn_weights.unsqueeze(-1) * x_gru, dim=1)  # (B*S, hidden_dim*2)
        return sent_vec.view(B, S, -1), attn_weights.view(B, S, W)  # (B, S, hidden_dim*2), (B, S, W)

class SentenceEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, sentence_vecs):
        x, _ = self.gru(sentence_vecs)  # (B, S, 2*H)
        attn_weights = torch.softmax(self.attention(x).squeeze(-1), dim=1)
        doc_vec = torch.sum(attn_weights.unsqueeze(-1) * x, dim=1)  # (B, 2*H)
        scores = torch.sigmoid(self.fc(x)).squeeze(-1)  # (B, S)
        return scores, attn_weights

class HierarchicalAttentionSummarizer(nn.Module):
    def __init__(self, embedding_dim, word_hidden_dim=256, sent_hidden_dim=256):
        super().__init__()
        self.word_encoder = WordEncoder(embedding_dim, word_hidden_dim)
        self.sent_encoder = SentenceEncoder(word_hidden_dim * 2, sent_hidden_dim)
        
    def forward(self, x):
        # x: (batch_size, max_sentences, max_words, embedding_dim)
        sent_vecs, word_attn = self.word_encoder(x)  # (B, S, 2*H), (B, S, W)
        sentence_scores, sent_attn = self.sent_encoder(sent_vecs)  # (B, S), (B, S)
        return word_attn, sent_attn, sentence_scores

# Define dataset class for hierarchical input
class HierarchicalSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        # Return the full 3D tensor (max_sentences, max_words, embedding_dim)
        return torch.tensor(self.embeddings[idx], dtype=torch.float32)

# Function to train model on a single file
def train_on_file(model, file_path, device, epochs=20):
    print(f"\nTraining on file: {os.path.basename(file_path)}")
    
    # Load embeddings from file to CPU first
    embeddings = torch.load(file_path, map_location="cpu")
    
    # Create dataset and dataloader with the hierarchical dataset
    dataset = HierarchicalSummaryDataset(embeddings)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=0)
    
    # Set up optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))
    criterion = nn.BCELoss()  # Binary cross-entropy for sentence scoring
    
    # Training loop for this file
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for inputs in progress:
            inputs = inputs.to(device)
            
            optimizer.zero_grad()
            
            # Forward pass with hierarchical attention model
            word_attn, sent_attn, sentence_scores = model(inputs)
            
            # Create pseudo-labels (assuming top 3 sentences are summary-worthy)
            # This is a simplistic approach - in practice, you might have actual labels
            batch_size, num_sentences = sentence_scores.shape
            pseudo_labels = torch.zeros_like(sentence_scores)
            
            # Set top 3 sentences in each document as positive (1)
            for i in range(batch_size):
                # Get scores for this document and find top 3 indices
                doc_scores = sentence_scores[i]
                if torch.sum(doc_scores > 0) > 0:  # Check if there are valid scores
                    _, top_indices = torch.topk(doc_scores, min(3, num_sentences))
                    pseudo_labels[i, top_indices] = 1.0
            
            # Compute loss
            loss = criterion(sentence_scores, pseudo_labels)
            
            # Add regularization for attention distribution if needed
            # This encourages more diverse attention
            attn_reg = 0.01 * (torch.mean(torch.abs(word_attn)) + torch.mean(torch.abs(sent_attn)))
            total_loss = loss + attn_reg
            
            # Backward pass
            total_loss.backward()
            
            # Clip gradients to prevent explosion
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            running_loss += total_loss.item()
            progress.set_postfix(loss=running_loss / (progress.n + 1))
        
        print(f"File: {os.path.basename(file_path)} - Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Clear memory to avoid OOM errors
    del embeddings, dataset, dataloader
    torch.cuda.empty_cache()
    
    return model

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Get all the training files
    files_part1 = sorted(glob.glob("/kaggle/input/fasttext-train-chunk-pt1/*.pt"))
    files_part2 = sorted(glob.glob("/kaggle/input/fasttext-train-chunk-pt2/*.pt"))
    all_files = files_part1 + files_part2
    
    print(f"Found {len(all_files)} training files")
    
    # Check if there's a checkpoint to continue from
    checkpoint_path = "hierarchical_model_checkpoint.pt"
    start_file_idx = 0
    
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model = checkpoint['model']
        start_file_idx = checkpoint['file_idx'] + 1
        print(f"Continuing from file index {start_file_idx}")
    else:
        # Initialize model
        # Load first file just to get embedding dimensions
        sample_data = torch.load(all_files[0], map_location="cpu")
        embedding_dim = sample_data.shape[-1]  # Should be 300 for Word2Vec/FastText
        
        # Initialize the hierarchical attention model
        word_hidden_dim = 256
        sent_hidden_dim = 256
        
        model = HierarchicalAttentionSummarizer(
            embedding_dim=embedding_dim,
            word_hidden_dim=word_hidden_dim,
            sent_hidden_dim=sent_hidden_dim
        )
        
        print(f"Initialized new hierarchical attention model with embedding_dim={embedding_dim}")
        del sample_data  # Free memory
    
    # Move model to device
    model = model.to(device)
    
    # Train on each file sequentially
    for i, file_path in enumerate(all_files[start_file_idx:], start=start_file_idx):
        print(f"\nProcessing file {i+1}/{len(all_files)}: {os.path.basename(file_path)}")
        
        # Train model on this file
        model = train_on_file(model, file_path, device, epochs=20)
        
        # Save checkpoint after each file
        checkpoint = {
            'model': model.cpu(),  # Save model to CPU to avoid CUDA memory issues
            'file_idx': i
        }
        torch.save(checkpoint, checkpoint_path)
        
        # Move model back to device for next training
        model = model.to(device)
        
        print(f"Saved checkpoint after file {i+1}/{len(all_files)}")
    
    # Save final model
    torch.save(model.cpu(), "final_hierarchical_model.pt")
    print("Training completed on all files. Final hierarchical attention model saved.")

if __name__ == "__main__":
    main()

#### Testing

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import re
from rouge_score import rouge_scorer

# Hierarchical Attention Model definition
class WordEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_filters=100, filter_sizes=[1,2,3,4,5,6,7]):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, fs, padding=fs//2)
            for fs in filter_sizes
        ])
        self.gru = nn.GRU(num_filters * len(filter_sizes), hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, word_embeddings):
        # word_embeddings: (batch, max_sentences, max_words, embedding_dim)
        B, S, W, E = word_embeddings.size()
        x = word_embeddings.view(B * S, W, E).permute(0, 2, 1)  # (B*S, E, W)
        convs = [F.relu(conv(x)) for conv in self.convs]
        pools = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in convs]  # (B*S, num_filters)
        x_cnn = torch.cat(pools, dim=1).unsqueeze(1).expand(-1, W, -1)  # (B*S, W, CNN_DIM)
        x_gru, _ = self.gru(x_cnn)
        attn_weights = torch.softmax(self.attention(x_gru).squeeze(-1), dim=1)
        sent_vec = torch.sum(attn_weights.unsqueeze(-1) * x_gru, dim=1)  # (B*S, hidden_dim*2)
        return sent_vec.view(B, S, -1), attn_weights.view(B, S, W)  # (B, S, hidden_dim*2), (B, S, W)

class SentenceEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, sentence_vecs):
        x, _ = self.gru(sentence_vecs)  # (B, S, 2*H)
        attn_weights = torch.softmax(self.attention(x).squeeze(-1), dim=1)
        doc_vec = torch.sum(attn_weights.unsqueeze(-1) * x, dim=1)  # (B, 2*H)
        scores = torch.sigmoid(self.fc(x)).squeeze(-1)  # (B, S)
        return scores, attn_weights

class HierarchicalAttentionSummarizer(nn.Module):
    def __init__(self, embedding_dim, word_hidden_dim=256, sent_hidden_dim=256):
        super().__init__()
        self.word_encoder = WordEncoder(embedding_dim, word_hidden_dim)
        self.sent_encoder = SentenceEncoder(word_hidden_dim * 2, sent_hidden_dim)
        
    def forward(self, x):
        # x: (batch_size, max_sentences, max_words, embedding_dim)
        sent_vecs, word_attn = self.word_encoder(x)  # (B, S, 2*H), (B, S, W)
        sentence_scores, sent_attn = self.sent_encoder(sent_vecs)  # (B, S), (B, S)
        return word_attn, sent_attn, sentence_scores

# ===== Using already loaded FastText embeddings =====
# We assume fasttext is already loaded and available as you mentioned
print("Using already loaded FastText embeddings...")
fasttext_vectors = torch.tensor(fasttext.vectors, dtype=torch.float16)
fasttext_vocab = fasttext.index_to_key
word2idx = {word: idx for idx, word in enumerate(fasttext_vocab)}

# ===== Text Processing Functions =====
def preprocess_text(text):
    """Clean and preprocess text for embedding."""
    if not isinstance(text, str):
        return ""
    
    # Remove special characters and numbers, keep only alphabets and spaces
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    return text.lower().strip()

def split_into_sentences(text):
    """Split text into sentences."""
    if not isinstance(text, str):
        return []
    
    # Basic sentence splitting on punctuation followed by space
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

# ===== Embedding Generation Functions =====
MAX_SENTENCES = 50
MAX_WORDS = 50
EMBEDDING_DIM = fasttext_vectors.shape[1]  # Use fasttext dimension

def get_embedding(word, word2idx, embedding_vectors):
    """Retrieve word embedding."""
    idx = word2idx.get(word, None)
    if idx is not None:
        return embedding_vectors[idx]
    
    # For FastText, we can use its ability to handle OOV words if word not in vocabulary
    # Check if we have access to the fasttext model and use its get_vector method
    if hasattr(fasttext, 'get_vector'):
        try:
            return torch.tensor(fasttext.get_vector(word), dtype=torch.float16)
        except:
            pass
            
    return torch.zeros(EMBEDDING_DIM, dtype=torch.float16)  # Zero vector for unknown words

def sentence_to_vector(sentence, word2idx, embedding_vectors, max_words=MAX_WORDS):
    """Convert a sentence into a word embedding matrix."""
    words = preprocess_text(sentence).split()[:max_words]
    embedding_matrix = torch.zeros((max_words, EMBEDDING_DIM), dtype=torch.float16)

    for i, word in enumerate(words):
        embedding_matrix[i] = get_embedding(word, word2idx, embedding_vectors)

    return embedding_matrix

def article_to_vectors(sentences, word2idx, embedding_vectors, max_sentences=MAX_SENTENCES):
    """Convert all sentences in an article to a padded 3D tensor."""
    sentence_vectors = [
        sentence_to_vector(sent, word2idx, embedding_vectors) 
        for sent in sentences[:max_sentences]
    ]

    # Pad if necessary
    num_sentences = len(sentence_vectors)
    if num_sentences < max_sentences:
        padding = [torch.zeros((MAX_WORDS, EMBEDDING_DIM), dtype=torch.float16)] * (max_sentences - num_sentences)
        return torch.stack(sentence_vectors + padding)

    return torch.stack(sentence_vectors[:max_sentences])

def process_test_dataset(df_path, word2idx, embedding_vectors, batch_size=100, max_articles=1500):
    """Process test dataset and convert to embeddings."""
    print(f"Loading test dataset from {df_path}...")
    df = pd.read_csv(df_path)
    
    # Limit to the first max_articles (1500)
    df = df.head(max_articles)
    
    print(f"Processing {len(df)} test articles...")
    all_vectors = []
    original_articles = []
    highlights = []

    # Process in batches to manage memory
    for start_idx in tqdm(range(0, len(df), batch_size), desc="Processing test batches"):
        end_idx = min(start_idx + batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx]
        
        batch_vectors = []
        for article in batch_df["article"]:
            sentences = split_into_sentences(article)
            article_vectors = article_to_vectors(sentences, word2idx, embedding_vectors)
            batch_vectors.append(article_vectors)
            
        # Store original articles and highlights for later use
        original_articles.extend(batch_df["article"].tolist())
        highlights.extend(batch_df["highlights"].tolist())
        
        # Stack batch and add to result
        if batch_vectors:
            all_vectors.append(torch.stack(batch_vectors))
    
    # Combine all batches
    test_embeddings = torch.cat(all_vectors)
    print(f"Generated embeddings shape: {test_embeddings.shape}")
    
    return test_embeddings, original_articles, highlights

# ===== HierarchicalSummaryDataset Class =====
class HierarchicalSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings.cpu()  # Ensure embeddings are on CPU

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        # Return the full 3D tensor (max_sentences, max_words, embedding_dim)
        return torch.tensor(self.embeddings[idx], dtype=torch.float32)

# ===== Summary Extraction and Evaluation =====
def extract_top_sentences(scores, articles, top_k=3):
    """Extract top-k sentences from each article based on scores."""
    summaries = []
    
    for i, article in enumerate(articles):
        sentences = split_into_sentences(article)
        
        if not sentences:
            summaries.append("")
            continue
            
        # Get actual number of sentences
        num_sentences = min(len(sentences), scores.shape[1])
        
        # Get scores for this article
        article_scores = scores[i][:num_sentences]
        
        # Get indices of top-k sentences
        if len(article_scores) <= top_k:
            top_indices = np.arange(len(article_scores))
        else:
            top_indices = np.argsort(-article_scores)[:top_k]
        
        # Sort indices to maintain original order
        top_indices = sorted(top_indices)
        
        # Extract selected sentences
        selected_sentences = [sentences[idx] for idx in top_indices if idx < len(sentences)]
        
        # Join sentences
        summary = " ".join(selected_sentences)
        summaries.append(summary)
    
    return summaries

def compute_rouge(predicted_summaries, gold_summaries):
    """Compute ROUGE scores."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    
    for pred, gold in zip(predicted_summaries, gold_summaries):
        # Handle empty strings
        if not isinstance(pred, str) or not pred.strip():
            pred = "empty summary"
        if not isinstance(gold, str) or not gold.strip():
            gold = "empty summary"
            
        score = scorer.score(gold, pred)
        
        for key in scores:
            scores[key].append(score[key].fmeasure)
    
    # Calculate average scores
    avg_scores = {key: np.mean(val) for key, val in scores.items()}
    
    return avg_scores

# ===== Main Evaluation Function =====
def evaluate_model(model_path, test_data_path, word2idx, embedding_vectors):
    """End-to-end evaluation pipeline."""
    print(f"Starting evaluation pipeline...")
    
    # Step 1: Load model
    print(f"Loading model from {model_path}...")
    model = torch.load(model_path)
    
    # Step 2: Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    model = model.to(device)
    
    # Step 3: Generate test embeddings
    print("Generating test embeddings...")
    test_embeddings, original_articles, highlights = process_test_dataset(
        test_data_path, word2idx, embedding_vectors, max_articles=1500  # Limit to 1500 articles
    )
    
    # Step 4: Save embeddings to avoid regenerating (optional)
    torch.save(test_embeddings, "fasttext_hierarchical_test_embeddings_1500.pt")
    print("Saved test embeddings to fasttext_hierarchical_test_embeddings_1500.pt")
    
    # Step 5: Create dataset and dataloader for hierarchical model
    test_dataset = HierarchicalSummaryDataset(test_embeddings)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=0)
    
    # Step 6: Extract sentence scores
    model.eval()
    all_scores = []
    
    print("Generating sentence scores...")
    with torch.no_grad():
        for inputs in tqdm(test_loader, desc="Scoring sentences"):
            inputs = inputs.to(device)
            # The hierarchical model returns word_attn, sent_attn, scores
            _, _, scores = model(inputs)
            all_scores.append(scores.cpu().numpy())
    
    # Combine all batch results
    sentence_scores = np.concatenate(all_scores, axis=0)
    
    # Step 7: Generate summaries
    print("Extracting top sentences for summaries...")
    predicted_summaries = extract_top_sentences(
        sentence_scores, 
        original_articles, 
        top_k=3
    )
    
    # Step 8: Compute ROUGE scores
    print("Computing ROUGE scores...")
    rouge_scores = compute_rouge(predicted_summaries, highlights)
    
    # Step 9: Print results
    print("\nROUGE Scores:")
    for key, value in rouge_scores.items():
        print(f"{key}: {value:.4f}")
    
    # Step 10: Save predictions
    results_df = pd.DataFrame({
        'article': original_articles,
        'highlights': highlights,
        'predicted_summary': predicted_summaries
    })
    results_df.to_csv("hierarchical_test_predictions_1500.csv", index=False)
    print("Saved predictions to hierarchical_test_predictions_1500.csv")
    
    return rouge_scores

# ===== Run Evaluation =====
if __name__ == "__main__":
    # Get fasttext embeddings onto GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    fasttext_vectors = fasttext_vectors.to(device)
    
    # Run evaluation using fasttext
    evaluate_model(
        model_path="final_hierarchical_model.pt",  # Path to your trained hierarchical model
        test_data_path="/kaggle/input/sampled-20k/test_2000.csv",
        word2idx=word2idx,
        embedding_vectors=fasttext_vectors
    )

### Glove

#### Training

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from tqdm import tqdm
import glob

# Hierarchical Attention Mechanism model definitions
class WordEncoder(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_filters=100, filter_sizes=[1,2,3,4,5,6,7]):
        super().__init__()
        self.convs = nn.ModuleList([
            nn.Conv1d(embedding_dim, num_filters, fs, padding=fs//2)
            for fs in filter_sizes
        ])
        self.gru = nn.GRU(num_filters * len(filter_sizes), hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, word_embeddings):
        # word_embeddings: (batch, max_sentences, max_words, embedding_dim)
        B, S, W, E = word_embeddings.size()
        x = word_embeddings.view(B * S, W, E).permute(0, 2, 1)  # (B*S, E, W)
        
        convs = [F.relu(conv(x)) for conv in self.convs]
        pools = [F.max_pool1d(c, c.shape[2]).squeeze(2) for c in convs]  # (B*S, num_filters)
        x_cnn = torch.cat(pools, dim=1).unsqueeze(1).expand(-1, W, -1)  # (B*S, W, CNN_DIM)
        
        x_gru, _ = self.gru(x_cnn)
        attn_weights = torch.softmax(self.attention(x_gru).squeeze(-1), dim=1)
        sent_vec = torch.sum(attn_weights.unsqueeze(-1) * x_gru, dim=1)  # (B*S, hidden_dim*2)
        
        return sent_vec.view(B, S, -1)  # (B, S, hidden_dim*2)

class SentenceEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.attention = nn.Linear(hidden_dim * 2, 1)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        
    def forward(self, sentence_vecs):
        x, _ = self.gru(sentence_vecs)  # (B, S, 2*H)
        attn_weights = torch.softmax(self.attention(x).squeeze(-1), dim=1)
        doc_vec = torch.sum(attn_weights.unsqueeze(-1) * x, dim=1)  # (B, 2*H)
        scores = torch.sigmoid(self.fc(x)).squeeze(-1)  # (B, S)
        
        return scores

class HierarchicalAttentionSummarizer(nn.Module):
    def __init__(self, embedding_dim, word_hidden_dim=256, sent_hidden_dim=256):
        super().__init__()
        self.word_encoder = WordEncoder(embedding_dim, word_hidden_dim)
        self.sent_encoder = SentenceEncoder(word_hidden_dim * 2, sent_hidden_dim)
        
    def forward(self, x):
        # x: (batch_size, max_sentences, max_words, embedding_dim)
        sent_vecs = self.word_encoder(x)  # (B, S, 2*H)
        sentence_scores = self.sent_encoder(sent_vecs)  # (B, S)
        
        return sentence_scores

# Define the dataset class
class TextSummaryDataset(Dataset):
    def __init__(self, embeddings):
        self.embeddings = embeddings
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx]

def train_on_file(model, file_path, device, epochs=20):
    print(f"\nTraining on file: {os.path.basename(file_path)}")
    
    # Load embeddings from file to CPU first
    embeddings = torch.load(file_path, map_location="cpu")
    
    # Convert embeddings to float32 to match model parameters
    embeddings = embeddings.to(torch.float32)
    
    # Create dataset and dataloader
    dataset = TextSummaryDataset(embeddings)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0)
    
    # Set up optimizer
    optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.99, 0.999))
    
    # Training loop for this file
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        progress = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for inputs in progress:
            inputs = inputs.to(device).to(torch.float32)  # Ensure inputs are float32
            
            optimizer.zero_grad()
            
            # Forward pass - get sentence scores
            sentence_scores = model(inputs)
            
            # Compute loss - using the same approximation as before
            # This could be replaced with a more appropriate loss for summarization
            loss = torch.mean((sentence_scores - sentence_scores.mean(dim=1, keepdim=True)) ** 2)
            
            # Backward pass
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            progress.set_postfix(loss=running_loss / len(progress))
        
        print(f"File: {os.path.basename(file_path)} - Epoch {epoch+1}, Loss: {running_loss/len(dataloader):.4f}")
    
    # Clear memory to avoid OOM errors
    del embeddings, dataset, dataloader
    torch.cuda.empty_cache()
    
    return model

def main():
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Get all the training files - modify these paths to point to your GloVe embeddings
    files_part1 = sorted(glob.glob("/kaggle/input/glove-train-chunk-pt1/*.pt"))
    files_part2 = sorted(glob.glob("/kaggle/input/glove-train-chunk-pt2/*.pt"))
    glove_files = files_part1 + files_part2
    
    print(f"Found {len(glove_files)} training files")
    
    # Check if there's a checkpoint to continue from
    checkpoint_path = "hierarchical_attention_checkpoint.pt"
    start_file_idx = 0
    
    if os.path.exists(checkpoint_path):
        print(f"Loading checkpoint from {checkpoint_path}")
        checkpoint = torch.load(checkpoint_path)
        model = checkpoint['model']
        start_file_idx = checkpoint['file_idx'] + 1
        print(f"Continuing from file index {start_file_idx}")
    else:
        # Initialize model
        # Load first file just to get embedding_dim and structure
        sample_data = torch.load(glove_files[0], map_location="cpu")
        embedding_dim = sample_data.shape[-1]  # Should be 100 for GloVe
        
        # Initialize the hierarchical attention model
        model = HierarchicalAttentionSummarizer(embedding_dim=embedding_dim)
        print(f"Initialized new hierarchical attention model with embedding_dim={embedding_dim}")
        
        del sample_data  # Free memory
    
    # Move model to device and ensure it's in float32
    model = model.to(device).to(torch.float32)
    
    # Train on each file sequentially
    for i, file_path in enumerate(glove_files[start_file_idx:], start=start_file_idx):
        print(f"\nProcessing file {i+1}/{len(glove_files)}: {os.path.basename(file_path)}")
        
        try:
            # Train model on this file
            model = train_on_file(model, file_path, device, epochs=20)
            
            # Save checkpoint after each file
            checkpoint = {
                'model': model.cpu(),  # Save model to CPU to avoid CUDA memory issues
                'file_idx': i
            }
            torch.save(checkpoint, checkpoint_path)
            
            # Move model back to device for next training
            model = model.to(device)
            
            print(f"Saved checkpoint after file {i+1}/{len(glove_files)}")
            
        except Exception as e:
            print(f"Error processing file {file_path}: {e}")
            # Save current checkpoint before exiting
            checkpoint = {
                'model': model.cpu(),
                'file_idx': i-1  # Record the last successful file
            }
            torch.save(checkpoint, checkpoint_path)
            raise e
    
    # Save final model
    torch.save(model.cpu(), "hierarchical_attention_final_model.pt")
    print("Training completed on all files. Final model saved.")

if __name__ == "__main__":
    main()