In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import random
import time
from collections import Counter
import spacy
from tqdm import tqdm
import os
import gc

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED) if torch.cuda.is_available() else None
np.random.seed(SEED)
random.seed(SEED)

In [3]:

# Load spaCy for sentence tokenization
try:
    nlp = spacy.load("en_core_web_sm")
except:
    import subprocess
    subprocess.call("python -m spacy download en_core_web_sm", shell=True)
    nlp = spacy.load("en_core_web_sm")

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
minimal_stopwords = set(['the', 'and', 'a', 'of', 'to', 'in', 'that', 'it', 'with', 'for', 'on', 'at'])
lemmatizer = WordNetLemmatizer()

def improved_preprocess(text, lower_case=True, lemmatize=True, stopword_removal=True):
    """Improved preprocessing function that preserves more contextual information"""
    if lower_case:
        text = text.lower()

    # Remove non-alphanumeric characters except for basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s\.\,\?\!]', '', text)

    # Tokenize
    tokens = nltk.word_tokenize(text)

    # Remove stopwords if specified
    if stopword_removal:
        tokens = [word for word in tokens if word not in minimal_stopwords]

    # Lemmatize if specified
    if lemmatize:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into text
    processed_text = " ".join(tokens)
    return processed_text

def tokenize(text):
    """Tokenize text to words"""
    return nltk.word_tokenize(text.lower())

def build_vocab(sentences, min_freq_ratio=0.01):
    """Build vocabulary from sentences with minimum frequency threshold"""
    all_tokens = [token for sent in sentences for token in tokenize(sent)]
    total = len(all_tokens)
    counter = Counter(all_tokens)

    # Calculate minimum count threshold
    min_count = max(1, int(total * min_freq_ratio))

    # Initialize vocabulary with special tokens
    vocab = {"<pad>": 0, "<unk>": 1, "<bos>": 2, "<eos>": 3}
    idx = 4

    # Add words that meet the frequency threshold
    for word, count in counter.items():
        if count >= min_count:
            vocab[word] = idx
            idx += 1

    print(f"Vocabulary size: {len(vocab)}")
    print(f"Min count threshold: {min_count}")
    return vocab

class WikiTitleDataset(Dataset):
    def __init__(self, df, vocab, max_length_text=512, max_length_title=30):
        self.df = df
        self.vocab = vocab
        self.max_length_text = max_length_text
        self.max_length_title = max_length_title

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['text']
        title = self.df.iloc[idx]['title']

        # Convert text to indices
        text_tokens = tokenize(text)[:self.max_length_text]
        text_indices = [self.vocab.get(token, self.vocab['<unk>']) for token in text_tokens]

        # Convert title to indices
        title_tokens = tokenize(title)[:self.max_length_title-2]  # -2 for <bos> and <eos>
        title_indices = [self.vocab['<bos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in title_tokens] + [self.vocab['<eos>']]

        return {
            'text': torch.tensor(text_indices, dtype=torch.long),
            'title': torch.tensor(title_indices, dtype=torch.long),
            'raw_text': text,
            'raw_title': title
        }

def collate_fn(batch):
    """Custom collate function for DataLoader"""
    # Sort batch by text length in descending order for packed sequences
    batch = sorted(batch, key=lambda x: len(x['text']), reverse=True)

    text_lengths = [len(item['text']) for item in batch]
    title_lengths = [len(item['title']) for item in batch]

    # Pad sequences
    padded_texts = torch.nn.utils.rnn.pad_sequence([item['text'] for item in batch], padding_value=0)
    padded_titles = torch.nn.utils.rnn.pad_sequence([item['title'] for item in batch], padding_value=0)

    # Keep raw texts and titles
    raw_texts = [item['raw_text'] for item in batch]
    raw_titles = [item['raw_title'] for item in batch]

    return {
        'text': padded_texts,
        'title': padded_titles,
        'text_lengths': torch.tensor(text_lengths),
        'title_lengths': torch.tensor(title_lengths),
        'raw_text': raw_texts,
        'raw_title': raw_titles
    }

 

In [4]:
print("Loading data...")
train_df = pd.read_csv('/kaggle/input/wiki-data/train.csv')
val_df = train_df.sample(n=500, random_state=42)
train_df = train_df.drop(val_df.index)
test_df = pd.read_csv('/kaggle/input/wiki-data/test.csv')

Loading data...


In [5]:
print("Preprocessing data...")
train_df['text'] = train_df['text'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=True))
val_df['text'] = val_df['text'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=True))
test_df['text'] = test_df['text'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=True))

Preprocessing data...


In [6]:
train_df['original_title'] = train_df['title']
val_df['original_title'] = val_df['title']
test_df['original_title'] = test_df['title']
train_df['title'] = train_df['title'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=False))
val_df['title'] = val_df['title'].apply(lambda x: improved_preprocess(x, lower_case=True, stopword_removal=False))

In [7]:
print("Building vocabulary...")
all_texts = list(train_df['text']) + list(train_df['title'])
vocab = build_vocab(all_texts, min_freq_ratio=0.0000007)  # Using very low threshold to prevent ommiting most words

Building vocabulary...
Vocabulary size: 46040
Min count threshold: 19


In [8]:
# Create datasets
train_dataset = WikiTitleDataset(train_df, vocab, max_length_text=512, max_length_title=30)
val_dataset = WikiTitleDataset(val_df, vocab, max_length_text=512, max_length_title=30)
test_dataset = WikiTitleDataset(test_df, vocab, max_length_text=512, max_length_title=30)

batch_size = 16  # Adjust based on your GPU memory

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)  # Smaller batch for testing


In [9]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 300

# Path to GloVe embeddings
glove_path = '/kaggle/input/wiki-datas/glove.6B.300d.txt'