In [1]:
import torch 
from data.data_extract import load_pubmed_extracted
from collections import Counter
import re

data = load_pubmed_extracted('D:/data_phd_code_from_scatch/datasets/pubmed_abstracts.json')


In [2]:
def preprocess_data(texts):
    tokenized_corpus = []
    for text in texts:
        # Split on whitespace and clean tokens
        tokens = [clean_token(token) for token in text.split()]
        # Remove None values
        tokens = [token for token in tokens if token]
        tokenized_corpus.extend(tokens)
    return tokenized_corpus

def clean_token(token):
        # Remove non-alphanumeric characters, convert to lowercase
        cleaned = re.sub(r'[^a-z0-9]', '', token.lower())
        return cleaned 
    

In [3]:
def prepare_data(texts, min_freq = 50, max_vocab_size=10000):
    """
    Prepares data for Word2Vec by tokenizing, replacing rare words with 'UNK', and creating vocab mappings.

    Args:
        texts (list of str): List of text strings.
        min_freq (int): Minimum frequency for tokens to be included in the vocabulary.

    Returns:
        dict: word2idx mapping.
        dict: idx2word mapping.
        list: Updated corpus with rare words replaced by 'UNK'.
    """
    corpus = preprocess_data(texts)
    token_counts = Counter(corpus)

    vocab = ['<UNK>']  # UNK token is always first
    filtered_vocab = [token for token,count in token_counts.most_common(max_vocab_size) if count >= min_freq]
    vocab.extend(filtered_vocab)
    word2idx = {word:idx for idx,word in enumerate(vocab)}
    idx2word = {idx:word for idx,word in word2idx.items()}

    # Replace out-of-vocab words with UNK index
    updated_corpus = [word2idx.get(token, word2idx['<UNK>']) for token in corpus]    

    return word2idx, idx2word, updated_corpus


In [20]:
def generate_training_pairs(corpus, window_size=5):
    """
    Generate training pairs for Skip-Gram.

    Args:
        corpus (list): List of tokens (processed, including 'UNK').
        window_size (int): Size of the context window.

    Returns:
        list of tuples: (center_word, context_word) pairs.
    """
    end_corpus = len(corpus)
    for i,center in enumerate(corpus):
        start = max(0,i-window_size)
        stop = min(end_corpus, i + window_size + 1)
        for j in range(start,stop):
            if j != i:
                yield (center, corpus[j])


In [30]:
https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset

class Word2VecDataset(torch.utils.data.IterableDataset):
    def __init__(self, corpus, word2idx, window_size=5):
        super(Word2VecDataset).__init__()
        """
        Memory-efficient Dataset for Word2Vec using a generator.

        Args:
            corpus (list): List of tokens (including 'UNK').
            word2idx (dict): Word-to-index mapping.
            window_size (int): Size of the context window.
        """
        self.corpus = corpus
        self.word2idx = word2idx
        self.window_size = window_size

    def __iter__(self):
        """
        Generate training pairs as tensors
        
        Yields:
            tuple: (center_word_tensor, context_word_tensor)
        """
        for center, context in generate_training_pairs(self.corpus, self.window_size):
            center_idx = self.word2idx.get(center, self.word2idx.get('<UNK>', 0))
            context_idx = self.word2idx.get(context, self.word2idx.get('<UNK>', 0))
            yield torch.tensor(center_idx, dtype=torch.long), torch.tensor(context_idx, dtype=torch.long)


Center: tensor([0, 0, 0, 0]), Context: tensor([0, 0, 0, 0])
Center words: ['<UNK>', '<UNK>', '<UNK>', '<UNK>']
Context words: ['<UNK>', '<UNK>', '<UNK>', '<UNK>']


In [None]:

# When creating the DataLoader, use a worker-based approach
texts = [item['text'] for item in data]
word2idx, idx2word, updated_corpus = prepare_data(texts)
dataset = Word2VecDataset(updated_corpus, word2idx, window_size=5)

# Use num_workers for proper iteration
dataloader = torch.utils.data.DataLoader(
    dataset, 
    batch_size=4, 
    num_workers=0  # Use 0 for debugging, can increase for performance
)


In [33]:

# Iterate through dataloader
for center, context in dataloader:
    print(f"Center: {center}, Context: {context}")
    # Convert back to words if needed
    print("Center words:", [idx2word.get(idx.item(), '<UNK>') for idx in center])
    print("Context words:", [idx2word.get(idx.item(), '<UNK>') for idx in context])
    break


KeyboardInterrupt: 

In [None]:
texts = [item['text'] for item in data]
word2idx, idx2word, updated_corpus =  prepare_data(texts)

TypeError: 'NoneType' object cannot be interpreted as an integer

In [29]:
dataset = Word2VecDataset(updated_corpus, word2idx, window_size=5)
# Use num_workers for proper iteration
dataloader = torch.utils.data.DataLoader(
    dataset, 
    batch_size=4, 
    num_workers=0  # Use 0 for debugging, can increase for performance
)
# Iterate through dataloader
for center, context in dataloader:
    print(f"Center: {center}, Context: {context}")
    print(f"Center: {idx2word.get(center)}, Context: {idx2word.get(context)}")
    break

TypeError: 'NoneType' object cannot be interpreted as an integer

In [25]:
dataset[0]

NotImplementedError: Subclasses of Dataset should implement __getitem__.