In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from nltk.corpus import brown
from  gensim.utils import simple_preprocess
from tqdm import tqdm
import random



In [2]:
brown_words = brown.words()
brown_text = " ".join(brown.words()).lower()

In [3]:
num_sents = len(brown.sents())
print("number of sentences:", num_sents)

number of sentences: 57340


In [4]:
len(np.unique(simple_preprocess(brown_text.lower())))

41239

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter

class Word2VecSentenceDataset_bak(Dataset):
    def __init__(self, sentences, window_size=2, negative_samples=5):
        """
        sentences: List of sentences, where each sentence is a list of words.
        window_size: The size of the context window.
        negative_samples: Number of negative samples to generate for each positive pair.
        """
        self.tokens = [word for sentence in sentences for word in sentence]  # Flatten the list of sentences
        self.vocab = {word: i for i, word in enumerate(set(self.tokens))}
        self.index_to_word = {i: word for word, i in self.vocab.items()}
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.word_frequencies = np.array([freq for word, freq in Counter(self.tokens).items()])**0.75
        self.word_frequencies /= self.word_frequencies.sum()
        self.data = self.generate_training_data(sentences)

    def generate_training_data(self, sentences):
        positive_pairs = []
        for sentence in sentences:
            for i, target_word in enumerate(sentence):
                target_index = self.vocab[target_word]
                context_indices = range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1))
                for j in context_indices:
                    if i != j:  # Exclude the target word itself
                        context_word = sentence[j]
                        context_index = self.vocab[context_word]
                        positive_pairs.append((target_index, context_index))
        return positive_pairs

    def get_negative_samples(self, target, num_samples):
        negatives = []
        while len(negatives) < num_samples:
            neg_sample = np.random.choice(len(self.vocab), p=self.word_frequencies)
            if neg_sample != target:
                negatives.append(neg_sample)
        return negatives

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        negatives = self.get_negative_samples(target, self.negative_samples)
        return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long), torch.tensor(negatives, dtype=torch.long)


In [51]:
a = {"x":4, "y":3}
list(a.keys())

['x', 'y']

In [102]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter

class Word2VecSentenceDataset(Dataset):
    def __init__(self, sentences, window_size=2, negative_samples=5, total_negative_samples=10000):
        self.tokens = [word for sentence in sentences for word in sentence]
        self.vocab = {word: i for i, word in enumerate(set(self.tokens))}
        self.index_to_word = {i: word for word, i in self.vocab.items()}
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.word_frequencies = np.array([freq for word, freq in Counter(self.tokens).items()])**0.75
        self.word_frequencies /= self.word_frequencies.sum()
        self.negative_sample_pool = np.random.choice(len(self.word_frequencies), size=total_negative_samples, replace=True, p=self.word_frequencies).tolist()

        self.data = self.generate_training_data(sentences)

    def generate_training_data(self, sentences):
        training_data = []
        for sentence in sentences:
            for i, target_word in enumerate(sentence):
                target_index = self.vocab[target_word]
                context_indices = range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1))
                for j in context_indices:
                    if i != j:  # Exclude the target word itself
                        context_word = sentence[j]
                        context_index = self.vocab[context_word]

                        negative_samples = random.sample(self.negative_sample_pool, k=self.negative_samples)
                        #negative_samples = self.get_negative_samples(target_index, self.negative_samples)
                        training_data.append((target_index, context_index, negative_samples))
   
        return training_data

    def get_negative_samples(self, target, num_samples):
        negatives = []
        while len(negatives) < num_samples:
            neg_sample = np.random.choice(len(self.vocab), p=self.word_frequencies)
            if neg_sample != target:
                negatives.append(neg_sample)
        return negatives

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        target, context, negatives = self.data[idx]
        return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long), torch.tensor(negatives, dtype=torch.long)


In [79]:
class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)

        # Initialize weights
        self.target_embeddings.weight.data.uniform_(-1, 1)
        self.context_embeddings.weight.data.uniform_(-1, 1)

    def forward(self, target_words, context_words, negative_words):
        target_embeds = self.target_embeddings(target_words)
        context_embeds = self.context_embeddings(context_words)
        negative_embeds = self.context_embeddings(negative_words)

        positive_score = torch.sum(target_embeds * context_embeds, dim=1)
        negative_score = torch.bmm(negative_embeds, target_embeds.unsqueeze(2)).squeeze()

        return -torch.mean(torch.log(torch.sigmoid(positive_score)) + torch.sum(torch.log(torch.sigmoid(-negative_score)), dim=1))


In [8]:
def preprocess_brown_corpus():
    processed_sentences = []
    for sentence in brown.sents():
        processed_sentence = simple_preprocess(' '.join(sentence), deacc=True)  
        processed_sentences.append(processed_sentence)
    return processed_sentences

In [99]:
def prepare_dataset():
    # Example corpus
    corpus = preprocess_brown_corpus()
    print("preprocessed")
    # Hyperparameters
    window_size = 2
    batch_size = 512
    total_negative_samples = 10000

    # Prepare dataset and dataloader
    dataset = Word2VecSentenceDataset(corpus, window_size)
    print("1111111111")
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    print("model loaded")
    return dataset, dataloader


In [85]:

def train(dataset, dataloader):

     embedding_dim = 20
     learning_rate = 0.01
     epochs = 10

     vocab_size = len(dataset.vocab)
     model = Word2VecModel(vocab_size, embedding_dim)
     
     optimizer = optim.SparseAdam(model.parameters(), lr=learning_rate)



     # Example training loop
     for epoch in range(epochs):
          total_loss = 0
          for target, context, negatives in tqdm(dataloader):  
               model.zero_grad()
               loss = model(target, context, negatives)
               loss.backward()
               optimizer.step()
               total_loss += loss.item()
               
          
     print(f"Epoch {epoch}, Loss: {total_loss}")

     return model


In [103]:
dataset, dataloader = prepare_dataset()
model = train(dataset, dataloader)

preprocessed
1111111111
model loaded


 59%|█████▊    | 4102/7003 [00:57<00:40, 71.91it/s]


KeyboardInterrupt: 

In [96]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True) as prof:
    with record_function("model_inference"):
        model = train(dataset, dataloader)
print("xxxxxxxxxx")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


STAGE:2024-03-25 17:23:08 92629:1664419 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
100%|██████████| 14/14 [00:00<00:00, 26.90it/s]
100%|██████████| 14/14 [00:00<00:00, 55.40it/s]
100%|██████████| 14/14 [00:00<00:00, 27.98it/s]
100%|██████████| 14/14 [00:00<00:00, 31.78it/s]
100%|██████████| 14/14 [00:00<00:00, 43.92it/s]
100%|██████████| 14/14 [00:00<00:00, 55.88it/s]
100%|██████████| 14/14 [00:00<00:00, 55.77it/s]
100%|██████████| 14/14 [00:00<00:00, 57.32it/s]
100%|██████████| 14/14 [00:00<00:00, 52.41it/s]
100%|██████████| 14/14 [00:00<00:00, 32.12it/s]
[W CPUAllocator.cpp:235] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


Epoch 9, Loss: 26.99661350250244


STAGE:2024-03-25 17:23:14 92629:1664419 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-03-25 17:23:14 92629:1664419 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


xxxxxxxxxx
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         5.94%     212.166ms       100.00%        3.570s        3.570s     381.03 Kb     -55.21 Mb             1  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        55.56%        1.984s        70.86%        2.530s      16.866ms       3.79 Mb      -3.72 Mb           150  
                         Optimizer.step#SparseAdam.step         1.74%      62.265ms        15.51%     553.593ms       3.954ms     237.50 Kb

In [23]:
import cProfile
import pstats

# Create a Profile object
profiler = cProfile.Profile()
profiler.enable()

# The code you want to profile
prepare_dataset()

profiler.disable()

# Create Stats object
stats = pstats.Stats(profiler).sort_stats('time')

# Print the statistics
stats.print_stats()




preprocessed
1111111111
model loaded
         29628201 function calls (29628198 primitive calls) in 35.664 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   349583   19.370    0.000   21.320    0.000 {method 'choice' of 'numpy.random.mtrand.RandomState' objects}
  6127074    2.411    0.000    3.237    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/gensim/utils.py:200(<genexpr>)
  1080784    1.307    0.000    1.523    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/gensim/utils.py:270(simple_tokenize)
  1161192    1.265    0.000    1.789    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/nltk/tag/util.py:10(str2tuple)
   129847    1.063    0.000    4.300    0.000 {method 'join' of 'str' objects}
    57340    0.828    0.000    2.654    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/gensim/utils.py

<pstats.Stats at 0x105341090>

In [14]:
prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)

''

In [87]:
# Inspect embeddings
word_embeddings = model.target_embeddings.weight.data
print(word_embeddings[dataset.vocab["atlanta"]])

tensor([-0.9071,  0.0152,  0.6542, -0.3143,  0.8903,  1.1885,  1.0614,  0.0296,
         1.2233,  0.1220, -1.0490,  0.2780, -0.2735,  0.0874,  1.3916,  0.3399,
         0.8787, -0.8516,  0.5996, -0.4491])


In [88]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(word, word_to_int, int_to_word, embeddings, top_n=5):
    # Get the embedding for the given word
    word_idx = word_to_int[word]
    word_embedding = embeddings[word_idx].reshape(1, -1)
    
    # Calculate cosine similarity between this word and all other words in the vocabulary
    similarities = []
    for i in range(len(embeddings)):
        other_word_embedding = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(word_embedding, other_word_embedding)[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Convert indices back to words and filter out the input word
    similar_words = [(int_to_word[sim[0]], sim[1]) for sim in similarities if sim[0] != word_idx]
    
    # Return the top N most similar words, excluding the word itself
    return similar_words[:top_n]




In [91]:
# Example usage
word = 'book'  # The word you want to find similar words for
similar_words = find_most_similar(word, dataset.vocab, dataset.index_to_word, word_embeddings, top_n=5)
print(f"Most similar words to '{word}':", similar_words)

Most similar words to 'book': [('balance', 0.8909827), ('gaggle', 0.88423204), ('announcement', 0.8836107), ('drummer', 0.88299674), ('chopped', 0.87900347)]
