In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from nltk.corpus import brown
from  gensim.utils import simple_preprocess
from tqdm import tqdm
import random



In [2]:
brown_words = brown.words()
brown_text = " ".join(brown.words()).lower()

In [3]:
num_sents = len(brown.sents())
print("number of sentences:", num_sents)

number of sentences: 57340


In [4]:
len(np.unique(simple_preprocess(brown_text.lower())))

41239

In [5]:
a = {"x":4, "y":3}
list(a.keys())

['x', 'y']

In [6]:
class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)

        # Initialize weights
        self.target_embeddings.weight.data.uniform_(-1, 1)
        self.context_embeddings.weight.data.uniform_(-1, 1)

    def forward(self, target_words, context_words, negative_words):
        target_embeds = self.target_embeddings(target_words)
        context_embeds = self.context_embeddings(context_words)
        negative_embeds = self.context_embeddings(negative_words)

        positive_score = torch.sum(target_embeds * context_embeds, dim=1)
        negative_score = torch.bmm(negative_embeds, target_embeds.unsqueeze(2)).squeeze()

        return -torch.mean(torch.log(torch.sigmoid(positive_score)) + torch.sum(torch.log(torch.sigmoid(-negative_score)), dim=1))


In [7]:
def preprocess_brown_corpus():
    processed_sentences = []
    for sentence in brown.sents():
        processed_sentence = simple_preprocess(' '.join(sentence), deacc=True)  
        processed_sentences.append(processed_sentence)
    return processed_sentences

In [13]:
def prepare_dataset():
    # Example corpus
    corpus = preprocess_brown_corpus()
    print("preprocessed")
    # Hyperparameters
    window_size = 2
    batch_size = 5000
    total_negative_samples = 10000

    # Prepare dataset and dataloader
    dataset = Word2VecSentenceDataset(corpus, window_size)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    print("model loaded")
    return dataset, dataloader


In [9]:

def train(dataset, dataloader):

     embedding_dim = 20
     learning_rate = 0.01
     epochs = 10

     vocab_size = len(dataset.vocab)
     model = Word2VecModel(vocab_size, embedding_dim)
     
     optimizer = optim.SparseAdam(model.parameters(), lr=learning_rate)



     # Example training loop
     for epoch in range(epochs):
          total_loss = 0
          for target, context, negatives in tqdm(dataloader):  
               model.zero_grad()
               loss = model(target, context, negatives)
               loss.backward()
               optimizer.step()
               total_loss += loss.item()
               
          
          print(f"Epoch {epoch}, Loss: {total_loss}")

     return model


In [14]:
from dataset import Word2VecSentenceDataset

dataset, dataloader = prepare_dataset()
model = train(dataset, dataloader)

preprocessed
model loaded


100%|██████████| 718/718 [01:21<00:00,  8.79it/s]


Epoch 0, Loss: 849.9680168926716


100%|██████████| 718/718 [01:25<00:00,  8.41it/s]


Epoch 1, Loss: 245.8645165860653


100%|██████████| 718/718 [01:24<00:00,  8.47it/s]


Epoch 2, Loss: 205.25545005500317


100%|██████████| 718/718 [01:24<00:00,  8.52it/s]


Epoch 3, Loss: 192.76511053740978


100%|██████████| 718/718 [01:38<00:00,  7.28it/s]


Epoch 4, Loss: 185.80170972645283


100%|██████████| 718/718 [01:38<00:00,  7.29it/s]


Epoch 5, Loss: 180.20911352336407


100%|██████████| 718/718 [01:18<00:00,  9.10it/s]


Epoch 6, Loss: 175.49111561477184


100%|██████████| 718/718 [01:13<00:00,  9.78it/s]


Epoch 7, Loss: 171.24921622872353


100%|██████████| 718/718 [01:13<00:00,  9.74it/s]


Epoch 8, Loss: 167.71086248755455


100%|██████████| 718/718 [01:17<00:00,  9.32it/s]

Epoch 9, Loss: 164.59217229485512





In [12]:
from dataset import Word2VecSentenceDataset

dataset, dataloader = prepare_dataset()
model = train(dataset, dataloader)

preprocessed
model loaded


100%|██████████| 7003/7003 [01:21<00:00, 85.52it/s]


Epoch 0, Loss: 5024.479638680816


100%|██████████| 7003/7003 [01:15<00:00, 92.91it/s]


Epoch 1, Loss: 2165.785257384181


100%|██████████| 7003/7003 [01:19<00:00, 88.27it/s]


Epoch 2, Loss: 1943.2130392044783


100%|██████████| 7003/7003 [01:21<00:00, 85.84it/s] 


Epoch 3, Loss: 1847.0670467466116


  7%|▋         | 467/7003 [00:06<01:24, 77.00it/s]


KeyboardInterrupt: 

In [96]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True) as prof:
    with record_function("model_inference"):
        model = train(dataset, dataloader)
print("xxxxxxxxxx")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


STAGE:2024-03-25 17:23:08 92629:1664419 ActivityProfilerController.cpp:312] Completed Stage: Warm Up
100%|██████████| 14/14 [00:00<00:00, 26.90it/s]
100%|██████████| 14/14 [00:00<00:00, 55.40it/s]
100%|██████████| 14/14 [00:00<00:00, 27.98it/s]
100%|██████████| 14/14 [00:00<00:00, 31.78it/s]
100%|██████████| 14/14 [00:00<00:00, 43.92it/s]
100%|██████████| 14/14 [00:00<00:00, 55.88it/s]
100%|██████████| 14/14 [00:00<00:00, 55.77it/s]
100%|██████████| 14/14 [00:00<00:00, 57.32it/s]
100%|██████████| 14/14 [00:00<00:00, 52.41it/s]
100%|██████████| 14/14 [00:00<00:00, 32.12it/s]
[W CPUAllocator.cpp:235] Memory block of unknown size was allocated before the profiling started, profiler results will not include the deallocation event


Epoch 9, Loss: 26.99661350250244


STAGE:2024-03-25 17:23:14 92629:1664419 ActivityProfilerController.cpp:318] Completed Stage: Collection
STAGE:2024-03-25 17:23:14 92629:1664419 ActivityProfilerController.cpp:322] Completed Stage: Post Processing


xxxxxxxxxx
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg       CPU Mem  Self CPU Mem    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                        model_inference         5.94%     212.166ms       100.00%        3.570s        3.570s     381.03 Kb     -55.21 Mb             1  
enumerate(DataLoader)#_SingleProcessDataLoaderIter._...        55.56%        1.984s        70.86%        2.530s      16.866ms       3.79 Mb      -3.72 Mb           150  
                         Optimizer.step#SparseAdam.step         1.74%      62.265ms        15.51%     553.593ms       3.954ms     237.50 Kb

In [23]:
import cProfile
import pstats

# Create a Profile object
profiler = cProfile.Profile()
profiler.enable()

# The code you want to profile
prepare_dataset()

profiler.disable()

# Create Stats object
stats = pstats.Stats(profiler).sort_stats('time')

# Print the statistics
stats.print_stats()




preprocessed
1111111111
model loaded
         29628201 function calls (29628198 primitive calls) in 35.664 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   349583   19.370    0.000   21.320    0.000 {method 'choice' of 'numpy.random.mtrand.RandomState' objects}
  6127074    2.411    0.000    3.237    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/gensim/utils.py:200(<genexpr>)
  1080784    1.307    0.000    1.523    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/gensim/utils.py:270(simple_tokenize)
  1161192    1.265    0.000    1.789    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/nltk/tag/util.py:10(str2tuple)
   129847    1.063    0.000    4.300    0.000 {method 'join' of 'str' objects}
    57340    0.828    0.000    2.654    0.000 /Users/tevfikaytekin/miniforge3/envs/pytorch/lib/python3.11/site-packages/gensim/utils.py

<pstats.Stats at 0x105341090>

In [14]:
prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)

''

In [15]:
# Inspect embeddings
word_embeddings = model.target_embeddings.weight.data
print(word_embeddings[dataset.vocab["atlanta"]])

tensor([ 2.8405e-01,  9.8905e-02, -3.3643e-01, -1.5944e+00,  8.0877e-01,
        -2.1713e+00,  7.2223e-01, -6.2445e-01, -1.0976e+00,  2.6209e-01,
         9.4365e-04,  2.1053e+00,  8.8264e-04,  2.4814e-01, -7.1435e-01,
         8.6160e-01,  3.6904e-01, -1.6685e+00, -9.4704e-02, -1.3241e+00])


In [16]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(word, word_to_int, int_to_word, embeddings, top_n=5):
    # Get the embedding for the given word
    word_idx = word_to_int[word]
    word_embedding = embeddings[word_idx].reshape(1, -1)
    
    # Calculate cosine similarity between this word and all other words in the vocabulary
    similarities = []
    for i in range(len(embeddings)):
        other_word_embedding = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(word_embedding, other_word_embedding)[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Convert indices back to words and filter out the input word
    similar_words = [(int_to_word[sim[0]], sim[1]) for sim in similarities if sim[0] != word_idx]
    
    # Return the top N most similar words, excluding the word itself
    return similar_words[:top_n]




In [19]:
# Example usage
word = 'angry'  # The word you want to find similar words for
similar_words = find_most_similar(word, dataset.vocab, dataset.index_to_word, word_embeddings, top_n=5)
print(f"Most similar words to '{word}':", similar_words)

Most similar words to 'angry': [('goggle', 0.88218176), ('sup', 0.87773263), ('psychosomatic', 0.87381506), ('maximilian', 0.8635538), ('repent', 0.8627277)]
