In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from nltk.corpus import brown
from  gensim.utils import simple_preprocess
from tqdm import tqdm
import random
from dataset import Word2VecSentenceDataset



In [3]:
brown_words = brown.words()
brown_text = " ".join(brown.words()).lower()

In [3]:
num_sents = len(brown.sents())
print("number of sentences:", num_sents)

number of sentences: 57340


In [4]:
len(np.unique(simple_preprocess(brown_text.lower())))

41239

In [5]:
a = {"x":4, "y":3}
list(a.keys())

['x', 'y']

In [5]:
class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)

        # Initialize weights
        self.target_embeddings.weight.data.uniform_(-1, 1)
        self.context_embeddings.weight.data.uniform_(-1, 1)

    def forward(self, target_words, context_words, negative_words):
        target_embeds = self.target_embeddings(target_words)
        context_embeds = self.context_embeddings(context_words)
        negative_embeds = self.context_embeddings(negative_words)
 

        positive_score = torch.sum(target_embeds * context_embeds, dim=1)
        negative_score = torch.bmm(negative_embeds, target_embeds.unsqueeze(2)).squeeze()
   
        return -torch.mean(torch.log(torch.sigmoid(positive_score)) + torch.sum(torch.log(torch.sigmoid(-negative_score)), dim=1))


In [6]:
def preprocess_brown_corpus():
    processed_sentences = []
    for sentence in brown.sents():
        processed_sentence = simple_preprocess(' '.join(sentence), deacc=True)  
        processed_sentences.append(processed_sentence)
    return processed_sentences

In [7]:
corpus = preprocess_brown_corpus()

In [14]:

dataset = Word2VecSentenceDataset(corpus, window_size=2, negative_samples=5, total_negative_samples=10000)

In [10]:
print(len(dataset.tokens))
dataset.tokens[:10]

981039


['the',
 'fulton',
 'county',
 'grand',
 'jury',
 'said',
 'friday',
 'an',
 'investigation',
 'of']

In [11]:
print(len(dataset.data))
dataset.data[:10]

3585262


[(26233, 27914, [31431, 233, 808, 24462, 84]),
 (26233, 36134, [541, 1670, 2709, 272, 133]),
 (27914, 26233, [21181, 189, 13242, 17735, 248]),
 (27914, 36134, [7817, 2524, 17259, 6651, 295]),
 (27914, 1268, [6283, 18, 791, 9, 2658]),
 (36134, 26233, [15651, 163, 26752, 382, 6714]),
 (36134, 27914, [77, 24434, 6909, 39571, 4480]),
 (36134, 1268, [3856, 28778, 9328, 1517, 6790]),
 (36134, 23855, [1855, 35741, 2812, 0, 47]),
 (1268, 27914, [23, 6248, 38490, 4090, 357])]

In [16]:
def prepare_dataset(batch_size=512, window_size=5, negative_samples=5, total_negative_samples=10000):
    # Example corpus
    corpus = preprocess_brown_corpus()
    print("preprocessed")
    # Hyperparameters
    

    # Prepare dataset and dataloader
    dataset = Word2VecSentenceDataset(corpus, window_size, negative_samples, total_negative_samples)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    print("model loaded")
    return dataset, dataloader


In [17]:

def train(dataset, dataloader, epochs = 10, embedding_dim = 20):

     
     learning_rate = 0.01

     vocab_size = len(dataset.vocab)
     model = Word2VecModel(vocab_size, embedding_dim)
     
     optimizer = optim.SparseAdam(model.parameters(), lr=learning_rate)



     # Example training loop
     for epoch in range(epochs):
          total_loss = 0
          for target, context, negatives in tqdm(dataloader):  
               model.zero_grad()
               loss = model(target, context, negatives)
               loss.backward()
               optimizer.step()
               total_loss += loss.item()
               print(loss.item())
          
          print(f"Epoch {epoch}, Loss: {total_loss}")

     return model


In [18]:

dataset, dataloader = prepare_dataset(batch_size=512, window_size=5, negative_samples=5, total_negative_samples=10000)


preprocessed
model loaded


In [19]:
model = train(dataset, dataloader, epochs=5, embedding_dim = 30)

  0%|          | 2/15925 [00:01<2:03:04,  2.16it/s]

6.118812084197998
6.028980731964111
6.191402912139893
6.031180381774902
5.911285400390625
6.005388259887695
6.007167816162109


  0%|          | 13/15925 [00:01<14:53, 17.82it/s] 

6.042276382446289
5.818857669830322
5.85860013961792
5.959985256195068
5.985980033874512
5.8998847007751465
5.8394880294799805
5.743626117706299
6.018033981323242
5.513816833496094
5.858908653259277


  0%|          | 25/15925 [00:01<08:14, 32.16it/s]

5.482419013977051
5.76649284362793
5.814390659332275
5.590760231018066
5.635128021240234
5.743924140930176
5.92970085144043
5.704176425933838
5.752326488494873
5.488264083862305
5.671124458312988
5.5727105140686035


  0%|          | 38/15925 [00:01<06:08, 43.12it/s]

5.724306106567383
5.686717987060547
5.469682693481445
5.637192249298096
5.618267059326172
5.520267486572266
5.357206344604492
5.532052040100098
5.579288482666016
5.504079341888428
5.59511137008667
5.421290397644043


  0%|          | 50/15925 [00:01<05:19, 49.70it/s]

5.529731750488281
5.488776683807373
5.491903305053711
5.340784072875977
5.432787895202637
5.507948398590088
5.32711935043335
5.42296838760376
5.379973888397217
5.3936686515808105
5.4688591957092285
5.363813877105713


  0%|          | 62/15925 [00:02<05:38, 46.80it/s]

5.376306533813477
5.443447113037109
5.362707614898682
5.394664764404297
5.330791473388672
5.315066337585449
5.209598541259766
5.289705276489258


  0%|          | 68/15925 [00:02<06:43, 39.34it/s]

5.258356094360352
5.4281792640686035
5.232609748840332
5.090364456176758
5.167248249053955
5.074770927429199
5.135268211364746
5.148684024810791
5.145553112030029


  0%|          | 77/15925 [00:03<10:32, 25.06it/s]

5.234455108642578
5.158916473388672
5.1948442459106445
5.187009334564209
5.044800281524658
5.196109771728516


  1%|          | 81/15925 [00:03<10:08, 26.04it/s]

4.965250015258789
4.943820953369141
4.967985153198242
5.148873329162598
5.027953147888184
4.978152275085449
5.0746541023254395


  1%|          | 89/15925 [00:03<08:44, 30.19it/s]

5.002896785736084
4.7722978591918945
5.003775596618652
4.933221340179443
4.972075939178467
4.864471435546875
5.097987174987793
4.9611029624938965
4.849127769470215


  1%|          | 99/15925 [00:03<07:27, 35.38it/s]

4.861113548278809
4.815537929534912
4.9008097648620605
4.871829986572266
4.904585838317871
4.88133430480957
4.9160237312316895
4.7247233390808105


  1%|          | 107/15925 [00:03<07:18, 36.08it/s]

4.949332237243652
4.73650598526001
4.816738128662109
4.790104866027832
4.803312301635742
4.774954795837402
4.824333190917969
4.786520957946777


  1%|          | 116/15925 [00:04<07:03, 37.35it/s]

4.71120023727417
4.615214824676514
4.711833477020264
4.6290788650512695
4.885652542114258
4.599153995513916
4.615567684173584
4.605502128601074


  1%|          | 125/15925 [00:04<06:30, 40.42it/s]

4.560319900512695
4.520665168762207
4.528570175170898
4.670169830322266
4.570208549499512
4.493706226348877
4.600646018981934
4.6966986656188965
4.469510078430176
4.525454998016357


  1%|          | 134/15925 [00:04<06:45, 38.90it/s]

4.390516757965088
4.60860538482666
4.424403190612793
4.383378028869629
4.167551040649414
4.471012115478516
4.322669982910156
4.363311767578125
4.3353729248046875


  1%|          | 143/15925 [00:05<11:33, 22.76it/s]

4.325016975402832
4.445796012878418
4.201120853424072
4.329587459564209
4.443479061126709
4.2860565185546875
4.2542829513549805


  1%|          | 152/15925 [00:05<08:53, 29.58it/s]

4.248330116271973
4.387728691101074
4.068804740905762
4.1061530113220215
4.211218357086182
4.220152378082275
4.283351898193359
4.194844722747803
4.190384387969971


  1%|          | 156/15925 [00:05<08:56, 29.39it/s]

4.234819412231445
4.171573162078857
4.113008499145508
4.211209297180176
3.9524598121643066
4.060318946838379


  1%|          | 164/15925 [00:05<08:22, 31.35it/s]

4.140848636627197
4.013000011444092
3.8698179721832275
3.8773651123046875
4.129438400268555
3.835348606109619
4.016602516174316
3.9354169368743896
3.776674270629883


  1%|          | 172/15925 [00:05<09:01, 29.09it/s]

4.013361930847168
3.954721450805664
3.7807536125183105
3.781461715698242
3.593843460083008





KeyboardInterrupt: 

In [None]:
from torch.profiler import profile, record_function, ProfilerActivity

with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True) as prof:
    with record_function("model_inference"):
        model = train(dataset, dataloader)
print("xxxxxxxxxx")
print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


In [None]:
import cProfile
import pstats

# Create a Profile object
profiler = cProfile.Profile()
profiler.enable()

# The code you want to profile
prepare_dataset()

profiler.disable()

# Create Stats object
stats = pstats.Stats(profiler).sort_stats('time')

# Print the statistics
stats.print_stats()




In [14]:
prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)

''

In [12]:
# Inspect embeddings
word_embeddings = model.target_embeddings.weight.data
print(word_embeddings[dataset.vocab["atlanta"]])

tensor([ 0.8798, -0.0981, -0.0641, -0.6053, -0.0116,  0.1742,  0.4074, -0.1351,
        -0.3801,  0.5048, -1.6327,  0.0111, -0.0943, -0.1630,  1.1513,  0.1729,
        -0.1996, -0.7478, -1.0341,  0.5263,  0.4483,  0.1727,  1.0539, -0.1429,
        -0.2299,  0.7460,  0.1045,  0.5899, -0.3256, -0.1430])


In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(word, word_to_int, int_to_word, embeddings, top_n=5):
    # Get the embedding for the given word
    word_idx = word_to_int[word]
    word_embedding = embeddings[word_idx].reshape(1, -1)
    
    # Calculate cosine similarity between this word and all other words in the vocabulary
    similarities = []
    for i in range(len(embeddings)):
        other_word_embedding = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(word_embedding, other_word_embedding)[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Convert indices back to words and filter out the input word
    similar_words = [(int_to_word[sim[0]], sim[1]) for sim in similarities if sim[0] != word_idx]
    
    # Return the top N most similar words, excluding the word itself
    return similar_words[:top_n]




In [14]:
# Example usage
word = 'book'  # The word you want to find similar words for
similar_words = find_most_similar(word, dataset.vocab, dataset.index_to_word, word_embeddings, top_n=5)
print(f"Most similar words to '{word}':", similar_words)

Most similar words to 'book': [('comedie', 0.8072341), ('trempler', 0.8050022), ('often', 0.80381423), ('expe', 0.8015127), ('hors', 0.80094683)]


In [15]:
# Example usage
word = 'twenty'  # The word you want to find similar words for
similar_words = find_most_similar(word, dataset.vocab, dataset.index_to_word, word_embeddings, top_n=5)
print(f"Most similar words to '{word}':", similar_words)

Most similar words to 'twenty': [('thirty', 0.8679474), ('ten', 0.8436272), ('six', 0.8391596), ('laid', 0.8352573), ('five', 0.82961893)]
