In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from nltk.corpus import brown


In [22]:
brown_words = brown.words()
brown_text = " ".join(brown.words()).lower()

In [48]:
num_sents = len(brown.sents())
print("number of sentences:", num_sents)

number of sentences: 57340


In [24]:
brown_text[:100]

"the fulton county grand jury said friday an investigation of atlanta's recent primary election produ"

In [2]:
# Define the Dataset
class Word2VecDataset(Dataset):
    def __init__(self, text, window_size=2):
        self.text = text.split()
        self.word_counts = Counter(self.text)
        self.word_list = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        self.word_to_int = {word: idx for idx, word in enumerate(self.word_list)}
        self.int_to_word = {idx: word for word, idx in self.word_to_int.items()}
        self.data = []
        for i, word in enumerate(self.text):
            for j in range(i - window_size, i + window_size + 1):
                if j != i and j >= 0 and j < len(self.text):
                    self.data.append((self.word_to_int[word], self.word_to_int[self.text[j]]))
        self.data = torch.tensor(self.data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Define the Skip-Gram Model

In [3]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_words):
        embedded = self.embeddings(input_words)
        scores = self.out(embedded)
        return scores

In [45]:
# Example corpus
corpus =brown_text[:10000]

# Hyperparameters
embedding_dim = 10
learning_rate = 0.001
epochs = 100
window_size = 2
batch_size = 4

# Prepare dataset and dataloader
dataset = Word2VecDataset(corpus, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print("model loaded")

# Model, Loss, and Optimizer
vocab_size = len(dataset.word_to_int)
model = SkipGramModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for data in dataloader:
        targets, contexts = data[:, 0], data[:, 1]  # Correctly unpacking data
        targets, contexts = targets.to(torch.long), contexts.to(torch.long)  # Ensure correct type
        optimizer.zero_grad()
        output = model(targets)
        loss = criterion(output, contexts)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')





model loaded
Epoch 0, Loss: 6.383079659562927
Epoch 10, Loss: 5.099413008034766
Epoch 20, Loss: 4.7253947921164405
Epoch 30, Loss: 4.472369408285296
Epoch 40, Loss: 4.304513565994598
Epoch 50, Loss: 4.1911586373373195
Epoch 60, Loss: 4.113703016628016
Epoch 70, Loss: 4.058059169566846
Epoch 80, Loss: 4.015699776711765
Epoch 90, Loss: 3.9841774989355794


In [31]:
# Inspect embeddings
word_embeddings = model.embeddings.weight.data
print(word_embeddings[dataset.word_to_int["Atlanta's"]])

tensor([-3.7393, -0.6095,  0.3763, -0.2495,  3.0213,  1.3814, -0.5919, -0.4371,
         0.9403,  0.7485])


In [39]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(word, word_to_int, int_to_word, embeddings, top_n=5):
    # Get the embedding for the given word
    word_idx = word_to_int[word]
    word_embedding = embeddings[word_idx].reshape(1, -1)
    
    # Calculate cosine similarity between this word and all other words in the vocabulary
    similarities = []
    for i in range(len(embeddings)):
        other_word_embedding = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(word_embedding, other_word_embedding)[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Convert indices back to words and filter out the input word
    similar_words = [(int_to_word[sim[0]], sim[1]) for sim in similarities if sim[0] != word_idx]
    
    # Return the top N most similar words, excluding the word itself
    return similar_words[:top_n]




In [47]:
# Example usage
word = 'friday'  # The word you want to find similar words for
similar_words = find_most_similar(word, dataset.word_to_int, dataset.int_to_word, word_embeddings, top_n=5)
print(f"Most similar words to '{word}':", similar_words)

Most similar words to 'friday': [('has', 0.7451347), ('had', 0.62919605), ('roads', 0.60290426), ('an', 0.5426837), ('there', 0.4983489)]


In [43]:
brown_text[:100]

"the fulton county grand jury said friday an investigation of atlanta's recent primary election produ"