In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
from nltk.corpus import brown
from  gensim.utils import simple_preprocess
from tqdm import tqdm



In [8]:
brown_words = brown.words()
brown_text = " ".join(brown.words()).lower()

In [34]:
num_sents = len(brown.sents())
print("number of sentences:", num_sents)

number of sentences: 57340


In [74]:
# Define the Dataset
class Word2VecDataset_bak(Dataset):
    def __init__(self, text, window_size=2):
        self.text = simple_preprocess(text)
        self.word_counts = Counter(self.text)
        self.word_list = sorted(self.word_counts, key=self.word_counts.get, reverse=True)
        self.word_to_int = {word: idx for idx, word in enumerate(self.word_list)}
        self.int_to_word = {idx: word for word, idx in self.word_to_int.items()}
        self.data = []
        for i, word in enumerate(self.text):
            for j in range(i - window_size, i + window_size + 1):
                if j != i and j >= 0 and j < len(self.text):
                    self.data.append((self.word_to_int[word], self.word_to_int[self.text[j]]))
        self.data = torch.tensor(self.data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]



In [75]:

class Word2VecDataset(Dataset):
    def __init__(self, text, window_size=2, negative_samples=5):
        self.tokens = simple_preprocess(text.lower())
        self.freq_dist = Counter(self.tokens)
        self.vocab = {word: i for i, word in enumerate(self.freq_dist.keys())}
        self.index_to_word = {i: word for word, i in self.vocab.items()}
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.data = self.generate_training_data()
        self.word_frequencies = np.array(list(self.freq_dist.values()))**0.75
        self.word_frequencies /= self.word_frequencies.sum()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        negatives = self.get_negative_samples(target)
        return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long), torch.tensor(negatives, dtype=torch.long)

    def generate_training_data(self):
        positive_pairs = []
        for i, word in enumerate(self.tokens):
            target = self.vocab[word]
            context_indices = list(range(max(0, i - self.window_size), min(len(self.tokens), i + self.window_size + 1)))
            context_indices.remove(i)
            for context_index in context_indices:
                context = self.vocab[self.tokens[context_index]]
                positive_pairs.append((target, context))
        return positive_pairs

    def get_negative_samples(self, target):
        negatives = []
        while len(negatives) < self.negative_samples:
            neg_sample = np.random.choice(len(self.vocab), p=self.word_frequencies)
            if neg_sample != target:
                negatives.append(neg_sample)
        return negatives


In [117]:
len(np.unique(simple_preprocess(brown_text.lower())))

41239

In [113]:
len(np.unique([word for sentence in corpus for word in sentence]))

41239

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter

class Word2VecSentenceDataset(Dataset):
    def __init__(self, sentences, window_size=2, negative_samples=5):
        """
        sentences: List of sentences, where each sentence is a list of words.
        window_size: The size of the context window.
        negative_samples: Number of negative samples to generate for each positive pair.
        """
        self.tokens = [word for sentence in sentences for word in sentence]  # Flatten the list of sentences
        self.vocab = {word: i for i, word in enumerate(set(self.tokens))}
        self.index_to_word = {i: word for word, i in self.vocab.items()}
        self.window_size = window_size
        self.negative_samples = negative_samples
        self.word_frequencies = np.array([freq for word, freq in Counter(self.tokens).items()])**0.75
        self.word_frequencies /= self.word_frequencies.sum()
        self.data = self.generate_training_data(sentences)

    def generate_training_data(self, sentences):
        positive_pairs = []
        for sentence in sentences:
            for i, target_word in enumerate(sentence):
                target_index = self.vocab[target_word]
                context_indices = range(max(0, i - self.window_size), min(len(sentence), i + self.window_size + 1))
                for j in context_indices:
                    if i != j:  # Exclude the target word itself
                        context_word = sentence[j]
                        context_index = self.vocab[context_word]
                        positive_pairs.append((target_index, context_index))
        return positive_pairs

    def get_negative_samples(self, target, num_samples):
        negatives = []
        while len(negatives) < num_samples:
            neg_sample = np.random.choice(len(self.vocab), p=self.word_frequencies)
            if neg_sample != target:
                negatives.append(neg_sample)
        return negatives

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        target, context = self.data[idx]
        negatives = self.get_negative_samples(target, self.negative_samples)
        return torch.tensor(target, dtype=torch.long), torch.tensor(context, dtype=torch.long), torch.tensor(negatives, dtype=torch.long)


In [37]:
# Define the Skip-Gram Model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, input_words):
        embedded = self.embeddings(input_words)
        scores = self.out(embedded)
        return scores

In [10]:
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)

        # Initialize weights
        self.target_embeddings.weight.data.uniform_(-1, 1)
        self.context_embeddings.weight.data.uniform_(-1, 1)

    def forward(self, target_words, context_words, negative_words):
        target_embeds = self.target_embeddings(target_words)
        context_embeds = self.context_embeddings(context_words)
        negative_embeds = self.context_embeddings(negative_words)

        positive_score = torch.sum(target_embeds * context_embeds, dim=1)
        negative_score = torch.bmm(negative_embeds, target_embeds.unsqueeze(2)).squeeze()

        return -torch.mean(torch.log(torch.sigmoid(positive_score)) + torch.sum(torch.log(torch.sigmoid(-negative_score)), dim=1))


In [78]:
corpus =brown_text[:10000]
window_size = 2

dataset = Word2VecDataset(corpus, window_size=2, negative_samples=5)
dataset.data[:10]

[(0, 1),
 (0, 2),
 (1, 0),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 3),
 (2, 4),
 (3, 1)]

In [39]:
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for data in dataloader:
    print(data)
    break
    

tensor([[  0,  75],
        [  3,   0],
        [198,  84],
        [ 64, 518]])


In [58]:
vocab_size

632

In [57]:
embedding_dim = 10
vocab_size = len(dataset.word_to_int)
model = SkipGramModel(vocab_size, embedding_dim)
output = model(data[:,0])
print(output.shape)
output

torch.Size([4, 632])


tensor([[ 0.0999, -0.4757,  0.5983,  ..., -0.6174,  0.2395, -0.3409],
        [ 0.9609, -0.8673,  0.5626,  ..., -0.6408, -0.4222,  0.2016],
        [ 0.3448, -0.0696,  0.1492,  ..., -0.9549,  0.2932, -0.6592],
        [ 0.4472,  0.0537,  0.9192,  ..., -0.5476,  0.4590, -0.1362]],
       grad_fn=<AddmmBackward0>)

In [42]:
nn.CrossEntropyLoss(torch.Tensor([0.1, 0.2, 0.4, 0.3]),[4])

CrossEntropyLoss()

In [67]:
len(brown_text)

6127073

In [11]:
def preprocess_brown_corpus():
    processed_sentences = []
    for sentence in brown.sents():
        processed_sentence = simple_preprocess(' '.join(sentence), deacc=True)  
        processed_sentences.append(processed_sentence)
    return processed_sentences

In [120]:
len(dataset.vocab)

41239

In [123]:
len(dataset.data)

3585262

In [5]:
corpus = preprocess_brown_corpus()

window_size = 2
# Prepare dataset and dataloader
dataset = Word2VecSentenceDataset(corpus, window_size)

In [6]:
dataset.__getitem__(0)

(tensor(3233), tensor(2826), tensor([30594,  6577,   413,   413,  7099]))

In [14]:
# Example corpus
corpus = preprocess_brown_corpus()

# Hyperparameters
embedding_dim = 20
learning_rate = 0.01
epochs = 10
window_size = 2
batch_size = 512

# Prepare dataset and dataloader
dataset = Word2VecSentenceDataset(corpus, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print("model loaded")

vocab_size = len(dataset.vocab)
# Initialize model, optimizer, and loss function
model = Word2Vec(vocab_size, embedding_dim)
optimizer = optim.SparseAdam(model.parameters(), lr=learning_rate)



# Example training loop
for epoch in range(epochs):
    total_loss = 0
    for target, context, negatives in tqdm(dataloader):  
         model.zero_grad()
         loss = model(target, context, negatives)
         loss.backward()
         optimizer.step()
         total_loss += loss.item()
      
    print(f"Epoch {epoch}, Loss: {total_loss}")


model loaded


  0%|          | 14/7003 [00:15<2:09:43,  1.11s/it]


KeyboardInterrupt: 

In [110]:
negatives

tensor([[ 6767,   280,   501, 21428,    37],
        [   30,  8497,  9096, 16877,  7626],
        [ 9802,  1121, 11797, 39716, 16900],
        [    0, 21062,     9,     0,  1607]])

In [103]:
# Example corpus
corpus =brown_text[:100000]

# Hyperparameters
embedding_dim = 30
learning_rate = 0.01
epochs = 10
window_size = 2
batch_size = 4

# Initialize model, optimizer, and loss function
model = Word2Vec(vocab_size, embedding_dim)
optimizer = optim.SparseAdam(model.parameters(), lr=learning_rate)

# Prepare dataset and dataloader
dataset = Word2VecDataset(corpus, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print("model loaded")

# Example training loop
for epoch in range(epochs):
    total_loss = 0
    for target, context, negatives in dataloader:  
        model.zero_grad()
        loss = model(target, context, negatives)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch}, Loss: {total_loss}")


model loaded


KeyboardInterrupt: 

In [106]:
negatives

tensor([[ 819,  996,  441,   47,  131],
        [  12,  555, 2352, 3036,  348],
        [ 269, 1704, 3078, 1600, 1262],
        [1962, 2490,  955,   39, 2875]])

In [69]:
# Example corpus
corpus = preprocess_brown_corpus()

# Hyperparameters
embedding_dim = 10
learning_rate = 0.001
epochs = 100
window_size = 2
batch_size = 4

# Prepare dataset and dataloader
dataset = Word2VecDataset(corpus, window_size)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
print("model loaded")

# Model, Loss, and Optimizer
vocab_size = len(dataset.word_to_int)
model = SkipGramModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for data in dataloader:
        targets, contexts = data[:, 0], data[:, 1]  # Correctly unpacking data
        targets, contexts = targets.to(torch.long), contexts.to(torch.long)  # Ensure correct type
        optimizer.zero_grad()
        output = model(targets)
        loss = criterion(output, contexts)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 1 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(dataloader)}')





model loaded
Epoch 0, Loss: 7.064614816512613
Epoch 1, Loss: 6.569237545064235
Epoch 2, Loss: 6.510200088207728
Epoch 3, Loss: 6.481425749521076
Epoch 4, Loss: 6.457907504252952
Epoch 5, Loss: 6.438017297239643
Epoch 6, Loss: 6.415325927111622
Epoch 7, Loss: 6.392766628606354
Epoch 8, Loss: 6.367758156926937
Epoch 9, Loss: 6.348035086840612
Epoch 10, Loss: 6.330607953604633


KeyboardInterrupt: 

In [90]:
# Inspect embeddings
word_embeddings = model.target_embeddings.weight.data
print(word_embeddings[dataset.vocab["atlanta"]])

tensor([ 0.1504,  0.1870, -0.1878, -1.4997,  0.7387,  0.1778, -0.0665,  0.3611,
         1.6426,  0.1221, -0.2383,  0.2758, -0.7139, -0.0731,  0.2115, -1.4121,
        -0.9097,  1.0919,  0.5057,  0.3861])


In [91]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(word, word_to_int, int_to_word, embeddings, top_n=5):
    # Get the embedding for the given word
    word_idx = word_to_int[word]
    word_embedding = embeddings[word_idx].reshape(1, -1)
    
    # Calculate cosine similarity between this word and all other words in the vocabulary
    similarities = []
    for i in range(len(embeddings)):
        other_word_embedding = embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(word_embedding, other_word_embedding)[0][0]
        similarities.append((i, similarity))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Convert indices back to words and filter out the input word
    similar_words = [(int_to_word[sim[0]], sim[1]) for sim in similarities if sim[0] != word_idx]
    
    # Return the top N most similar words, excluding the word itself
    return similar_words[:top_n]




In [93]:
# Example usage
word = 'friday'  # The word you want to find similar words for
similar_words = find_most_similar(word, dataset.vocab, dataset.index_to_word, word_embeddings, top_n=5)
print(f"Most similar words to '{word}':", similar_words)

Most similar words to 'friday': [('laws', 0.80120075), ('situation', 0.77872765), ('hotel', 0.75322247), ('presentments', 0.7503358), ('beaumont', 0.7498878)]
