<a href="https://colab.research.google.com/github/yongminkim0501/DeepLearning/blob/main/NLP_HW1_SkeletonCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
##Word2vec using pytorch
import numpy as np
text = '''Machine learning is the study of computer algorithms that improve automatically through experience. It is seen as a subset of artificial intelligence. Machine learning algorithms build a mathematical model based on sample data, known as  training data, in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or infeasible to develop conventional algorithms to perform the needed tasks.'''

# Tokenization
text = text.lower()
words = text.split()



In [None]:
from collections import Counter
from collections import defaultdict

## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
word2idx = {word: idx for idx, word in enumerate(vocab,0)}
idx2word = {idx: word for word, idx in word2idx.items()}
vocab_size = len(vocab)

In [None]:
import numpy as np
import torch
# Negative Sampling parameters
num_negatives = 5

word_counts = defaultdict(int)
for word in words:
  word_counts[word] += 1

# Create a unigram distribution for negative sampling
word_counts_array = np.array([word_counts[idx2word[i]] for i in range(vocab_size)])
unigram_dist = word_counts_array / word_counts_array.sum()
unigram_dist = unigram_dist ** 0.75
unigram_dist = unigram_dist / unigram_dist.sum()

def get_negative_samples_exclude_context(context_idx, num_negatives, unigram_dist, vocab_size):
    negatives = []
    while len(negatives) < num_negatives:
        sample = np.random.choice(vocab_size, p=unigram_dist)
        if sample != context_idx:
            negatives.append(sample)
    return torch.tensor(negatives)


window_size = 2  # Context window size

def generate_pairs(tokenized_corpus, word2idx, window_size):
    pairs = []
    for idx, target_word in enumerate(tokenized_corpus):
        if target_word not in word2idx:
            continue
        target_idx = word2idx[target_word]
        # Define context window
        start = max(idx - window_size, 0)
        end = min(idx + window_size + 1, len(tokenized_corpus)-1)
        context_list = []
        for context_idx in range(start, end):
            if context_idx != idx:
                context_word = tokenized_corpus[context_idx]
                if context_word in word2idx:
                    context_word_idx = word2idx[context_word]
                    context_list.append(context_word_idx)
        pairs.append((target_idx,context_list))
    res = []
    for target_word, context_words in pairs:
      negative_words = get_negative_samples_exclude_context(context_word,num_negatives,unigram_dist,len(word2idx))
      for context_word in context_words:
        target_word = torch.tensor([target_word])
        context_word = torch.tensor([context_word])
        negative_words = torch.tensor(negative_words)
        res.append((target_word,context_word,negative_words))
    return res

pairs = generate_pairs(words,word2idx,2)
print(pairs)

[(tensor([2]), tensor([3]), tensor([48, 32, 26, 35, 52])), (tensor([2]), tensor([4]), tensor([48, 32, 26, 35, 52])), (tensor([3]), tensor([2]), tensor([ 0, 18, 33, 17, 37])), (tensor([3]), tensor([4]), tensor([ 0, 18, 33, 17, 37])), (tensor([3]), tensor([8]), tensor([ 0, 18, 33, 17, 37])), (tensor([4]), tensor([2]), tensor([10, 31, 25, 10, 35])), (tensor([4]), tensor([3]), tensor([10, 31, 25, 10, 35])), (tensor([4]), tensor([8]), tensor([10, 31, 25, 10, 35])), (tensor([4]), tensor([14]), tensor([10, 31, 25, 10, 35])), (tensor([8]), tensor([3]), tensor([36,  6, 43, 44, 57])), (tensor([8]), tensor([4]), tensor([36,  6, 43, 44, 57])), (tensor([8]), tensor([14]), tensor([36,  6, 43, 44, 57])), (tensor([8]), tensor([5]), tensor([36,  6, 43, 44, 57])), (tensor([14]), tensor([4]), tensor([26, 49, 13,  6, 52])), (tensor([14]), tensor([8]), tensor([26, 49, 13,  6, 52])), (tensor([14]), tensor([5]), tensor([26, 49, 13,  6, 52])), (tensor([14]), tensor([9]), tensor([26, 49, 13,  6, 52])), (tensor

  negative_words = torch.tensor(negative_words)


In [None]:

import torch
import torch.nn as nn
from torch import optim
class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.input_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.output_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # Initialize embeddings
        initrange = 0.5 / embedding_dim
        self.input_embeddings.weight.data.uniform_(-initrange, initrange)
        self.output_embeddings.weight.data.uniform_(-0, 0)

    def forward(self, input_word, context_word, negative_words=None):
      ##################################################################



In [None]:
import random
# Initialize model, loss, optimizer
# Initialize model, loss, optimizer
model = Word2Vec(len(word2idx), 10)
learning_rate = 0.001
epochs = 50
loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

num_epochs = 100

for epoch in range(num_epochs):
    total_loss = 0
    for i, (target, context, negative_words) in enumerate(pairs):

        # Labels for positive/negative


        # Zero gradients
        optimizer.zero_grad()

        # Forward pass


        # Compute loss for positive/negative and do summation



        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")


In [None]:
from torch.nn.functional import cosine_similarity
# Get the word embeddings
word_embeddings = model.input_embeddings.weight.data

# Example: Get embedding for a specific word
word = 'machine'
if word in word2idx:
    word_idx = word2idx[word]
    word_vector = word_embeddings[word_idx]
    print(f"Vector for '{word}':\n{word_vector}")
else:
    print(f"Word '{word}' not in vocabulary.")

def find_similar(word, word_embeddings, word2idx, idx2word, top_k=5):
    if word not in word2idx:
        print(f"Word '{word}' not in vocabulary.")
        return
    word_idx = word2idx[word]
    word_vec = word_embeddings[word_idx].unsqueeze(0)
    similarities = cosine_similarity(word_vec, word_embeddings,dim=-1)
    similar_indices = similarities.argsort(descending=True)[:top_k].data.tolist()
    similar_words= []
    for idx in similar_indices:
      if idx2word[idx] != word:
        similar_words.append(idx2word[idx])
    print(f"Top {top_k} words similar to '{word}': {similar_words}")

# Example
find_similar('learning', word_embeddings, word2idx, idx2word, top_k=3)
find_similar('machine', word_embeddings, word2idx, idx2word, top_k=3)


Vector for 'machine':
tensor([-1.5444, -2.1195,  1.5672,  0.0539, -0.3602,  1.4793,  1.1963,  1.4893,
        -0.5970, -1.9585])
Top 3 words similar to 'learning': ['used', 'filtering']
Top 3 words similar to 'machine': ['are', 'intelligence.']
