<a href="https://colab.research.google.com/github/tcb7351/tcb7351/blob/20231206_2/%5BNLP_2023%5D_Word_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Corpus Preparation

In [None]:
import nltk
from nltk.corpus import brown
nltk.download('brown')

from collections import Counter

corpus = []
for sent in brown.sents():
    corpus.append([w.lower() for w in sent])
print(corpus[0])

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


['the', 'fulton', 'county', 'grand', 'jury', 'said', 'friday', 'an', 'investigation', 'of', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']


In [None]:
vocab = Counter()
for s in corpus:
    for w in s:
        vocab[w] += 1

word_to_id = {"<unk>": 0, "<s>": 1}
for w, c in vocab.most_common():
    if c < 20:
        break
    word_to_id[w] = len(word_to_id)

print(len(word_to_id))

5013


# Using PyTorch

The code is taken from https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

print(torch.cuda.is_available())
torch.manual_seed(1)

True


<torch._C.Generator at 0x7f3e0f19a950>

In [None]:
EMBEDDING_DIM = 20
VOCAB_SIZE = len(word_to_id)

embeds = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM)
lookup_tensor = torch.tensor([word_to_id["country"]], dtype=torch.long)
print(embeds(lookup_tensor))

tensor([[ 0.7584,  1.3751, -0.0024, -0.6730, -0.1112, -1.9805, -0.9982,  0.6384,
         -2.0778,  1.3597, -0.8795,  0.4062,  0.6346, -0.1749, -1.8811,  1.0915,
          0.2510, -0.1448,  0.4096, -0.9203]], grad_fn=<EmbeddingBackward0>)


# CBOW Language Model

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * 2 * embedding_dim, 50)
        self.linear2 = nn.Linear(50, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
import tqdm

NUM_EPOCHS = 3
CONTEXT_SIZE = 3
EMBEDDING_DIM = 20
VOCAB_SIZE = len(word_to_id)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def train():
    contexts = []
    targets = []
    for s in corpus:
        for i in range(len(s)):
            target = word_to_id[s[i]] if s[i] in word_to_id else word_to_id["<unk>"]
            context = []
            for j in range(i-CONTEXT_SIZE, i+CONTEXT_SIZE+1):
                if j == i:
                    continue
                wc = s[j] if 0 <= j < len(s) else "<s>"
                wc = word_to_id[wc] if wc in word_to_id else word_to_id["<unk>"]
                context.append(wc)
            contexts.append(context)
            targets.append([target])
    print(len(contexts), len(targets))
    print(contexts[0], targets[0])

    contexts = torch.tensor(contexts, dtype=torch.long).to(device)
    targets = torch.tensor(targets, dtype=torch.long).to(device)

    losses = []
    loss_function = nn.NLLLoss() # Negative likelihood loss
    model = CBOW(VOCAB_SIZE, EMBEDDING_DIM, CONTEXT_SIZE).to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    for epoch in tqdm.trange(NUM_EPOCHS):
        total_loss = 0
        for context, target in zip(contexts, targets):
            model.zero_grad()
            log_probs = model(context)
            loss = loss_function(log_probs, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(total_loss)
    return model

model = train()

1161192 1161192
[1, 1, 1, 0, 652, 2297] [2]


 33%|███▎      | 1/3 [15:50<31:40, 950.40s/it]

5829712.189111054


 67%|██████▋   | 2/3 [31:38<15:49, 949.19s/it]

5404481.153046466


100%|██████████| 3/3 [47:25<00:00, 948.47s/it]

5220841.668642633





In [None]:
print(model.embeddings.weight[word_to_id["nice"]])

tensor([ 4.4142e-04,  3.4940e-01,  3.6838e-01,  1.6032e+00, -6.4265e-01,
         1.8571e+00,  6.1649e-01,  1.8254e+00, -5.8889e-01, -1.7467e+00,
         8.2054e-01, -1.0854e+00,  3.4049e-01, -8.1458e-01, -1.1703e-01,
        -2.8776e-01,  1.5662e+00,  1.2633e+00, -4.8334e-01, -6.0228e-02],
       device='cuda:0', grad_fn=<SelectBackward0>)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def similarity(w1, w2):
    return cosine_similarity([model.embeddings.weight[word_to_id[w1]].cpu().detach().numpy()],
                             [model.embeddings.weight[word_to_id[w2]].cpu().detach().numpy()])

In [None]:
print(similarity("good", "bad"))
print(similarity("good", "excellent"))
print(similarity("good", "tree"))

[[-0.21657313]]
[[0.32453555]]
[[-0.05597425]]
