# FastText
FastText is a word embedding technique that breaks down words into n grams and creates the word embeddings for those smaller units\
This helps FastText to also find out ebmedding for previously unseen words by using the smaller parts of the word

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, window_size, embedding_size):
        super(SkipGram, self).__init__()

        self.embeddings = nn.Embedding(vocab_size, embedding_size)
        self.linear = nn.Linear(embedding_size, vocab_size)

    def forward(self, target):
        target_embedding = self.embeddings(target)
        res = self.linear(target_embedding)
        return res

In [3]:
window_size = 3
doc = [
    "<i am henry>",
    "<i like college>",
    "<do henry like college>",
    "<i am do i like college>",
    "<i do like henry>",
    "<do i like henry>",
]
raw_text = " ".join(doc)
tokens = raw_text.split(" ")

In [4]:
def get_new_tokens(tok):
    char_tokens = []
    for token in tok:
        if len(token) < window_size:
            char_tokens.append(token)
        for i in range(0, len(token) - window_size + 1):
            char_tokens.append(token[i : i + window_size])
    return char_tokens

In [5]:
char_tokens = get_new_tokens(tokens)
print(char_tokens[:10])
vocab = set(char_tokens)
vocab_size = len(vocab)

['<i', 'am', 'hen', 'enr', 'nry', 'ry>', '<i', 'lik', 'ike', 'col']


In [6]:
data = []
word_index = {word: i for i, word in enumerate(vocab)}

for i in range(window_size, len(char_tokens) - window_size):
    context = word_index[char_tokens[i]]
    for j in range(-window_size, window_size + 1):
        if j == 0:
            continue
        data.append((context, word_index[char_tokens[i + j]]))
print(data[:10])

[(3, 16), (3, 0), (3, 1), (3, 13), (3, 14), (3, 16), (13, 0), (13, 1), (13, 3), (13, 14)]


In [7]:
embed_size = 10
learning_rate = 0.01
epochs = 1000

model = SkipGram(vocab_size, window_size, embed_size)
lossfn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [8]:
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        optimizer.zero_grad()
        output = model(torch.tensor([context]))
        loss = lossfn(output, torch.tensor([target]))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 50 == 0:
        print(epoch, total_loss / len(data))

0 2.929987609791918
50 2.3488601352892764
100 2.2823653622549407
150 2.261180039165782
200 2.253301527629904
250 2.249035820263584
300 2.2461052427486496
350 2.2436619603714973
400 2.241289444521171
450 2.2389877912949543
500 2.2370689385602263
550 2.235602776614987
600 2.234455091207206
650 2.2335173297090596
700 2.2327251633008323
750 2.2320393364445694
800 2.2314340431673996
850 2.230892019206975
900 2.230401324982546
950 2.2299533826964244


In [9]:
word_to_lookup = "henbenry"
lookup = get_new_tokens([word_to_lookup])
res = []
for lu in lookup:
    if lu in word_index.keys():
        wi = word_index[lu]
        embedding = model.embeddings(torch.tensor([wi]))
        res.append(embedding.detach().numpy()[0])
        print(f"Embedding for '{lu}': {embedding.detach().numpy()}")

print(f"Embedding for {word_to_lookup}: {sum(res) / len(res)}")

Embedding for 'hen': [[-0.8204228   0.10082058 -0.7994618  -0.63440734 -0.32108185  0.3642752
   0.25619638  0.5068886   2.0844624  -0.01407014]]
Embedding for 'enr': [[-0.424241   -0.6773601  -0.7903334   0.26817885  0.25904417  1.2208512
  -0.5175885   1.9262754  -0.9821287  -0.88805467]]
Embedding for 'nry': [[ 0.29178938 -2.0117695  -0.89259404 -1.5418677  -0.04163222 -0.38947254
  -0.5865991  -0.7319726  -1.0225619  -0.28157222]]
Embedding for henbenry: [-0.3176248  -0.86276966 -0.8274631  -0.63603204 -0.03455663  0.39855132
 -0.28266373  0.56706387  0.02659062 -0.39456567]
