## Bigram 
This file demonstrates a bigram model.

### Embedding layers

pytorch embedding layers act as a array of trainable parameters for a given vocabulary size. For example, if we have n numbers where we are using the the data in context and target as in the table below, the vocabulary size is n and we can have some embedding paramters depending on the size of the data we are trying to feed.


### context size = 1, target size = 1
The following example embeds the sequence of numbers up to 10 as the context and target.

|context|target|
|-|-|
|1|2|
|2|3|
|3|4|
|..|..|
|n|n+1|

In [10]:
import torch
import torch.nn as nn

import torch.nn.functional as F

vocab_size = 10
context_size = 1

input_tensor = []
for i in range(1,vocab_size-context_size):
    input_tensor.append([[i], [i+1]])

class BiGram(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim):
        super(BiGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs


model = BiGram(vocab_size,context_size, vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
losses = []

for epoch in range(100):
    # print('epoch: {}'.format(epoch + 1))
    running_loss = 0
    for data in input_tensor:
        word, label = data
        word = torch.LongTensor(word)
        label = torch.LongTensor(label)
        # forward
        out = model(word)
        loss = F.cross_entropy(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print('Loss: {:.6f}'.format(running_loss / vocab_size))

model.embeddings.weight[[1]]

Loss: 0.881331


tensor([[-0.2687, -0.7669, -1.5136,  1.0216, -1.2418, -0.4616, -0.3318,  0.0050,
          1.2334,  0.6881]], grad_fn=<IndexBackward0>)

In [11]:
pred = randrange(1, 10)
target = pred + 1
out = model(torch.tensor([pred], dtype=torch.long))
_, predict_label = torch.max(out, 1)
predict_word = predict_label.data[0].item()
print('real word is {}, predict word is {}'.format(target, predict_word))

real word is 10, predict word is 5


### context_size = 2; target_size = 1

|context|target|
|-|-|
|1,2|3|
|2,3|4|
|4,5|6|
|..|..|
|n,n+1|n+2|


In [12]:
import torch.nn.functional as F

context_size = 2
# The end of vocab size might overflow at the boundary like 9+10 = 11. Hence add context size for safely rail.
vocab_size = 10+context_size

input_tensor = []
for i in range(1,vocab_size-context_size):
    input_tensor.append([[i, i+1], [i+2]])

class BiGram(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim):
        super(BiGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size*embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view(1,-1)
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

model = BiGram(vocab_size,context_size, vocab_size)
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
loss_function = nn.NLLLoss()
losses = []

for epoch in range(100):
    # print('epoch: {}'.format(epoch + 1))
    running_loss = 0
    for data in input_tensor:
        word, label = data
        word = torch.LongTensor(word)
        label = torch.LongTensor(label)
        # forward
        out = model(word)
        loss = F.cross_entropy(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
print('Loss: {:.6f}'.format(running_loss / vocab_size))

model.embeddings.weight[[1]]

Loss: 0.823904


tensor([[-0.8044, -2.0680, -0.3244,  0.8827, -0.5343,  1.4524, -0.1523, -1.1447,
          1.0708,  0.2457, -1.1162,  1.9035]], grad_fn=<IndexBackward0>)

In [13]:
pred = randrange(1, 10)
target = pred + 2
print ((pred, pred+1), pred+2)
out = model(torch.tensor([pred, pred+1], dtype=torch.long))
_, predict_label = torch.max(out, 1)
predict_word = predict_label.data[0].item()
print('real word is {}, predict word is {}'.format(target, predict_word))

(9, 10) 11
real word is 11, predict word is 11
