In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# One-Hot Embeddings: Bag-of-Words Classifier

In [2]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH"),
        ("Yo creo que si".split(), "SPANISH"),
        ("it is lost on me".split(), "ENGLISH")]

word2idx = {}
for sent, _ in data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
print(word2idx)

label2idx = {"SPANISH": 0, "ENGLISH": 1}

VOC_SIZE = len(word2idx)
N_LABEL = len(label2idx)

X = []
y = []
for sent, label in data:
    # It would create double by default
    vec = np.zeros(VOC_SIZE)
    for word in sent:
        vec[word2idx[word]] += 1
    X.append(vec)
    y.append(label2idx[label])

# X -> torch.float32
# y MUST BE torch.int64 (long tensor)
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.long)
print(X)
print(y)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}
tensor([[1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1.,
         1., 1., 1., 1., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 1., 0.],
        [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 1.

In [3]:
class BoWClassifier(nn.Module):
    def __init__(self, n_label, voc_size):
        # All units with differentiable parameters should be included in __init__ function
        # non-linearity function like tanh, ReLU and softmax without differentiable parameters
        # could not be included in __init__ function
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(voc_size, n_label)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=-1)

In [4]:
model = BoWClassifier(N_LABEL, VOC_SIZE)

for param in model.parameters():
    print(param)

bow_vec = X[0]
log_probs = model(bow_vec)
print(log_probs)
print(log_probs.size())

Parameter containing:
tensor([[ 0.0319, -0.0179, -0.0375,  0.1074,  0.0662,  0.1006, -0.0383, -0.1187,
          0.0301, -0.1874, -0.1899,  0.0124,  0.1544, -0.0285,  0.0959,  0.0439,
          0.1780,  0.0186,  0.0330,  0.0482, -0.1069,  0.0919, -0.1699,  0.0348,
          0.1165,  0.1579],
        [ 0.1032, -0.0515,  0.0229,  0.1484,  0.1949, -0.1490, -0.0033,  0.0697,
          0.1015, -0.0685, -0.1354,  0.1873,  0.1753,  0.1079, -0.1126,  0.1947,
         -0.0516, -0.0145,  0.1332,  0.1120,  0.0430, -0.1163,  0.0631,  0.1459,
          0.1297,  0.1406]], requires_grad=True)
Parameter containing:
tensor([ 0.1577, -0.1602], requires_grad=True)
tensor([-0.5544, -0.8543], grad_fn=<LogSoftmaxBackward>)
torch.Size([2])


In [5]:
train_X, test_X = X[:4], X[4:]
train_y, test_y = y[:4], y[4:]

# Prediction result before training. 
for bow_vec, label in zip(test_X, test_y):
    log_probs = model(bow_vec)
    print(log_probs, label)

# The weight vector for specific word
# The weight vector works like an embedding vector, mapping a word (i.e., one-hot embedding vector) 
# to a dense vector, except for the bias. 
print(model.linear.weight[:, word2idx['creo']])

tensor([-0.7112, -0.6755], grad_fn=<LogSoftmaxBackward>) tensor(0)
tensor([-0.4691, -0.9824], grad_fn=<LogSoftmaxBackward>) tensor(1)
tensor([-0.1899, -0.1354], grad_fn=<SelectBackward>)


In [6]:
# Training
# Negative log-likelihood
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(100):
    running_loss = 0.0
    for bow_vec, label in zip(train_X, train_y):
        # log_probs MUST BE a matrix to enter the loss function
        # use view-function to reshape!!
        log_probs = model(bow_vec.view(1, -1))
        
        # log_probs -> matrix
        # label -> vector
        loss = loss_func(log_probs, label.view(1))
        running_loss += loss.item()

        # Backward propagation
        model.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
            print(epoch + 1, running_loss)

10 0.38178349286317825
200.18935478664934635
30 0.12546012550592422
40 0.09370490722358227
50 0.07474238239228725
60 0.06214729091152549
70 0.05317725474014878
80 0.046465481631457806
90 0.04125553090125322
100 0.03709437372162938


In [7]:
# Prediction result after training. 
for bow_vec, label in zip(test_X, test_y):
    log_probs = model(bow_vec)
    print(log_probs, label)

# The weight vector for specific word
# The weight vector works like an embedding vector, mapping a word (i.e., one-hot embedding vector) 
# to a dense vector, except for the bias. 
print(model.linear.weight[:, word2idx['creo']])

tensor([-0.1414, -2.0263], grad_fn=<LogSoftmaxBackward>) tensor(0)
tensor([-2.1827, -0.1196], grad_fn=<LogSoftmaxBackward>) tensor(1)
tensor([ 0.2472, -0.5726], grad_fn=<SelectBackward>)


# Dense Embeddings
## `nn.Embedding`
Encode semantics in words. 

In [8]:
word2idx = {'hello': 0, 
            'world': 1, 
            'i': 2, 
            'am': 3, 
            'syuoni': 4}
# 5 vocabulary size, 4 embedding size. 
emb = nn.Embedding(5, 4)
emb.weight

Parameter containing:
tensor([[ 0.2702, -1.3375,  0.3695,  0.0582],
        [ 0.3289, -1.0839,  1.0494, -0.4954],
        [-1.2038, -0.4306, -0.9760, -0.6266],
        [ 0.7669,  0.7873,  0.3577, -0.7754],
        [ 0.1783, -2.0435,  0.9216, -0.0520]], requires_grad=True)

In [9]:
# Indexing-input MUST BE torch.long/torch.int64
t = torch.tensor(word2idx['hello'], dtype=torch.long)
t_emb = emb(t)
t_emb

tensor([ 0.2702, -1.3375,  0.3695,  0.0582], grad_fn=<EmbeddingBackward>)

In [10]:
# Input as a sequence
idx_seq = torch.tensor([word2idx[w] for w in "hello world".split()], dtype=torch.long)
embedded = emb(idx_seq)
embedded

tensor([[ 0.2702, -1.3375,  0.3695,  0.0582],
        [ 0.3289, -1.0839,  1.0494, -0.4954]], grad_fn=<EmbeddingBackward>)

In [11]:
# Input as a minibatch of sequences
idx_seq_batch = torch.tensor([[0, 1], 
                              [2, 3], 
                              [3, 4]], dtype=torch.long)
embedded = emb(idx_seq_batch)
embedded

tensor([[[ 0.2702, -1.3375,  0.3695,  0.0582],
         [ 0.3289, -1.0839,  1.0494, -0.4954]],

        [[-1.2038, -0.4306, -0.9760, -0.6266],
         [ 0.7669,  0.7873,  0.3577, -0.7754]],

        [[ 0.7669,  0.7873,  0.3577, -0.7754],
         [ 0.1783, -2.0435,  0.9216, -0.0520]]], grad_fn=<EmbeddingBackward>)

## `nn.EmbeddingBag`
Equivalent to `torch.nn.Embedding` followed by `torch.sum(dim=0)` / `torch.mean(dim=0)` / `torch.max(dim=0)`.

In [12]:
# 5 vocabulary size, 4 embedding size. 
emb_sum = nn.EmbeddingBag(5, 4, mode='sum')
emb_sum.weight

Parameter containing:
tensor([[-1.0847e+00, -5.3809e-01, -7.9392e-01,  6.9129e-01],
        [ 5.7604e-02,  8.9597e-01,  1.9440e+00, -3.6877e-01],
        [ 6.4747e-01,  7.2291e-01,  2.6080e-01, -9.6123e-01],
        [ 2.7295e-02, -1.5850e+00,  1.2183e+00, -6.5733e-01],
        [ 4.8205e-04, -2.5908e-01, -2.0872e+00, -1.1491e+00]],
       requires_grad=True)

In [13]:
# The input sequence is viewed as packed / concatenated from multiple individual sequences. 
# The offsets indicates the starting indexes of individual sequences. 
idx_seq = torch.tensor([word2idx[w] for w in "hello world i am syuoni".split()], dtype=torch.long)
offsets = torch.tensor([0, 2], dtype=torch.long)
embedded = emb_sum(idx_seq, offsets)
print(embedded)

print(emb_sum.weight[0:2].sum(dim=0))
print(emb_sum.weight[2:5].sum(dim=0))

tensor([[-1.0271,  0.3579,  1.1501,  0.3225],
        [ 0.6752, -1.1212, -0.6080, -2.7676]], grad_fn=<EmbeddingBagBackward>)
tensor([-1.0271,  0.3579,  1.1501,  0.3225], grad_fn=<SumBackward1>)
tensor([ 0.6752, -1.1212, -0.6080, -2.7676], grad_fn=<SumBackward1>)


In [14]:
# Input as a minibatch of sequences
idx_seq_batch = torch.tensor([[0, 1], 
                              [2, 3], 
                              [3, 4]], dtype=torch.long)
embedded = emb_sum(idx_seq_batch)
embedded

tensor([[-1.0271,  0.3579,  1.1501,  0.3225],
        [ 0.6748, -0.8621,  1.4791, -1.6186],
        [ 0.0278, -1.8441, -0.8688, -1.8064]], grad_fn=<EmbeddingBagBackward>)

# Training Embeddings: N-Gram Model 
In an n-gram language model, given a sequence of words $w$, we want to compute  
$$
P \left( w_{i} \left| w_{i-1}, w_{i-2}, ..., w_{i-n+1} \right. \right)  
$$  
where $w_{i}$ is the i-th word of the sequence.  

In [15]:
# Use TWO words to predict next word
CONTEXT_SIZE = 2
EMB_DIM = 10

# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
print(trigrams[:3])

voc = set(test_sentence)
word2idx = {word: i for i, word in enumerate(voc)}
VOC_SIZE = len(voc)

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [16]:
class NGramModel(nn.Module):
    def __init__(self, voc_size, emb_dim, context_size):
        super(NGramModel, self).__init__()
        self.emb = nn.Embedding(voc_size, emb_dim)
        self.fc1 = nn.Linear(context_size*emb_dim, 128)
        self.fc2 = nn.Linear(128, voc_size)
        
    def forward(self, ins):
        # ins include several words (N=context_size)
        # self.emb(ins) -> (context_size, emb_dim)
        # self.emb(ins).view((1, -1)) -> (1, context_size*emb_dim)
        emb_ins = self.emb(ins).view((1, -1))
        outs = F.relu(self.fc1(emb_ins))
        outs = self.fc2(outs)
        log_probs = F.log_softmax(outs, dim=-1)
        return log_probs

In [17]:
loss_func = nn.NLLLoss()
model = NGramModel(VOC_SIZE, EMB_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:
        context_idxes = [word2idx[w] for w in context]
        context_var = torch.tensor(context_idxes, dtype=torch.long)
        target_var = torch.tensor([word2idx[target]], dtype=torch.long)
        
        model.zero_grad()
        log_probs = model(context_var)
        
        # log_probs -> matrix
        # target_var -> vector
        loss = loss_func(log_probs, target_var)        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(epoch, total_loss)

0 523.3301439285278
1 521.2640182971954
2 519.2101652622223
3 517.1685211658478
4 515.1380250453949
5 513.1179633140564
6 511.1087770462036
7 509.108384847641
8 507.11737871170044
9 505.135014295578


# Computing Word Embeddings: Continuous Bag-of-Words
In an continuous BOW model, given a sequence of words $w$, we want to compute  
$$
P \left( w_{i} \left| w_{i+n-1}, ..., w_{i+1}, w_{i-1}, ..., w_{i-n+1} \right. \right)  
$$  
where $w_{i}$ is the i-th word of the sequence.  

In [18]:
CONTEXT_SIZE = 2
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

voc = set(raw_text)
VOC_SIZE = len(voc)
word2idx = {word: i for i, word in enumerate(voc)}

data = [([raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2]], raw_text[i]) for i in range(2, len(raw_text)-2)]
print(data[:3])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study')]


In [19]:
class CBOW(nn.Module):
    def __init__(self, voc_size, emb_dim):
        super(CBOW, self).__init__()
        self.emb = nn.Embedding(voc_size, emb_dim)
        self.fc = nn.Linear(emb_dim, voc_size)
        
    def forward(self, ins):
        emb_ins = self.emb(ins)
        outs = self.fc(emb_ins.sum(dim=0, keepdim=True))
        log_probs = F.log_softmax(outs, dim=-1)
        return log_probs

In [20]:
loss_func = nn.NLLLoss()
model = CBOW(VOC_SIZE, EMB_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in data:
        context_idxes = [word2idx[w] for w in context]
        context_var = torch.tensor(context_idxes, dtype=torch.long)
        target_var = torch.tensor([word2idx[target]], dtype=torch.long)
        
        model.zero_grad()
        log_probs = model(context_var)
        loss = loss_func(log_probs, target_var)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(epoch, total_loss)

0 241.13269114494324
1 238.55687272548676
2 236.0288382768631
3 233.5469708442688
4 231.10988879203796
5 228.71640646457672
6 226.36551249027252
7 224.05631721019745
8 221.78803765773773
9 219.55995047092438
