In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# One-Hot Embeddings: Bag-of-Words Classifier

In [2]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH"),
        ("Yo creo que si".split(), "SPANISH"),
        ("it is lost on me".split(), "ENGLISH")]

word2idx = {}
for sent, _ in data:
    for word in sent:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
print(word2idx)

label2idx = {"SPANISH": 0, "ENGLISH": 1}

VOC_SIZE = len(word2idx)
N_LABEL = len(label2idx)

X = []
y = []
for sent, label in data:
    # It would create double by default
    vec = np.zeros(VOC_SIZE)
    for word in sent:
        vec[word2idx[word]] += 1
    X.append(vec)
    y.append(label2idx[label])

# X -> torch.float32
# y MUST BE torch.int64 (long tensor)
X = torch.tensor(X, dtype=torch.float)
y = torch.tensor(y, dtype=torch.long)
print(X)
print(y)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}
tensor([[1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 1.,
         1., 1., 1., 1., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 1., 0.],
        [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 1.

In [3]:
class BoWClassifier(nn.Module):
    def __init__(self, n_label, voc_size):
        # All units with differentiable parameters should be included in __init__ function
        # non-linearity function like tanh, ReLU and softmax without differentiable parameters
        # could not be included in __init__ function
        super(BoWClassifier, self).__init__()
        self.linear = nn.Linear(voc_size, n_label)
        
    def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=-1)

In [4]:
model = BoWClassifier(N_LABEL, VOC_SIZE)

for param in model.parameters():
    print(param)

bow_vec = X[0]
log_probs = model(bow_vec)
print(log_probs)
print(log_probs.size())

Parameter containing:
tensor([[-0.0373, -0.1773, -0.0899,  0.1846,  0.1721, -0.1628,  0.0644,  0.0019,
          0.1746, -0.1871,  0.0118, -0.0190, -0.1606, -0.1247,  0.0234,  0.0245,
          0.0912, -0.0126,  0.1601, -0.1820,  0.1288,  0.1498,  0.1129,  0.0806,
         -0.0051, -0.1724],
        [ 0.1793, -0.1507, -0.1670,  0.1632,  0.1346, -0.1195, -0.0451, -0.0050,
          0.1651, -0.0721, -0.0248,  0.0428,  0.0417, -0.0157, -0.1199, -0.1857,
          0.0539,  0.1455, -0.1853,  0.1798,  0.1642,  0.1695,  0.1888,  0.0805,
          0.0194,  0.1493]], requires_grad=True)
Parameter containing:
tensor([ 0.0058, -0.0719], requires_grad=True)
tensor([-0.7303, -0.6574], grad_fn=<LogSoftmaxBackward>)
torch.Size([2])


In [5]:
train_X, test_X = X[:4], X[4:]
train_y, test_y = y[:4], y[4:]

# Prediction result before training. 
for bow_vec, label in zip(test_X, test_y):
    log_probs = model(bow_vec)
    print(log_probs, label)

# The weight vector for specific word
# The weight vector works like an embedding vector, mapping a word (i.e., one-hot embedding vector) 
# to a dense vector, except for the bias. 
print(model.linear.weight[:, word2idx['creo']])

tensor([-0.6791, -0.7073], grad_fn=<LogSoftmaxBackward>) tensor(0)
tensor([-0.9348, -0.4987], grad_fn=<LogSoftmaxBackward>) tensor(1)
tensor([ 0.0118, -0.0248], grad_fn=<SelectBackward>)


In [6]:
# Training
# Negative log-likelihood
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(100):
    running_loss = 0.0
    for bow_vec, label in zip(train_X, train_y):
        # log_probs MUST BE a matrix to enter the loss function
        # use view-function to reshape!!
        log_probs = model(bow_vec.view(1, -1))
        
        # log_probs -> matrix
        # label -> vector
        loss = loss_func(log_probs, label.view(1))
        running_loss += loss.item()

        # Backward propagation
        model.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
            print(epoch + 1, running_loss)

10 0.38944409787654877
20 0.1914397720247507
30 0.12640963774174452
40 0.0942453807219863
50 0.07509064953774214
60 0.062390413135290146
70 0.05335633549839258
80 0.046602878253906965
90 0.0413644309155643
100 0.03718271292746067


In [7]:
# Prediction result after training. 
for bow_vec, label in zip(test_X, test_y):
    log_probs = model(bow_vec)
    print(log_probs, label)

# The weight vector for specific word
# The weight vector works like an embedding vector, mapping a word (i.e., one-hot embedding vector) 
# to a dense vector, except for the bias. 
print(model.linear.weight[:, word2idx['creo']])

tensor([-0.1359, -2.0629], grad_fn=<LogSoftmaxBackward>) tensor(0)
tensor([-3.0247, -0.0498], grad_fn=<LogSoftmaxBackward>) tensor(1)
tensor([ 0.4389, -0.4519], grad_fn=<SelectBackward>)


# Dense Embeddings
Encode semantic similarity in words. 

In [8]:
word2idx = {'hello': 0, 'world': 1}
emb = nn.Embedding(2, 5)
print(emb.weight)

# Indexing-input MUST BE torch.long/torch.int64
t = torch.tensor(word2idx['hello'], dtype=torch.long)
t_emb = emb(t)
print(t_emb)

Parameter containing:
tensor([[ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055],
        [ 0.5628,  0.2336, -0.5755,  0.6286, -0.5012]], requires_grad=True)
tensor([ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055],
       grad_fn=<EmbeddingBackward>)


In [9]:
# Input as a sequence
idx_seq = torch.tensor([0, 1, 0], dtype=torch.long)
embedded = emb(idx_seq)
embedded

tensor([[ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055],
        [ 0.5628,  0.2336, -0.5755,  0.6286, -0.5012],
        [ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055]],
       grad_fn=<EmbeddingBackward>)

In [10]:
# Input as a minibatch of sequences
idx_seq_batch = torch.tensor([[0, 1], [1, 1], [0, 0]], dtype=torch.long)
embedded = emb(idx_seq_batch)
embedded

tensor([[[ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055],
         [ 0.5628,  0.2336, -0.5755,  0.6286, -0.5012]],

        [[ 0.5628,  0.2336, -0.5755,  0.6286, -0.5012],
         [ 0.5628,  0.2336, -0.5755,  0.6286, -0.5012]],

        [[ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055],
         [ 0.6340, -0.9037, -0.1137, -1.6222, -0.9055]]],
       grad_fn=<EmbeddingBackward>)

# Training Embeddings: N-Gram Model 
In an n-gram language model, given a sequence of words $w$, we want to compute  
$
P \left( w_{i} \left| w_{i-1}, w_{i-2}, ..., w_{i-n+1} \right. \right)  
$  
where $w_{i}$ is the i-th word of the sequence.  

In [11]:
# Use TWO words to predict next word
CONTEXT_SIZE = 2
EMB_DIM = 10

# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2])
            for i in range(len(test_sentence) - 2)]
print(trigrams[:3])

voc = set(test_sentence)
word2idx = {word: i for i, word in enumerate(voc)}
VOC_SIZE = len(voc)

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]


In [12]:
class NGramModel(nn.Module):
    def __init__(self, voc_size, emb_dim, context_size):
        super(NGramModel, self).__init__()
        self.emb = nn.Embedding(voc_size, emb_dim)
        self.fc1 = nn.Linear(context_size*emb_dim, 128)
        self.fc2 = nn.Linear(128, voc_size)
        
    def forward(self, ins):
        # ins include several words (N=context_size)
        # self.emb(ins) -> (context_size, emb_dim)
        # self.emb(ins).view((1, -1)) -> (1, context_size*emb_dim)
        emb_ins = self.emb(ins).view((1, -1))
        outs = F.relu(self.fc1(emb_ins))
        outs = self.fc2(outs)
        log_probs = F.log_softmax(outs, dim=-1)
        return log_probs

In [13]:
loss_func = nn.NLLLoss()
model = NGramModel(VOC_SIZE, EMB_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in trigrams:
        context_idxes = [word2idx[w] for w in context]
        context_var = torch.tensor(context_idxes, dtype=torch.long)
        target_var = torch.tensor([word2idx[target]], dtype=torch.long)
        
        model.zero_grad()
        log_probs = model(context_var)
        
        # log_probs -> matrix
        # target_var -> vector
        loss = loss_func(log_probs, target_var)        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(epoch, total_loss)

0 519.6142597198486
1 517.2811331748962
2 514.9622578620911
3 512.6555509567261
4 510.36166524887085
5 508.07909512519836
6 505.8089349269867
7 503.54898047447205
8 501.29919600486755
9 499.0564966201782


# Computing Word Embeddings: Continuous Bag-of-Words
In an continuous BOW model, given a sequence of words $w$, we want to compute  
$
P \left( w_{i} \left| w_{i+n-1}, ..., w_{i+1}, w_{i-1}, ..., w_{i-n+1} \right. \right)  
$  
where $w_{i}$ is the i-th word of the sequence.  

In [14]:
CONTEXT_SIZE = 2
raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

voc = set(raw_text)
VOC_SIZE = len(voc)
word2idx = {word: i for i, word in enumerate(voc)}

data = [([raw_text[i-2], raw_text[i-1], raw_text[i+1], raw_text[i+2]], raw_text[i]) for i in range(2, len(raw_text)-2)]
print(data[:3])

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study')]


In [15]:
class CBOW(nn.Module):
    def __init__(self, voc_size, emb_dim):
        super(CBOW, self).__init__()
        self.emb = nn.Embedding(voc_size, emb_dim)
        self.fc = nn.Linear(emb_dim, voc_size)
        
    def forward(self, ins):
        emb_ins = self.emb(ins)
        outs = self.fc(emb_ins.sum(dim=0, keepdim=True))
        log_probs = F.log_softmax(outs, dim=-1)
        return log_probs

In [16]:
loss_func = nn.NLLLoss()
model = CBOW(VOC_SIZE, EMB_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = 0
    for context, target in data:
        context_idxes = [word2idx[w] for w in context]
        context_var = torch.tensor(context_idxes, dtype=torch.long)
        target_var = torch.tensor([word2idx[target]], dtype=torch.long)
        
        model.zero_grad()
        log_probs = model(context_var)
        loss = loss_func(log_probs, target_var)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    print(epoch, total_loss)

0 263.90684628486633
1 261.3531012535095
2 258.83710980415344
3 256.3578133583069
4 253.91421222686768
5 251.50538110733032
6 249.130450963974
7 246.7886085510254
8 244.47909116744995
9 242.20118117332458
