Deep Learning with PyTorch: https://github.com/ytiam/tutorials/blob/example_notebooks/beginner_source/nlp/deep_learning_tutorial.py
**************************

Deep Learning Building Blocks: Affine maps, non-linearities and objectives
==========================================================================

Deep learning consists of composing linearities with non-linearities in
clever ways. The introduction of non-linearities allows for powerful
models. In this section, we will play with these core components, make
up an objective function, and see how the model is trained.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
torch.manual_seed(1)

<torch._C.Generator at 0x7ff188924f70>

In [3]:
lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b

In [4]:
lin

Linear(in_features=5, out_features=3, bias=True)

In [5]:
# data is 2x5.  A maps from 5 to 3... can we map "data" under A?
data = torch.randn(2, 5)

In [6]:
data

tensor([[-1.1948,  0.0250, -0.7627,  1.3969, -0.3245],
        [ 0.2879,  1.0579,  0.9621,  0.3935,  1.1322]])

In [7]:
print(lin(data))

tensor([[ 0.1755, -0.3268, -0.5069],
        [-0.6602,  0.2260,  0.1089]], grad_fn=<AddmmBackward0>)


In pytorch, most non-linearities are in torch.functional (we have it imported as F)
Note that non-linearites typically don't have parameters like affine maps do.
That is, they don't have weights that are updated during training.

In [8]:
print(F.relu(lin(data)))

tensor([[0.1755, 0.0000, 0.0000],
        [0.0000, 0.2260, 0.1089]], grad_fn=<ReluBackward0>)


In [9]:
# Softmax is also in torch.nn.functional
data = torch.randn(5)

In [10]:
data

tensor([-0.5404, -2.2102,  2.1130, -0.0040,  1.3800])

In [11]:
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax


tensor([0.0418, 0.0079, 0.5936, 0.0715, 0.2852])
tensor(1.)
tensor([-3.1749, -4.8447, -0.5215, -2.6385, -1.2545])


In [12]:
# torch.log_(torch.tensor(0.2377))

In [13]:
######################################################################
# Example: Logistic Regression Bag-of-Words classifier
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
# Our model will map a sparse BoW representation to log probabilities over
# labels. We assign each word in the vocab an index. For example, say our
# entire vocab is two words "hello" and "world", with indices 0 and 1
# respectively. The BoW vector for the sentence "hello hello hello hello"
# is
#
# .. math::  \left[ 4, 0 \right]
#
# For "hello world world hello", it is
#
# .. math::  \left[ 2, 2 \right]
#
# etc. In general, it is
#
# .. math::  \left[ \text{Count}(\text{hello}), \text{Count}(\text{world}) \right]
#
# Denote this BOW vector as :math:`x`. The output of our network is:
#
# .. math::  \log \text{Softmax}(Ax + b)
#
# That is, we pass the input through an affine map and then do log
# softmax.
#

In [14]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

In [15]:
test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

In [16]:
# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}


In [17]:
VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2

In [18]:
# class BoWClassifier(nn.Module):  # inheriting from nn.Module!

#     def __init__(self, num_labels, vocab_size):
#         # calls the init function of nn.Module.  Dont get confused by syntax,
#         # just always do it in an nn.Module
#         super(BoWClassifier, self).__init__()

#         # Define the parameters that you will need.  In this case, we need A and b,
#         # the parameters of the affine mapping.
#         # Torch defines nn.Linear(), which provides the affine map.
#         # Make sure you understand why the input dimension is vocab_size
#         # and the output is num_labels!
#         self.linear = nn.Linear(vocab_size, num_labels)

#         # NOTE! The non-linearity log softmax does not have parameters! So we don't need
#         # to worry about that here

#     def forward(self, bow_vec):
#         # Pass the input through the linear layer,
#         # then pass that through log_softmax.
#         # Many non-linearities and other functions are in torch.nn.functional
#         return F.log_softmax(self.linear(bow_vec), dim=1)

In [19]:
class BoWClassifier_(nn.Module):

  def __init__(self,num_labels, vocab_size):
    super(BoWClassifier_,self).__init__()
    self.linear = nn.Linear(vocab_size,num_labels)

  def forward(self, bow_vec):
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [20]:
def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

In [21]:
model = BoWClassifier_(NUM_LABELS, VOCAB_SIZE)

In [22]:
model(make_bow_vector("Yo creo que si".split(),word_to_ix))

tensor([[-0.6866, -0.6998]], grad_fn=<LogSoftmaxBackward0>)

the model knows its parameters.  The first output below is A, the second is b. Whenever you assign a component to a class variable in the __init__ function of a module, which was done with the line
# self.linear = nn.Linear(...)
Then through some Python magic from the PyTorch devs, your module (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters

In [23]:
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.1885, -0.0935,  0.1064, -0.0477,  0.1953,  0.1572, -0.0092, -0.1309,
          0.1194,  0.0609, -0.1268,  0.1274,  0.1191,  0.1739, -0.1099, -0.0323,
         -0.0038,  0.0286, -0.1488, -0.1392,  0.1067, -0.0460,  0.0958,  0.0112,
          0.0644,  0.0431],
        [ 0.0713,  0.0972, -0.1816,  0.0987, -0.1379, -0.1480,  0.0119, -0.0334,
          0.1152, -0.1136, -0.1743,  0.1427, -0.0291,  0.1103,  0.0630, -0.1471,
          0.0394,  0.0471, -0.1313, -0.0931,  0.0669,  0.0351, -0.0834, -0.0594,
          0.1796, -0.0363]], requires_grad=True)
Parameter containing:
tensor([0.1106, 0.0849], requires_grad=True)


In [24]:
# To run the model, pass in a BoW vector
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    sample = data[0]
    bow_vector = make_bow_vector(sample[0], word_to_ix)
    log_probs = model(bow_vector)
    print(log_probs)

tensor([[-0.5313, -0.8864]])


In [25]:
def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])

In [26]:
# Which of the above values corresponds to the log probability of ENGLISH,
# and which to SPANISH? We never defined it, but we need to if we want to
# train the thing.

label_to_ix = {"SPANISH": 0, "ENGLISH": 1}

In [27]:
# Run on test data before we train, just to see a before-and-after
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.6866, -0.6998]])
tensor([[-0.8991, -0.5225]])


In [28]:
# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([-0.1268, -0.1743], grad_fn=<SelectBackward0>)


In [29]:
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [30]:
for epoch in range(200):
  for instance, label in data:
    model.zero_grad()

    bow_vec = make_bow_vector(instance, word_to_ix)
    target = make_target(label, label_to_ix)

    log_probs = model(bow_vec)
    loss = loss_function(log_probs, target)
    print('loss...', loss)
    loss.backward()
    optimizer.step()

loss... tensor(0.5313, grad_fn=<NllLossBackward0>)
loss... tensor(0.6055, grad_fn=<NllLossBackward0>)
loss... tensor(0.5221, grad_fn=<NllLossBackward0>)
loss... tensor(0.9851, grad_fn=<NllLossBackward0>)
loss... tensor(0.4010, grad_fn=<NllLossBackward0>)
loss... tensor(0.3710, grad_fn=<NllLossBackward0>)
loss... tensor(0.4625, grad_fn=<NllLossBackward0>)
loss... tensor(0.3311, grad_fn=<NllLossBackward0>)
loss... tensor(0.2968, grad_fn=<NllLossBackward0>)
loss... tensor(0.2867, grad_fn=<NllLossBackward0>)
loss... tensor(0.3413, grad_fn=<NllLossBackward0>)
loss... tensor(0.1951, grad_fn=<NllLossBackward0>)
loss... tensor(0.2316, grad_fn=<NllLossBackward0>)
loss... tensor(0.2372, grad_fn=<NllLossBackward0>)
loss... tensor(0.2588, grad_fn=<NllLossBackward0>)
loss... tensor(0.1395, grad_fn=<NllLossBackward0>)
loss... tensor(0.1891, grad_fn=<NllLossBackward0>)
loss... tensor(0.2028, grad_fn=<NllLossBackward0>)
loss... tensor(0.2055, grad_fn=<NllLossBackward0>)
loss... tensor(0.1087, grad_fn=

In [31]:
with torch.no_grad():
    for instance, label in test_data:
        bow_vec = make_bow_vector(instance, word_to_ix)
        log_probs = model(bow_vec)
        print(log_probs)

tensor([[-0.1164, -2.2087]])
tensor([[-3.3730, -0.0349]])


In [45]:
print(next(model.parameters())[:, word_to_ix["creo"]])

tensor([ 0.0245, -0.0899], grad_fn=<SelectBackward0>)
