# PyTorch for NLP
#### Torch's tensor library

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10d1ee390>

In [2]:
# Create a torch.Tensor object with the given data.  It is a 1D vector
V_data = [1., 2., 3.]
V = torch.Tensor(V_data)
print(V)

# Creates a matrix
M_data = [[1., 2., 3.], [4., 5., 6]]
M = torch.Tensor(M_data)
print(M)

# Create a 3D tensor of size 2x2x2.
T_data = [[[1., 2.], [3., 4.]],
          [[5., 6.], [7., 8.]]]
T = torch.Tensor(T_data)
print(T)


 1
 2
 3
[torch.FloatTensor of size 3]


 1  2  3
 4  5  6
[torch.FloatTensor of size 2x3]


(0 ,.,.) = 
  1  2
  3  4

(1 ,.,.) = 
  5  6
  7  8
[torch.FloatTensor of size 2x2x2]



In [3]:
# Index into V and get a scalar
print(V[0])

# Index into M and get a vector
print(M[0])

# Index into T and get a matrix
print(T[0])

1.0

 1
 2
 3
[torch.FloatTensor of size 3]


 1  2
 3  4
[torch.FloatTensor of size 2x2]



In [4]:
x = torch.randn((3, 4, 5))
print(x)


(0 ,.,.) = 
 -2.9718  1.7070 -0.4305 -2.2820  0.5237
  0.0004 -1.2039  3.5283  0.4434  0.5848
  0.8407  0.5510  0.3863  0.9124 -0.8410
  1.2282 -1.8661  1.4146 -1.8781 -0.4674

(1 ,.,.) = 
 -0.7576  0.4215 -0.4827 -1.1198  0.3056
  1.0386  0.5206 -0.5006  1.2182  0.2117
 -1.0613 -1.9441 -0.9596  0.5489 -0.9901
 -0.3826  1.5037  1.8267  0.5561  1.6445

(2 ,.,.) = 
  0.4973 -1.5067  1.7661 -0.3569 -0.1713
  0.4068 -0.4284 -1.1299  1.4274 -1.4027
  1.4825 -1.1559  1.6190  0.9581  0.7747
  0.1940  0.1687  0.3061  1.0743 -1.0327
[torch.FloatTensor of size 3x4x5]



In [5]:
# operations with tensors
x = torch.Tensor([1., 2., 3.])
y = torch.Tensor([4., 5., 6.])
z = x + y
print(z)


 5
 7
 9
[torch.FloatTensor of size 3]



In [7]:
# By default, it concatenates along the first axis (concatenates rows)
x_1 = torch.randn(2, 5)
y_1 = torch.randn(3, 5)
z_1 = torch.cat([x_1, y_1])
print(z_1)

# Concatenate columns:
x_2 = torch.randn(2, 3)
y_2 = torch.randn(2, 5)
# second arg specifies which axis to concat along
z_2 = torch.cat([x_2, y_2], 1)
print(z_2)

# If your tensors are not compatible, torch will complain.  Uncomment to see the error
# torch.cat([x_1, x_2])



 0.0507 -0.9644 -2.0111  0.5245  2.1332
-0.0822  0.8388 -1.3233  0.0701  1.2200
 0.4251 -1.2328 -0.6195  1.5133  1.9954
-0.6585 -0.4139 -0.2250 -0.6890  0.9882
 0.7404 -2.0990  1.2582 -0.3990 -1.0952
[torch.FloatTensor of size 5x5]


-1.0703  0.6404  1.6199 -0.2831 -0.4705 -1.7655 -0.1656  0.2312
 0.5258 -0.2969 -0.0681 -0.0839 -1.7731 -1.0721  1.0248 -0.7116
[torch.FloatTensor of size 2x8]



#### Reshaping TEnsors; as many neural networks expect their input to have a certain shape
-  `.view()`


In [10]:
x = torch.randn(2, 3, 4)
print(x)
print("-"*20)
print(x.view(2, 12))  # Reshape to 2 rows, 12 columns
print("-"*20)
# Same as above.  If one of the dimensions is -1, its size can be inferred
print(x.view(2, -1))



(0 ,.,.) = 
 -0.9882  1.3801 -0.1173  0.9317
  1.3267 -1.0173 -1.8575  0.9015
  0.1495 -0.0336 -0.6076 -1.0048

(1 ,.,.) = 
 -0.2826 -0.2711  1.3210  1.1608
  0.3457 -0.1136 -0.8910  0.2900
 -2.1017 -1.1279 -0.8191  0.5334
[torch.FloatTensor of size 2x3x4]

--------------------


Columns 0 to 9 
-0.9882  1.3801 -0.1173  0.9317  1.3267 -1.0173 -1.8575  0.9015  0.1495 -0.0336
-0.2826 -0.2711  1.3210  1.1608  0.3457 -0.1136 -0.8910  0.2900 -2.1017 -1.1279

Columns 10 to 11 
-0.6076 -1.0048
-0.8191  0.5334
[torch.FloatTensor of size 2x12]

--------------------


Columns 0 to 9 
-0.9882  1.3801 -0.1173  0.9317  1.3267 -1.0173 -1.8575  0.9015  0.1495 -0.0336
-0.2826 -0.2711  1.3210  1.1608  0.3457 -0.1136 -0.8910  0.2900 -2.1017 -1.1279

Columns 10 to 11 
-0.6076 -1.0048
-0.8191  0.5334
[torch.FloatTensor of size 2x12]



##### Conputation Graph and Automatic Differentiation
- Computation graph: how the data is combine to give the output
- with it, we don't need to write back propagation gradients ourself
- fundamental class of PyTorch: `adagrad.Variable`
- the Variable class keeps track of how tensors are being created
- `Variable` knows how we created them

In [13]:
# Variables wrap tensor objects
x = autograd.Variable(torch.Tensor([1., 2., 3]), requires_grad=True)
# You can access the data with the .data attribute
print(x.data)

# You can also do all the same operations you did with tensors with Variables.
y = autograd.Variable(torch.Tensor([4., 5., 6]), requires_grad=True)
z = x + y
print(z.data)

# BUT z knows something extra.
print("z knows how it was created")
print(z.grad_fn)


 1
 2
 3
[torch.FloatTensor of size 3]


 5
 7
 9
[torch.FloatTensor of size 3]

z knows how it was created
<torch.autograd.function.AddBackward object at 0x10d18fe58>


In [14]:
# Lets sum up all the entries in z
s = z.sum()
print(s)
print(s.grad_fn)

Variable containing:
 21
[torch.FloatTensor of size 1]

<torch.autograd.function.SumBackward object at 0x10d18fd68>


- `s` have enough information about how it was created
- it can gloss over to compute the gradient


In [15]:
# calling .backward() on any variable will run backprop, starting from it.
s.backward()
print(x.grad) # compute gradient wrt x

Variable containing:
 1
 1
 1
[torch.FloatTensor of size 3]



- if we run the above block multiple times, the gradient will be increment
- because `PyTorch` accumulates the gradient in to the `.grad` property

In [17]:
x = torch.randn((2, 2))
y = torch.randn((2, 2))
z = x + y  # These are Tensor types, and backprop would not be possible

var_x = autograd.Variable(x)
var_y = autograd.Variable(y)
# var_z contains enough information to compute gradients, as we saw above
var_z = var_x + var_y
print("What info does z has")
print(var_z.grad_fn)
print("--"*20)

var_z_data = var_z.data  # Get the wrapped Tensor object out of var_z...
# Re-wrap the tensor in a new variable
new_var_z = autograd.Variable(var_z_data)

# ... does new_var_z have information to backprop to x and y?
# NO!
print("Does new_var_z have information to backprop to x and y?")
print(new_var_z.grad_fn)
# And how could it?  We yanked the tensor out of var_z (that is
# what var_z.data is).  This tensor doesn't know anything about
# how it was computed.  We pass it into new_var_z, and this is all the
# information new_var_z gets.  If var_z_data doesn't know how it was
# computed, theres no way new_var_z will.
# In essence, we have broken the variable away from its past history

What info does z has
<torch.autograd.function.AddBackward object at 0x10eb2a138>
----------------------------------------
Does new_var_z have information to backprop to x and y?
None


-  If you want the error from your loss function to backpropagate to a component of your network, you MUST NOT break the Variable chain from that component to your loss Variable. If you do, the loss will have no idea your component exists, and its parameters can’t be updated.

## Deep Learning with PyTorch
- Buildng Blocks: Affine maps, non-linearities and objectives
###### Let's make an objective function and see how the model is trained

#### Affine Maps

- for a matrix A, vectors x and bias b: Affine Map: `f(x) = Ax+b`
- maps the rows of the input to the output, plus a bias 
- different form tranditional linear alzebra: they maps columns

In [18]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10d1ee390>

In [23]:
lin = nn.Linear(5, 3)  # maps from R^5 to R^3, parameters A, b
# data is 2x5.  A maps from 5 to 3... can we map "data" under A?
data = autograd.Variable(torch.randn(2, 5))
print("Let's have a look at our random data:")
print(data)
print("-"*20)
print("Data after linear mapping")
print(lin(data))  # yes


Let's have a look at our random data:
Variable containing:
-0.9644 -2.0111  0.5245  2.1332 -0.0822
 0.8388 -1.3233  0.0701  1.2200  0.4251
[torch.FloatTensor of size 2x5]

--------------------
Data after linear mapping
Variable containing:
-0.0569  0.8267 -1.7184
-0.5147 -0.3979 -1.5739
[torch.FloatTensor of size 2x3]



#### Non-Linearities

In [24]:
# In pytorch, most non-linearities are in torch.functional (we have it imported as F)
# Note that non-linearites typically don't have parameters like affine maps do.
# That is, they don't have weights that are updated during training.
data = autograd.Variable(torch.randn(2, 2))
print(data)
print(F.relu(data))

Variable containing:
-1.2328 -0.6195
 1.5133  1.9954
[torch.FloatTensor of size 2x2]

Variable containing:
 0.0000  0.0000
 1.5133  1.9954
[torch.FloatTensor of size 2x2]



#### Softmax and Probabilities
- it is mostly last operaion in a network
- it takes in a vector of real numbers and returns a probability distribution
- Each element is non-negative and sum to 1
- like element wise exponentiation operator to the input and make everything non-negative and then dividing by the normalization constant

In [25]:
# Softmax is also in torch.nn.functional
data = autograd.Variable(torch.randn(5))
print(data)
print(F.softmax(data))
print(F.softmax(data).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data))  # theres also log_softmax


Variable containing:
-0.6585
-0.4139
-0.2250
-0.6890
 0.9882
[torch.FloatTensor of size 5]

Variable containing:
 0.1002
 0.1280
 0.1546
 0.0972
 0.5201
[torch.FloatTensor of size 5]

Variable containing:
 1
[torch.FloatTensor of size 1]

Variable containing:
-2.3005
-2.0559
-1.8671
-2.3311
-0.6538
[torch.FloatTensor of size 5]



##### Objective Functions (loss fucntion / cost function)
- we train our network to minimize this function
- so that our network generalize well
- have small loss on unseen examples in dev_set, test_set or production
- `log likelihood loss`: a very common objective function for multi classs classification
       - train the network to minimize the negative log probability of the correct output
       - maximize the log probability of the correct output
       

#### Creating Network COmponent on Pytorch
- a network using only affine maps and non-linearities
- compute loss fucntion using negative log likelihood
- update the parameters by backpropagation
##### Logistic Regression Bag of words Classifier
- takes in a sparse bag-of-words representation
- output: a probability distriution over two words: 'hello':0, 'world':1
- it is [Count(hello), Count(world)]



In [27]:
data = [("me gusta comer en la cafeteria".split(), "SPANISH"),
        ("Give it to me".split(), "ENGLISH"),
        ("No creo que sea una buena idea".split(), "SPANISH"),
        ("No it is not a good idea to get lost at sea".split(), "ENGLISH")]

test_data = [("Yo creo que si".split(), "SPANISH"),
             ("it is lost on me".split(), "ENGLISH")]

# word_to_ix maps each word in the vocab to a unique integer, which will be its
# index into the Bag of words vector
word_to_ix = {}
for sent, _ in data + test_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
NUM_LABELS = 2


class BoWClassifier(nn.Module):  # inheriting from nn.Module!

    def __init__(self, num_labels, vocab_size):
        # calls the init function of nn.Module.  Dont get confused by syntax,
        # just always do it in an nn.Module
        super(BoWClassifier, self).__init__()

        # Define the parameters that you will need.  In this case, we need A and b,
        # the parameters of the affine mapping.
        # Torch defines nn.Linear(), which provides the affine map.
        # Make sure you understand why the input dimension is vocab_size
        # and the output is num_labels!
        self.linear = nn.Linear(vocab_size, num_labels)

        # NOTE! The non-linearity log softmax does not have parameters! So we don't need
        # to worry about that here

    def forward(self, bow_vec):
        # Pass the input through the linear layer,
        # then pass that through log_softmax.
        # Many non-linearities and other functions are in torch.nn.functional
        return F.log_softmax(self.linear(bow_vec))


def make_bow_vector(sentence, word_to_ix):
    vec = torch.zeros(len(word_to_ix))
    for word in sentence:
        vec[word_to_ix[word]] += 1
    return vec.view(1, -1)

label_to_ix = {"SPANISH": 0, "ENGLISH": 1} # index for our target

def make_target(label, label_to_ix):
    return torch.LongTensor([label_to_ix[label]])


model = BoWClassifier(NUM_LABELS, VOCAB_SIZE)

# the model knows its parameters.  The first output below is A, the second is b.
# Whenever you assign a component to a class variable in the __init__ function
# of a module, which was done with the line
# self.linear = nn.Linear(...)
# Then through some Python magic from the Pytorch devs, your module
# (in this case, BoWClassifier) will store knowledge of the nn.Linear's parameters
for param in model.parameters():
    print(param)

# To run the model, pass in a BoW vector, but wrapped in an autograd.Variable
sample = data[0]
bow_vector = make_bow_vector(sample[0], word_to_ix)
log_probs = model(autograd.Variable(bow_vector))
print(log_probs)

{'me': 0, 'gusta': 1, 'comer': 2, 'en': 3, 'la': 4, 'cafeteria': 5, 'Give': 6, 'it': 7, 'to': 8, 'No': 9, 'creo': 10, 'que': 11, 'sea': 12, 'una': 13, 'buena': 14, 'idea': 15, 'is': 16, 'not': 17, 'a': 18, 'good': 19, 'get': 20, 'lost': 21, 'at': 22, 'Yo': 23, 'si': 24, 'on': 25}
Parameter containing:

Columns 0 to 9 
 0.0641 -0.0007  0.0477 -0.1672 -0.1511  0.1126  0.1763 -0.1710 -0.0196 -0.0568
 0.0106 -0.1926  0.1514  0.0820 -0.0560 -0.0115  0.1602  0.1038  0.0484 -0.0128

Columns 10 to 19 
 0.0307  0.1733 -0.0360 -0.0471 -0.1031  0.1031  0.1582  0.1065  0.0289 -0.0779
-0.1899 -0.0906  0.1684  0.1301  0.0749  0.0201  0.1951 -0.1686 -0.1285 -0.0108

Columns 20 to 25 
-0.1950  0.1070  0.0459 -0.1361 -0.0680  0.0308
-0.1423  0.0952  0.1697 -0.1208  0.0772 -0.0140
[torch.FloatTensor of size 2x26]

Parameter containing:
-0.1702
-0.1058
[torch.FloatTensor of size 2]

Variable containing:
-0.7672 -0.6242
[torch.FloatTensor of size 1x2]



##### Let's train The BoW model
- pass instances through to get log probabilities
- compute a loss function
- compute the gradient of loss function
- update the parameter with gradient step
- loss function in `nn` package; nn.NLLLoss(): negative log likelihood loss
- input to NLLLoss: a vector of log probabilities and a target label
- doesn't compute the log probabiliteis
- therfore we will have last layer: log softmax

In [29]:
# Run on test data before we train, just to see a before-and-after
for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# Print the matrix column corresponding to "creo"
print(next(model.parameters())[:, word_to_ix["creo"]])

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Usually you want to pass over the training data several times.
# 100 is much bigger than on a real data set, but real datasets have more than
# two instances.  Usually, somewhere between 5 and 30 epochs is reasonable.
for epoch in range(5):
    for instance, label in data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Make our BOW vector and also we must wrap the target in a
        # Variable as an integer. For example, if the target is SPANISH, then
        # we wrap the integer 0. The loss function then knows that the 0th
        # element of the log probabilities is the log probability
        # corresponding to SPANISH
        bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
        target = autograd.Variable(make_target(label, label_to_ix))

        # Step 3. Run our forward pass.
        log_probs = model(bow_vec)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()

for instance, label in test_data:
    bow_vec = autograd.Variable(make_bow_vector(instance, word_to_ix))
    log_probs = model(bow_vec)
    print(log_probs)

# Index corresponding to Spanish goes up, English goes down!
print("matrix column corresponding to `creo` after training")
print(next(model.parameters())[:, word_to_ix["creo"]])

Variable containing:
-0.3004 -1.3491
[torch.FloatTensor of size 1x2]

Variable containing:
-1.3685 -0.2937
[torch.FloatTensor of size 1x2]

Variable containing:
 0.1953
-0.3544
[torch.FloatTensor of size 2]

Variable containing:
-0.2353 -1.5624
[torch.FloatTensor of size 1x2]

Variable containing:
-1.6156 -0.2216
[torch.FloatTensor of size 1x2]

matrix column corresponding to `creo` after training
Variable containing:
 0.2557
-0.4148
[torch.FloatTensor of size 2]



## Word Embeddings:
### Encoding Lexical Semantics in PyTOrch
- word embeddings are a representation of the semantics of a word, efficiently encoding `semantic` information that might be relevant to the task at hand
- module `torch.nn.Embedding` allow us to use embeddins in torch.
    - it takes two arguments: vocabualry_size, embedding_dimensionality

In [30]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10d1ee390>

In [31]:
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([word_to_ix["hello"]])
hello_embed = embeds(autograd.Variable(lookup_tensor))
print(hello_embed)

Variable containing:
-2.9718  1.7070 -0.4305 -2.2820  0.5237
[torch.FloatTensor of size 1x5]



### N-Gram Langugage Modeling
- let's compute loss on soem training example 
- update the parameters with backpropagation

In [32]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
# We will use Shakespeare Sonnet 2
test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()
# we should tokenize the input, but we will ignore that for now
# build a list of tuples.  Each tuple is ([ word_i-2, word_i-1 ], target word)
trigrams = [([test_sentence[i], test_sentence[i + 1]], test_sentence[i + 2]) for i in range(len(test_sentence) - 2)]
# print the first 3, just so you can see what they look like
print(trigrams[:3])

vocab = set(test_sentence)
word_to_ix = {word: i for i, word in enumerate(vocab)}


class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs


losses = []
loss_function = nn.NLLLoss()
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)

for epoch in range(10):
    total_loss = torch.Tensor([0])
    for context, target in trigrams: #input of one trigram at a time

        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in variables)
        context_idxs = [word_to_ix[w] for w in context]
        context_var = autograd.Variable(torch.LongTensor(context_idxs))

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old
        # instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting log probabilities over next
        # words
        log_probs = model(context_var)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a variable)
        loss = loss_function(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        total_loss += loss.data
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!

[(['When', 'forty'], 'winters'), (['forty', 'winters'], 'shall'), (['winters', 'shall'], 'besiege')]
[
 521.1187
[torch.FloatTensor of size 1]
, 
 518.6967
[torch.FloatTensor of size 1]
, 
 516.2914
[torch.FloatTensor of size 1]
, 
 513.9013
[torch.FloatTensor of size 1]
, 
 511.5252
[torch.FloatTensor of size 1]
, 
 509.1629
[torch.FloatTensor of size 1]
, 
 506.8138
[torch.FloatTensor of size 1]
, 
 504.4771
[torch.FloatTensor of size 1]
, 
 502.1527
[torch.FloatTensor of size 1]
, 
 499.8381
[torch.FloatTensor of size 1]
]


### Word Embeddings: Continuous Bag-of-Words
- tries to predict words given the context of a few words before and a few words after the target word
- distinct from language modeling, since CBOW is not sequential and does not have to be probabilistic.
-  CBOW is used to quickly train word embeddings, and these embeddings are used to initialize the embeddings of some more complicated model
- referred to as `pretraining embeddings`

In [42]:
CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
EMBEDDING_DIM = 10

raw_text = """We are about to study the idea of a computational process.
Computational processes are abstract beings that inhabit computers.
As they evolve, processes manipulate other abstract things called data.
The evolution of a process is directed by a pattern of rules
called a program. People create programs to direct processes. In effect,
we conjure the spirits of the computer with our spells.""".split()

# By deriving a set from `raw_text`, we deduplicate the array
vocab = set(raw_text)
vocab_size = len(vocab)

word_to_ix = {word: i for i, word in enumerate(vocab)}

data = []
for i in range(2, len(raw_text) - 2):
    context = [raw_text[i - 2], raw_text[i - 1],
               raw_text[i + 1], raw_text[i + 2]]
    target = raw_text[i]
    data.append((context, target))
print(data[:5])



class CBOW(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(2*context_size * embedding_dim, 128) # 2 for context from both the sides
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out)
        return log_probs
# create your model and train.  here are some functions to help you make
# the data ready for use by your module


def make_context_vector(context, word_to_ix):
    idxs = [word_to_ix[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


# make_context_vector(data[0][0], word_to_ix)  # example

losses = []
loss_function = nn.NLLLoss()
model = CBOW(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
optimizer = optim.SGD(model.parameters(), lr=0.001)


for epochs in range(10):
    total_loss = torch.Tensor([0])
    for context, target in data:
        context_var = make_context_vector(context, word_to_ix)
        model.zero_grad()
        log_probs = model(context_var)
        loss = loss_function(log_probs, autograd.Variable(torch.LongTensor([word_to_ix[target]])))
        loss.backward()
        optimizer.step()
        total_loss +=loss.data
    losses.append(total_loss)
print(losses)

[(['We', 'are', 'to', 'study'], 'about'), (['are', 'about', 'study', 'the'], 'to'), (['about', 'to', 'the', 'idea'], 'study'), (['to', 'study', 'idea', 'of'], 'the'), (['study', 'the', 'of', 'a'], 'idea')]
[
 227.7815
[torch.FloatTensor of size 1]
, 
 226.2712
[torch.FloatTensor of size 1]
, 
 224.7702
[torch.FloatTensor of size 1]
, 
 223.2798
[torch.FloatTensor of size 1]
, 
 221.7987
[torch.FloatTensor of size 1]
, 
 220.3266
[torch.FloatTensor of size 1]
, 
 218.8619
[torch.FloatTensor of size 1]
, 
 217.4039
[torch.FloatTensor of size 1]
, 
 215.9532
[torch.FloatTensor of size 1]
, 
 214.5096
[torch.FloatTensor of size 1]
]


## Sequence Models and Long Short Term Memory Networks

- in feed forward network, no state between the input is maintained
- what if we want our model to consider some sort of dependencies through time between our input
- Examples of these models: 
    - Hidden markov models for Part of Speech tagging
    - Conditional Random Field

#### RNN: recurrent neural network
- RNN stores some kind of state
- its output could be used as part of next input
 
#### LSTM in PyTorch
- expects each inputs to be 3D Tensor
    - first axis: Sequence itself
    - second axis: isntance of mini-batches
    - third axis: element of the input

In [43]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x10d1ee390>

In [54]:
lstm = nn.LSTM(3, 3)  # Input dim is 3, output dim is 3
inputs = [autograd.Variable(torch.randn((1, 3))) for _ in range(5)]  # make a sequence of length 5

# initialize the hidden state.
hidden = (autograd.Variable(torch.randn(1, 1, 3)),
          autograd.Variable(torch.randn((1, 1, 3))))
for i in inputs:
    # Step through the sequence one element at a time.
    # after each step, hidden contains the hidden state.
    out, hidden = lstm(i.view(1, 1, -1), hidden)

# alternatively, we can do the entire sequence all at once.
# the first value returned by LSTM is all of the hidden states throughout
# the sequence. the second is just the most recent hidden state
# (compare the last slice of "out" with "hidden" below, they are the same)
# The reason for this is that:
# "out" will give you access to all hidden states in the sequence
# "hidden" will allow you to continue the sequence and backpropagate,
# by passing it as an argument  to the lstm at a later time
# Add the extra 2nd dimension
inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (autograd.Variable(torch.randn(1, 1, 3)), autograd.Variable(torch.randn((1, 1, 3))))  # clean out hidden state
# hidden = (autograd.Variable(torch.randn(1, 1, 3)))  # clean out hidden state


out, hidden = lstm(inputs, hidden)
print(out)
print("-"*10)
print(hidden)

Variable containing:
(0 ,.,.) = 
 -0.3343  0.1457  0.1078

(1 ,.,.) = 
 -0.2018  0.1239  0.0529

(2 ,.,.) = 
 -0.3114  0.1585  0.1186

(3 ,.,.) = 
 -0.4305  0.1928  0.1592

(4 ,.,.) = 
 -0.2115  0.0799  0.0866
[torch.FloatTensor of size 5x1x3]

----------
(Variable containing:
(0 ,.,.) = 
 -0.2115  0.0799  0.0866
[torch.FloatTensor of size 1x1x3]
, Variable containing:
(0 ,.,.) = 
 -0.6231  0.1859  0.3601
[torch.FloatTensor of size 1x1x3]
)


### LSTM for Part of Speech Tagging

In [56]:
# Let's prepare the data

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)


training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

tag_to_ix = {"DET": 0, "NN": 1, "V": 2}
print(tag_to_ix)

# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

{'The': 0, 'dog': 1, 'ate': 2, 'the': 3, 'apple': 4, 'Everybody': 5, 'read': 6, 'that': 7, 'book': 8}
{'DET': 0, 'NN': 1, 'V': 2}


In [57]:
# Create the Model

class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space)
        return tag_scores

In [59]:
# Train the model

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print("tag_scores before optimization")
print(tag_scores)

for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tags in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Variables of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# See what the scores are after training
inputs = prepare_sequence(training_data[0][0], word_to_ix) #takign first sentence and getting it's words_idx
tag_scores = model(inputs)
# The sentence is "the dog ate the apple".  i,j corresponds to score for tag j
#  for word i. The predicted tag is the maximum scoring tag.
# Here, we can see the predicted sequence below is 0 1 2 0 1
# since 0 is index of the maximum value of row 1,
# 1 is the index of maximum value of row 2, etc.
# Which is DET NOUN VERB DET NOUN, the correct sequence!
print(tag_scores)

tag_scores before optimization
Variable containing:
-1.2316 -0.8447 -1.2783
-1.3461 -0.7961 -1.2425
-1.2964 -0.8080 -1.2704
-1.1575 -0.8825 -1.3020
-1.1765 -0.8615 -1.3127
[torch.FloatTensor of size 5x3]

Variable containing:
-0.0905 -3.0901 -3.1928
-5.5411 -0.0562 -2.9813
-2.4526 -2.2690 -0.2101
-0.0999 -4.5885 -2.4659
-5.6176 -0.0134 -4.6370
[torch.FloatTensor of size 5x3]



# IMPLEMENT: HomeWork

### LSTM PoS Tagger with Character level Features
- Two LSTMs in the models
    - The original one outputs, PoS tag score
    - One new LSTM:To get Character level representation of words, do an LSTM over characters of a word
        - To do a character level sequence; we need to embed characters
        - this Character embedding will be inpu to the character level LSTM
        - outputs a character level representation of each word
        
- Augment the word embeddings with a representations derived from characters of the word

## Making Dynamic Decisions and the Bi-LSTM CRF