#### Libraries

In [1]:
import os
import time
import math
import pickle
from datetime import datetime

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from tqdm import tqdm

from utils.parser_utils import minibatches, load_and_preprocess_data, AverageMeter

### Model

In [2]:
class ParserModel(nn.Module):
    """ Feedforward neural network with an embedding layer and single hidden layer.
    The ParserModel will predict which transition should be applied to a
    given partial parse configuration.

    PyTorch Notes:
        - Note that "ParserModel" is a subclass of the "nn.Module" class. In PyTorch all neural networks
            are a subclass of this "nn.Module".
        - The "__init__" method is where you define all the layers and their respective parameters
            (embedding layers, linear layers, dropout layers, etc.).
        - "__init__" gets automatically called when you create a new instance of your class, e.g.
            when you write "m = ParserModel()".
        - Other methods of ParserModel can access variables that have "self." prefix. Thus,
            you should add the "self." prefix layers, values, etc. that you want to utilize
            in other ParserModel methods.
        - For further documentation on "nn.Module" please see https://pytorch.org/docs/stable/nn.html.
    """
    def __init__(self, embeddings, n_features=36,
        hidden_size=200, n_classes=3, dropout_prob=0.5):
        """ Initialize the parser model.

        @param embeddings (Tensor): word embeddings (num_words, embedding_size)
        @param n_features (int): number of input features
        @param hidden_size (int): number of hidden units
        @param n_classes (int): number of output classes
        @param dropout_prob (float): dropout probability
        """
        super(ParserModel, self).__init__()
        self.n_features = n_features
        self.n_classes = n_classes
        self.dropout_prob = dropout_prob
        self.embed_size = embeddings.shape[1]
        self.hidden_size = hidden_size
        self.pretrained_embeddings = nn.Embedding(embeddings.shape[0], self.embed_size)
        self.pretrained_embeddings.weight = nn.Parameter(torch.tensor(embeddings))


        ###     1) Construct `self.embed_to_hidden` linear layer, initializing the weight matrix
        ###         with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
        ###     2) Construct `self.dropout` layer.
        ###     3) Construct `self.hidden_to_logits` linear layer, initializing the weight matrix
        ###         with the `nn.init.xavier_uniform_` function with `gain = 1` (default)
        ###
        ### Note: Here, we use Xavier Uniform Initialization for our Weight initialization.
        ###         It has been shown empirically, that this provides better initial weights
        ###         for training networks than random uniform initialization.
        ###         For more details checkout this great blogpost:
        ###             http://andyljones.tumblr.com/post/110998971763/an-explanation-of-xavier-initialization
        ### Hints:
        ###     - After you create a linear layer you can access the weight
        ###       matrix via:
        ###         linear_layer.weight
        ###
        ### Please see the following docs for support:
        ###     Linear Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Xavier Init: https://pytorch.org/docs/stable/nn.html#torch.nn.init.xavier_uniform_
        ###     Dropout: https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
        self.embed_to_hidden = nn.Linear(n_features*self.embed_size, hidden_size)
        nn.init.xavier_uniform_(self.embed_to_hidden.weight)
        self.dropout = nn.Dropout(dropout_prob)
        self.hidden_to_logits = nn.Linear(hidden_size, n_classes)
        nn.init.xavier_uniform_(self.hidden_to_logits.weight)


    def embedding_lookup(self, t):
        """ Utilize `self.pretrained_embeddings` to map input `t` from input tokens (integers)
            to embedding vectors.

            PyTorch Notes:
                - `self.pretrained_embeddings` is a torch.nn.Embedding object that we defined in __init__
                - Here `t` is a tensor where each row represents a list of features. Each feature is represented by an integer (input token).
                - In PyTorch the Embedding object, e.g. `self.pretrained_embeddings`, allows you to
                    go from an index to embedding. Please see the documentation (https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding)
                    to learn how to use `self.pretrained_embeddings` to extract the embeddings for your tensor `t`.

            @param t (Tensor): input tensor of tokens (batch_size, n_features)

            @return x (Tensor): tensor of embeddings for words represented in t
                                (batch_size, n_features * embed_size)
        """

        ###     1) Use `self.pretrained_embeddings` to lookup the embeddings for the input tokens in `t`.
        ###     2) After you apply the embedding lookup, you will have a tensor shape (batch_size, n_features, embedding_size).
        ###         Use the tensor `view` method to reshape the embeddings tensor to (batch_size, n_features * embedding_size)
        ###
        ### Note: In order to get batch_size, you may need use the tensor .size() function:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.size
        ###
        ###  Please see the following docs for support:
        ###     Embedding Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
        ###     View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        x = self.pretrained_embeddings(t)
        x = x.view(x.size(0), -1)

        return x


    def forward(self, t):
        """ Run the model forward.

            Note that we will not apply the softmax function here because it is included in the loss function nn.CrossEntropyLoss

            PyTorch Notes:
                - Every nn.Module object (PyTorch model) has a `forward` function.
                - When you apply your nn.Module to an input tensor `t` this function is applied to the tensor.
                    For example, if you created an instance of your ParserModel and applied it to some `t` as follows,
                    the `forward` function would called on `t` and the result would be stored in the `output` variable:
                        model = ParserModel()
                        output = model(t) # this calls the forward function
                - For more details checkout: https://pytorch.org/docs/stable/nn.html#torch.nn.Module.forward

        @param t (Tensor): input tensor of tokens (batch_size, n_features)

        @return logits (Tensor): tensor of predictions (output after applying the layers of the network)
                                 without applying softmax (batch_size, n_classes)
        """

        ###     1) Apply `self.embedding_lookup` to `t` to get the embeddings
        ###     2) Apply `embed_to_hidden` linear layer to the embeddings
        ###     3) Apply relu non-linearity to the output of step 2 to get the hidden units.
        ###     4) Apply dropout layer to the output of step 3.
        ###     5) Apply `hidden_to_logits` layer to the output of step 4 to get the logits.
        ###
        ### Note: We do not apply the softmax to the logits here, because
        ### the loss function (torch.nn.CrossEntropyLoss) applies it more efficiently.
        ###
        ### Please see the following docs for support:
        ###     ReLU: https://pytorch.org/docs/stable/nn.html?highlight=relu#torch.nn.functional.relu
        embs = self.embedding_lookup(t)
        hidden = self.embed_to_hidden(embs)
        logits = self.hidden_to_logits(self.dropout(F.relu(hidden)))

        return logits

### Train Functions

In [3]:
def train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size):
    """ Train the neural dependency parser for single epoch.

    Note: In PyTorch we can signify train versus test and automatically have
    the Dropout Layer applied and removed, accordingly, by specifying
    whether we are training, `model.train()`, or evaluating, `model.eval()`

    @param parser (Parser): Neural Dependency Parser
    @param train_data ():
    @param dev_data ():
    @param optimizer (nn.Optimizer): Adam Optimizer
    @param loss_func (nn.CrossEntropyLoss): Cross Entropy Loss Function
    @param batch_size (int): batch size
    @param lr (float): learning rate

    @return dev_UAS (float): Unlabeled Attachment Score (UAS) for dev data
    """
    parser.model.train() # Places model in "train" mode, i.e. apply dropout layer
    n_minibatches = math.ceil(len(train_data) / batch_size)
    loss_meter = AverageMeter()

    with tqdm(total=(n_minibatches)) as prog:
        for i, (train_x, train_y) in enumerate(minibatches(train_data, batch_size)):
            optimizer.zero_grad()   # remove any baggage in the optimizer
            loss = 0. # store loss for this batch here
            train_x = torch.from_numpy(train_x).long()
            train_y = torch.from_numpy(train_y.nonzero()[1]).long()

            ###      1) Run train_x forward through model to produce `logits`
            ###      2) Use the `loss_func` parameter to apply the PyTorch CrossEntropyLoss function.
            ###         This will take `logits` and `train_y` as inputs. It will output the CrossEntropyLoss
            ###         between softmax(`logits`) and `train_y`. Remember that softmax(`logits`)
            ###         are the predictions (y^ from the PDF).
            ###      3) Backprop losses
            ###      4) Take step with the optimizer
            ### Please see the following docs for support:
            ###     Optimizer Step: https://pytorch.org/docs/stable/optim.html#optimizer-step
            logits = parser.model(train_x)
            loss = loss_func(logits, train_y)
            loss.backward()
            optimizer.step()
            
            prog.update(1)
            loss_meter.update(loss.item())

    print ("Average Train Loss: {}".format(loss_meter.avg))

    print("Evaluating on Validation Set",)
    parser.model.eval() # Places model in "eval" mode, i.e. don't apply dropout layer
    dev_UAS, _ = parser.parse(dev_data)
    print("- Val UAS: {:.2f}".format(dev_UAS * 100.0))
    return dev_UAS

In [4]:
def train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.0005):
    """ Train the neural dependency parser.

    @param parser (Parser): Neural Dependency Parser
    @param train_data ():
    @param dev_data ():
    @param output_path (str): Path to which model weights and results are written.
    @param batch_size (int): Number of examples in a single batch
    @param n_epochs (int): Number of training epochs
    @param lr (float): Learning rate
    """
    best_dev_UAS = 0

    ###      1) Construct Adam Optimizer in variable `optimizer`
    ###      2) Construct the Cross Entropy Loss Function in variable `loss_func`
    ###
    ### Hint: Use `parser.model.parameters()` to pass optimizer
    ###       necessary parameters to tune.
    ### Please see the following docs for support:
    ###     Adam Optimizer: https://pytorch.org/docs/stable/optim.html
    ###     Cross Entropy Loss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
    optimizer = optim.Adam(parser.model.parameters(), lr=lr)
    loss_func = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        print("Epoch {:} out of {:}".format(epoch + 1, n_epochs))
        
        dev_UAS = train_for_epoch(parser, train_data, dev_data, optimizer, loss_func, batch_size)
        
        if dev_UAS > best_dev_UAS:
            best_dev_UAS = dev_UAS
            print("New best dev UAS! Saving model.")
            torch.save(parser.model.state_dict(), output_path)
        print("")

### TRAIN AND TEST

In [5]:
print(80 * "=")
print("INITIALIZING")
print(80 * "=")
parser, embeddings, train_data, dev_data, test_data = load_and_preprocess_data(True)

start = time.time()
model = ParserModel(embeddings)
parser.model = model
print("took {:.2f} seconds\n".format(time.time() - start))

INITIALIZING
Loading data...
took 1.81 seconds
Building parser...
took 0.02 seconds
Loading pretrained embeddings...
took 2.37 seconds
Vectorizing data...
took 0.06 seconds
Preprocessing training data...
took 0.94 seconds
took 0.01 seconds



In [7]:
print(80 * "=")
print("TRAINING")
print(80 * "=")
output_dir = "results/{:%Y%m%d_%H%M%S}/".format(datetime.now())
output_path = output_dir + "model.weights"

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

train(parser, train_data, dev_data, output_path, batch_size=1024, n_epochs=10, lr=0.001)

TRAINING
Epoch 1 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 34.32it/s]


Average Train Loss: 0.27971369586884975
Evaluating on Validation Set


125250it [00:00, 8806245.51it/s]                                                                                       


- Val UAS: 65.06
New best dev UAS! Saving model.

Epoch 2 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 33.10it/s]


Average Train Loss: 0.21410703317572674
Evaluating on Validation Set


125250it [00:00, 10773703.90it/s]                                                                                      


- Val UAS: 67.46
New best dev UAS! Saving model.

Epoch 3 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 30.87it/s]


Average Train Loss: 0.18367853667587042
Evaluating on Validation Set


125250it [00:00, 9421556.63it/s]                                                                                       


- Val UAS: 69.97
New best dev UAS! Saving model.

Epoch 4 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 34.71it/s]


Average Train Loss: 0.16109992299849787
Evaluating on Validation Set


125250it [00:00, 8193022.08it/s]                                                                                       


- Val UAS: 72.63
New best dev UAS! Saving model.

Epoch 5 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 32.51it/s]


Average Train Loss: 0.14576697566856942
Evaluating on Validation Set


125250it [00:00, 15743723.81it/s]                                                                                      


- Val UAS: 72.80
New best dev UAS! Saving model.

Epoch 6 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 35.76it/s]


Average Train Loss: 0.12828764785081148
Evaluating on Validation Set


125250it [00:00, 7860672.08it/s]                                                                                       


- Val UAS: 73.46
New best dev UAS! Saving model.

Epoch 7 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 33.87it/s]


Average Train Loss: 0.11631873187919457
Evaluating on Validation Set


125250it [00:00, 8533039.49it/s]                                                                                       


- Val UAS: 74.81
New best dev UAS! Saving model.

Epoch 8 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 33.26it/s]


Average Train Loss: 0.11130194707463185
Evaluating on Validation Set


125250it [00:00, 17774879.92it/s]                                                                                      


- Val UAS: 74.52

Epoch 9 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 32.32it/s]


Average Train Loss: 0.10017089343940218
Evaluating on Validation Set


125250it [00:00, 7338742.96it/s]                                                                                       


- Val UAS: 75.28
New best dev UAS! Saving model.

Epoch 10 out of 10


100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:01<00:00, 36.39it/s]


Average Train Loss: 0.0942362720767657
Evaluating on Validation Set


125250it [00:00, 10610073.64it/s]                                                                                      

- Val UAS: 74.40






In [8]:
print(80 * "=")
print("TESTING")
print(80 * "=")
print("Restoring the best model weights found on the Validation set")
parser.model.load_state_dict(torch.load(output_path))
print("Final Evaluation on Test set",)
parser.model.eval()
UAS, dependencies = parser.parse(test_data)
print("- Test UAS: {:.2f}".format(UAS * 100.0))

TESTING
Restoring the best model weights found on the Validation set
Final Evaluation on Test set


125250it [00:00, 14759962.24it/s]                                                                                      

- Test UAS: 76.61



