## Simple Name Entity Recognition

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [63]:
import os
import sys
import utils 
import torch
import logging
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random

from tqdm import trange
from torch.autograd import Variable
sys.path.append(".")

### 1) Building the model

In [64]:
class Model(nn.Module):
    def __init__(self, params):
        """
        We define a recurrent network that predicts the NER tags for each token in the sentence. The components
        required are:
        - an embedding layer: this layer maps each index in range(params.vocab_size) to a params.embedding_dim vector
        - lstm: applying the LSTM on the sequential input returns an output for each token in the sentence
        - fc: a fully connected layer that converts the LSTM output for each token to a distribution over NER tags

        Args:
            params: (Params) contains vocab_size, embedding_dim, lstm_hidden_dim
        """
        super(Model, self).__init__()
        self.embedding = nn.Embedding(params.vocab_size, params.embedding_dim)
        self.lstm = nn.LSTM(params.embedding_dim,
                            params.lstm_hidden_dim, batch_first=True)
        self.fc = nn.Linear(params.lstm_hidden_dim, params.number_of_tags)
        
    def forward(self, s):
        s = self.embedding(s)       # dim: batch_size x seq_len x embedding_dim
        s, _ = self.lstm(s)
        s = s.contiguous()          # required often before view
        s = s.view(-1, s.shape[2])  # dim: batch_size*seq_len x lstm_hidden_dim
        s = self.fc(s)
        # softmax on all tokens (batch_size -> #sentences, seq_len -> #tokens_per_sentence, s-> all tokens)
        return F.log_softmax(s, dim=1)

### 2) Loss Function
Cross Entrophy Loss

In [65]:

def loss_fn(outputs, labels):
    """Compute the cross entropy loss over outputs from the model and labels for all tokens

    Args:
        outputs: (Variable) dimension batch_size*seq_len x num_tags - log softmax output of the model
        labels: (Variable) dimension batch_size x seq_len where each element is either a label in [0, 1, ... num_tag-1],
            or -1 in case it is a PADding token.
    Returns:
        loss: (Variable) cross entropy loss for all tokens in the batch
    """

    # reshape to shape of batch_size*seq_len
    labels = labels.view(-1)

    # since padding tokens have label -1, we can generate a mask to exclude the loss from those terms
    mask = (labels >= 0).float()

    # indexing with negative values is not supported. Since PADded tokens have label -1, we convert them to a positive
    # number. This does not affect training, since we ignore the PADded tokens with the mask.
    labels = labels % outputs.shape[1]

    num_tokens = int(torch.sum(mask))

    # compute cross entropy loss for all tokens (except PADding tokens), by multiplying with mask
    return -torch.sum(outputs[range(outputs.shape[0]), labels] * mask) / num_tokens



### 3) Accuracy

In [66]:
def accuracy(outputs, labels):
    """Compute accuracy for all tokens excluding Padding terms"""

    labels = labels.ravel()  # flattened array
    mask = (labels >= 0)
    # np.argmax gives us the class predicted for each token by the model
    outputs = np.argmax(outputs, axis=1)
    return np.sum(outputs == labels) / float(np.sum(mask))



In [67]:
# maintain all metrics required in this dictionary - these are used in the training and evaluation loops
metrics = {
    'accuracy': accuracy,
    # add more metrics if required for each token type
}

### 4) Data Preparation

##### 1) Creating data loader

In [68]:

class DataLoader(object):
    """Stores dataset_params, vocabulary ad tags with their mapping to indices"""

    def __init__(self, data_dir, params):
        """Loads dataset_params, vocabulary and tags. Ensure you have already run build_vocab.py on data_dir"""

        json_path = os.path.join(data_dir, 'dataset_params.json')
        assert os.path.isfile(
            json_path), "No json file found at {}, run build_vocab.py".format(json_path)
        self.dataset_params = utils.Params(json_path)

        # loading vocab
        vocab_path = os.path.join(data_dir, 'words.txt')
        self.vocab = {}
        with open(vocab_path) as f:
            # map words to their indices
            for i, l in enumerate(f.read().splitlines()):
                self.vocab[l] = i

        # setting the indices for UNKnown words and PADding symbols
        self.unk_ind = self.vocab[self.dataset_params.unk_word]
        self.pad_ind = self.vocab[self.dataset_params.pad_word]

        # loading tags
        tags_path = os.path.join(data_dir, 'tags.txt')
        self.tag_map = {}
        with open(tags_path) as f:
            for i, t in enumerate(f.read().splitlines()):
                # map tags to their indices
                self.tag_map[t] = i

        # adding dataset parameters to param
        params.update(json_path)

    def load_sentences_labels(self, sentences_file, labels_file, d):
        """
        Loads sentences and labels from their corresponding files. Maps tokens and tags to their indices and stores
        them in the provided dict d.
        Args:
                sentences_file: (string) file with sentences with tokens space-separated
                labels_file: (string) file with NER tags for the sentences in labels_file
                d: (dict) a dictionary in which the loaded data is stored
        """

        sentences = []
        labels = []

        with open(sentences_file) as f:
            for sentence in f.read().splitlines():
                s = [self.vocab[token] if token in self.vocab
                     else self.unk_ind
                     for token in sentence.split(' ')]
                sentences.append(s)

        with open(labels_file) as f:
            for sentence in f.read().splitlines():
                l = [self.tag_map[label] for label in sentence.split(' ')]
                labels.append(l)

        # ensure there is a tag for each token
        assert len(labels) == len(sentences)
        for i in range(len(labels)):
            assert len(labels[i]) == len(sentences[i])

        # storing sentences and labels in a dict
        d['data'] = sentences
        d['labels'] = labels
        d['size'] = len(sentences)

    def load_data(self, types, data_dir):
        """Loads data for each type in types from data_dir"""

        data = {}
        for split in ['train', 'val', 'test']:
            if split in types:
                sentences_file = os.path.join(data_dir, split, 'sentences.txt')
                labels_file = os.path.join(data_dir, split, 'labels.txt')
                data[split] = {}
                self.load_sentences_labels(
                    sentences_file, labels_file, data[split])

        return data

    def data_iterator(self, data, params, shuffle=False):
        """
        Returns a generator that yields batches of data with labels. Batch size is params.batch_size. Expires after one
        pass over the data.
        Args:
            data: (dict) contains data which has keys 'data', 'labels' and 'size'
            params: (Params) hyperparameters of the training process.
            shuffle: (bool) whether the data should be shuffled
        Yields:
            batch_data: (Variable) dimension batch_size x seq_len with the sentence data
            batch_labels: (Variable) dimension batch_size x seq_len with the corresponding labels
        """

        # make a list that decides the order in which we go over the data- this avoids explicit shuffling of data
        order = list(range(data['size']))
        if shuffle:
            random.seed(230)
            random.shuffle(order)

        # one pass over data
        for i in range(data['size'] + 1 // params.batch_size):
            # fetch sentences and tags
            batch_sentences = [data['data'][idx]
                               for idx in order[i * params.batch_size:(i + 1) * params.batch_size]]
            batch_tags = [data['labels'][idx] for idx in order[i *
                                                               params.batch_size:(i + 1) * params.batch_size]]

            # compute length of longest sentence in the batch
            batch_max_len = max([len(s) for s in batch_sentences])

            # prepare a numpy array with the data, initialising the data with pad_ind and all labels with -1
            # initialising labels to -1 differentiates tokens with tags from PADding tokens
            batch_data = self.pad_ind * \
                np.ones((len(batch_sentences), batch_max_len))
            batch_labels = -1 * np.ones((len(batch_sentences), batch_max_len))

            # copy the data to the numpy array
            for j in range(len(batch_sentences)):
                cur_len = len(batch_sentences[j])
                batch_data[j][:cur_len] = batch_sentences[j]
                batch_labels[j][:cur_len] = batch_tags[j]

            batch_data, batch_labels = torch.LongTensor(
                batch_data), torch.LongTensor(batch_labels)

            # shift tensors to GPU if available
            if params.cuda:
                batch_data, batch_labels = batch_data.cuda(), batch_labels.cuda()

            # convert them to Variables to record operations in the computational graph
            batch_data, batch_labels = Variable(
                batch_data), Variable(batch_labels)

            yield batch_data, batch_labels

### 5) Training

In [69]:
def train(model, optimizer, loss_fn, data_iterator, metrics, params, num_steps):
    """Train the model on `num_steps` batches"""

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = utils.RunningAverage()

    # Use tqdm for progress bar
    t = trange(num_steps)
    for i in t:
        train_batch, labels_batch = next(data_iterator)

        output_batch = model(train_batch)
        loss = loss_fn(output_batch, labels_batch)

        optimizer.zero_grad()  # clear gradients
        loss.backward()         # compute gradients wrt loss
        optimizer.step()        # update gradients

        if i % params.save_summary_steps == 0:
            # convert tensor data to numpy
            output_batch = output_batch.data.cpu().numpy()
            labels_batch = labels_batch.data.cpu().numpy()

            # compute all metrics on this batch
            summary_batch = {metric: metrics[metric](
                output_batch, labels_batch) for metric in metrics}
            summary_batch['loss'] = loss.item()
            summ.append(summary_batch)

        # update the average loss
        loss_avg.update(loss.item())
        t.set_postfix(loss='{:05.3f}'.format(loss_avg()))

        # compute mean of all metrics in summary
        metrics_mean = {metric: np.mean(
            [x[metric] for x in summ]) for metric in summ[0]}
        metrics_string = " ; ".join("{}: {:05.3f}".format(k, v)
                                    for k, v in metrics_mean.items())
        logging.info('- Train metrics: ' + metrics_string)



In [70]:
def train_and_evaluate(model, train_data, val_data, optimizer, loss_fn, metrics, params, model_dir, restore_file=None):
    if restore_file is not None:
        restore_path = os.path.join(
            model_dir, restore_file + '.pth.tar')
        logging.info('Restoring parameters from {}'.format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)


    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info('Epoch {}/{}'.format(epoch + 1, params.num_epochs))

        # Compute number of batches in one epoch
        num_steps = (params.train_size + 1) // params.batch_size
        train_data_iterator = data_loader.data_iterator(
            train_data, params, shuffle=True)
        train(model, optimizer, loss_fn, train_data_iterator,
              metrics, params, num_steps)

        # Evaluate for one epoch on validation set
        num_steps = (params.val_size + 1) // params.batch_size
        val_data_iterator = data_loader.data_iterator(
            val_data, params, shuffle=False)
        val_metrics = evaluate(
            model, loss_fn, val_data_iterator, metrics, params, num_steps)

        val_acc = val_metrics['accuracy']
        is_best = val_acc >= best_val_acc

    
        # If best_eval, best_save_path
        if is_best:
            logging.info('- Found new best accuracy')
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(
                model_dir, 'metrics_val_best_weights.json')
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(
            model_dir, 'metrics_val_last_weights.json')
        utils.save_dict_to_json(val_metrics, last_json_path)


In [71]:
json_path = os.path.join("./", 'params.json')
assert os.path.isfile(
    json_path), "No json config file found at {}".format(json_path)
params = utils.Params(json_path)

params.cuda = torch.cuda.is_available()

In [72]:
torch.manual_seed(230)

if params.cuda:
    torch.cuda.manual_seed(230)


In [73]:
# Set the logger
#utils.set_logger(os.path.join("./logs", 'train.log'))

In [74]:
# Create the input data pipeline
logging.info("Loading the datasets...")

In [75]:
# load data
data_loader = DataLoader('/content/drive/My Drive/Colab Notebooks/datasets and models/data/', params)
data = data_loader.load_data(['train', 'val'], '/content/drive/My Drive/Colab Notebooks/datasets and models/data/')
train_data = data['train']
val_data = data['val']

In [76]:
# specify the train and val dataset sizes
params.train_size = train_data['size']
params.val_size = val_data['size']


In [77]:
logging.info('- done.')

In [78]:
# Define the model and optimizer
model = Model(params).cuda() if params.cuda else Model(params)
optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)


In [79]:
print(model)


Model(
  (embedding): Embedding(35180, 50)
  (lstm): LSTM(50, 50, batch_first=True)
  (fc): Linear(in_features=50, out_features=17, bias=True)
)


In [80]:
 train_and_evaluate(model, train_data, val_data, optimizer,
                       loss_fn, metrics, params, "./")

100%|██████████| 6714/6714 [00:50<00:00, 133.01it/s, loss=0.353]


AttributeError: ignored

#### 6) Evaluation

In [81]:
def evaluate(model, loss_fn, data_iterator, metrics, params, num_steps):
    """Evaluate the model on 'num_steps' batches"""

    model.eval()    # set model to evaluation mode
    summ = []       # summary for current eval loop

    for _ in range(num_steps):
        data_batch, labels_batch = next(data_iterator)

        output_batch = model(data_batch)
        loss = loss_fn(output_batch, labels_batch)

        output_batch = output_batch.data.cpu().numpy()
        labels_batch = labels_batch.data.cpu().numpy()

        summary_batch = {metric: metrics[metric](
            output_batch, labels_batch) for metric in metrics}
        summary_batch['loss'] = loss.item()
        summ.append(summary_batch)

    # compute mean of all metrics in summary
    metrics_mean = {metric: np.mean([x[metric]
                                     for x in summ]) for metric in summ[0]}
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v)
                                for k, v in metrics_mean.items())
    logging.info('- Eval metrics: ' + metrics_string)
    return metrics_mean

In [82]:
# Get the logger
#utils.set_logger(os.path.join("./logs", 'evaluate.log'))

In [83]:
# load data
data_loader = DataLoader('/content/drive/My Drive/Colab Notebooks/datasets and models/data/', params)
data = data_loader.load_data(['val'], '/content/drive/My Drive/Colab Notebooks/datasets and models/data/')
test_data = data['val']

In [84]:
# specify the test set size
params.test_size = test_data['size']
test_data_iterator = data_loader.data_iterator(test_data, params)

In [85]:
print(test_data_iterator)

<generator object DataLoader.data_iterator at 0x7f57585f4bf8>


In [86]:
print(next(test_data_iterator)[0])

tensor([[ 1020,    68,  5092,    50,     9, 29845,  1677, 18327,  1033,     9,
          4452,    13,   522, 29846,    45, 10314,   223,  6582,    21, 35178,
         35178, 35178],
        [ 6607, 10092,    31,    45,  2112,    80,     9,   580,   581,   855,
         20336,   857,    63,   363,    93,  2822,   347,  6657, 10314,    18,
          6599,    21],
        [ 1641,   151,   817,   120,  1354,  1790,  1053,  1054,   864,    21,
         35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178, 35178,
         35178, 35178],
        [ 1615,  1384,   125,   172,   107,    45,  7127,  1566,    18,  1724,
           337,  1002,   322,   116,    18,     9,  1065,  1052,    21, 35178,
         35178, 35178],
        [ 2798,    93,  5154,   125,   126,  1053,  1054,    24,   134, 14349,
          1559,    63,   882,  1466,     7,    45,   884,   107,    21, 35178,
         35178, 35178]], device='cuda:0')


In [87]:
print(next(test_data_iterator)[1])

tensor([[ 0,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  1,  0, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0,
          0,  0,  1,  0,  0,  0,  0, -1, -1, -1],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],
        [ 5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  4,  0,
          0,  0,  0,  1,  0,  1,  0,  0,  0,  0],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  7, 12,  0, -1, -1, -1, -1, -1, -1, -1,
         -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], device='cuda:0')


In [89]:
torch.save(model.state_dict(), "NER-Epoch100.pth")

In [90]:
pred = model(next(test_data_iterator)[0])

In [91]:
print(pred)

tensor([[-4.9870e-02, -5.5602e+00, -7.4069e+00,  ..., -8.0736e+00,
         -8.2532e+00, -9.4013e+00],
        [-3.0100e-03, -8.1961e+00, -1.1800e+01,  ..., -1.1182e+01,
         -1.0437e+01, -1.3208e+01],
        [-3.6463e+00, -8.0811e-02, -5.0329e+00,  ..., -8.5043e+00,
         -1.0360e+01, -1.1429e+01],
        ...,
        [-5.1000e-01, -5.6407e+00, -5.2204e+00,  ..., -7.3921e+00,
         -7.7477e+00, -8.5167e+00],
        [-5.9379e-01, -5.7015e+00, -5.1903e+00,  ..., -7.4786e+00,
         -7.8078e+00, -8.5684e+00],
        [-6.7590e-01, -5.7780e+00, -5.1727e+00,  ..., -7.5779e+00,
         -7.8852e+00, -8.6467e+00]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward>)


In [92]:
print(pred.argmax())

tensor(2278, device='cuda:0')
