##Connection to Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd drive/MyDrive/Colab Notebooks/Temp/NER

##Importing necessary items

In [None]:
!pip install trax

In [None]:
import os 
import trax 
from trax import layers as tl
import numpy as np
import pandas as pd
import random as rnd

##Loading and Preprocessing Dataset


In [None]:
def get_vocab(vocab_path, tags_path):
    vocab = {}
    idx2word = []
    with open(vocab_path) as f:
        for i, w in enumerate(f.read().splitlines()):
            vocab[w] = i
            idx2word.append(w)
    vocab['<PAD>'] = len(vocab)
    vocab['<UNK>'] = len(vocab)
    idx2word.append('<PAD>')
    idx2word.append('<UNK') 

    tag_map = {}
    idx2tag = []
    with open(tags_path) as f:
        for i, t in enumerate(f.read().splitlines()):
            tag_map[t] = i
            idx2tag.append(t) 

    return vocab, idx2word, tag_map, idx2tag




def preprocess(vocab, tag_map, sentences_file, labels_file):
    sentences = []
    labels = []

    with open(sentences_file) as f:
        for sentence in f.read().splitlines():
            # replace each token by its index if it is in vocab
            # else use index of UNK WORD
            s = [vocab[token] if token in vocab 
                 else vocab['<UNK>']
                 for token in sentence.split(' ')]
            sentences.append(s)

    with open(labels_file) as f:
        for sentence in f.read().splitlines():
            # replace each label by its index
            l = [tag_map[label] for label in sentence.split(' ')] 
            labels.append(l) 
    return sentences, labels, len(sentences)

In [None]:
vocab, idx2word, tag_map, idx2tag = get_vocab('data/words.txt', 'data/tags.txt')
train_sen, train_labels, train_size = preprocess(vocab, tag_map, 'data/train/sentences.txt', 'data/train/labels.txt')
valid_sen, valid_labels, valid_size = preprocess(vocab, tag_map, 'data/val/sentences.txt', 'data//val/labels.txt')
test_sen, test_labels, test_size = preprocess(vocab, tag_map, 'data/test/sentences.txt', 'data/test/labels.txt')

##Batch Generator

In [None]:
def data_generator(batch_size, x, y, pad, shuffle=False, verbose=False):
    '''
      Input: 
        batch_size - integer describing the batch size
        x - list containing sentences where words are represented as integers
        y - list containing tags associated with the sentences
        shuffle - Shuffle the data order
        pad - an integer representing a pad character
        verbose - Print information during runtime
      Output:
        a tuple containing 2 elements:
        X - np.ndarray of dim (batch_size, max_len) of padded sentences
        Y - np.ndarray of dim (batch_size, max_len) of tags associated with the sentences in X
    '''
    num_lines = len(x)
    lines_index = [*range(num_lines)]
    
    if shuffle:
        rnd.shuffle(lines_index)

    index = 0 # tracks current location in x, y
    while True:
        buffer_x = [0] * batch_size # Temporal array to store the raw x data for this batch
        buffer_y = [0] * batch_size # Temporal array to store the raw y data for this batch
                
        max_len = 0
        for i in range(batch_size):
            if index >= num_lines:
                index = 0
                if shuffle:
                    rnd.shuffle(lines_index)

            buffer_x[i] = x[lines_index[index]]
            buffer_y[i] = y[lines_index[index]]
          
            lenx = len(x[lines_index[index]]) 
            if lenx > max_len:
                max_len = lenx 

            index += 1


        # creating X, Y, NumPy arrays of size (batch_size, max_len) 'full' of pad value
        X = np.full((batch_size, max_len), pad)
        Y = np.full((batch_size, max_len), pad)

        for i in range(batch_size):
            x_i = buffer_x[i]
            y_i = buffer_y[i]

            for j in range(len(x_i)):
                X[i, j] = x_i[j]
                Y[i, j] = y_i[j]
        if verbose: print("index=", index)
        yield((X, Y))

##Building the model

In [None]:
vocab_size = len(vocab)
embedding_dims = 64
num_tags = len(tag_map)

In [None]:
def NER(vocab_size=35180, d_model=50, tags=tag_map):

    model = tl.Serial(
      tl.Embedding(vocab_size, d_model), 
      tl.LSTM(d_model), 
      tl.LSTM(d_model),
      tl.Dense(len(tags)),
      tl.LogSoftmax()  
      )
    return model

In [None]:
model = NER(vocab_size=vocab_size, d_model=embedding_dims, tags=tag_map)
print(model)

Serial[
  Embedding_35180_64
  LSTM_64
  LSTM_64
  Dense_17
  LogSoftmax
]


##Getting training and validation batches

In [None]:
from trax.supervised import training
batch_size = 64

# Create training data, mask pad id=35178 for training.
train_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, train_sen, train_labels, vocab['<PAD>'], True), id_to_mask=vocab['<PAD>'])

# Create validation data, mask pad id=35178 for training.
eval_generator = trax.data.inputs.add_loss_weights(
    data_generator(batch_size, valid_sen, valid_labels, vocab['<PAD>'], True), id_to_mask=vocab['<PAD>'])

##Training

In [None]:
def train_model(NER, train_generator, eval_generator, train_steps=1, output_dir='model'):

    train_task = training.TrainTask(
      train_generator,
      loss_layer = tl.CrossEntropyLoss(),
      optimizer = trax.optimizers.Adam(0.01), 
    )

    eval_task = training.EvalTask(
      labeled_data = eval_generator, 
      metrics = [tl.CrossEntropyLoss(), tl.Accuracy()],
      n_eval_batches = 10 
    )

    training_loop = training.Loop(
        NER,
        train_task, 
        eval_tasks=[eval_task], 
        output_dir = output_dir) 
    
    training_loop.run(n_steps = train_steps)
    return training_loop

In [None]:
train_steps = 1000
!rm -f 'model/model.pkl.gz' 
training_loop = train_model(model, train_generator, eval_generator, train_steps)

  "jax.host_id has been renamed to jax.process_index. This alias "
  "jax.host_count has been renamed to jax.process_count. This alias "



Step      1: Total number of trainable weights: 2318673
Step      1: Ran 1 train steps in 4.62 secs
Step      1: train CrossEntropyLoss |  3.25448155
Step      1: eval  CrossEntropyLoss |  1.34228257
Step      1: eval          Accuracy |  0.81889203

Step    100: Ran 99 train steps in 111.02 secs
Step    100: train CrossEntropyLoss |  0.75717318
Step    100: eval  CrossEntropyLoss |  0.55147647
Step    100: eval          Accuracy |  0.85894021

Step    200: Ran 100 train steps in 35.76 secs
Step    200: train CrossEntropyLoss |  0.35892054
Step    200: eval  CrossEntropyLoss |  0.22245454
Step    200: eval          Accuracy |  0.94019287

Step    300: Ran 100 train steps in 22.69 secs
Step    300: train CrossEntropyLoss |  0.20599809
Step    300: eval  CrossEntropyLoss |  0.17200291
Step    300: eval          Accuracy |  0.94976841

Step    400: Ran 100 train steps in 22.55 secs
Step    400: train CrossEntropyLoss |  0.16318233
Step    400: eval  CrossEntropyLoss |  0.13841280
Step   

##Model Evaluation (using test set)





In [None]:
model = NER()
model.init(trax.shapes.ShapeDtype((1, 1), dtype=np.int32))
model.init_from_file('model/model.pkl.gz', weights_only=True)

In [None]:
x, y = next(data_generator(len(test_sen), test_sen, test_labels, vocab['<PAD>']))
print("input shapes", x.shape, y.shape)
test_pred = model(x)
print(f"test_pred shape: {test_pred.shape}")

input shapes (7194, 70) (7194, 70)
test_pred shape: (7194, 70, 17)


In [None]:
def evaluate_prediction(pred, labels, pad):

    outputs = np.argmax(pred, axis=2)
    print("outputs shape:", outputs.shape)

    mask = labels != pad
    print("mask shape:", mask.shape)
    accuracy = np.sum(outputs == labels) / float(np.sum(mask))
    return accuracy

In [None]:
accuracy = evaluate_prediction(test_pred, y, vocab['<PAD>'])
print("accuracy: ", accuracy)

outputs shape: (7194, 70)
mask shape: (7194, 70)
accuracy:  0.9593302


## Predicting custom sentence

In [None]:
def predict(sentence, model, vocab, tag_map):
    s = [vocab[token] if token in vocab else vocab['<UNK>'] for token in sentence.split(' ')]
    batch_data = np.ones((1, len(s)))
    batch_data[0][:] = s
    sentence = np.array(batch_data).astype(int)
    output = model(sentence)
    outputs = np.argmax(output, axis=2)
    labels = list(tag_map.keys())
    pred = []
    for i in range(len(outputs[0])):
        idx = outputs[0][i] 
        pred_label = labels[idx]
        pred.append(pred_label)
    return pred

In [None]:
sentence = "Hello Tuhin, are you in Bangladesh? Come Dhaka and see how we ride the Rider"
predictions = predict(sentence, model, vocab, tag_map)
for x,y in zip(sentence.split(' '), predictions):
    if y != 'O':
        print(x,y)

Hello B-org
Tuhin, I-org
Bangladesh? B-geo
Dhaka B-geo
Rider B-org
