#### Here, I am trying am trying to perform Named Entity Recognition with RNNs.

In [1]:
# imports
import numpy as np
import pandas as pd
import math
from urllib.request import urlretrieve
import tensorflow as tf
import os
import random
seed=54321
% env TF_FORCE_GPU_ALLOW_GROWTH=true

UsageError: Line magic function `%` not found.


In [2]:
# Downloading the data
url='https://github.com/ZihanWangKi/CrossWeigh/raw/master/data/'
dir_name='data'
def download_data(url,filename,download_dir):
    """
    Download a file if its not present
    """
    # create directory is it not exists
    os.makedirs(download_dir,exist_ok=True)
    # if file do not exist , download
    if not os.path.exists(os.path.join(download_dir,filename)):
        filepath,_=urlretrieve(url+filename,os.path.join(download_dir,filename))
    else:
        filepath=os.path.join(download_dir,filename)
    return filepath

In [3]:
# creating train, test and validation sets.
train_filepath = download_data(url, 'conllpp_train.txt', dir_name)
dev_filepath = download_data(url, 'conllpp_dev.txt', dir_name)
test_filepath = download_data(url, 'conllpp_test.txt', dir_name)

##### data looks like:
 -DOCSTART- -X- -X- O  
 
 EU NNP B-NP B-ORG  

 rejects VBZ B-VP O  

 German JJ B-NP B-MISC  
 
 call NN I-NP O

In [4]:
# Reading the data
def read_data(filename):
    '''
    read data from a file with given filename.
    Return a list of sentence and list of ner labels
    '''
    print('Reading data')
    sentences,ner_labels=[],[]
    # open the file
    with open(filename,'r',encoding='latin-1') as f:
        # read each line
        is_sos=True # record at each sentence if it is start of sentence
        sentence_tokens=[]
        sentence_labels=[]
        i=0
        for row in f:
            # if there is empty line or 'DOCSTART' its a new line
            if len(row.strip())==0 or row.split(' ')[0]=='DOCSTART':
                is_sos=False
            else:
                is_sos=True
                token,_,_,ner_label=row.split()
                sentence_tokens.append(token)
                sentence_labels.append(ner_label.strip())
            # when end of line is there or beginning of next line is there,add data to main list and flush temporary one
            if not is_sos and len(sentence_tokens)>0:
                sentences.append(' '.join(sentence_tokens))
                ner_labels.append(sentence_labels)
                sentence_tokens,sentence_labels=[],[]
    print('Done')
    return sentences,ner_labels
            

In [5]:
train_sentences,train_labels=read_data(train_filepath)
test_sentences,test_labels=read_data(test_filepath)
valid_sentences,valid_labels=read_data(dev_filepath)

Reading data
Done
Reading data
Done
Reading data
Done


In [6]:
print(f"Train size: {len(train_labels)}")
print(f"Valid size: {len(valid_labels)}")
print(f"Test size: {len(test_labels)}")

Train size: 14987
Valid size: 3466
Test size: 3683


In [7]:
# Print some data
print('\nSample data\n')
for v_sent, v_labels in zip(valid_sentences[:5], valid_labels[:5]):
    print(f"Sentence: {v_sent}")
    print(f"Labels: {v_labels}")
    print('\n')


Sample data

Sentence: -DOCSTART-
Labels: ['O']


Sentence: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .
Labels: ['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Sentence: LONDON 1996-08-30
Labels: ['B-LOC', 'O']


Sentence: West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .
Labels: ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Sentence: Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .
Labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O

#### One of the main characteristics of NER is class imbalance, all classes will not have equal number of samples. One observation is that there are more non name entities than name entities. Using chain method: convert all NER labels to a series object but these are list of list and inner list has the NER tags for all tokens in a sentence, so using chain() method to create a flat list. It chains several lists together to make a single list

In [8]:
# Checking imbalamce of data
from itertools import chain
print("Training data label counts")
print(pd.Series(chain(*train_labels)).value_counts())
print("\nValidation data label counts")
print(pd.Series(chain(*valid_labels)).value_counts())
print("\nTest data label counts")
print(pd.Series(chain(*test_labels)).value_counts())

Training data label counts
O         170524
B-LOC       7140
B-PER       6600
B-ORG       6321
I-PER       4528
I-ORG       3704
B-MISC      3438
I-LOC       1157
I-MISC      1155
dtype: int64

Validation data label counts
O         42975
B-PER      1842
B-LOC      1837
B-ORG      1341
I-PER      1307
B-MISC      922
I-ORG       751
I-MISC      346
I-LOC       257
dtype: int64

Test data label counts
O         38374
B-ORG      1714
B-LOC      1645
B-PER      1617
I-PER      1161
I-ORG       881
B-MISC      722
I-LOC       259
I-MISC      252
dtype: int64


#### O labels are several magnitudes higher than the volume of other labels. Also padding will be required so analyzing the sequence length of each sentence.

In [9]:
# Analyzing the sequence length
pd.Series(train_sentences).str.split().str.len().describe(percentiles=[0.05,0.95])

count    14987.000000
mean        13.649630
std         11.700735
min          1.000000
5%           1.000000
50%          9.000000
95%         37.000000
max        113.000000
dtype: float64

##### 95% of the sentences have 37 tokens or less

#### Processing data: Labels need to be processed  
Convert class labels to class ID  
Pad sequence of labels to maximum length  
Generate a mask to indicate padded labels to use this info to disregard paddded labels during model training

In [12]:
# function for mapping
def get_label_id_map(train_labels):
    # unique list of lebels
    unique_train_labels=pd.Series(chain(*train_labels)).unique()
    # create mapping using dictionary
    labels_map=dict(zip(unique_train_labels,np.arange(unique_train_labels.shape[0])))
    print(f"label map:{labels_map}")
    return labels_map

In [13]:
labels_map=get_label_id_map(train_labels)

label map:{'O': 0, 'B-ORG': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}


In [21]:
def get_padded_int_labels(labels, labels_map, max_seq_length, return_mask=True):

    # Convert string labels to integers 
    int_labels = [[labels_map[x] for x in one_seq] for one_seq in labels]
    
    
    # Pad sequences
    if return_mask:
        # If we return mask, we first pad with a special value (-1) and 
        # use that to create the mask and later replace -1 with 'O'
        padded_labels = np.array(
            tf.keras.preprocessing.sequence.pad_sequences(
                int_labels, maxlen=max_seq_length, padding='post', truncating='post', value=-1
            )
        )
        
        # mask filter
        mask_filter = (padded_labels != -1)
        # replace -1 with 'O' s ID
        padded_labels[~mask_filter] = labels_map['O']        
        return padded_labels, mask_filter.astype('int')
    
    else:
        padded_labels = np.array(ner_pad_sequence_func(int_labels, value=labels_map['O']))
        return padded_labels

In [22]:
# generate processed labels and masks for train,test and valid data:
max_seq_length=40
padded_train_labels,train_mask=get_padded_int_labels(train_labels,labels_map,max_seq_length,return_mask=True)
padded_test_labels,test_mask=get_padded_int_labels(test_labels,labels_map,max_seq_length,return_mask=True)
padded_valid_labels,valid_mask=get_padded_int_labels(valid_labels,labels_map,max_seq_length,return_mask=True)

In [23]:
# Print some labels IDs
print(padded_train_labels[:2])
print(train_mask[:2])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [1 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]]
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]]


#### Defining the hyperparameters:  
The maximum length could be 40 as in previous code I saw that 95% sentences have 37 tokens or less

In [15]:
max_seq_length=40
embedding_size=64 # size of token embedding
rnn_hidden_size=64 # number of hidden unites in RNN layer
n_Classes=9 # number of o/p nodes
batch_size=64 # number of samples in a batch
epochs=4 # number of epochs to train
 

#### Defining the model: The model will have one embedding layer followed by a RNN layer and finally a Dense prediction layer, also integrating the tokenization in the model.This is done using TextVectorization layer

In [24]:
import tensorflow.keras.backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
K.clear_session()
# trying to fit on a small example
toy_corpus=["I went to the restaurent yesterday","The restaurent was full"]
toy_vectorization_layer=TextVectorization()
# fit it on sample data
toy_vectorization_layer.adapt(toy_corpus)
# generate output as
toy_vectorized_output=toy_vectorization_layer(toy_corpus)
# to see the vocabulary
toy_vocabulary=toy_vectorization_layer.get_vocabulary()
print("With default arguments\n")
print(f"Data: \n{toy_vectorized_output}")
print(f"Vocabulary: {toy_vocabulary}")

With default arguments

Data: 
[[8 5 7 2 3 4]
 [2 3 6 9 0 0]]
Vocabulary: ['', '[UNK]', 'the', 'restaurent', 'yesterday', 'went', 'was', 'to', 'i', 'full']


#### The tokenization has done some preprocessing also like: converting to lower case. Also the size of vocabulary can be limited using the max_tokens argument, also preprocessing can be avoided by setting standardize to None, also the padding/truncation of sequences can be set with the output_sequence_length.

#### Implement the rest of network:

In [25]:
import tensorflow.keras.layers as layers
import tensorflow.keras.backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
K.clear_session()
# define an input layer with 1 column
word_input=layers.Input(shape=(1,),dtype=tf.string)
# define a function that takes corpus,maximum sequence length and vocabulary size and returns trained TextVectorization layer 
# and the vocabulary size
def get_fitted_token_vectorization_layer(corpus,max_seq_length,vocabulary_size=None):
    '''Fit a text vectorization layer on data'''
    vectorization_layer=TextVectorization(max_tokens=vocabulary_size,standardize=None,output_sequence_length=max_seq_length)
    # setting standardize to none because in NER keeping the case of letters is important as typically entity start with upper case
    # fit on data
    vectorization_layer.adapt(corpus)
    # get vocabulary size
    n_vocab=len(vectorization_layer.get_vocabulary())
    return vectorization_layer,n_vocab

# text vectorized layer
vectorized_layer,n_vocab=get_fitted_token_vectorization_layer(train_sentences,max_seq_length)
# map each word to id : pass word input to vectorized layer to get vectorized output
vectorized_output=vectorized_layer(word_input)
# the output from vectorization layer is sent to embedding layer
# look up embeddings for the ids
embedding_layer=layers.Embedding(input_dim=n_vocab,output_dim=embedding_size,mask_zero=True)(vectorized_output)
# define a simple RNN layer
rnn_layer=layers.SimpleRNN(units=rnn_hidden_size,return_sequences=True)
# arguments: units(int)- hidden output size. more is desired
# return_sequences- whether to return sequence from all timesteps or to return last output, for NER every single
# token needs to be labelled so its required to return sequences at all time steps
rnn_out=rnn_layer(embedding_layer)
# Defining dense layer
dense_layer=layers.Dense(n_Classes,activation='softmax')
# time-distributed output from the RNN will go to a Dense layer
dense_out=dense_layer(rnn_out)
# define the model
model=tf.keras.Model(inputs=word_input,outputs=dense_out)

#### Evaluation metrics and loss function: As earlier I saw the distribution of labels, it appears to be a class imbalance problem, this needs to be considered while training and evaluating the model so either a new metric can be used or sample weights can be used to penalize more frequent classes and boost importance of rare classes, so using first one.A modified version of accuracy known as macro accuarcy can be used where accuracies for each class are calculated and then averaged so class imbalance is ignored.

In [26]:
# defining macro accuracy:
def macro_accuracy(y_true,y_pred):
    # y_pred has predictions for all classes so getting the predicted class from it using argmax
    # first convert y_true and y_pred to flattened vectors
    y_true=tf.cast(tf.reshape(y_true,[-1]),'int32')
    y_pred=tf.cast(tf.reshape(tf.argmax(y_pred,axis=-1),[-1]),'int32')
    # sort y_true so that same labels are together
    sorted_y_true=tf.sort(y_true)
    sorted_inds=tf.argsort(y_true)
    # use the tf.gather() function to order y_pred in the same order as y_true after sorting
    sorted_y_pred=tf.gather(y_pred,sorted_inds)
    # tf.gather() function takes a tensor and a set of indices and orders the passed tensor in the order of the indices.
    # calculated sorted_corrected that is true if a particular element is equal in sorted_y_true and sorted_y_pred
    sorted_corrected=tf.cast(tf.math.equal(sorted_y_true,sorted_y_pred),'int32')
    # then use segment sum to calculate segmented sum of correctly predicted samples,
    # samples belonging to each class considered a particular segment
    # segment sum has 2 parameters: data and segment id; data can be float or int and a tensor,
    # segment id is a tensor which can be int only, size should be equal to data first dimension,should be sorted
    # and values inside it can be repeated
    # ex= data=[5,1,7,2,3,4,1,3], segment_ids=[0,0,0,1,2,2,3,3] - if I start iterating 5,1,7 fall under same segment as 
    # the first 3 values in segment_ids tensor are same , so here 4 segments are there and in each segment perform sum 
    # of all element so here result is- [5+1+7,2,3+4,1+3] -> [13,2,7,4]
    correct_for_each_label=tf.cast(tf.math.segment_sum(sorted_corrected,sorted_y_true),'float32')+1
    # adding by 1 to avoid division by 0 error
    # do same for vectors of 1 - get numbers of true samples for each class in data
    all_for_each_label = tf.cast(tf.math.segment_sum(tf.ones_like(sorted_y_true),sorted_y_true),'float32')+1
    mean_accuracy=tf.reduce_mean(correct_for_each_label/all_for_each_label)
    return mean_accuracy


In [27]:
# Wrap this function in meanmetricwrapper to produce tf.keras.metrics.Metric object which can be passed to compile function
mean_accuracy_metric=tf.keras.metrics.MeanMetricWrapper(fn=macro_accuracy,name='macro_accuracy')
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=[mean_accuracy_metric])
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 40)                0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 64)            1512064   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 40, 64)            8256      
_________________________________________________________________
dense (Dense)                (None, 40, 9)             585       
Total params: 1,520,905
Trainable params: 1,520,905
Non-trainable params: 0
_________________________________________________________________


#### Train the model: before training the model, class imbalance needs to be tackled so writing a function to compute class weightsto generate sample weights further.

In [28]:
def get_class_weights(train_labels):
    label_count=pd.Series(chain(*train_labels)).value_counts()
    # to compute weights divide minimum frequency with other frequencies
    label_count=label_count.min()/label_count
    label_id_map=get_label_id_map(train_labels)
    # output is converted into a dictionary that has class IDs as keys and class weights as values.
    label_count.index=label_count.index.map(label_id_map)
    return label_count.to_dict()

In [29]:
def get_sample_weights_from_class_weights(labels,class_weights):
    #perform a dictionary lookup element-wise on each label to generate a sample weight from class_weights
    return np.vectorize(class_weights.get)(labels)

In [30]:
# Generate actual weights:
train_class_weights=get_class_weights(train_labels)
print(f"Class Weights:{train_class_weights}")

label map:{'O': 0, 'B-ORG': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}
Class Weights:{0: 0.006773240130421524, 5: 0.16176470588235295, 3: 0.175, 1: 0.18272425249169436, 4: 0.25507950530035334, 6: 0.31182505399568033, 2: 0.33595113438045376, 8: 0.9982713915298185, 7: 1.0}


##### Other has the lowest weight as it’s the most frequent and the class I-LOC has the highest as it’s the least frequent.

In [31]:
# make train sequences an array
train_sentences=np.array(train_sentences)
# get sample weights
train_sample_weights=get_sample_weights_from_class_weights(padded_train_labels,train_class_weights)
# train the model
model.fit(
    train_sentences,padded_train_labels,sample_weight=train_sample_weights,batch_size=batch_size,epochs=epochs,
    validation_data=(np.array(valid_sentences),padded_valid_labels)
)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x22d82709220>

In [32]:
model.evaluate(np.array(test_sentences), padded_test_labels)



[0.09011910110712051, 0.7633246779441833]

In [33]:
# Analyzing outputs:
# using first 5 sentences
n_samples=5
visual_test_sentences=test_sentences[:n_samples]
visual_test_labels=padded_test_labels[:n_samples]
visual_test_predictions=model.predict(np.array(visual_test_sentences))
visual_test_pred_labels=np.argmax(visual_test_predictions,axis=-1)
# create a reversed labels_map that has a mapping from label ID to label string:
rev_labels_map=dict(zip(labels_map.values(),labels_map.keys()))
for i, (sentence, sent_labels, sent_preds) in enumerate(zip(visual_test_sentences, visual_test_labels, visual_test_pred_labels)):    
    n_tokens = len(sentence.split())
    print("Sample:\t","\t".join(sentence.split()))
    print("True:\t","\t".join([rev_labels_map[i] for i in sent_labels[:n_tokens]]))
    print("Pred:\t","\t".join([rev_labels_map[i] for i in sent_preds[:n_tokens]]))
    print("\n")

Sample:	 -DOCSTART-
True:	 O
Pred:	 O


Sample:	 SOCCER	-	JAPAN	GET	LUCKY	WIN	,	CHINA	IN	SURPRISE	DEFEAT	.
True:	 O	O	B-LOC	O	O	O	O	B-LOC	O	O	O	O
Pred:	 O	O	B-MISC	O	O	O	O	B-ORG	O	O	O	O


Sample:	 Nadim	Ladki
True:	 B-PER	I-PER
Pred:	 O	O


Sample:	 AL-AIN	,	United	Arab	Emirates	1996-12-06
True:	 B-LOC	O	B-LOC	I-LOC	I-LOC	O
Pred:	 O	O	B-LOC	I-LOC	I-LOC	O


Sample:	 Japan	began	the	defence	of	their	Asian	Cup	title	with	a	lucky	2-1	win	against	Syria	in	a	Group	C	championship	match	on	Friday	.
True:	 B-LOC	O	O	O	O	O	B-MISC	I-MISC	O	O	O	O	O	O	O	B-LOC	O	O	O	O	O	O	O	O	O
Pred:	 B-LOC	O	O	O	O	O	B-MISC	I-MISC	I-MISC	O	O	O	O	O	O	B-LOC	O	O	O	O	O	O	O	O	O


