# NER Tagger

Named Entity Recognition (NER) , also known as entity chunking/extraction , is a popular technique used in information extraction to identify and segment the named entities and classify or categorize them under various predefined classes.


In [2]:
#Importing libraries

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn_crfsuite import CRF, scorers, metrics
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import metrics as crf_metrics
import scipy.stats
import time

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/syedhadi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/syedhadi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True


### Dataset

The dataset an extract from GMB corpus which is tagged, annotated and built specifically to train the classifier to predict named entities such as name, location, etc. GMB is a fairly large corpus with a lot of annotations. 

In [3]:
#Loading dataset


#remove these comments below if SSL verification fails. (it sometimes fails if using MASON network)
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context


df = pd.read_csv('https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2008%20-%20Project%206%20-%20Build%20your%20NER%20Tagger/ner_dataset.csv.gz', compression='gzip', encoding='ISO-8859-1')
df.T #it is easier to understand the data when we see the transpose 



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
Sentence #,Sentence: 1,,,,,,,,,,...,,,Sentence: 47959,,,,,,,
Word,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
POS,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
Tag,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


In [4]:
'''
We can handle NaN values by using the 'pandas.DataFrame.ffill' function. It replaces NaN values with values from 
the previous row. 

This is an informed decision. Because currently we have a sparse dataset where the Sentence # is populated only for 
the first word of the sentence. This function puts the relevant sentence number against each word of each sentence. 
This allows us to group sentences together much easier. 
'''

df = df.fillna(method='ffill')
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048565,1048566,1048567,1048568,1048569,1048570,1048571,1048572,1048573,1048574
Sentence #,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,Sentence: 1,...,Sentence: 47958,Sentence: 47958,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959,Sentence: 47959
Word,Thousands,of,demonstrators,have,marched,through,London,to,protest,the,...,impact,.,Indian,forces,said,they,responded,to,the,attack
POS,NNS,IN,NNS,VBP,VBN,IN,NNP,TO,VB,DT,...,NN,.,JJ,NNS,VBD,PRP,VBD,TO,DT,NN
Tag,O,O,O,O,O,O,B-geo,O,O,O,...,O,O,B-gpe,O,O,O,O,O,O,O


In [5]:
print('# of unique sentences, # of unique words, # of unique POS tags, # of unique NER tags')
df['Sentence #'].nunique(), df.Word.nunique(), df.POS.nunique(), df.Tag.nunique()

# of unique sentences, # of unique words, # of unique POS tags, # of unique NER tags


(47959, 35178, 42, 17)

### NER Tags

The tags in this dataset are explained as follows:

1. geo = Geographical Entity
2. org = Organization
3. per = Person
4. gpe = Geopolitical Entity
5. tim = Time indicator
6. art = Artifact
7. eve = Event
8. nat = Natural Phenomenon


Anything outside these classes is termed as other, denoted as O.

In [6]:
#Distribution of Tags
df.Tag.value_counts()

O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: Tag, dtype: int64

### Transforming the Dataset

Our dataset is built of words, POS tags and NER tags. However, we need to transform it into complete sentences
before we can train a model for NER tagging. This is because the position of a word within a sentence is important
for NER tagging. 

In [7]:
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O
...,...,...,...,...
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O


In [8]:
'''
A class to retrieve the sentences from the dataset. Group by the attribute Sentence# and create 1 observation for
each sentence. 

'''

class getsentence(object):    
    def __init__(self, data):
        self.n_sent = 1.0
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

In [9]:
getter = getsentence(df)
sentences = getter.sentences
#This is how a sentence (one observation) will look like in the dataset. 
print(sentences[0])

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [10]:
#Number of Sentences in the (transformed) dataset
len(sentences)

47959

## Conditional Random Field (CRF) Classifier

A Conditional Random Field (CRF) is a standard model for predicting the most likely sequence of labels that correspond to a sequence of inputs. It is a supervised learning method which has been proven to be better than the tree based models when it comes to NER. Whereas a discrete classifier predicts a label for a single sample without considering "neighboring" samples, a CRF can take context into account; e.g., the linear chain CRF (which is popular in natural language processing) predicts sequences of labels for sequences of input samples. 

### Feature Generation


In order to use CRF, we will enhance the feature set and create more features which can be used by the model to predict the tags correctly. Since we need to take into account the context as well, we create features which will provide consecutive POS tags for each word. Also, we add new features such as upper, lower, digit, title etc. for each word and also consider the consecutive words in the list. In short, we try to provide a sequence of features to the model for each word - the sequence containing POS tags, capitalisations, type of word(title) etc.

In [11]:
'''
A sentence and an index for a word within the sentence are passed as inputs. 
The output is a dictionary of features that contains information for the current, previous and next word. 

Sample Input: ('demonstrators', 'NNS', 'O')

Previous Word: ('of', 'IN', 'O')

Next Word: ('have', 'VBP', 'O')

Sample Output (Features generated by the function)
{'bias': 1.0,
 'word.lower()': 'demonstrators',
 'word[-3:]': 'ors',
 'word[-2:]': 'rs',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'NNS',
 'postag[:2]': 'NN',
 '-1:word.lower()': 'of',
 '-1:word.istitle()': False,
 '-1:word.isupper()': False,
 '-1:postag': 'IN',
 '-1:postag[:2]': 'IN',
 '+1:word.lower()': 'have',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'VBP',
 '+1:postag[:2]': 'VB'}


'''

# Creating Feature set
def word2features(sent, i):
    word = sent[i][0] #extract word from sentence using index i
    postag = sent[i][1] #extract POS tag of word at index i

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), #lower case word
        'word[-3:]': word[-3:], #last 3 characters of word
        'word[-2:]': word[-2:], #last 2 characters of word
        'word.isupper()': word.isupper(), # is word upper case? 
        'word.istitle()': word.istitle(), #is the first character of word in upper case?
        'word.isdigit()': word.isdigit(), #is it a digit?
        'postag': postag, #POS tag of the word
        'postag[:2]': postag[:2], #prefix of POS tag
    }
    if i > 0:
        #expanding feature set by including features of previous word if not at the beginning of sentence
        word1 = sent[i-1][0] #previous word
        postag1 = sent[i-1][1] #POS tag of previous word
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True #specify 'Beginning Of Sentence' if no previous word exists
    
    #expanding feature set by including features of next word if not at the end of sentence
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })    
    else:
        features['EOS'] = True #specify 'End Of Sentence' if no previous word exists

    return features

In [12]:
'''
A sentence is passed as an input. The output is a list of dictionaries that contains features generated for
each word within the sentence through the word2features function

'''
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


'''
Returns an array of labels for each word in the sentence that is passed as input. 
'''
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [13]:
#Splitting dataset into features and labels
X = [sent2features(s) for s in sentences] #extracting features
y = [sent2labels(s) for s in sentences] #extracting labels 

In [14]:
'''
Split the data into train test and validation data. 80%, 10%, 10%
This can be done using the train_test_split function by sklearn. 

'''

# split the data in 80:10:10 for train:valid:test dataset
train_size=0.8

# In the first step we will split the data in training and remaining dataset
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)

# Now since we want the valid and test size to be equal (10% each of overall data). 
# we have to define valid_size=0.5 (that is 50% of remaining data)
test_size = 0.5
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

print(len(X_train)), print(len(y_train))
print(len(X_test)), print(len(y_test))
print(len(X_valid)), print(len(y_valid))

38367
38367
4796
4796
4796
4796


(None, None)

In [16]:
'''
Creating the CRF Model for NER Tagging. 

We will be using the LGBFS algorithm (Gradient descent using the L-BFGS method) and it works best using a 
limited amount of computer memory. It is a popular algorithm for parameter estimation in machine learning. 
Gradient Descent will be used as an optimization function.

'''

crf = CRF(algorithm='lbfgs',
          c1=1,#coefficient for L1 regularization,
          c2=1,#coefficients for L2 regularization,
          max_iterations=100,
          all_possible_transitions=False)


try:
    crf.fit(X_train, y_train)
except AttributeError:
    pass

In [17]:
#making predictions on test data
y_pred = crf.predict(X_test)



'''
We remove the class O from the labels. This is because it is the NER tag for most of the words within the sentences. 
We can think of the dataset as being heavily class imbalanced. If 90% of the dataset is class 'O' and our model 
predicts everything as having class 'O', we have an accuracy of 90% but we have a problem. 

If we remove class 'O' and evaluate our predictions on the remaining entities, we have a better understanding of
how our model performs. 

It is important to look at Recall and F1 score for a better evaluation as well. 

'''
labels = list(crf.classes_) #defining labels
labels.remove('O') #removing class O

In [32]:
#Evaluating on the test set. 

print("For Testing Set: ")
print("Accuracy score:      {}".format(metrics.flat_accuracy_score(y_test, y_pred), labels = labels))
print("F1 score:          {}".format(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels = labels)))
print("Recall score:      {}".format(metrics.flat_recall_score(y_test, y_pred, average='weighted', labels = labels)))

For Testing Set: 
Accuracy score:      0.9706208850099526
F1 score:          0.843561671750781
Recall score:      0.8393217347453353


## Hyperparameter Tuning

In [18]:
'''
It is good practice to tune the parameters of the model to obtain the best possible model. 
The crf model takes the following parameters that we will be tuning:

1. c1 : coefficient for L1 regularization
2. c2 : coefficient for L2 regularization

'''

#%%time  #prints time taken to run this block of cell
crf_ = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted')

# search
rs = RandomizedSearchCV(crf_, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=10,
                        scoring=f1_scorer)


rs.fit(X_valid, y_valid)
#try:
#    rs.fit(X_valid, y_valid)
#except AttributeError:
#    pass

Fitting 3 folds for each of 10 candidates, totalling 30 fits


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 max_iterations=100),
                   n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff74fc96940>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7ff6ffefec70>},
                   scoring=make_scorer(flat_f1_score, average=weighted),
                   verbose=1)

In [19]:
#extracting parameters of best model

print('Best parameters:', rs.best_params_)
print('Best CV score:', rs.best_score_)
print('Model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

Best parameters: {'c1': 0.5374501506049948, 'c2': 0.011822447399065664}
Best CV score: 0.9609731539876526
Model size: 0.47M


In [20]:
#re-training model with best parameters

crf_tuned = CRF(algorithm='lbfgs',
          c1=0.5374501506049948,#coefficient for L1 regularization,
          c2=0.011822447399065664,#coefficients for L2 regularization,
          max_iterations=100,
          all_possible_transitions=False)


#training the CRF model on the training set
#crf.fit(X_train, y_train)

try:
    crf_tuned.fit(X_train, y_train)
except AttributeError:
    pass

In [22]:
#making predictions on test data

y_pred = crf_tuned.predict(X_test)


#removing class O NER tags.
labels = list(crf_tuned.classes_) #extracting labels
labels.remove('O') #removing class O

In [23]:
#Evaluating on test data

print("For Testing Set: ")
print("Accuracy score:      {}".format(metrics.flat_accuracy_score(y_test, y_pred), labels = labels))
print("F1 score:          {}".format(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels = labels)))
print("Recall score:      {}".format(metrics.flat_recall_score(y_test, y_pred, average='weighted', labels = labels)))

For Testing Set: 
Accuracy score:      0.9712140175219024
F1 score:          0.8490730261358983
Recall score:      0.8468168944711243
