## Named Entity Recognition
The task in NER is to find the entity-type of words. Entities can be locations, time expressions or names.
##### Dataset used:
Annotated Corpus for NER (https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus/downloads/entity-annotated-corpus.zip/4)

In [66]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv("entity-annotated-corpus/ner_dataset.csv", encoding="latin1")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [5]:
# The data is structured by sentences and only has sentence column
# filled for the first word, so have to fill nans
data = data.fillna(method="ffill")
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [7]:
words = list(set(data.Word.values))
n_words = len(words)
n_words

35178

In [9]:
data["Sentence #"].values

array(['Sentence: 1', 'Sentence: 1', 'Sentence: 1', ...,
       'Sentence: 47959', 'Sentence: 47959', 'Sentence: 47959'],
      dtype=object)

In [20]:
class GetSentences():
    def __init__(self, data):
        self.data = data
        self.n_sent = 1
        self.empty = False
        
    def get_next(self):
        try:
            sent_num = "Sentence: {}".format(self.n_sent)
            s = self.data[self.data["Sentence #"] == sent_num]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()
        except:
            self.empty = True
            return None, None, None

In [21]:
sentence_getter = GetSentences(data)

In [28]:
sent, pos, tag = sentence_getter.get_next()
print(sent); print(pos); print(tag)

['Families', 'of', 'soldiers', 'killed', 'in', 'the', 'conflict', 'joined', 'the', 'protesters', 'who', 'carried', 'banners', 'with', 'such', 'slogans', 'as', '"', 'Bush', 'Number', 'One', 'Terrorist', '"', 'and', '"', 'Stop', 'the', 'Bombings', '.', '"']
['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', 'VBD', 'DT', 'NNS', 'WP', 'VBD', 'NNS', 'IN', 'JJ', 'NNS', 'IN', '``', 'NNP', 'NN', 'CD', 'NN', '``', 'CC', '``', 'VB', 'DT', 'NNS', '.', '``']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-per', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [29]:
words = data["Word"].values.tolist()
tags = data["Tag"].values.tolist()

### Memorization
Remembering the most common named entity for every word and predicting on that. In case the word is not known, we just predict
'O'

In [34]:
# BaseEstimator provides among other things a default implementation 
# for the get_params and set_params methods. This is useful to make 
# the model grid search-able with GridSearchCV for automated 
# parameters tuning and behave well with others when combined in a 
# Pipeline.
class MemoryTagger(BaseEstimator, TransformerMixin):
    # Expects a list of words as X and a list of tags as y
    def fit(self, X, y):
        voc = {}
        self.tags = []
        for word, tag in zip(X, y):
            if tag not in self.tags:
                self.tags.append(tag)
            if word in voc:
                if tag in voc[word]:
                    voc[word][tag] += 1
                else:
                    voc[word][tag] = 1
            else:
                voc[word] = {tag: 1}
        self.memory = {}
        for k, d in voc.items():
            self.memory[k] = max(d, key=d.get)
    
    # Predicts the tag from self.memory
    def predict(self, X, y=None):
        return [self.memory.get(x, 'O') for x in X]

In [35]:
tagger = MemoryTagger()

In [37]:
tagger.fit(words, tags)
tagger.tags

['O',
 'B-geo',
 'B-gpe',
 'B-per',
 'I-geo',
 'B-org',
 'I-org',
 'B-tim',
 'B-art',
 'I-art',
 'I-per',
 'I-gpe',
 'I-tim',
 'B-nat',
 'B-eve',
 'I-eve',
 'I-nat']

In [52]:
pred = cross_val_predict(estimator=MemoryTagger(), 
                      X=words, 
                      y=tags, 
                      cv=5,
                      n_jobs=-1)

In [53]:
# Recall suffers since we cannot predict on words we do not know
# as of now
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.20      0.05      0.09       402
       B-eve       0.54      0.25      0.34       308
       B-geo       0.78      0.85      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.42      0.28      0.33       201
       B-org       0.67      0.49      0.56     20143
       B-per       0.78      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.01       297
       I-eve       0.39      0.12      0.18       253
       I-geo       0.73      0.58      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.73      0.65      0.69     17251
       I-tim       0.58      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

   micro avg       0.95   

### Machine Learning Approach
Data is converted to a simple feature vector for every word and then a random forest is used to classify the words. Not all words might be unique in this basic feature vector conversion

In [61]:
def feature_map(word):
    return np.array([word.istitle(), word.islower(), word.isupper(),
                    word.isdigit(), word.isalpha()])

In [63]:
simple_feature_words = [feature_map(w) for w in words]

In [64]:
pred = cross_val_predict(estimator=RandomForestClassifier(n_estimators=20),
                        X=simple_feature_words, y=tags, cv=5, n_jobs=-1)

In [65]:
# Poor because features created hold no information about the words
rf_report = classification_report(y_pred=pred, y_true=tags)
print(rf_report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.64      0.09      0.16     37644
       B-gpe       0.00      0.00      0.00     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.69      0.15      0.25     20143
       B-per       0.70      0.20      0.31     16990
       B-tim       0.00      0.00      0.00     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.00      0.00      0.00     16784
       I-per       0.46      0.01      0.02     17251
       I-tim       0.00      0.00      0.00      6528
           O       0.86      1.00      0.92    887908

   micro avg       0.85   