In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("../data/ner_dataset.csv", encoding="latin1")

In [2]:
data = data.fillna(method="ffill")
data.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [3]:
words = list(set(data['Word'].values))

In [4]:
n_words = len(words)
print(n_words)

35178


So we have 47959 sentences containing 35178 different words.

We start by writing a small class to retrieve a sentence from the dataset

In [5]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
    
    def get_next(self):
        try:
            s = self.data[self.data["Sentence #"] == "Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s["Word"].values.tolist(), s["POS"].values.tolist(), s["Tag"].values.tolist()    
        except:
            self.empty = True
            return None, None, None

In [6]:
getter = SentenceGetter(data)

In [7]:
sent, pos, tag = getter.get_next()

In [8]:
print(sent)
print(pos) 
print(tag)

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']
['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


## A first idea: Memorization

The first simple idea and baseline might be to just remember the most common named entity for every word and predict that. In case we do not know a word we just predict 'O'. The following class does that. I implement it inheriting from a scikit-learn base classes to use the class with the inbuilt cross-validation.

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

In [10]:
class MemoryTagger(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        voc = {}
        self.tags = []
        for x, t in zip(X,y):
            if t not in self.tags:
                self.tags.append(t)
            if x in voc:
                if t in voc[x]:
                    voc[x][t] += 1 
                else:
                    voc[x][t] = 1 
            else:
                voc[x] = {t:1}
        self.memory = {}
        for k,d in voc.items():
            self.memory[k]= max(d,key=d.get)
    
    def predict(self,X, y=None):
        return [self.memory.get(x, 'O') for x in X]

In [11]:
temp_voc = {'a':{'B':3,'E':2},
           'b':{'B':5,'E':4}}
for k, d in temp_voc.items():
    print(d, max(d,key=d.get))
    

{'B': 3, 'E': 2} B
{'B': 5, 'E': 4} B


In [12]:
tagger = MemoryTagger()

In [13]:
tagger.fit(sent, tag)

In [14]:
print(tagger.predict(sent))
print(tag)

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [15]:
tagger.tags

['O', 'B-geo', 'B-gpe']

It looks like it basically works. Now we do a 5-fold cross-validation.

In [16]:
from sklearn.model_selection import  cross_val_predict
from sklearn.metrics import classification_report

In [17]:
words = data['Word'].values.tolist()
tags = data['Tag'].values.tolist()

We will use the scikit-learn classification report to evaluate the tagger, because we are basically interested in precision, recall, and the f1-score.

In [18]:
pred = cross_val_predict(estimator=MemoryTagger(),
                        X=words, y=tags,cv=5)

In [19]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

              precision    recall  f1-score   support

       B-art       0.20      0.05      0.09       402
       B-eve       0.54      0.25      0.34       308
       B-geo       0.78      0.85      0.81     37644
       B-gpe       0.94      0.93      0.94     15870
       B-nat       0.42      0.28      0.33       201
       B-org       0.67      0.49      0.56     20143
       B-per       0.78      0.65      0.71     16990
       B-tim       0.87      0.77      0.82     20333
       I-art       0.04      0.01      0.01       297
       I-eve       0.39      0.12      0.18       253
       I-geo       0.73      0.58      0.65      7414
       I-gpe       0.62      0.45      0.52       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.69      0.53      0.60     16784
       I-per       0.73      0.65      0.69     17251
       I-tim       0.58      0.13      0.21      6528
           O       0.97      0.99      0.98    887908

    accuracy              

This looks not so bad! The precision is quite reasonable, but as you might have guessed, the recall is pretty weak. This is due to the fact that we can not predict on words we don't know. To overcome this issue, we will now introduce a simple machine learning model to predict the named entities.

### A simple machine learning approach

To do machine learning, we convert the data to a simple feature vector for every word and then use a random forest to classify the words.

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
# The most simple feature map only contains information of 
# the word itself.
def feature_map(word):
    return np.array([word.istitle(), word.islower(),
                    word.isupper(), len(word),
                    word.isdigit(),word.isalpha()])

In [26]:
words = [feature_map(w) for w in data['Word'].values.tolist()]

In [29]:
data['Word'].values.tolist()[0], words[0]

('Thousands', array([1, 0, 0, 9, 0, 1]))

In [31]:
pred = cross_val_predict(RandomForestClassifier(n_estimators=20),
                        X=words, y=tags, cv=5)

In [33]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.79      0.40     37644
       B-gpe       0.26      0.06      0.09     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.47      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

    accuracy              

That looks really bad. This is expected, since the features lack a lot of information necessary for the decision. So now we enhance our simple features on the one hand and on the other hand by using context information.

In [34]:
from sklearn.preprocessing import LabelEncoder

In [35]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.memory_tagger = MemoryTagger()
        self.tag_encoder = LabelEncoder()
        self.pos_encoder = LabelEncoder()
        
    def fit(self, X, y):
        words = X["Word"].values.tolist()
        self.pos = X["Word"].values.tolist()
        tags = X["Tag"].values.tolist()
        self.memory_tagger.fit(words, tags)
        self.tag_encoder.fit(tags)
        self.pos_encoder.fit(self.pos)
        return self
    
    def transform(self,X,y=None):
        def pos_default(p):
            if p in self.pos:
                return self.pos_encoder.transform([p])[0]
            else:
                return -1
        
        pos = X["POS"].values.tolist()
        words = X["Word"].values.tolist()
        out = []
        for i in range(len(words)):
            w = words[i]
            p = pos[i]
            if i < len(words) - 1:
                wp = self.tag_encoder.transform(self.memory_tagger.predict([words[i+1]]))[0]
                posp = pos_default(pos[i+1])
            else:
                wp = self.tag_encoder.transform(['O'])[0]
                posp = pos_default(".")
            if i > 0:
                if words[i-1] != ".":
                    wm = self.tag_encoder.transform(self.memory_tagger.predict([words[i-1]]))[0]
                    posm = pos_default(pos[i-1])
                else:
                    wm = self.tag_encoder.transform(['O'])[0]
                    posm = pos_default(".")
            else:
                posm = pos_default(".")
                wm = self.tag_encoder.transform(['O'])[0]
            out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 self.tag_encoder.transform(self.memory_tagger.predict([w]))[0],
                                 pos_default(p), wp, wm, posp, posm]))
        return out
    

In [36]:
from sklearn.pipeline import Pipeline
pred = cross_val_predict(Pipeline([("feature_map", FeatureTransformer()), 
                                   ("clf", RandomForestClassifier(n_estimators=20, n_jobs=3))]),
                         X=data, y=tags, cv=5)

KeyboardInterrupt: 

In [None]:
report = classification_report(y_pred=pred, y_true=tags)
print(report)