In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv("ner_dataset.csv", encoding="latin1")

In [2]:
data = data.fillna(method="ffill")
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [3]:
words = list(set(data["Word"].values))
n_words = len(words); n_words

35178

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class MajorityVotingTagger(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y):
        """
        X: list of words
        y: list of tags
        """
        word2cnt = {}
        self.tags = []
        for x, t in zip(X, y):
            if t not in self.tags:
                self.tags.append(t)
            if x in word2cnt:
                if t in word2cnt[x]:
                    word2cnt[x][t] += 1
                else:
                    word2cnt[x][t] = 1
            else:
                word2cnt[x] = {t: 1}
        self.mjvote = {}
        
        for k, d in word2cnt.items():
            self.mjvote[k] = max(d, key=d.get)
    
    def predict(self, X, y=None):
        '''
        Predict the the tag from memory. If word is unknown, predict 'O'.
        '''
        return [self.mjvote.get(x, 'O') for x in X]

In [5]:
from sklearn.ensemble import RandomForestClassifier
def get_feature(word):
    return np.array([word.istitle(), word.islower(), word.isupper(), len(word),
                     word.isdigit(),  word.isalpha()])

In [6]:
words = [get_feature(w) for w in data["Word"].values.tolist()]
tags = data["Tag"].values.tolist()

In [7]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=words, y=tags, cv=5)
report = classification_report(y_pred=pred, y_true=tags)
print(report)

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00       402
       B-eve       0.00      0.00      0.00       308
       B-geo       0.26      0.80      0.40     37644
       B-gpe       0.25      0.04      0.07     15870
       B-nat       0.00      0.00      0.00       201
       B-org       0.65      0.17      0.27     20143
       B-per       0.97      0.20      0.33     16990
       B-tim       0.29      0.32      0.30     20333
       I-art       0.00      0.00      0.00       297
       I-eve       0.00      0.00      0.00       253
       I-geo       0.00      0.00      0.00      7414
       I-gpe       0.00      0.00      0.00       198
       I-nat       0.00      0.00      0.00        51
       I-org       0.36      0.03      0.06     16784
       I-per       0.46      0.02      0.04     17251
       I-tim       0.50      0.06      0.11      6528
           O       0.97      0.98      0.97    887908

   micro avg       0.87   

In [8]:
def get_sentences(data):
    agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
    sentence_grouped = data.groupby("Sentence #").apply(agg_func)
    return [s for s in sentence_grouped] 

In [9]:
sentences = get_sentences(data)

In [None]:
from sklearn.preprocessing import LabelEncoder

out = []
y = []
mv_tagger = MajorityVotingTagger()
tag_encoder = LabelEncoder()
pos_encoder = LabelEncoder()

words = data["Word"].values.tolist()
pos = data["POS"].values.tolist()
tags = data["Tag"].values.tolist()

mv_tagger.fit(words, tags)
tag_encoder.fit(tags)
pos_encoder.fit(pos)


for sentence in sentences:
    for i in range(len(sentence)):
        w, p, t = sentence[i][0], sentence[i][1], sentence[i][2]
        
        if i < len(sentence)-1:
            # 如果不是最后一个单词，则可以用到下文的信息
            mem_tag_r = tag_encoder.transform(mv_tagger.predict([sentence[i+1][0]]))[0]
            true_pos_r = pos_encoder.transform([sentence[i+1][1]])[0]
        else:
            mem_tag_r = tag_encoder.transform(['O'])[0]
            true_pos_r =  pos_encoder.transform(['.'])[0]
            
        if i > 0: 
            # 如果不是第一个单词，则可以用到上文的信息
            mem_tag_l = tag_encoder.transform(mv_tagger.predict([sentence[i-1][0]]))[0]
            true_pos_l = pos_encoder.transform([sentence[i-1][1]])[0]
        else:
            mem_tag_l = tag_encoder.transform(['O'])[0]
            true_pos_l =  pos_encoder.transform(['.'])[0]
        #print (mem_tag_r, true_pos_r, mem_tag_l, true_pos_l)
        
        out.append(np.array([w.istitle(), w.islower(), w.isupper(), len(w), w.isdigit(), w.isalpha(),
                                 tag_encoder.transform(mv_tagger.predict([sentence[i][0]])),
                                 pos_encoder.transform([p])[0], mem_tag_r, true_pos_r, mem_tag_l, true_pos_l]))
        y.append(t)

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
pred = cross_val_predict(RandomForestClassifier(n_estimators=20), X=out, y=y, cv=5)
report = classification_report(y_pred=pred, y_true=y)
print(report)