In [31]:
import nltk
import collections 

from nltk.corpus import conll2002
from sklearn.linear_model import Perceptron
from nltk.classify.scikitlearn import SklearnClassifier


In [39]:
class NamedEntityTagger(nltk.TaggerI):
    def __init__(self,train_sents):
        train_set=[]
        for sentence in train_sents:
            untagged_sent = [(word, tag) for (word, tag, ne_tag) in sentence]
            history = []
            for i, (word, tag, ne_tag) in enumerate(sentence):
                featureset = ne_features(untagged_sent, i, history)
                train_set.append( (featureset, ne_tag) ) 
                history.append(ne_tag)
        self.classifier =  SklearnClassifier(Perceptron())       
        self.classifier.train(train_set)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        
#        Alternative with sk-learn
#        self.classifier =  SklearnClassifier(Perceptron())       
#        self.classifier.train(train_set)

#        self.classifier = SklearnClassifier(LinearSVC())        
#        self.classifier.train(train_set)
        
        
        
    def tag(self, sentence):
        history = []
        for i, (word, tag) in enumerate(sentence):
            featureset = ne_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [20]:
def eval(nerctagger, test_sentences):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    i = 0
    for test_sentence in test_sentences:
        tagged_sentence = nerctagger.tag([(word,tag) for (word,tag, ne_tag) in test_sentence])
        for ((word,tag,label),(pair,predicted)) in zip(test_sentence,tagged_sentence):
            refsets[label].add(i)
            testsets[predicted].add(i)
            i = i+1

    tags = ['B-LOC','I-LOC','B-ORG','I-ORG','B-PER','I-PER']
    
    (ma_precision, ma_recall, ma_fmeasure) = (0,0,0)
    for label_type in tags:
        precision = nltk.metrics.precision(refsets[label_type], testsets[label_type])
        recall = nltk.metrics.recall(refsets[label_type], testsets[label_type])
        fmeasure = nltk.metrics.f_measure(refsets[label_type], testsets[label_type])
        print 'precision(%s):' % label_type, precision 
        print 'recall(%s):' % label_type, recall 
        print 'F-measure(%s):' % label_type, fmeasure
        print '\n'
        ma_precision += precision
        ma_recall += recall
        ma_fmeasure += fmeasure
        
        
    print "--------------------------------------------------------------------------------"
    print "Precision (Ma):", ma_precision/len(tags)  
    print "Recall (Ma):", ma_recall/len(tags)
    print "F-measure (Ma):", ma_fmeasure/len(tags)
    print "--------------------------------------------------------------------------------"

In [23]:
train_sentences = conll2002.iob_sents('esp.train')
test_sentences = conll2002.iob_sents('esp.testa')

In [33]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    return {"word": word}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

precision(B-LOC): 0.607287449393
recall(B-LOC): 0.609756097561
F-measure(B-LOC): 0.608519269777


precision(I-LOC): 0.509345794393
recall(I-LOC): 0.323442136499
F-measure(I-LOC): 0.395644283122


precision(B-ORG): 0.782608695652
recall(B-ORG): 0.582352941176
F-measure(B-ORG): 0.667790893761


precision(I-ORG): 0.496124031008
recall(I-ORG): 0.234260614934
F-measure(I-ORG): 0.318249627051


precision(B-PER): 0.735457063712
recall(B-PER): 0.434533551555
F-measure(B-PER): 0.546296296296


precision(I-PER): 0.580952380952
recall(I-PER): 0.284051222352
F-measure(I-PER): 0.381548084441


--------------------------------------------------------------------------------
Precision (Ma): 0.618629235852
Recall (Ma): 0.411399427346
F-measure (Ma): 0.486341409075
--------------------------------------------------------------------------------


In [34]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    return {"word": word, "pos": pos}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

precision(B-LOC): 0.740585774059
recall(B-LOC): 0.359756097561
F-measure(B-LOC): 0.484268125855


precision(I-LOC): 0.572413793103
recall(I-LOC): 0.246290801187
F-measure(I-LOC): 0.344398340249


precision(B-ORG): 0.739130434783
recall(B-ORG): 0.62
F-measure(B-ORG): 0.674344209853


precision(I-ORG): 0.561739130435
recall(I-ORG): 0.236456808199
F-measure(I-ORG): 0.332818134982


precision(B-PER): 0.653209109731
recall(B-PER): 0.516366612111
F-measure(B-PER): 0.576782449726


precision(I-PER): 0.207865168539
recall(I-PER): 0.559953434226
F-measure(I-PER): 0.303183107469


--------------------------------------------------------------------------------
Precision (Ma): 0.579157235108
Recall (Ma): 0.423137292214
F-measure (Ma): 0.452632394689
--------------------------------------------------------------------------------


In [40]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"word": word, "pos": pos, "prevpos": prevpos}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

precision(B-LOC): 0.392444910808
recall(B-LOC): 0.760162601626
F-measure(B-LOC): 0.517647058824


precision(I-LOC): 0.7
recall(I-LOC): 0.290801186944
F-measure(I-LOC): 0.410901467505


precision(B-ORG): 0.548181818182
recall(B-ORG): 0.709411764706
F-measure(B-ORG): 0.618461538462


precision(I-ORG): 0.565853658537
recall(I-ORG): 0.254758418741
F-measure(I-ORG): 0.351337708228


precision(B-PER): 0.68496849685
recall(B-PER): 0.622749590835
F-measure(B-PER): 0.652378911273


precision(I-PER): 0.398224476855
recall(I-PER): 0.731082654249
F-measure(I-PER): 0.515599343186


--------------------------------------------------------------------------------
Precision (Ma): 0.548278893538
Recall (Ma): 0.561494369517
F-measure (Ma): 0.511054337913
--------------------------------------------------------------------------------


In [47]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == 0:
        prevtag = "<START>"
    else:
        prevtag = history[i-1]        
    return {"word": word, "isAlnum": word.isalnum(), "isDigit": word.isdigit(), "pos": pos, "prevword": prevword, "prevpos": prevpos, "prevtag" : prevtag}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

precision(B-LOC): 0.487297921478
recall(B-LOC): 0.643292682927
F-measure(B-LOC): 0.554533508541


precision(I-LOC): 0.302845528455
recall(I-LOC): 0.442136498516
F-measure(I-LOC): 0.359469240048


precision(B-ORG): 0.484848484848
recall(B-ORG): 0.715294117647
F-measure(B-ORG): 0.577946768061


precision(I-ORG): 0.431399631676
recall(I-ORG): 0.685944363104
F-measure(I-ORG): 0.529677784059


precision(B-PER): 0.556935817805
recall(B-PER): 0.660392798691
F-measure(B-PER): 0.604268064395


precision(I-PER): 0.617258883249
recall(I-PER): 0.707799767171
F-measure(I-PER): 0.659436008677


--------------------------------------------------------------------------------
Precision (Ma): 0.480097711252
Recall (Ma): 0.642476704676
F-measure (Ma): 0.547555228964
--------------------------------------------------------------------------------
