In [None]:
import nltk
import collections 

from nltk.corpus import conll2002
from sklearn.linear_model import Perceptron
from nltk.classify.scikitlearn import SklearnClassifier


In [None]:
class NamedEntityTagger(nltk.TaggerI):
    def __init__(self,train_sents):
        train_set=[]
        for sentence in train_sents:
            untagged_sent = [(word, tag) for (word, tag, ne_tag) in sentence]
            history = []
            for i, (word, tag, ne_tag) in enumerate(sentence):
                featureset = ne_features(untagged_sent, i, history)
                train_set.append( (featureset, ne_tag) ) 
                history.append(ne_tag)
        self.classifier =  SklearnClassifier(Perceptron())       
        self.classifier.train(train_set)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        
#        Alternative with sk-learn
#        self.classifier =  SklearnClassifier(Perceptron())       
#        self.classifier.train(train_set)

#        self.classifier = SklearnClassifier(LinearSVC())        
#        self.classifier.train(train_set)
        
        
        
    def tag(self, sentence):
        history = []
        for i, (word, tag) in enumerate(sentence):
            featureset = ne_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [None]:
def eval(nerctagger, test_sentences):
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)

    i = 0
    for test_sentence in test_sentences:
        tagged_sentence = nerctagger.tag([(word,tag) for (word,tag, ne_tag) in test_sentence])
        for ((word,tag,label),(pair,predicted)) in zip(test_sentence,tagged_sentence):
            refsets[label].add(i)
            testsets[predicted].add(i)
            i = i+1

    tags = ['B-LOC','I-LOC','B-ORG','I-ORG','B-PER','I-PER']
    
    (ma_precision, ma_recall, ma_fmeasure) = (0,0,0)
    for label_type in tags:
        precision = nltk.metrics.precision(refsets[label_type], testsets[label_type])
        recall = nltk.metrics.recall(refsets[label_type], testsets[label_type])
        fmeasure = nltk.metrics.f_measure(refsets[label_type], testsets[label_type])
        print 'precision(%s):' % label_type, precision 
        print 'recall(%s):' % label_type, recall 
        print 'F-measure(%s):' % label_type, fmeasure
        print '\n'
        ma_precision += precision
        ma_recall += recall
        ma_fmeasure += fmeasure
        
        
    print "--------------------------------------------------------------------------------"
    print "Precision (Ma):", ma_precision/len(tags)  
    print "Recall (Ma):", ma_recall/len(tags)
    print "F-measure (Ma):", ma_fmeasure/len(tags)
    print "--------------------------------------------------------------------------------"

In [None]:
train_sentences = conll2002.iob_sents('esp.train')
test_sentences = conll2002.iob_sents('esp.testa')

In [None]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    return {"word": word}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

In [None]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    return {"word": word, "pos": pos}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

In [None]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"word": word, "pos": pos, "prevpos": prevpos}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)

In [None]:
def ne_features(sentence, i, history):
    word, pos = sentence[i]
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    if i == 0:
        prevtag = "<START>"
    else:
        prevtag = history[i-1]        
    return {"word": word, "isAlnum": word.isalnum(), "isDigit": word.isdigit(), "pos": pos, "prevword": prevword, "prevpos": prevpos, "prevtag" : prevtag}

nerctagger = NamedEntityTagger(train_sentences)
eval(nerctagger, test_sentences)