In [66]:
import nltk 

class NamedEntityTagger(nltk.TaggerI):
    def __init__(self,train_sents):
        train_set=[]
        for sentence in train_sents:
            untagged_sent = [(word, tag) for (word, tag, ne_tag) in sentence]
            history = []
            for i, (word, tag, ne_tag) in enumerate(sentence):
                featureset = ne_features(untagged_sent, i, history)
                train_set.append( (featureset, ne_tag) ) 
                history.append(ne_tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
        
        
    def tag(self, sentence):
        history = []
        for i, (word, tag) in enumerate(sentence):
            featureset = ne_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [41]:
from nltk.corpus import conll2002


sentence = conll2002.iob_sents('esp.train')[2]

[(word, tag) for (word, tag, ne_tag) in sentence]

[(u'El', u'DA'),
 (u'Abogado', u'NC'),
 (u'General', u'AQ'),
 (u'del', u'SP'),
 (u'Estado', u'NC'),
 (u',', u'Fc'),
 (u'Daryl', u'VMI'),
 (u'Williams', u'NC'),
 (u',', u'Fc'),
 (u'subray\xf3', u'VMI'),
 (u'hoy', u'RG'),
 (u'la', u'DA'),
 (u'necesidad', u'NC'),
 (u'de', u'SP'),
 (u'tomar', u'VMN'),
 (u'medidas', u'NC'),
 (u'para', u'SP'),
 (u'proteger', u'VMN'),
 (u'al', u'SP'),
 (u'sistema', u'NC'),
 (u'judicial', u'AQ'),
 (u'australiano', u'AQ'),
 (u'frente', u'RG'),
 (u'a', u'SP'),
 (u'una', u'DI'),
 (u'p\xe1gina', u'NC'),
 (u'de', u'SP'),
 (u'internet', u'NC'),
 (u'que', u'PR'),
 (u'imposibilita', u'VMI'),
 (u'el', u'DA'),
 (u'cumplimiento', u'NC'),
 (u'de', u'SP'),
 (u'los', u'DA'),
 (u'principios', u'NC'),
 (u'b\xe1sicos', u'AQ'),
 (u'de', u'SP'),
 (u'la', u'DA'),
 (u'Ley', u'NC'),
 (u'.', u'Fp')]

In [94]:
def ne_features(sentence, i, history):
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]
    return {"pos": pos, "word": word, "prevpos": prevpos}

trainSentences = conll2002.iob_sents('esp.train')

In [98]:
nerctagger = NamedEntityTagger(trainSentences)

In [108]:
tests_sentences = conll2002.iob_sents('esp.testa')

nerctagger.tag([(word,tag) for (word,tag, ne_tag) in tests_sentences[107]])



[((u'El', u'DA'), u'O'),
 ((u'miembro', u'NC'), u'O'),
 ((u'de', u'SP'), u'O'),
 ((u'la', u'DA'), u'O'),
 ((u'Comisi\xf3n', u'NC'), u'B-ORG'),
 ((u'Regional', u'AQ'), u'I-ORG'),
 ((u'de', u'SP'), u'O'),
 ((u'UCE', u'VMI'), u'O'),
 ((u'Emilio', u'NC'), u'B-PER'),
 ((u'Guerrero', u'NC'), u'I-PER'),
 ((u'expondr\xe1', u'VMI'), u'O'),
 ((u'la', u'DA'), u'O'),
 ((u'posici\xf3n', u'NC'), u'O'),
 ((u'de', u'SP'), u'O'),
 ((u'esta', u'DD'), u'O'),
 ((u'organizaci\xf3n', u'NC'), u'O'),
 ((u'agraria', u'AQ'), u'O'),
 ((u'respecto', u'NC'), u'O'),
 ((u'a', u'SP'), u'O'),
 ((u'la', u'DA'), u'O'),
 ((u'pr\xf3xima', u'AQ'), u'O'),
 ((u'campa\xf1a', u'NC'), u'O'),
 ((u'de', u'SP'), u'O'),
 ((u'tomate', u'NC'), u'O'),
 ((u'.', u'Fp'), u'O')]