# Load CoNLL 2002 for CRF

In [9]:
from nltk.corpus import conll2002
from nltk.stem import PorterStemmer
porter = PorterStemmer()

In [13]:
def load_conll02_crf():
    
    # import data 
    tagged_sents = conll2002.iob_sents()
    
    # stemming, lowercasing
    porter = PorterStemmer()
    norm_sents = [[(porter.stem(w).lower(),pos,ne) for w,pos,ne in sent] for sent in tagged_sents]
    
    # create data in the format
    #  X: [([w,...],[pos,...]),...]
    #  Y: [[ne,...],...]
    X = [([w for w,_,_ in norm_sent],[pos for _,pos,_ in norm_sent]) for norm_sent in norm_sents]
    Y = [[ne for _,_,ne in norm_sent] for norm_sent in norm_sents]
    
    # train-valid-test split
    def train_valid_test_split(X,Y):
        n = len(X)
        cutoff = int(n*.8)
        return (X[:cutoff],Y[:cutoff]), (X[cutoff:],Y[cutoff:])  
    train, test = train_valid_test_split(X,Y)
    
    return train, test   

In [14]:
%%time
(X_train, Y_train), (X_test, Y_test) = load_conll02_crf()

CPU times: user 9.08 s, sys: 183 ms, total: 9.26 s
Wall time: 9.32 s


In [16]:
datum = X_train[0]
print datum

([u'sao', u'paulo', u'(', u'brasil', u')', u',', u'23', u'may', u'(', u'efecom', u')', u'.'], [u'NC', u'VMI', u'Fpa', u'NC', u'Fpt', u'Fc', u'Z', u'NC', u'Fpa', u'NP', u'Fpt', u'Fp'])


In [18]:
for i,(word,pos) in enumerate(zip(X_train[0][0],X_train[0][1])):
    print i,word,pos

0 sao NC
1 paulo VMI
2 ( Fpa
3 brasil NC
4 ) Fpt
5 , Fc
6 23 Z
7 may NC
8 ( Fpa
9 efecom NP
10 ) Fpt
11 . Fp


In [19]:
def featurize(datum): # format: ([w,...],[pos,...])
    
    words, pos = datum
    len_sent = len(words)
    all_features = []
    for i in xrange(len_sent):
        features = [words[i],pos[i]]       
        if i > 0:
            features += ['-1'+words[i-1],'-1'+pos[i-1]]
            if i > 1:
                features += ['-2'+words[i-2],'-2'+pos[i-2]]
            else: features += ['BOS']
        else: features += ['BOS']
        if i < len_sent-1:
            features += ['+1'+words[i+1],'-1'+pos[i+1]]
            if i < len_sent-2:
                features += ['+2'+words[i+2],'+2'+pos[i+2]]
            else: features += ['EOS']
        else: features += ['EOS']
        all_features.append(features)
    
    return all_features
        

In [23]:
print featurize(datum)[0]
print featurize(datum)[1]

[u'sao', u'NC', 'BOS', u'+1paulo', u'-1VMI', u'+2(', u'+2Fpa']
[u'paulo', u'VMI', u'-1sao', u'-1NC', 'BOS', u'+1(', u'-1Fpa', u'+2brasil', u'+2NC']


In [24]:
train = (X_train, Y_train)
test = (X_test, Y_test)

# CRF

In [31]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite

In [32]:
class CRF:
    
    def __init__(self, train, test, verbose=0, # train/test format: (X,Y), 
                                               #  where X = [([w,...],[pos,...]),...]
                                               #        Y = [[ne,...],...]
                 config={'c1': 1.0, # coef for L1.
                         'c2': 1e-3, # coef for L2.
                         'max_iterations': 100,
                         'feature.possible_transitions':True}, # include unseen transitions.
                 tagger_name='crf.crfsuite', # name of generated tagger.
                 featurizer=featurize):
        
        print "... featurizing data"
        self.featurizer = featurizer
        self.X_train = [self.featurizer(datum) for datum in train[0]]
        self.Y_train = train[1]
        self.X_test = [self.featurizer(datum) for datum in test[0]]
        self.Y_test = test[1]

        print "... loading data into CRF"
        crf = pycrfsuite.Trainer(verbose=verbose)
        for x,y in zip(self.X_train, self.Y_train):
            crf.append(x,y)
        crf.set_params(config)
        print "... training"
        crf.train(tagger_name)
        self.tagger = pycrfsuite.Tagger()
        self.tagger.open(tagger_name)
    
    def evaluate(self):
        y_true = self.Y_test
        y_pred = [self.tagger.tag(sent) for sent in self.X_test] # sent here is [[feat,...],...]
        lb = LabelBinarizer()
        y_true_in_tags = lb.fit_transform(list(chain.from_iterable(y_true))) # get a list of tags in 1-hot.
        y_pred_in_tags = lb.transform(list(chain.from_iterable(y_pred)))
        tagset = list(set(lb.classes_))
        class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
        print classification_report(
            y_true_in_tags,
            y_pred_in_tags,
            labels = [class_indices[cls] for cls in tagset],
            target_names = tagset
        )
        

In [33]:
%%time
crf = CRF(train, test)

... featurizing data
... loading data into CRF
... training
CPU times: user 1min 25s, sys: 819 ms, total: 1min 26s
Wall time: 1min 26s


In [34]:
%%time
crf.evaluate()

             precision    recall  f1-score   support

          O       0.97      0.99      0.98     83525
      I-LOC       0.67      0.18      0.29       342
      B-ORG       0.75      0.48      0.58       642
      I-PER       0.74      0.85      0.79      1446
      B-PER       0.72      0.62      0.66      2075
     I-MISC       0.56      0.36      0.44       778
     B-MISC       0.78      0.44      0.57      1434
      I-ORG       0.62      0.64      0.63       417
      B-LOC       0.89      0.56      0.69      1615

avg / total       0.95      0.95      0.95     92274

CPU times: user 760 ms, sys: 23.8 ms, total: 783 ms
Wall time: 789 ms


In [35]:
from sklearn.metrics import accuracy_score

In [36]:
y_true = crf.Y_test
y_pred = [crf.tagger.tag(sent) for sent in crf.X_test]
y_true_merged = list(chain.from_iterable(y_true))
y_pred_merged = list(chain.from_iterable(y_pred))

In [37]:
print "Accuracy: %.2f" % accuracy_score(y_true_merged,y_pred_merged)

Accuracy: 0.95
