# Load CoNLL 2002 for RNN

In [194]:
from nltk.corpus import conll2002
from nltk.stem import PorterStemmer
from collections import defaultdict
from __future__ import division

In [195]:
def load_conll02_rnn():
    
    # import data 
    tagged_sents = conll2002.iob_sents()
    
    # stemming, lowercasing
    porter = PorterStemmer()
    norm_sents = [[(porter.stem(w).lower(),pos,ne) for w,pos,ne in sent] for sent in tagged_sents if len(sent)>1]
        # len=1 sents cause problem in rnn's batch training.
    
    # create look-up dicts
    word_vocab = list({w for norm_sent in norm_sents for w,_,_ in norm_sent})
    label_vocab = list({ne for norm_sent in norm_sents for _,_,ne in norm_sent}) 
    words2idx = defaultdict(int, {w:i for i,w in enumerate(word_vocab)})
    labels2idx = defaultdict(int, {l:i for i,l in enumerate(label_vocab)})
    dicts = {'words2idx':words2idx, 'labels2idx':labels2idx}
    
    # convert [(w,pos,ne),...] format to [w,...],[ne,...]
    def sent_convert_rnn(norm_sent):
        # norm_sent: a list of (w,pos,ne) triples.
        # returns: encoded words & labels.
        words = np.asarray([words2idx[w] for w,_,_ in norm_sent],dtype='int32')
        labels = np.asarray([labels2idx[ne] for _,_,ne in norm_sent],dtype='int32')
        return words, labels   
    X, Y = [], []
    for norm_sent in norm_sents:
        words, labels = sent_convert_rnn(norm_sent)
        X.append(words)
        Y.append(labels) 
    
    # train-valid-test split
    def train_valid_test_split(X,Y):
        n = len(X)
        cutoff1, cutoff2 = int(n*.7), int(n*.85)
        return (X[:cutoff1],Y[:cutoff1]), \
               (X[cutoff1:cutoff2],Y[cutoff1:cutoff2]), \
               (X[cutoff2:],Y[cutoff2:])  
    train, valid, test = train_valid_test_split(X,Y)
    
    return {'train':train, 'valid':valid, 'test':test, 'dicts':dicts}

In [196]:
%%time
data = load_conll02_rnn()
model_dir = "/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS_PLUS/MODELS/"

CPU times: user 9.42 s, sys: 76.1 ms, total: 9.5 s
Wall time: 9.52 s


# RNN

In [246]:
import numpy as np
import gzip, cPickle, random, os

In [247]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Activation, TimeDistributed
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical
from keras.regularizers import l1, l2

In [248]:
class NER:
    
    def __init__(self, data, model_dir, mode='simple_rnn'):
        '''
        # data: a dictionary which contains ...
            {'train':train, 'valid':valid, 'test':test, 'dicts':dicts}
            each value in the dictionary is a 2-tuple ...
            (encoded_sentences, encoded_labels)
            dicts includes ...
            'words2idx', 'labels2idx'
        # mode: 'simple_rnn', 'lstm', 'gru'.
        '''
        if mode not in ['simple_rnn','lstm','gru']:
            print "MODE ERROR: only 'simple_rnn','lstm','gru'. \n"
            return
        self.mode = mode
        self.model_dir = model_dir # NB: this is for temporary model saving pathing.
        
        transform = {'x': lambda x: np.asarray([x]), 
                     'y': lambda y: to_categorical(np.asarray(y)[:,np.newaxis],
                                                   len(data['dicts']['labels2idx']))[np.newaxis,:,:]}
            # len(data['dicts']['labels2idx']): number of labels.
        try:
            data_transformed = [ (map(transform['x'],x), map(transform['y'],y)) 
                                 for x,y in [data['train'], data['valid'], data['test']] ]
            self.X_train, self.Y_train = data_transformed[0]
            self.X_valid, self.Y_valid = data_transformed[1]
            self.X_test, self.Y_test = data_transformed[2]
            self.dicts = data['dicts']
        except:
            print "DATA FORMAT ERROR: \n", \
                  "data = {'train':train, 'valid':valid, 'test':test, 'dicts':dicts} \n", \
                  "value = (encoded_sentences, encoded_labels) \n"
    
    def __shuffle(self, X, Y, seed):
        '''
        # X, Y: data and corresponding labels.
        # seed: ensure the same after-shuffle order for X and Y.
        '''
        random.seed(seed)
        random.shuffle(X)
        random.seed(seed)
        random.shuffle(Y)
    
    def __get_mean_evaluation(self, X, Y, model):
        '''
        # X, Y: data and corresponding labels.
        # return: average loss and accuracy on X and Y
        '''
        evaluation_size = len(X)
        losses, accuracies = [], []
        for i in xrange(evaluation_size):
            loss,accuracy = model.evaluate(X[i],Y[i],verbose=0)
            losses.append(loss)
            accuracies.append(accuracy)
        return (np.mean(losses),np.mean(accuracies))

    def __save_best_model(self):
        '''
        # save current model as the best when called.
        '''
        try:
            os.remove(self.model_dir+'best_model.p')
            os.remove(self.model_dir+'best_weights.h5')
        except OSError:
            pass
        model_json = self.model.to_json()
        with open(self.model_dir+'best_model.p','wb') as f:
            cPickle.dump(model_json, f)
        self.model.save_weights(self.model_dir+'best_weights.h5')
        print "New Best Model Saved!"
    
    def train(self, validation=False, validation_freq=1, verbose=0, verbose_freq=100,
                    lr=.1, nhidden=100, emb_dim=100, nepochs=1,
                    regularize=False, reg_method='l2',lmd=.1):
        '''
        # validation, validation_freq: 
            if true, run validation at validation_freq epoch (1 by default).
        # verbose, verbose_freq: 
            0: only print out simple messages (e.g. '... building models').
            1: print out validation too.
            print training progress after training every verbose_freq sentences.
        # lr: learning rate.
        # nhidden: number of hidden neurons.
        # emb_dim: dimension of word embeddings.
        # nepochs: number of epochs.
        # regularize: regularize or not.
        # reg_method: Lasso or L2 regularization.
        # lmd: regularization hyperparam.
        # return: end training when max patience is reached.
        '''
        print "... configuring model"
        vocsize = len(self.dicts['words2idx'])
        nlabels = len(self.dicts['labels2idx'])
        nsents = len(self.X_train)
        self.dicts['idx2labels'] = {i:l for l,i in self.dicts['labels2idx'].iteritems()}
        self.dicts['idx2words'] = {i:w for w,i in self.dicts['words2idx'].iteritems()}
            
        print "... building model" 
        if regularize:
            regularizer = l2(lmd) if reg_method=='l2' else l1(lmd)
        else:
            regularizer = None
        model_types = {'simple_rnn':SimpleRNN, 'lstm':LSTM, 'gru':GRU}
        self.model = Sequential()
        self.model.add(Embedding(input_dim=vocsize, output_dim=emb_dim))
        self.model.add(model_types[self.mode](output_dim=nhidden, activation='sigmoid', 
                       return_sequences=True, W_regularizer=regularizer))
        self.model.add(TimeDistributed(Dense(output_dim=nlabels)))
        self.model.add(Activation('softmax'))
        sgd = SGD(lr=lr, momentum=.0, decay=.0, nesterov=False)
        self.model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
        
        self.best_model = self.model # initial model, updated when new best models are saved.
        
        print "... training model"
        patience = 10000
        patience_increase_ratio = 2
        improvement_threshold = .995
        best_iter = 0
        best_loss = np.inf
        best_accuracy = 0.
        done_looping = False # stop training when patience are broken.
        num_iter = 0
        for e in xrange(nepochs):
            if done_looping: break
            e += 1
            if verbose: print "... Epoch: %d" % e
            self.__shuffle(self.X_train,self.Y_train,seed=0)
            for i in xrange(nsents):
                num_iter += 1
                if verbose and i!=0 and i%verbose_freq==0:
                    print "    ... trained %d sentences" % i
                if self.X_train[i].shape[1]==1: continue
                self.model.train_on_batch(self.X_train[i],self.Y_train[i])
        
                if validation and i%validation_freq==0:
                    sample_idxs = random.sample(range(len(self.X_valid)),100)
                    X_valid_samples = [self.X_valid[k] for k in sample_idxs]
                    Y_valid_samples = [self.Y_valid[k] for k in sample_idxs]
                    avg_loss, avg_accuracy = self.__get_mean_evaluation(X_valid_samples, 
                                                                        Y_valid_samples,
                                                                        self.model)
                    if avg_loss < best_loss*improvement_threshold and \
                       avg_accuracy > best_accuracy:
                        self.__save_best_model()
                        patience = max(patience, num_iter*patience_increase_ratio)
                        best_loss = avg_loss
                        best_accuracy = avg_accuracy
                        best_iter = num_iter
                        print "Validation: Loss = %.6f | Accuracy = %.6f" % (avg_loss, avg_accuracy)                        
                    if patience <= num_iter:
                        done_looping = True
                        break

        mean_test_loss, mean_test_accuracy = self.__get_mean_evaluation(self.X_test,
                                                                        self.Y_test,
                                                                        self.best_model)
        print "TRAINING COMPLETE (at iteration %d)!" % num_iter
        print "Test on Best Model: Loss = %.6f | Accuracy = %.6f" % (mean_test_loss,mean_test_accuracy)
                
    def predict(self, x): 
        '''
        # x: sentences in string (e.g. "i'd like to book a flight from london to paris")
        # return: a list of predicted labels associated with the words in the sentence x.
        '''
        x_filtered = filter(lambda w:w in self.dicts['words2idx'].keys(), x.split())
        x_encoded = np.asarray([map(self.dicts['words2idx'].get,x_filtered)])
        pred_encoded = map(np.argmax,self.model.predict_on_batch(x_encoded)[0])
        return map(self.dicts['idx2labels'].get, pred_encoded)

In [249]:
ner = NER(data,model_dir=model_dir)

In [250]:
%%time
ner.train(validation=1,validation_freq=5000,verbose=1,verbose_freq=5000,
          nhidden=100,nepochs=20)

... configuring model
... building model
... training model
... Epoch: 1
New Best Model Saved!
Validation: Loss = 0.935752 | Accuracy = 0.821422
    ... trained 5000 sentences
New Best Model Saved!
Validation: Loss = 0.509429 | Accuracy = 0.873451
    ... trained 10000 sentences
    ... trained 15000 sentences
New Best Model Saved!
Validation: Loss = 0.296714 | Accuracy = 0.921065
    ... trained 20000 sentences
... Epoch: 2
    ... trained 5000 sentences
    ... trained 10000 sentences
TRAINING COMPLETE (at iteration 30886)!
Test on Best Model: Loss = 0.328325 | Accuracy = 0.906164
CPU times: user 11min 23s, sys: 1min 28s, total: 12min 51s
Wall time: 12min 54s


In [209]:
# SUPER SLOW!! 
# LAST PERFORMANCE : Validation: Loss = 0.216919 | Accuracy = 0.946363 AT EPOCH 4
# %%time
# model = ner.train(validation=1,validation_freq=5000,verbose=1,verbose_freq=5000,
#           nhidden=300, emb_dim=300, nepochs=20)

In [251]:
from keras.models import model_from_json

In [253]:
with open(model_dir+'best_model.p') as f:
    model_json = cPickle.load(f)
rnn_ner = model_from_json(model_json)
rnn_ner.load_weights(model_dir+'best_weights.h5')

In [260]:
sgd = SGD(lr=.1, momentum=.0, decay=.0, nesterov=False)
rnn_ner.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [None]:
X_test, Y_test = ner.X_test, ner.Y_test
dicts = ner.dicts

In [None]:
def to_labels(Y): # so we can observe predictions in tags.
    return [map(dicts['idx2labels'].get,map(np.argmax,list(y[0])))
            for y in Y]

In [292]:
Y_pred = to_labels([rnn_ner.predict(x) for x in X_test])
Y_true = to_labels(Y_test)

In [295]:
from itertools import chain
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

In [296]:
lb = LabelBinarizer()

In [301]:
y_true_in_tags = lb.fit_transform(list(chain.from_iterable(Y_true))) # one-hot tags
y_pred_in_tags = lb.transform(list(chain.from_iterable(Y_pred)))
tagset = list(set(lb.classes_))
class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}

In [303]:
print classification_report(
    y_true_in_tags,
    y_pred_in_tags,
    labels = [class_indices[cls] for cls in tagset],
    target_names = tagset
)

             precision    recall  f1-score   support

          O       0.93      0.99      0.96     61463
      I-LOC       0.00      0.00      0.00       313
      B-ORG       0.18      0.17      0.17       386
      I-PER       0.68      0.19      0.29      1037
      B-PER       0.45      0.29      0.35      1428
     I-MISC       0.00      0.00      0.00       561
     B-MISC       0.00      0.00      0.00       974
      I-ORG       0.68      0.26      0.38       282
      B-LOC       0.51      0.40      0.44      1250

avg / total       0.88      0.91      0.89     67694

