In [4]:
import numpy as np
import gzip, pickle, random, os

In [6]:
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Activation, TimeDistributed
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical
from keras.regularizers import l1, l2

In [7]:
def load_atis(path, folder=0):
    
    folders = {0: 'atis.fold0.pkl.gz',
               1: 'atis.fold1.pkl.gz',
               2: 'atis.fold2.pkl.gz',
               3: 'atis.fold3.pkl.gz',
               4: 'atis.fold4.pkl.gz'}
    
    f = gzip.open(path+folders[folder], 'rb')
    train, valid, test, dicts = pickle.load(f)
    f.close()
    
    train = (train[0],train[2])
    valid = (valid[0],valid[2])
    test = (test[0],test[2])

    return {'train':train, 'valid':valid, 'test':test, 'dicts':dicts}

# ATIS PATH: /Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA

In [51]:
class NER:
    
    def __init__(self, data, model_dir, mode='simple_rnn'):
        '''
        # data: a dictionary which contains ...
            {'train':train, 'valid':valid, 'test':test, 'dicts':dicts}
            each value in the dictionary is a 2-tuple ...
            (encoded_sentences, encoded_labels)
            dicts includes ...
            'words2idx', 'labels2idx'
        # mode: 'simple_rnn', 'lstm', 'gru'.
        '''
        if mode not in ['simple_rnn','lstm','gru']:
            print "MODE ERROR: only 'simple_rnn','lstm','gru'. \n"
            return
        self.mode = mode
        self.model_dir = model_dir # NB: this is for temporary model saving pathing.
        
        transform = {'x': lambda x: np.asarray([x]), 
                     'y': lambda y: to_categorical(np.asarray(y)[:,np.newaxis],
                                                   len(data['dicts']['labels2idx']))[np.newaxis,:,:]}
            # len(data['dicts']['labels2idx']): number of labels.
        try:
            data_transformed = [ (map(transform['x'],x), map(transform['y'],y)) 
                                 for x,y in [data['train'], data['valid'], data['test']] ]
            self.X_train, self.Y_train = data_transformed[0]
            self.X_valid, self.Y_valid = data_transformed[1]
            self.X_test, self.Y_test = data_transformed[2]
            self.dicts = data['dicts']
        except:
            print "DATA FORMAT ERROR: \n", \
                  "data = {'train':train, 'valid':valid, 'test':test, 'dicts':dicts} \n", \
                  "value = (encoded_sentences, encoded_labels) \n"
    
    def __shuffle(self, X, Y, seed):
        '''
        # X, Y: data and corresponding labels.
        # seed: ensure the same after-shuffle order for X and Y.
        '''
        random.seed(seed)
        random.shuffle(X)
        random.seed(seed)
        random.shuffle(Y)
    
    def __get_mean_evaluation(self, X, Y, model):
        '''
        # X, Y: data and corresponding labels.
        # return: average loss and accuracy on X and Y
        '''
        evaluation_size = len(X)
        losses, accuracies = [], []
        for i in xrange(evaluation_size):
            loss,accuracy = model.evaluate(X[i],Y[i],verbose=0)
            losses.append(loss)
            accuracies.append(accuracy)
        return (np.mean(losses),np.mean(accuracies))

    def __save_best_model(self):
        '''
        # save current model as the best when called.
        '''
        try:
            os.remove(self.model_dir+'best_model.json')
            os.remove(self.model_dir+'best_weights.h5')
        except OSError:
            pass
        model_json = self.model.to_json()
        open(self.model_dir+'best_model.json','w').write(model_json)
        self.model.save_weights(self.model_dir+'best_weights.h5')
        print "New Best Model Saved!"
    
    def train(self, validation=False, validation_freq=1, verbose=0, verbose_freq=100,
                    lr=.1, nhidden=100, emb_dim=100, nepochs=1,
                    regularize=False, reg_method='l2',lmd=.1):
        '''
        # validation, validation_freq: 
            if true, run validation at validation_freq epoch (1 by default).
        # verbose, verbose_freq: 
            0: only print out simple messages (e.g. '... building models').
            1: print out validation too.
            print training progress after training every verbose_freq sentences.
        # lr: learning rate.
        # nhidden: number of hidden neurons.
        # emb_dim: dimension of word embeddings.
        # nepochs: number of epochs.
        # regularize: regularize or not.
        # reg_method: Lasso or L2 regularization.
        # lmd: regularization hyperparam.
        # return: end training when max patience is reached.
        '''
        print "... configuring model"
        vocsize = len(self.dicts['words2idx'])
        nlabels = len(self.dicts['labels2idx'])
        nsents = len(self.X_train)
        self.dicts['idx2labels'] = {i:l for l,i in self.dicts['labels2idx'].iteritems()}
        self.dicts['idx2words'] = {i:w for w,i in self.dicts['words2idx'].iteritems()}
            
        print "... building model" 
        if regularize:
            regularizer = l2(lmd) if reg_method=='l2' else l1(lmd)
        else:
            regularizer = None
        model_types = {'simple_rnn':SimpleRNN, 'lstm':LSTM, 'gru':GRU}
        self.model = Sequential()
        self.model.add(Embedding(input_dim=vocsize, output_dim=emb_dim))
        self.model.add(model_types[self.mode](output_dim=nhidden, activation='sigmoid', 
                       return_sequences=True, W_regularizer=regularizer))
        self.model.add(TimeDistributed(Dense(output_dim=nlabels)))
        self.model.add(Activation('softmax'))
        sgd = SGD(lr=lr, momentum=.0, decay=.0, nesterov=False)
        self.model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
        
        self.best_model = self.model # initial model, updated when new best models are saved.
        
        print "... training model"
        patience = 3000
        patience_increase_ratio = 2
        improvement_threshold = .995
        best_iter = 0
        best_loss = np.inf
        best_accuracy = 0.
        done_looping = False # stop training when patience are broken.
        num_iter = 0
        for e in xrange(nepochs):
            if done_looping: break
            e += 1
            if verbose: print "... Epoch: %d" % e
            self.__shuffle(self.X_train,self.Y_train,seed=0)
            for i in xrange(nsents):
                num_iter += 1
                if verbose and i!=0 and i%verbose_freq==0:
                    print "    ... trained %d sentences" % i
                if self.X_train[i].shape[1]==1: continue
                self.model.train_on_batch(self.X_train[i],self.Y_train[i])
        
                if validation and i%validation_freq==0:
                    sample_idxs = random.sample(range(len(self.X_valid)),100)
                    X_valid_samples = [self.X_valid[k] for k in sample_idxs]
                    Y_valid_samples = [self.Y_valid[k] for k in sample_idxs]
                    avg_loss, avg_accuracy = self.__get_mean_evaluation(X_valid_samples, 
                                                                        Y_valid_samples,
                                                                        self.model)
                    if avg_loss < best_loss*improvement_threshold and \
                       avg_accuracy > best_accuracy:
                        self.__save_best_model()
                        patience = max(patience, num_iter*patience_increase_ratio)
                        best_loss = avg_loss
                        best_accuracy = avg_accuracy
                        best_iter = num_iter
                        print "Validation: Loss = %.6f | Accuracy = %.6f" % (avg_loss, avg_accuracy)                        
                    if patience <= num_iter:
                        done_looping = True
                        break

        mean_test_loss, mean_test_accuracy = self.__get_mean_evaluation(self.X_test,
                                                                        self.Y_test,
                                                                        self.best_model)
        print "TRAINING COMPLETE (at iteration %d)!" % num_iter
        print "Test on Best Model: Loss = %.6f | Accuracy = %.6f" % (mean_test_loss,mean_test_accuracy)
                
    def predict(self, x): 
        '''
        # x: sentences in string (e.g. "i'd like to book a flight from london to paris")
        # return: a list of predicted labels associated with the words in the sentence x.
        '''
        x_filtered = filter(lambda w:w in self.dicts['words2idx'].keys(), x.split())
        x_encoded = np.asarray([map(self.dicts['words2idx'].get,x_filtered)])
        pred_encoded = map(np.argmax,self.model.predict_on_batch(x_encoded)[0])
        return map(self.dicts['idx2labels'].get, pred_encoded)

In [52]:
data = load_atis(path='/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA/')
model_dir = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/ojo_ner/ojo_ner/models/'

In [53]:
ner = NER(data,model_dir=model_dir)

In [54]:
ner.train(validation=1,validation_freq=1000,verbose=1,verbose_freq=500,
          nhidden=300,nepochs=20)

... configuring model
... building model
... training model
... Epoch: 1
New Best Model Saved!
Validation: Loss = 3.151388 | Accuracy = 0.646422
    ... trained 500 sentences
    ... trained 1000 sentences
New Best Model Saved!
Validation: Loss = 1.500056 | Accuracy = 0.662491
    ... trained 1500 sentences
    ... trained 2000 sentences
New Best Model Saved!
Validation: Loss = 0.876027 | Accuracy = 0.816586
    ... trained 2500 sentences
    ... trained 3000 sentences
    ... trained 3500 sentences
... Epoch: 2
    ... trained 500 sentences
    ... trained 1000 sentences
New Best Model Saved!
Validation: Loss = 0.804577 | Accuracy = 0.829873
    ... trained 1500 sentences
    ... trained 2000 sentences
New Best Model Saved!
Validation: Loss = 0.620858 | Accuracy = 0.864586
    ... trained 2500 sentences
    ... trained 3000 sentences
    ... trained 3500 sentences
... Epoch: 3
    ... trained 500 sentences
    ... trained 1000 sentences
New Best Model Saved!
Validation: Loss = 0.56417

In [92]:
sent = "i'd like to book a flight from charlotte to las vegas"
print ner.predict(sent)

['O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name']


##### LSTM & GRU TESTS (W/O REGULARIZATION)

In [67]:
%%time
model_dir = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/ojo_ner/ojo_ner/models/LSTM/'
ner_LSTM = NER(data,model_dir=model_dir,mode='lstm')
ner_LSTM.train(validation=1,validation_freq=10000,verbose=1,verbose_freq=5000,lr=1.,nhidden=100,nepochs=500)

... configuring model
... building model
... training model
... Epoch: 1
New Best Model Saved!
Validation: Loss = 2.352198 | Accuracy = 0.646422
... Epoch: 2
New Best Model Saved!
Validation: Loss = 0.344126 | Accuracy = 0.919698
... Epoch: 3
New Best Model Saved!
Validation: Loss = 0.195892 | Accuracy = 0.955229
... Epoch: 4
New Best Model Saved!
Validation: Loss = 0.168471 | Accuracy = 0.966054
... Epoch: 5
New Best Model Saved!
Validation: Loss = 0.121325 | Accuracy = 0.976052
... Epoch: 6
... Epoch: 7
... Epoch: 8
New Best Model Saved!
Validation: Loss = 0.105804 | Accuracy = 0.976203
... Epoch: 9
New Best Model Saved!
Validation: Loss = 0.104694 | Accuracy = 0.976819
... Epoch: 10
... Epoch: 11
... Epoch: 12
... Epoch: 13
New Best Model Saved!
Validation: Loss = 0.091405 | Accuracy = 0.984327
... Epoch: 14
... Epoch: 15
... Epoch: 16
New Best Model Saved!
Validation: Loss = 0.090592 | Accuracy = 0.984757
... Epoch: 17
... Epoch: 18
New Best Model Saved!
Validation: Loss = 0.088189

In [68]:
%%time
model_dir = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/ojo_ner/ojo_ner/models/GRU/'
ner_GRU = NER(data,model_dir=model_dir,mode='gru')
ner_GRU.train(validation=1,validation_freq=10000,verbose=1,verbose_freq=5000,lr=1.,nhidden=100,nepochs=500)

... configuring model
... building model
... training model
... Epoch: 1
New Best Model Saved!
Validation: Loss = 2.498494 | Accuracy = 0.646422
... Epoch: 2
New Best Model Saved!
Validation: Loss = 0.314833 | Accuracy = 0.937377
... Epoch: 3
New Best Model Saved!
Validation: Loss = 0.179257 | Accuracy = 0.957094
... Epoch: 4
New Best Model Saved!
Validation: Loss = 0.163287 | Accuracy = 0.966786
... Epoch: 5
New Best Model Saved!
Validation: Loss = 0.121273 | Accuracy = 0.973163
... Epoch: 6
... Epoch: 7
New Best Model Saved!
Validation: Loss = 0.116239 | Accuracy = 0.978026
... Epoch: 8
New Best Model Saved!
Validation: Loss = 0.094448 | Accuracy = 0.981240
... Epoch: 9
... Epoch: 10
... Epoch: 11
... Epoch: 12
... Epoch: 13
New Best Model Saved!
Validation: Loss = 0.085655 | Accuracy = 0.985407
... Epoch: 14
... Epoch: 15
... Epoch: 16
... Epoch: 17
... Epoch: 18
... Epoch: 19
... Epoch: 20
... Epoch: 21
... Epoch: 22
... Epoch: 23
... Epoch: 24
... Epoch: 25
... Epoch: 26
TRAINING 