In [249]:
import numpy as np
import gzip, os, random

In [250]:
from keras.models import Sequential, model_from_json
from keras.layers import Embedding, SimpleRNN, LSTM, GRU, Dense, Activation, TimeDistributed
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

In [251]:
def load_atis(path, folder=0):
    
    folders = {0: 'atis.fold0.pkl.gz',
               1: 'atis.fold1.pkl.gz',
               2: 'atis.fold2.pkl.gz',
               3: 'atis.fold3.pkl.gz',
               4: 'atis.fold4.pkl.gz'}
    
    f = gzip.open(path+folders[folder], 'rb')
    train, valid, test, dicts = pickle.load(f)
    f.close()
    
    train = (train[0],train[2])
    valid = (valid[0],valid[2])
    test = (test[0],test[2])

    return {'train':train, 'valid':valid, 'test':test, 'dicts':dicts}

# ATIS PATH: /Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA

In [261]:
class NER:
    
    def __init__(self, data, model_dir, mode='SimpleRNN'):
        '''
        # data: a dictionary which contains ...
            {'train':train, 'valid':valid, 'test':test, 'dicts':dicts}
            each value in the dictionary is a 2-tuple ...
            (encoded_sentences, encoded_labels)
            dicts includes ...
            'words2idx', 'labels2idx'
        # mode: 'SimpleRNN', 'LSTM', 'GRU'.
        '''
        if mode not in ['SimpleRNN','LSTM','GRU']:
            print "MODE ERROR: only 'SimpleRNN', 'LSTM', 'GRU'. \n"
            return
        self.mode = mode
        try:
            self.model_dir = model_dir
        except IOError:
            print "IOError: Check if directory is correct."
        try:
            data_transformed = [ (map(self.__transform(len(data['dicts']['labels2idx']),'x'),x),
                                      map(self.__transform(len(data['dicts']['labels2idx']),'y'),y)) 
                                      for x,y in [data['train'], 
                                                  data['valid'], 
                                                  data['test']] ]
            self.X_train, self.Y_train = data_transformed[0]
            self.X_valid, self.Y_valid = data_transformed[1]
            self.X_test, self.Y_test = data_transformed[2]
            self.dicts = data['dicts']
        except:
            print "DATA FORMAT ERROR: \n", \
                  "data = {'train':train, 'valid':valid, 'test':test, 'dicts':dicts} \n", \
                  "value = (encoded_sentences, encoded_labels) \n"
        
    def __transform(self, nlabels, mode='x'):
        '''
        # x: (sent_len, ) => (1, sent_len)
        # y: (sent_len, ) => (1, sent_len, nlabels)
        '''
        if mode not in ['x','y']: 
            print "MODE ERROR: only 'x' and 'y'. \n"
            return
        if mode=='x': return lambda x: np.asarray([x])  
        if mode=='y': return lambda y: to_categorical(np.asarray(y)[:,np.newaxis],
                                                  nlabels)[np.newaxis,:,:] 
    
    def __shuffle(self, X, Y, seed):
        '''
        # X, Y: data and corresponding labels.
        # seed: ensure the same after-shuffle order for X and Y.
        '''
        random.seed(seed)
        random.shuffle(X)
        random.seed(seed)
        random.shuffle(Y)
    
    def __get_mean_evaluation(self, X, Y):
        '''
        # X, Y: data and corresponding labels.
        # return: average loss and accuracy on X and Y
        '''
        losses, accuracies = [], []
        for i in xrange(100):
            loss,accuracy = ner.model.evaluate(X[i],Y[i],verbose=0)
            losses.append(loss)
            accuracies.append(accuracy)
        return (np.mean(losses),np.mean(accuracies))
    
    def __save_best_model(self,mode):
        try:
            os.remove(self.model_dir+mode+'.json')
            os.remove(self.model_dir+mode+'.h5')
        except OSError:
            pass
        model_json = self.model.to_json()
        open(self.model_dir+mode+'.json','w').write(model_json)
        self.model.save_weights(self.model_dir+mode+'.h5')
        print "New %s Saved!" % mode  
        
    def train(self, validation=False, validation_freq=1, verbose=0, verbose_freq=100,
                    lr=.1, nhidden=100, emb_dim=100, nepochs=1):
        '''
        # validation, validation_freq: 
            if true, run validation at validation_freq epoch (1 by default).
        # verbose, verbose_freq: 
            0: only print out simple messages (e.g. '... building models').
            1: print out validation too.
            print training progress after training every verbose_freq sentences.
        # lr: learning rate.
        # nhidden: number of hidden neurons.
        # emb_dim: dimension of word embeddings.
        # nepochs: number of epochs.
        # return: end training when max patience is reached.
        '''
        print "... configuring model"
        # learning word embeddings using word2vec
        data_w = []
        for data in [self.X_train, self.X_valid, self.X_test]:
            data_w += [map(i2w.get,d) for datum in data for d in datum]
        self.w2v = Word2Vec(data_w,size=100,min_count=1)                
        # building customized word embedding weights
        index_dict = self.dicts['words2idx']
        word_vectors = {w:self.w2v[w] for w in index_dict.iterkeys()}
        n_symbols = len(index_dict) + 1
        embedding_weights = np.zeros((n_symbols+1,emb_dim))
        for word,index in index_dict.items():
            embedding_weights[index,:] = word_vectors[word] 
        # setting up params for model           
        vocsize = len(self.dicts['words2idx'])
        nlabels = len(self.dicts['labels2idx'])
        nsents = len(self.X_train)
        self.dicts['idx2labels'] = {i:l for l,i in self.dicts['labels2idx'].iteritems()}
        self.dicts['idx2words'] = {i:w for w,i in self.dicts['words2idx'].iteritems()}
        
        print "... building model"            
        self.model = Sequential()
        self.model.add(Embedding(output_dim=emb_dim, input_dim=n_symbols+1, mask_zero=True, weights=[embedding_weights]))
        if self.mode=='LSTM':
            self.model.add(LSTM(output_dim=nhidden, activation='sigmoid', return_sequences=True))
        elif self.mode=='GRU':
            self.model.add(GRU(output_dim=nhidden, activation='sigmoid', return_sequences=True))
        else: 
            self.model.add(SimpleRNN(output_dim=nhidden, activation='sigmoid', return_sequences=True))
        self.model.add(TimeDistributed(Dense(output_dim=nlabels)))
        self.model.add(Activation('softmax'))
        sgd = SGD(lr=lr, momentum=.0, decay=.0, nesterov=False)
        self.model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    
        print "... training model"
        patience = 3000
        patience_increase_ratio = 2
        improvement_threshold = .995
        best_validation_loss = np.inf
        best_iter = 0
        best_loss = np.inf
        best_accuracy = 0.
        for e in xrange(nepochs):
            e += 1
            if verbose: print "... Epoch: %d" % e
            self.__shuffle(self.X_train,self.Y_train,seed=0)
            for i in xrange(nsents):
                if verbose and i!=0 and i%verbose_freq==0:
                    print "    ... trained %d sentences" % i
                if self.X_train[i].shape[1]==1: continue
                self.model.train_on_batch(self.X_train[i],self.Y_train[i])
        
                if validation and i%validation_freq==0:
                    sample_idxs = random.sample(range(len(self.X_valid)),100)
                    X_valid_samples = [self.X_valid[k] for k in sample_idxs]
                    Y_valid_samples = [self.Y_valid[k] for k in sample_idxs]
                    avg_loss, avg_accuracy = self.__get_mean_evaluation(X_valid_samples, 
                                                                        Y_valid_samples)
                    current_iter = ((e-1)*nsents+i)
                    if avg_loss < best_validation_loss*improvement_threshold:
                        self.__save_best_model(mode='best_valid_model')
                        patience = max(patience, current_iter*patience_increase_ratio)
                        best_validation_loss = avg_loss
                        best_iter = current_iter
                        sample_idxs = random.sample(range(len(self.X_test)),100)
                        X_test_samples = [self.X_test[k] for k in sample_idxs]
                        Y_test_samples = [self.Y_test[k] for k in sample_idxs]
                        avg_test_loss, avg_test_accuracy = self.__get_mean_evaluation(X_test_samples, 
                                                                                      Y_test_samples)
                        print "Validation: Loss = %.6f | Accuracy = %.6f" % (avg_loss, avg_accuracy)
                        print "Test: Loss = %.6f | Accuracy = %.6f" % (avg_test_loss, avg_test_accuracy)
                        if avg_test_loss<best_loss and avg_test_accuracy<best_accuracy:
                            best_loss = avg_test_loss
                            best_accuracy = avg_test_accuracy
                            self.__save_best_model(mode='best_test_model')                            
                    if patience <= current_iter:
                        print "TRAINING COMPLETE (at iteration %d)" % current_iter
                        return
                
    def predict(self, x): 
        '''
        # x: sentences in string (e.g. "i'd like to book a flight from london to paris")
        # return: a list of predicted labels associated with the words in the sentence x.
        '''
        x_filtered = filter(lambda w:w in self.dicts['words2idx'].keys(), x.split())
        x_encoded = np.asarray([map(self.dicts['words2idx'].get,x_filtered)])
        pred_encoded = map(np.argmax,self.model.predict_on_batch(x_encoded)[0])
        return map(self.dicts['idx2labels'].get, pred_encoded)

In [262]:
data = load_atis(path='/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA/')
model_dir = '/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/ojo_ner/ojo_ner/models/'

In [263]:
ner = NER(data,model_dir=model_dir)

In [264]:
ner.train(validation=1,validation_freq=1000,verbose=1,verbose_freq=500,nepochs=20)

... configuring model
... building model
... training model
... Epoch: 1
New best_valid_model Saved!
Validation: Loss = 4.251051 | Accuracy = 0.000000
Test: Loss = 4.323856 | Accuracy = 0.000000
    ... trained 500 sentences
    ... trained 1000 sentences
New best_valid_model Saved!
Validation: Loss = 1.126817 | Accuracy = 0.723969
Test: Loss = 1.352461 | Accuracy = 0.669490
    ... trained 1500 sentences
    ... trained 2000 sentences
New best_valid_model Saved!
Validation: Loss = 0.874869 | Accuracy = 0.805185
Test: Loss = 1.124764 | Accuracy = 0.764178
    ... trained 2500 sentences
    ... trained 3000 sentences
New best_valid_model Saved!
Validation: Loss = 0.657061 | Accuracy = 0.849701
Test: Loss = 0.904144 | Accuracy = 0.806225
    ... trained 3500 sentences
... Epoch: 2
    ... trained 500 sentences
    ... trained 1000 sentences
New best_valid_model Saved!
Validation: Loss = 0.627164 | Accuracy = 0.870469
Test: Loss = 0.703038 | Accuracy = 0.857402
    ... trained 1500 senten

In [265]:
sent = "i'd like to book a flight from charlotte to las vegas"
print ner.predict(sent)

['O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name']
