# Data Augmentation

In [1]:
import gzip, pickle
import spacy
from spacy.en import English

In [2]:
parser = English()

In [3]:
def extract_info(sent): 
    # assuming parser = spacy.English()
    # sent is a list of words
    if type(sent)==list: sent = ' '.join(sent)
    parsed = parser(unicode(sent))# if type(sent)==str else parser(unicode(sent))
    pos = [token.pos_ for token in parsed]
    ner = ['none' if token.ent_type_=='' else token.ent_type_ for token in parsed]
    dep_rel = [token.dep_ for token in parsed]
    dep_head = [token.head.orth_ for token in parsed]
    return pos, ner, dep_rel, dep_head

In [4]:
path = "/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA/"

In [5]:
f = gzip.open(path+'atis.fold0.pkl.gz','rb')
train, valid, test, dicts = pickle.load(f)

In [6]:
train[0][0]

array([554, 194, 268,  64,  62,  16,   8, 234, 481,  20,  40,  58, 234,
       415, 205], dtype=int32)

In [7]:
dicts.keys() # 'tables2idx' is not used, since it's not derivable generally (atis-specific info)

['labels2idx', 'tables2idx', 'words2idx']

In [8]:
i2w = {i:w for w,i in dicts['words2idx'].iteritems()}
i2l = {i:l for l,i in dicts['labels2idx'].iteritems()}

In [9]:
print map(i2w.get, train[0][0])

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']


In [10]:
X_train = [map(i2w.get, encoded_sent) for encoded_sent in train[0]]
X_test = [map(i2w.get, encoded_sent) for encoded_sent in test[0]]

In [11]:
print X_train[0]
print X_test[0]

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']
['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'st.', 'louis']


In [12]:
Y_train = [map(i2l.get, encoded_labels) for encoded_labels in train[2]]
Y_test = [map(i2l.get, encoded_sent) for encoded_sent in test[2]]

In [13]:
print Y_train[0]
print Y_test[0]

['O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time_relative', 'B-depart_time.time', 'O', 'O', 'B-depart_time.period_of_day', 'O', 'O', 'O', 'B-toloc.city_name', 'I-toloc.city_name']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']


In [14]:
len(dicts['labels2idx'].keys()) # 127 labels in total

127

In [15]:
def augment_info(sent): 
    # sent: a list of words.
    # return: (words, pos, ner, dep_rel, dep_head).
    pos, ner, dep_rel, dep_head = extract_info(sent)
    return (sent, pos, ner, dep_rel, dep_head)

In [16]:
print augment_info(X_train[0])

(['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco'], [u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN'], ['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none'], [u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj'], [u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in'])


In [17]:
X_train_augmented = [augment_info(sent) for sent in X_train]
X_test_augmented = [augment_info(sent) for sent in X_test]

In [18]:
for entry in X_train_augmented[0]:
    print entry

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']
[u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN']
['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none']
[u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj']
[u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in']


# Featurization

In [19]:
from pyparsing import StringEnd, oneOf, FollowedBy, Optional, ZeroOrMore, SkipTo

In [20]:
prefix = ['anti','de','dis','en','em','fore','in','im','il','ir',
          'inter','mid','mis','non','over','pre','re','semi','sub',
          'super','trans','un','under']
suffix = ['able','ible','al','ial','ed','en','er','est','ful','ic',
          'ing','ion','tion','ation','ition','ity','ty','ive','ative',
          'itive','less','ly','ment','ness','ous','eous','ious','s',
          'es','y','ism']

In [28]:
class Featurize:
    
    def __init__(self, prefix=[], suffix=[]): # lists of pfx/sfx.
        self.prefix = prefix
        self.suffix = suffix
        end_of_string = StringEnd()
        pfx_pyp_regex = oneOf(' '.join(prefix))
        sfx_pyp_regex = oneOf(' '.join(suffix)) + FollowedBy(end_of_string)
        self.template = (ZeroOrMore(pfx_pyp_regex)('prefix') +
                         SkipTo(sfx_pyp_regex | end_of_string)('root') + 
                         Optional(sfx_pyp_regex)('suffix'))              
        self.afx = lambda word: self.template.parseString(word)
        self.feat_set = {'pfx': lambda w_idx,datum: self.afx(datum[0][w_idx]).prefix[0] \
                                     if self.afx(datum[0][w_idx]).prefix!='' else 'no_pfx',
                         'sfx': lambda w_idx,datum: self.afx(datum[0][w_idx]).suffix[0] \
                                     if self.afx(datum[0][w_idx]).suffix!='' else 'no_sfx',
                         'root': lambda w_idx,datum: self.afx(datum[0][w_idx]).root,
                         'isdigit': lambda w_idx,datum: datum[0][w_idx].isdigit(),
                         'pos': lambda w_idx,datum: datum[1][w_idx],
                         'ner': lambda w_idx,datum: datum[2][w_idx],
                         'dep_rel': lambda w_idx,datum: datum[3][w_idx],
                         'dep_head': lambda w_idx,datum: datum[4][w_idx]} 
            # datum: (words, pos, ner, dep_rel, dep_head)
        
    def word_featurize(self, datum, i): 
        # datum: (sent, pos, ner, dep_rel, dep_head).
        # i: index of the token processed. 
        features = {}
        for feat in self.feat_set.keys():
            features[feat] = str(self.feat_set[feat](i,datum))
        features['boundary'] = 'No'
        if i > 0:
            for feat in self.feat_set.keys():
                features[feat+'-1'] = str(self.feat_set[feat](i-1,datum))
                if i > 1:
                    features[feat+'-2'] = str(self.feat_set[feat](i-2,datum))
        else: features['boundary'] = 'BOS'
        if i < len(datum[0])-1:
            for feat in self.feat_set.keys():
                features[feat+'+1'] = str(self.feat_set[feat](i+1,datum))
                if i < len(datum[0])-2:
                    features[feat+'+2'] = str(self.feat_set[feat](i+2,datum))
        else: features['boundary'] = 'EOS'
        
        return features
        
    def sent_featurize(self, datum):
        # datum: (sent, pos, ner, dep_rel, dep_head).   
        return [self.word_featurize(datum, i) for i in xrange(len(datum[0]))] 

In [29]:
featurizer = Featurize(prefix,suffix)

In [30]:
test_sent = X_train_augmented[0]
print test_sent

(['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco'], [u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN'], ['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none'], [u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj'], [u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in'])


In [31]:
print featurizer.word_featurize(test_sent,1)

{'pos': 'NOUN', 'pfx+2': 'no_pfx', 'dep_head+2': 'at', 'dep_head+1': 'leave', 'isdigit+2': 'False', 'isdigit': 'False', 'dep_head': 'leave', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'pfx-1': 'no_pfx', 'dep_head-1': 'flights', 'isdigit-1': 'False', 'dep_rel-1': 'det', 'dep_rel+2': 'advmod', 'ner+1': 'none', 'root-1': 'what', 'boundary': 'No', 'dep_rel': 'nsubj', 'root+2': 'atlanta', 'root+1': 'leave', 'ner+2': 'none', 'ner-1': 'none', 'dep_rel+1': 'ROOT', 'sfx': 's', 'sfx-1': 'no_sfx', 'sfx+1': 'no_sfx', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'pos-1': 'ADJ', 'ner': 'none', 'pos+2': 'ADV', 'pos+1': 'VERB', 'root': 'flight'}


In [32]:
print featurizer.sent_featurize(test_sent)[0]

{'pos': 'ADJ', 'isdigit': 'False', 'dep_head+2': 'leave', 'dep_head+1': 'leave', 'pfx+2': 'no_pfx', 'isdigit+2': 'False', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'dep_rel+2': 'ROOT', 'ner+1': 'none', 'boundary': 'BOS', 'dep_rel': 'det', 'root+2': 'leave', 'root+1': 'flight', 'ner+2': 'none', 'dep_head': 'flights', 'dep_rel+1': 'nsubj', 'sfx': 'no_sfx', 'sfx+1': 's', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'ner': 'none', 'pos+2': 'VERB', 'pos+1': 'NOUN', 'root': 'what'}


In [35]:
train_sents = [(x_augmented,y) for x_augmented,y in zip(X_train_augmented,Y_train)]
test_sents = [(x_augmented,y) for x_augmented,y in zip(X_test_augmented,Y_test)]

In [36]:
print train_sents[0][0]
print 
print train_sents[0][1]

(['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco'], [u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN'], ['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none'], [u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj'], [u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in'])

['O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time_relative', 'B-depart_time.time', 'O', 'O', 'B-depart_time.period_of_day', 'O', 'O', 'O', 'B-toloc.city_name', 'I-toloc.city_name']


In [37]:
%%time
X_train = [featurizer.sent_featurize(datum[0]) for datum in train_sents]
Y_train = [datum[1] for datum in train_sents]

CPU times: user 1min 35s, sys: 602 ms, total: 1min 36s
Wall time: 1min 36s


In [40]:
print X_train[0][0]
print 
print Y_train[0][0]

{'pos': 'ADJ', 'isdigit': 'False', 'dep_head+2': 'leave', 'dep_head+1': 'leave', 'pfx+2': 'no_pfx', 'isdigit+2': 'False', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'dep_rel+2': 'ROOT', 'ner+1': 'none', 'boundary': 'BOS', 'dep_rel': 'det', 'root+2': 'leave', 'root+1': 'flight', 'ner+2': 'none', 'dep_head': 'flights', 'dep_rel+1': 'nsubj', 'sfx': 'no_sfx', 'sfx+1': 's', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'ner': 'none', 'pos+2': 'VERB', 'pos+1': 'NOUN', 'root': 'what'}

O


In [41]:
train = [(w_feats,label) for X,Y in zip(X_train,Y_train) for w_feats,label in zip(X,Y)]

In [43]:
print train[0]
print
print train[1]

({'pos': 'ADJ', 'isdigit': 'False', 'dep_head+2': 'leave', 'dep_head+1': 'leave', 'pfx+2': 'no_pfx', 'isdigit+2': 'False', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'dep_rel+2': 'ROOT', 'ner+1': 'none', 'boundary': 'BOS', 'dep_rel': 'det', 'root+2': 'leave', 'root+1': 'flight', 'ner+2': 'none', 'dep_head': 'flights', 'dep_rel+1': 'nsubj', 'sfx': 'no_sfx', 'sfx+1': 's', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'ner': 'none', 'pos+2': 'VERB', 'pos+1': 'NOUN', 'root': 'what'}, 'O')

({'pos': 'NOUN', 'pfx+2': 'no_pfx', 'dep_head+2': 'at', 'dep_head+1': 'leave', 'isdigit+2': 'False', 'isdigit': 'False', 'dep_head': 'leave', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'pfx-1': 'no_pfx', 'dep_head-1': 'flights', 'isdigit-1': 'False', 'dep_rel-1': 'det', 'dep_rel+2': 'advmod', 'ner+1': 'none', 'root-1': 'what', 'boundary': 'No', 'dep_rel': 'nsubj', 'root+2': 'atlanta', 'root+1': 'leave', 'ner+2': 'none', 'ner-1': 'none', 'dep_rel+1': 'ROOT', 'sfx': 's', 'sfx-1': 'no_sfx', 'sfx+1': 'no_sfx', 'pfx': 'no_

In [44]:
%%time
X_test = [featurizer.sent_featurize(datum[0]) for datum in test_sents]
Y_test = [datum[1] for datum in test_sents]
test = [(w_feats,label) for X,Y in zip(X_test,Y_test) for w_feats,label in zip(X,Y)]

CPU times: user 19.3 s, sys: 144 ms, total: 19.5 s
Wall time: 19.5 s


# Maxent Classifier

In [46]:
import random
import numpy as np
from nltk import MaxentClassifier, classify

In [52]:
%%time
me_lbfgs = MaxentClassifier.train(train, algorithm='megam', trace=3, max_iter=10)

CPU times: user 3min 13s, sys: 2.67 s, total: 3min 16s
Wall time: 12min 55s


In [53]:
print "L-BFGS: %.2f%%" % (classify.accuracy(me_lbfgs,test)*100)

L-BFGS: 90.86%
