# Data Augmentation

In [21]:
import gzip, pickle
import spacy
from spacy.en import English

In [22]:
parser = English()

In [23]:
def extract_info(sent): 
    # assuming parser = spacy.English()
    # sent is a list of words
    if type(sent)==list: sent = ' '.join(sent)
    parsed = parser(unicode(sent))# if type(sent)==str else parser(unicode(sent))
    pos = [token.pos_ for token in parsed]
    ner = ['none' if token.ent_type_=='' else token.ent_type_ for token in parsed]
    dep_rel = [token.dep_ for token in parsed]
    dep_head = [token.head.orth_ for token in parsed]
    return pos, ner, dep_rel, dep_head

In [24]:
path = "/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA/"

In [25]:
f = gzip.open(path+'atis.fold0.pkl.gz','rb')
train, valid, test, dicts = pickle.load(f)

In [26]:
train[0][0]

array([554, 194, 268,  64,  62,  16,   8, 234, 481,  20,  40,  58, 234,
       415, 205], dtype=int32)

In [27]:
dicts.keys() # 'tables2idx' is not used, since it's not derivable generally (atis-specific info)

['labels2idx', 'tables2idx', 'words2idx']

In [28]:
i2w = {i:w for w,i in dicts['words2idx'].iteritems()}
i2l = {i:l for l,i in dicts['labels2idx'].iteritems()}

In [29]:
print map(i2w.get, train[0][0])

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']


In [30]:
X_train = [map(i2w.get, encoded_sent) for encoded_sent in train[0]]
X_test = [map(i2w.get, encoded_sent) for encoded_sent in test[0]]

In [31]:
print X_train[0]
print X_test[0]

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']
['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'st.', 'louis']


In [32]:
Y_train = [map(i2l.get, encoded_labels) for encoded_labels in train[2]]
Y_test = [map(i2l.get, encoded_sent) for encoded_sent in test[2]]

In [33]:
print Y_train[0]
print Y_test[0]

['O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time_relative', 'B-depart_time.time', 'O', 'O', 'B-depart_time.period_of_day', 'O', 'O', 'O', 'B-toloc.city_name', 'I-toloc.city_name']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']


In [34]:
len(dicts['labels2idx'].keys()) # 127 labels in total

127

In [35]:
def augment_info(sent): 
    # sent: a list of words.
    # return: (words, pos, ner, dep_rel, dep_head).
    pos, ner, dep_rel, dep_head = extract_info(sent)
    return (sent, pos, ner, dep_rel, dep_head)

In [36]:
print augment_info(X_train[0])

(['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco'], [u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN'], ['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none'], [u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj'], [u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in'])


In [37]:
X_train_augmented = [augment_info(sent) for sent in X_train]
X_test_augmented = [augment_info(sent) for sent in X_test]

In [38]:
for entry in X_train_augmented[0]:
    print entry

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']
[u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN']
['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none']
[u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj']
[u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in']


# Featurization

In [39]:
from pyparsing import StringEnd, oneOf, FollowedBy, Optional, ZeroOrMore, SkipTo

In [40]:
prefix = ['anti','de','dis','en','em','fore','in','im','il','ir',
          'inter','mid','mis','non','over','pre','re','semi','sub',
          'super','trans','un','under']
suffix = ['able','ible','al','ial','ed','en','er','est','ful','ic',
          'ing','ion','tion','ation','ition','ity','ty','ive','ative',
          'itive','less','ly','ment','ness','ous','eous','ious','s',
          'es','y','ism']

In [41]:
class Featurize:
    
    def __init__(self, prefix=[], suffix=[]): # lists of pfx/sfx.
        self.prefix = prefix
        self.suffix = suffix
        end_of_string = StringEnd()
        pfx_pyp_regex = oneOf(' '.join(prefix))
        sfx_pyp_regex = oneOf(' '.join(suffix)) + FollowedBy(end_of_string)
        self.template = (ZeroOrMore(pfx_pyp_regex)('prefix') +
                         SkipTo(sfx_pyp_regex | end_of_string)('root') + 
                         Optional(sfx_pyp_regex)('suffix'))              
        self.afx = lambda word: self.template.parseString(word)
        self.feat_set = {'pfx': lambda w_idx,datum: self.afx(datum[0][w_idx]).prefix[0] \
                                     if self.afx(datum[0][w_idx]).prefix!='' else 'no_pfx',
                         'sfx': lambda w_idx,datum: self.afx(datum[0][w_idx]).suffix[0] \
                                     if self.afx(datum[0][w_idx]).suffix!='' else 'no_sfx',
                         'root': lambda w_idx,datum: self.afx(datum[0][w_idx]).root,
                         'isdigit': lambda w_idx,datum: datum[0][w_idx].isdigit(),
                         'pos': lambda w_idx,datum: datum[1][w_idx],
                         'ner': lambda w_idx,datum: datum[2][w_idx],
                         'dep_rel': lambda w_idx,datum: datum[3][w_idx],
                         'dep_head': lambda w_idx,datum: datum[4][w_idx]} 
            # datum: (words, pos, ner, dep_rel, dep_head)
        
    def word_featurize(self, datum, i): 
        # datum: (sent, pos, ner, dep_rel, dep_head).
        # i: index of the token processed. 
        features = {}
        for feat in self.feat_set.keys():
            features[feat] = str(self.feat_set[feat](i,datum))
        features['boundary'] = 'No'
        if i > 0:
            for feat in self.feat_set.keys():
                features[feat+'-1'] = str(self.feat_set[feat](i-1,datum))
                if i > 1:
                    features[feat+'-2'] = str(self.feat_set[feat](i-2,datum))
        else: features['boundary'] = 'BOS'
        if i < len(datum[0])-1:
            for feat in self.feat_set.keys():
                features[feat+'+1'] = str(self.feat_set[feat](i+1,datum))
                if i < len(datum[0])-2:
                    features[feat+'+2'] = str(self.feat_set[feat](i+2,datum))
        else: features['boundary'] = 'EOS'
        
        return features
        
    def sent_featurize(self, datum):
        # datum: (sent, pos, ner, dep_rel, dep_head).   
        return [self.word_featurize(datum, i) for i in xrange(len(datum[0]))] 

In [42]:
featurizer = Featurize(prefix,suffix)

In [43]:
test_sent = X_train_augmented[0]
print test_sent

(['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco'], [u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN'], ['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none'], [u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj'], [u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in'])


In [44]:
print featurizer.word_featurize(test_sent,1)

{'pos': 'NOUN', 'pfx+2': 'no_pfx', 'dep_head+2': 'at', 'dep_head+1': 'leave', 'isdigit+2': 'False', 'isdigit': 'False', 'dep_head': 'leave', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'pfx-1': 'no_pfx', 'dep_head-1': 'flights', 'isdigit-1': 'False', 'dep_rel-1': 'det', 'dep_rel+2': 'advmod', 'ner+1': 'none', 'root-1': 'what', 'boundary': 'No', 'dep_rel': 'nsubj', 'root+2': 'atlanta', 'root+1': 'leave', 'ner+2': 'none', 'ner-1': 'none', 'dep_rel+1': 'ROOT', 'sfx': 's', 'sfx-1': 'no_sfx', 'sfx+1': 'no_sfx', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'pos-1': 'ADJ', 'ner': 'none', 'pos+2': 'ADV', 'pos+1': 'VERB', 'root': 'flight'}


In [45]:
print featurizer.sent_featurize(test_sent)[0]

{'pos': 'ADJ', 'isdigit': 'False', 'dep_head+2': 'leave', 'dep_head+1': 'leave', 'pfx+2': 'no_pfx', 'isdigit+2': 'False', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'dep_rel+2': 'ROOT', 'ner+1': 'none', 'boundary': 'BOS', 'dep_rel': 'det', 'root+2': 'leave', 'root+1': 'flight', 'ner+2': 'none', 'dep_head': 'flights', 'dep_rel+1': 'nsubj', 'sfx': 'no_sfx', 'sfx+1': 's', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'ner': 'none', 'pos+2': 'VERB', 'pos+1': 'NOUN', 'root': 'what'}


In [46]:
train_sents = [(x_augmented,y) for x_augmented,y in zip(X_train_augmented,Y_train)]
test_sents = [(x_augmented,y) for x_augmented,y in zip(X_test_augmented,Y_test)]

In [47]:
print train_sents[0][0]
print 
print train_sents[0][1]

(['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco'], [u'ADJ', u'NOUN', u'VERB', u'ADV', u'ADP', u'ADP', u'PROPN', u'ADP', u'DET', u'NOUN', u'CONJ', u'VERB', u'ADP', u'PROPN', u'NOUN'], ['none', 'none', 'none', 'none', 'none', 'none', u'ORG', 'none', 'none', u'TIME', 'none', 'none', 'none', 'none', 'none'], [u'det', u'nsubj', u'ROOT', u'advmod', u'prep', u'nmod', u'pobj', u'prep', u'det', u'pobj', u'cc', u'conj', u'prep', u'compound', u'pobj'], [u'flights', u'leave', u'leave', u'at', u'leave', u'DIGIT', u'at', u'DIGIT', u'afternoon', u'in', u'leave', u'leave', u'arrive', u'francisco', u'in'])

['O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time_relative', 'B-depart_time.time', 'O', 'O', 'B-depart_time.period_of_day', 'O', 'O', 'O', 'B-toloc.city_name', 'I-toloc.city_name']


In [48]:
%%time
X_train = [featurizer.sent_featurize(datum[0]) for datum in train_sents]
Y_train = [datum[1] for datum in train_sents]

CPU times: user 1min 31s, sys: 704 ms, total: 1min 32s
Wall time: 1min 32s


In [49]:
print X_train[0][0]
print 
print Y_train[0][0]

{'pos': 'ADJ', 'isdigit': 'False', 'dep_head+2': 'leave', 'dep_head+1': 'leave', 'pfx+2': 'no_pfx', 'isdigit+2': 'False', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'dep_rel+2': 'ROOT', 'ner+1': 'none', 'boundary': 'BOS', 'dep_rel': 'det', 'root+2': 'leave', 'root+1': 'flight', 'ner+2': 'none', 'dep_head': 'flights', 'dep_rel+1': 'nsubj', 'sfx': 'no_sfx', 'sfx+1': 's', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'ner': 'none', 'pos+2': 'VERB', 'pos+1': 'NOUN', 'root': 'what'}

O


In [50]:
# train = [(w_feats,label) for X,Y in zip(X_train,Y_train) for w_feats,label in zip(X,Y)] # this is for CRF/Maxent

In [61]:
X_train_words = [w_feats for X in X_train for w_feats in X]
Y_train_words = [label for Y in Y_train for label in Y]

In [63]:
print X_train_words[0]
print
print Y_train_words[0]

{'pos': 'ADJ', 'isdigit': 'False', 'dep_head+2': 'leave', 'dep_head+1': 'leave', 'pfx+2': 'no_pfx', 'isdigit+2': 'False', 'isdigit+1': 'False', 'pfx+1': 'no_pfx', 'dep_rel+2': 'ROOT', 'ner+1': 'none', 'boundary': 'BOS', 'dep_rel': 'det', 'root+2': 'leave', 'root+1': 'flight', 'ner+2': 'none', 'dep_head': 'flights', 'dep_rel+1': 'nsubj', 'sfx': 'no_sfx', 'sfx+1': 's', 'pfx': 'no_pfx', 'sfx+2': 'no_sfx', 'ner': 'none', 'pos+2': 'VERB', 'pos+1': 'NOUN', 'root': 'what'}

O


In [52]:
%%time
X_test = [featurizer.sent_featurize(datum[0]) for datum in test_sents]
Y_test = [datum[1] for datum in test_sents]
# test = [(w_feats,label) for X,Y in zip(X_test,Y_test) for w_feats,label in zip(X,Y)] # this is for CRF/Maxent

CPU times: user 18.3 s, sys: 139 ms, total: 18.4 s
Wall time: 18.5 s


In [64]:
X_test_words = [w_feats for X in X_test for w_feats in X]
Y_test_words = [label for Y in Y_test for label in Y]

# SVM

** Training **

In [65]:
from sklearn.preprocessing import MultiLabelBinarizer

In [132]:
X = X_train_words + X_test_words
Y = Y_train_words + Y_test_words

In [134]:
X_Binarizer = MultiLabelBinarizer()
Y_Binarizer = MultiLabelBinarizer()

In [135]:
%%time
X_binarized = X_Binarizer.fit_transform(X)
Y_binarized = Y_Binarizer.fit_transform(Y)

CPU times: user 1.13 s, sys: 111 ms, total: 1.24 s
Wall time: 1.25 s


In [138]:
int(len(X_binarized)*.8)

43571

In [139]:
X_train, X_test = X_binarized[:43571], X_binarized[43571:]
Y_train, Y_test = Y_binarized[:43571], Y_binarized[43571:]

In [140]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

In [141]:
%%time
svm = OneVsRestClassifier(LinearSVC()).fit(X_train, Y_train)

  str(classes[c]))


CPU times: user 4min 49s, sys: 1.09 s, total: 4min 50s
Wall time: 4min 51s


** Evaluation **

In [142]:
from itertools import chain

In [143]:
%%time
Y_pred = svm.predict(X_test)

CPU times: user 32.4 ms, sys: 4.39 ms, total: 36.8 ms
Wall time: 32.9 ms


In [144]:
Y_true = Y_test

In [146]:
print Y_pred[0]
print
print Y_true[0]

[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [147]:
from sklearn.metrics import accuracy_score

In [148]:
print "Accuracy: %.2f" % accuracy_score(Y_true, Y_pred)

Accuracy: 0.62
