# Load Data

In [4]:
import gzip, pickle

In [5]:
path = "/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA/"

In [6]:
f = gzip.open(path+'atis.fold0.pkl.gz','rb')
train, valid, test, dicts = pickle.load(f)

In [12]:
w2i = dicts['words2idx']
l2i = dicts['labels2idx']
i2w = {i:w for w,i in dicts['words2idx'].iteritems()}
i2l = {i:l for l,i in dicts['labels2idx'].iteritems()}
word_vocab = w2i.keys()
label_vocab = l2i.keys()

In [9]:
X_train = [map(i2w.get, encoded_sent) for encoded_sent in train[0]]
X_test = [map(i2w.get, encoded_sent) for encoded_sent in test[0]]
Y_train = [map(i2l.get, encoded_labels) for encoded_labels in train[2]]
Y_test = [map(i2l.get, encoded_sent) for encoded_sent in test[2]]

In [10]:
print X_train[0]
print Y_train[0]
print
print X_test[0]
print Y_test[0]

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']
['O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time_relative', 'B-depart_time.time', 'O', 'O', 'B-depart_time.period_of_day', 'O', 'O', 'O', 'B-toloc.city_name', 'I-toloc.city_name']

['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'st.', 'louis']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']


In [14]:
print len(word_vocab)
print len(label_vocab)

572
127


In [18]:
print len(X_train), ' ', len(Y_train)

3983   3983


In [16]:
tw = X_train[0]
tl = Y_train[0]

In [17]:
for i in range(2,len(tl)):
    print tw[i-2], ' ', 

what   flights   leave   atlanta   at   about   DIGIT   in   the   afternoon   and   arrive   in  


# Build Raw Cooccurrence Matrices

In [19]:
import numpy as np

In [27]:
def build_w2l_matrix(pos):
    m = np.zeros((len(word_vocab),len(label_vocab)))
    for X,Y in zip(X_train,Y_train):
        for i in xrange(abs(pos),min(len(Y),len(Y)-pos)):
            m[w2i[X[i+pos]]][l2i[Y[i]]] += 1
    return m

In [28]:
%%time
w2l_ms = {'neg2':build_w2l_matrix(pos=-2),
          'neg1':build_w2l_matrix(pos=-1),
          'zero':build_w2l_matrix(pos=0),
          'pos1':build_w2l_matrix(pos=1),
          'pos2':build_w2l_matrix(pos=2)}

CPU times: user 154 ms, sys: 9.06 ms, total: 163 ms
Wall time: 157 ms


# Compute SPS & Asscoiation Strength

**Informal Definition**

* **Selectional Preference Strength (SPS)**: The extent to which the probability of observing a feature $ft$ increases (from its marginal probability distribution on a dataset) on conditioning $ft$ on a particular label $l$.

* **Selectional Association Strength (SAS)**: The contribution of a feature value $ft_i$ to the overall $SPS$ of a label $l$ with respect to $ft$.


**Math**

* **SPS**

    * $SPS(l) = D_{KL}(P(ft|l)||P(ft)) = \sum_{i}P(ft_i|l)\cdot log\frac{P(ft_i|l)}{P(ft_i)}$.
    

* **SAS**

    * $SAS(ft_i,l) = \frac{P(ft_i|l)\cdot log\frac{P(ft_i|l)}{P(ft_i)}}{SPS(l)}$.

In [29]:
from __future__ import division

In [30]:
log = lambda x: np.log(x) if x>0 else np.log(1e-20)
div = lambda x,y: x/y if y!=0 else 0

In [34]:
# resnik association matrix
def resnik_association_matrix(w2l):
    
    total_count = w2l.sum()
    p_w = lambda w: div(w2l[w2i[w],:].sum(), total_count)
    p_w_given_l = lambda w,l: div(w2l[w2i[w]][l2i[l]], w2l[:,l2i[l]].sum())
    sps_w_l = lambda w,l: p_w_given_l(w,l)*log(div(p_w_given_l(w,l),p_w(w)))
    sps_l = lambda l: sum(p_w_given_l(w,l)*log(div(p_w_given_l(w,l),p_w(w))) 
                          for w in word_vocab)
    
    asso_w_l = np.zeros((len(word_vocab),len(label_vocab)))
    for l in label_vocab:
        s_l = sps_l(l)
        for w in word_vocab:
            asso_w_l[w2i[w]][l2i[l]] = div(sps_w_l(w,l),s_l)
    
    return asso_w_l
    

In [64]:
%%time
asso_ms = {'neg2':resnik_association_matrix(w2l_ms['neg2']),
           'neg1':resnik_association_matrix(w2l_ms['neg1']),
           'zero':resnik_association_matrix(w2l_ms['zero']),
           'pos1':resnik_association_matrix(w2l_ms['pos1']),
           'pos2':resnik_association_matrix(w2l_ms['pos2'])}

CPU times: user 10.5 s, sys: 31.3 ms, total: 10.5 s
Wall time: 10.5 s


In [116]:
def top_k(asso, l, k=5):
    
    l_vec = asso[:,l2i[l]]
    k_ws = np.argsort(l_vec)[::-1][:k]
    
    return zip(map(i2w.get, k_ws),[l_vec[w_idx] for w_idx in k_ws])

In [97]:
l1 = 'B-fromloc.city_name'
l2 = 'B-toloc.city_name'

In [98]:
top_k(asso_ms['neg1'], l1)

[('from', 0.91907842123589178), ('between', 0.071307412020743574), ('leaving', 0.021747763665529767), ('leave', 0.010534667010072322), ('leaves', 0.00080508222757163454)]


In [99]:
top_k(asso_ms['neg1'], l2)

[('to', 0.92025153953489625), ('and', 0.051358698459713206), ('in', 0.019955171600037791), ('downtown', 0.0059494972721757028), ('into', 0.0052958856826928287)]


##### WEAK FEATURES IN CRF FOR ATIS

In [103]:
# F1 <= 50 in CRF 
L = ['I-restriction_code','B-arrive_time.period_of_day','I-return_date.date_relative','I-airport_name',
     'B-airport_code','I-city_name','B-restriction_code','B-depart_time.start_time','I-arrive_time.time_relative',
     'B-arrive_date.day_number','B-arrive_date.day_name','I-fromloc.state_name','B-mod','B-toloc.country_name',
     'B-return_date.date_relative','I-transport_type','I-arrive_time.start_time','B-airport_name',
     'B-toloc.airport_code','B-days_code','I-state_name','I-flight_number','B-state_name','B-depart_time.end_time',
     'B-depart_time.end_time','I-depart_time.end_time']

In [104]:
l = 'I-restriction_code'

In [106]:
for m in asso_ms.iterkeys():
    top_k(asso_ms[m], l, k=1)

[('restriction', 0.9934885679918003)]
[('DIGITDIGIT', 1.0)]
[('ap', 1.0)]
[('your', 0.0)]
[('mean', 0.81521683319047966)]


In [109]:
0.9934885679918003*.2 + .2 + .2 + 0 + 0.81521683319047966*.2

0.761741080236456

In [110]:
for m in asso_ms.iterkeys():
    top_k(asso_ms[m], l, k=5)

[('restriction', 0.9934885679918003), ('the', 0.006511432008199646), ('express', -0.0), ('flight', -0.0), ('flies', -0.0)]
[('DIGITDIGIT', 1.0), ('your', -0.0), ('fly', -0.0), ('flights', -0.0), ('flight', -0.0)]
[('ap', 1.0), ('your', -0.0), ('goes', -0.0), ('flights', -0.0), ('flight', -0.0)]
[('your', 0.0), ('fifteenth', 0.0), ('flight', 0.0), ('flies', 0.0), ('fit', 0.0)]
[('mean', 0.81521683319047966), ('restriction', 0.18478316680952037), ('flights', -0.0), ('flight', -0.0), ('flies', -0.0)]


In [111]:
0.9934885679918003*.2 + .2 + .2 + 0 + 0.18478316680952037*.2

0.6356543469602641

In [112]:
from collections import defaultdict

In [119]:
# GET [-2,+2] WORD-PROB LISTS FOR LABELS IN L (k=5)
def get_w2p_dic(L):
    
    L_dic = defaultdict(lambda : defaultdict(list))
    for l in L:
        for pos in asso_ms.keys(): # pos = ['neg2','neg1','zero','pos1','pos2']
            L_dic[l][pos] = top_k(asso_ms[pos],l)
    
    return L_dic
    

In [262]:
# L_dic = get_w2p_dic(L) # condition: retagging on f1<.5 label types
L_dic = get_w2p_dic(l2i.keys()) # condition: retagging on all label types 

In [263]:
L_dic.items()[0]

('B-time_relative',
 defaultdict(list,
             {'neg1': [('morning', 1.0),
               ('go', -0.0),
               ('flight', -0.0),
               ('flies', -0.0),
               ('fit', -0.0)],
              'neg2': [('the', 1.0),
               ('your', -0.0),
               ('fifteenth', -0.0),
               ('flight', -0.0),
               ('flies', -0.0)],
              'pos1': [('DIGIT', 1.0),
               ('fly', -0.0),
               ('flights', -0.0),
               ('flight', -0.0),
               ('flies', -0.0)],
              'pos2': [('am', 1.0),
               ('goes', -0.0),
               ('flights', -0.0),
               ('flight', -0.0),
               ('flies', -0.0)],
              'zero': [('around', 1.0),
               ('goes', -0.0),
               ('florida', -0.0),
               ('flights', -0.0),
               ('flight', -0.0)]}))

In [124]:
sample = X_train[0]
print sample

['what', 'flights', 'leave', 'atlanta', 'at', 'about', 'DIGIT', 'in', 'the', 'afternoon', 'and', 'arrive', 'in', 'san', 'francisco']


In [141]:
def get_window(w_idx, X): # X: X_train[0]
    features = [X[w_idx]]
    if w_idx > 0:
        features.insert(0,X[w_idx-1])
        if w_idx > 1:
            features.insert(0,X[w_idx-2])
        else: features.insert(0,'BOS')
    else: features.insert(0,'BOS'); features.insert(0,'BOS')
    if w_idx < len(X)-1:
        features.append(X[w_idx+1])
        if w_idx < len(X)-2:
            features.append(X[w_idx+2])
        else: features.append('EOS')
    else: features += ['EOS','EOS']
    return features

In [142]:
print get_window(0,sample)
print get_window(1,sample)
print get_window(len(sample)-1,sample)

['BOS', 'BOS', 'what', 'flights', 'leave']
['BOS', 'what', 'flights', 'leave', 'atlanta']
['in', 'san', 'francisco', 'EOS', 'EOS']


In [261]:
def most_likely_relabel(ws): # ws: [-2,+2], 5-word window.
    
    ml_new_l = {'l':'','lin_comb':0}
    pos_list = ['neg2','neg1','zero','pos1','pos2']
    
    for l,w_dic in L_dic.iteritems():
        l_lin_comb = 0
        for pos,w in zip(pos_list,ws):
            for wl,prob_wl in w_dic[pos]:
                if w==wl: 
                    l_lin_comb += .2*prob_wl
                    break
        if l_lin_comb > ml_new_l['lin_comb']:
            ml_new_l['l'] = l
            ml_new_l['lin_comb'] = l_lin_comb
    
    return (ml_new_l['l'],ml_new_l['lin_comb'])
    

In [154]:
# Experiment: The following label entry should be returned!!
# 
# ('B-airport_code',
#  defaultdict(list,
#              {'neg1': [('is', 0.53537965145354793),
#                ('does', 0.46462034854645201),
#                ('your', -0.0),
#                ('fifth', -0.0),
#                ('flight', -0.0)],
#               'neg2': [('what', 0.90244454901210958),
#                ('where', 0.06343329517761026),
#                ('city', 0.034122155810280101),
#                ('your', -0.0),
#                ('fifteenth', -0.0)],
#               'pos1': [('mean', 0.5608967426638114),
#                ('stand', 0.40210857992976262),
#                ('washington', 0.036994677406426035),
#                ('flies', -0.0),
#                ('fit', -0.0)],
#               'pos2': [('for', 1.0),
#                ('your', -0.0),
#                ('florida', -0.0),
#                ('flight', -0.0),
#                ('flies', -0.0)],
#               'zero': [('mco', 0.33784216297980474),
#                ('ewr', 0.28957899683983263),
#                ('<UNK>', 0.20429812405875766),
#                ('ord', 0.096526332279944199),
#                ('dfw', 0.039392220834579349)]}))
ws = ['what', 'is', 'mco', 'mean', 'for']
most_likely_relabel(ws)

B-airport_code 0.667312621222
I-arrive_time.start_time 0
B-airport_name 0
B-depart_time.start_time 0
I-arrive_time.time_relative 0
B-arrive_date.day_number 0
I-restriction_code 0.163043366638
B-arrive_date.day_name 0
I-fromloc.state_name 0
B-mod 0
B-toloc.airport_code 0
I-city_name 0
B-days_code 0.4
I-depart_time.end_time 0
B-state_name 0
I-state_name 0
I-flight_number 0
B-toloc.country_name 0
B-arrive_time.period_of_day 0
B-return_date.date_relative 0
I-transport_type 0
I-return_date.date_relative 0
B-depart_time.end_time 0
B-restriction_code 0
I-airport_name 0


('B-airport_code', 0.66731262122185475)

In [167]:
sample_X = X_test[0] # note that this is the original word list, not featurized list.
sample_Y = y_pred[0] # y_pred = [tagger.tag(sent) for sent in X_test_featurized]
print sample_X
print
print sample_Y

['i', 'would', 'like', 'to', 'find', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'st.', 'louis']

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']


In [269]:
def retag_check(X,Y,threshold=.5): # X,Y in X_test, pred(X_test_featurized)
    
    for w_idx in xrange(len(X)):
        if Y[w_idx]!='O': continue
        ws = get_window(w_idx,X)
        l, prob_l = most_likely_relabel(ws)
        if prob_l > threshold:
            Y[w_idx] = l
    
    return Y


In [194]:
print retag_check(sample_X,sample_Y)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']


# Good Old 5-Gram CRF

In [132]:
def word_window(w_idx, X): # X: X_train[0]
    features = ['0'+X[w_idx]]
    if w_idx > 0:
        features.append('-1'+X[w_idx-1])
        if w_idx > 1:
            features.append('-2'+X[w_idx-2])
        else: features.append('BOS')
    else: features += ['BOS','BOS']
    if w_idx < len(X)-1:
        features.append('+1'+X[w_idx+1])
        if w_idx < len(X)-2:
            features.append('+2'+X[w_idx+2])
        else: features.append('EOS')
    else: features += ['EOS','EOS']
    return features

def sent_window(X):
    return [word_window(w_idx, X) for w_idx in xrange(len(X))]

In [133]:
print word_window(0,X_train[0])
print word_window(1,X_train[0])
print word_window(len(X_train[0])-1,X_train[0])

['0what', 'BOS', 'BOS', '+1flights', '+2leave']
['0flights', '-1what', 'BOS', '+1leave', '+2atlanta']
['0francisco', '-1san', '-2in', 'EOS', 'EOS']


In [78]:
X_train_featurized = [sent_window(X) for X in X_train]
X_test_featurized = [sent_window(X) for X in X_test]

In [79]:
X_train_featurized[0]

[['0what', 'BOS', '+1flights', '+2leave'],
 ['0flights', '-1what', '+1leave', '+2atlanta'],
 ['0leave', '-1flights', '-2what', '+1atlanta', '+2at'],
 ['0atlanta', '-1leave', '-2flights', '+1at', '+2about'],
 ['0at', '-1atlanta', '-2leave', '+1about', '+2DIGIT'],
 ['0about', '-1at', '-2atlanta', '+1DIGIT', '+2in'],
 ['0DIGIT', '-1about', '-2at', '+1in', '+2the'],
 ['0in', '-1DIGIT', '-2about', '+1the', '+2afternoon'],
 ['0the', '-1in', '-2DIGIT', '+1afternoon', '+2and'],
 ['0afternoon', '-1the', '-2in', '+1and', '+2arrive'],
 ['0and', '-1afternoon', '-2the', '+1arrive', '+2in'],
 ['0arrive', '-1and', '-2afternoon', '+1in', '+2san'],
 ['0in', '-1arrive', '-2and', '+1san', '+2francisco'],
 ['0san', '-1in', '-2arrive', '+1francisco'],
 ['0francisco', '-1san', '-2in', 'EOS']]

In [80]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import os, cPickle, time, random
from sklearn.metrics import accuracy_score
from itertools import chain

In [81]:
crf_ner = pycrfsuite.Trainer(verbose=0)

In [82]:
for X,Y in zip(X_train_featurized, Y_train):
    crf_ner.append(X,Y)

In [83]:
config={'c1': 1.0, # coef for L1.
        'c2': 1e-3, # coef for L2.
        'max_iterations': 50,
        'feature.possible_transitions':True}
crf_ner.set_params(config)

In [84]:
%%time
crf_ner.train('words_window.crfsuite')

CPU times: user 1min 43s, sys: 273 ms, total: 1min 44s
Wall time: 1min 44s


In [85]:
tagger = pycrfsuite.Tagger()
tagger.open('words_window.crfsuite')

<contextlib.closing at 0x10ef83a50>

### A. No Retagging

In [181]:
y_true = Y_test
y_pred = [tagger.tag(sent) for sent in X_test_featurized]

In [183]:
y_true_merged = list(chain.from_iterable(y_true))
y_pred_merged = list(chain.from_iterable(y_pred))

In [184]:
from sklearn.metrics import accuracy_score
print "Accuracy: %.6f" % accuracy_score(y_true_merged,y_pred_merged)

Accuracy: 0.960426


In [186]:
lb = LabelBinarizer()
y_true_in_tags = lb.fit_transform(list(chain.from_iterable(y_true))) # get a list of tags in 1-hot.
y_pred_in_tags = lb.transform(list(chain.from_iterable(y_pred)))
tagset = list(set(lb.classes_))
class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
print classification_report(
    y_true_in_tags,
    y_pred_in_tags,
    labels = [class_indices[cls] for cls in tagset],
    target_names = tagset
)

                              precision    recall  f1-score   support

B-depart_date.today_relative       0.89      0.89      0.89         9
 B-arrive_date.date_relative       1.00      0.50      0.67         2
 B-depart_date.date_relative       0.85      1.00      0.92        17
          I-restriction_code       1.00      0.33      0.50         3
      B-depart_date.day_name       0.95      0.99      0.97       212
      I-arrive_time.end_time       0.55      0.75      0.63         8
      B-fromloc.airport_code       1.00      0.60      0.75         5
             B-cost_relative       1.00      0.97      0.99        37
                   B-connect       1.00      1.00      1.00         6
             B-flight_number       0.92      1.00      0.96        11
 B-depart_time.time_relative       0.92      0.92      0.92        65
           I-toloc.city_name       0.92      0.97      0.95       265
 B-arrive_time.period_of_day       0.00      0.00      0.00         6
 B-depart_time.peri

### B. Retagging on F1<.5 Label Types (Threshold = .5, only retag 'O')

In [250]:
y_true = Y_test
y_pred = [tagger.tag(sent) for sent in X_test_featurized]

In [251]:
for i,(X,Y) in enumerate(zip(X_test,y_pred)):
    y_pred[i] = retag_check(X,Y,.5)

In [252]:
y_true_merged = list(chain.from_iterable(y_true))
y_pred_merged = list(chain.from_iterable(y_pred))

In [253]:
from sklearn.metrics import accuracy_score
print "Accuracy: %.6f" % accuracy_score(y_true_merged,y_pred_merged)

Accuracy: 0.960970


In [254]:
lb = LabelBinarizer()
y_true_in_tags = lb.fit_transform(list(chain.from_iterable(y_true))) # get a list of tags in 1-hot.
y_pred_in_tags = lb.transform(list(chain.from_iterable(y_pred)))
tagset = list(set(lb.classes_))
class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
print classification_report(
    y_true_in_tags,
    y_pred_in_tags,
    labels = [class_indices[cls] for cls in tagset],
    target_names = tagset
)

                              precision    recall  f1-score   support

B-depart_date.today_relative       0.89      0.89      0.89         9
 B-arrive_date.date_relative       1.00      0.50      0.67         2
 B-depart_date.date_relative       0.85      1.00      0.92        17
          I-restriction_code       1.00      1.00      1.00         3
      B-depart_date.day_name       0.95      0.99      0.97       212
      I-arrive_time.end_time       0.55      0.75      0.63         8
      B-fromloc.airport_code       1.00      0.60      0.75         5
             B-cost_relative       1.00      0.97      0.99        37
                   B-connect       1.00      1.00      1.00         6
             B-flight_number       0.92      1.00      0.96        11
 B-depart_time.time_relative       0.92      0.92      0.92        65
           I-toloc.city_name       0.92      0.97      0.95       265
 B-arrive_time.period_of_day       0.00      0.00      0.00         6
 B-depart_time.peri

### C. Retagging on All Label Types (Thresold = .5, retag all)

In [299]:
y_true = Y_test
y_pred = [tagger.tag(sent) for sent in X_test_featurized]

In [300]:
for i,(X,Y) in enumerate(zip(X_test,y_pred)):
    y_pred[i] = retag_check(X,Y,.50)

In [301]:
y_true_merged = list(chain.from_iterable(y_true))
y_pred_merged = list(chain.from_iterable(y_pred))

In [302]:
from sklearn.metrics import accuracy_score
print "Accuracy: %.6f" % accuracy_score(y_true_merged,y_pred_merged)

Accuracy: 0.960970


In [274]:
lb = LabelBinarizer()
y_true_in_tags = lb.fit_transform(list(chain.from_iterable(y_true))) # get a list of tags in 1-hot.
y_pred_in_tags = lb.transform(list(chain.from_iterable(y_pred)))
tagset = list(set(lb.classes_))
class_indices = {cls:idx for idx,cls in enumerate(lb.classes_)}
print classification_report(
    y_true_in_tags,
    y_pred_in_tags,
    labels = [class_indices[cls] for cls in tagset],
    target_names = tagset
)

                              precision    recall  f1-score   support

B-depart_date.today_relative       0.89      0.89      0.89         9
 B-arrive_date.date_relative       1.00      0.50      0.67         2
 B-depart_date.date_relative       0.85      1.00      0.92        17
          I-restriction_code       1.00      1.00      1.00         3
      B-depart_date.day_name       0.95      0.99      0.97       212
      I-arrive_time.end_time       0.50      0.75      0.60         8
      B-fromloc.airport_code       1.00      0.60      0.75         5
             B-cost_relative       1.00      0.97      0.99        37
                   B-connect       1.00      1.00      1.00         6
             B-flight_number       0.92      1.00      0.96        11
 B-depart_time.time_relative       0.92      0.92      0.92        65
           I-toloc.city_name       0.92      0.97      0.95       265
 B-arrive_time.period_of_day       0.00      0.00      0.00         6
 B-depart_time.peri