# Load Data

In [2]:
import gzip, pickle

In [3]:
path = "/Users/jacobsw/Desktop/IMPLEMENTATION_CAMP/CODE/OJO/LUIS/DATA/"

In [4]:
def load_atis():

    f = gzip.open(path+'atis.fold0.pkl.gz','rb')
    train, valid, test, dicts = pickle.load(f)
    
    w2i = dicts['words2idx']
    l2i = dicts['labels2idx']
    i2w = {i:w for w,i in dicts['words2idx'].iteritems()}
    i2l = {i:l for l,i in dicts['labels2idx'].iteritems()}  

    X_train = [map(i2w.get, encoded_sent) for encoded_sent in train[0]]
    X_valid = [map(i2w.get, encoded_sent) for encoded_sent in valid[0]]
    X_test = [map(i2w.get, encoded_sent) for encoded_sent in test[0]]
    Y_train = [map(i2l.get, encoded_labels) for encoded_labels in train[2]]
    Y_valid = [map(i2l.get, encoded_labels) for encoded_labels in valid[2]]
    Y_test = [map(i2l.get, encoded_sent) for encoded_sent in test[2]]

    X_train = X_train + X_valid
    Y_train = Y_train + Y_valid
    
    return X_train, Y_train, X_test, Y_test, {'i2w':i2w,'w2i':w2i,'i2l':i2l,'l2i':l2i}

In [5]:
%%time
X_train, Y_train, X_test, Y_test, dicts = load_atis()

CPU times: user 1.7 s, sys: 47 ms, total: 1.74 s
Wall time: 1.82 s


In [68]:
label_set = list(dicts['l2i'].keys())

# Featurization (CRF & MaxEnt)

In [7]:
def crf_word_window(w_idx, X): # X: X_train[0]
    features = ['0'+X[w_idx]]
    if w_idx > 0:
        features.append('-1'+X[w_idx-1])
        if w_idx > 1:
            features.append('-2'+X[w_idx-2])
        else: features.append('BOS')
    else: features += ['BOS','BOS']
    if w_idx < len(X)-1:
        features.append('+1'+X[w_idx+1])
        if w_idx < len(X)-2:
            features.append('+2'+X[w_idx+2])
        else: features.append('EOS')
    else: features += ['EOS','EOS']
    return features

def maxent_word_window(w_idx, X): # X: X_train[0]
    features = {'0word':X[w_idx]}
    if w_idx > 0:
        features['-1word'] = X[w_idx-1]
        if w_idx > 1:
            features['-2word'] = X[w_idx-2]
        else: features['-2word'] = 'BOS'
    else: 
        features['-2word'] = features['-1word'] = 'BOS'
    if w_idx < len(X)-1:
        features['+1word'] = X[w_idx+1]
        if w_idx < len(X)-2:
            features['+2word'] = X[w_idx+2]
        else: features['+2word'] = 'EOS'
    else: features['+2word'] = features['+1word'] = 'EOS'
    return features

def sent_window(X, word_window):
    return [word_window(w_idx, X) for w_idx in xrange(len(X))]

In [8]:
sample = X_train[0]

In [9]:
sent_window(sample,word_window=crf_word_window)

[['0what', 'BOS', 'BOS', '+1flights', '+2leave'],
 ['0flights', '-1what', 'BOS', '+1leave', '+2atlanta'],
 ['0leave', '-1flights', '-2what', '+1atlanta', '+2at'],
 ['0atlanta', '-1leave', '-2flights', '+1at', '+2about'],
 ['0at', '-1atlanta', '-2leave', '+1about', '+2DIGIT'],
 ['0about', '-1at', '-2atlanta', '+1DIGIT', '+2in'],
 ['0DIGIT', '-1about', '-2at', '+1in', '+2the'],
 ['0in', '-1DIGIT', '-2about', '+1the', '+2afternoon'],
 ['0the', '-1in', '-2DIGIT', '+1afternoon', '+2and'],
 ['0afternoon', '-1the', '-2in', '+1and', '+2arrive'],
 ['0and', '-1afternoon', '-2the', '+1arrive', '+2in'],
 ['0arrive', '-1and', '-2afternoon', '+1in', '+2san'],
 ['0in', '-1arrive', '-2and', '+1san', '+2francisco'],
 ['0san', '-1in', '-2arrive', '+1francisco', 'EOS'],
 ['0francisco', '-1san', '-2in', 'EOS', 'EOS']]

In [10]:
sent_window(sample,word_window=maxent_word_window)

[{'+1word': 'flights',
  '+2word': 'leave',
  '-1word': 'BOS',
  '-2word': 'BOS',
  '0word': 'what'},
 {'+1word': 'leave',
  '+2word': 'atlanta',
  '-1word': 'what',
  '-2word': 'BOS',
  '0word': 'flights'},
 {'+1word': 'atlanta',
  '+2word': 'at',
  '-1word': 'flights',
  '-2word': 'what',
  '0word': 'leave'},
 {'+1word': 'at',
  '+2word': 'about',
  '-1word': 'leave',
  '-2word': 'flights',
  '0word': 'atlanta'},
 {'+1word': 'about',
  '+2word': 'DIGIT',
  '-1word': 'atlanta',
  '-2word': 'leave',
  '0word': 'at'},
 {'+1word': 'DIGIT',
  '+2word': 'in',
  '-1word': 'at',
  '-2word': 'atlanta',
  '0word': 'about'},
 {'+1word': 'in',
  '+2word': 'the',
  '-1word': 'about',
  '-2word': 'at',
  '0word': 'DIGIT'},
 {'+1word': 'the',
  '+2word': 'afternoon',
  '-1word': 'DIGIT',
  '-2word': 'about',
  '0word': 'in'},
 {'+1word': 'afternoon',
  '+2word': 'and',
  '-1word': 'in',
  '-2word': 'DIGIT',
  '0word': 'the'},
 {'+1word': 'and',
  '+2word': 'arrive',
  '-1word': 'the',
  '-2word': '

# CRF Tagging

In [26]:
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import os, cPickle, time, random
from sklearn.metrics import accuracy_score
from itertools import chain

In [93]:
class CRF:
    
    def __init__(self, X_train, Y_train, X_test, Y_test, 
                 config={'c1': 1.0,'c2': 1e-3,
                         'max_iterations': 50,
                         'feature.possible_transitions':True}):
        self.X_train, self.Y_train = X_train, Y_train
        self.X_test, self.Y_test = X_test, Y_test
        self.config = config
    
    def train(self):
        
        crf_ner = pycrfsuite.Trainer(verbose=0)
        crf_ner.set_params(self.config)
        X_train_featurized = [sent_window(X,word_window=crf_word_window) 
                              for X in X_train]
        for X,Y in zip(X_train_featurized, self.Y_train):
            crf_ner.append(X,Y)
        crf_ner.train('words_window.crfsuite')
        self.tagger = pycrfsuite.Tagger()
        self.tagger.open('words_window.crfsuite')
        
    def get_confusion_by_cv(self, k=5):
        
        def cv_partition(X, Y, k=k):
            k = max(1,k)
            step = int(len(X)/k)
            chunked = []
            for i in xrange(0, len(X), step):
                if i+2*step<len(X):
                    chunked.append((X[i:i+step],Y[i:i+step]))
                else:
                    chunked.append((X[i:],Y[i:]))
            return chunked  
        
        train_chunks = cv_partition(self.X_train,self.Y_train)
        
        cms = {}
        
        for i in xrange(k):
            print "... cross validation %dth round" % (i+1)
            x_valid, y_valid = train_chunks[i]
            x_train, y_train = [], []
            for x,y in train_chunks[:i]+train_chunks[i+1:]:
                x_train += x
                y_train += y
            crf = pycrfsuite.Trainer(verbose=0)
            crf.set_params(self.config)
            x_train_featurized = [sent_window(x,word_window=crf_word_window)
                                  for x in x_train]
            for x,y in zip(x_train_featurized, y_train):
                crf.append(x,y)
            crf.train('cv_crf.crfsuite')
            tagger = pycrfsuite.Tagger()
            tagger.open('cv_crf.crfsuite')
            x_valid_featurized = [sent_window(x,word_window=crf_word_window)
                                 for x in x_valid]
            y_pred = [tagger.tag(sent) for sent in x_valid_featurized]
            y_true = y_valid
            y_true_merged = list(chain.from_iterable(y_true))
            y_pred_merged = list(chain.from_iterable(y_pred))  
            labels = list(set(y_true_merged))
            label2cmidx = {l:i for i,l in enumerate(labels)}
            cms[i] = {'cm':confusion_matrix(y_true_merged,y_pred_merged,labels),
                      'l2i':label2cmidx, 'i2l':labels}
        
        return cms     
        
    def tag(self):
        
        X_test_featurized = [sent_window(X,word_window=crf_word_window) 
                             for X in X_test]
        y_pred = [self.tagger.tag(sent) for sent in X_test_featurized]
        
        return y_pred
    
    def evaluate(self):
        
        y_true = self.Y_test
        y_pred = self.tag()
        y_true_merged = list(chain.from_iterable(y_true))
        y_pred_merged = list(chain.from_iterable(y_pred))
        
        print "Accuracy: %.6f%%" % (accuracy_score(y_true_merged,y_pred_merged)*100)
        

In [94]:
%%time
crf = CRF(X_train,Y_train,X_test,Y_test)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 8.82 µs


In [87]:
crf.evaluate() # result from running .train()

Accuracy: 96.216569%


In [95]:
%%time
cms = crf.get_confusion_by_cv()

... cross validation 1th round
... cross validation 2th round
... cross validation 3th round
... cross validation 4th round
... cross validation 5th round
CPU times: user 7min 54s, sys: 1.74 s, total: 7min 56s
Wall time: 7min 57s


##### GET THE LIST CONFUSED LABEL-TYPE (EXPERIMENTING RADICAL SET NOW)

In [97]:
from collections import defaultdict
from __future__ import division

In [102]:
div = lambda x,y: 0 if y==0 else x/y

In [109]:
def get_confusion(cm, l2i, i2l):
    
    confused = defaultdict(list)
    for l in l2i.iterkeys():
        l_vec = cm[:,l2i[l]]
        if div(l_vec[l2i[l]],l_vec.sum())<.5:
            label_list = [i2l[i] for i,l_i in enumerate(l_vec) if l_i!=0]
            if len(label_list) > 0:
                confused[l] = label_list
    
    return confused
            

In [110]:
%%time
cms_confused = []
for i in cms.iterkeys():
    cms_confused.append(get_confusion(cms[i]['cm'],cms[i]['l2i'],cms[i]['i2l']))

CPU times: user 4.21 ms, sys: 1.68 ms, total: 5.89 ms
Wall time: 4.62 ms


In [113]:
for confused in cms_confused:
    print confused
    print

defaultdict(<type 'list'>, {'I-toloc.airport_name': ['I-airport_name', 'O', 'I-toloc.airport_name', 'I-fromloc.airport_name'], 'B-arrive_time.period_of_day': ['B-depart_time.period_of_day'], 'B-toloc.airport_name': ['B-fromloc.airport_name', 'B-airport_name', 'B-toloc.airport_name']})

defaultdict(<type 'list'>, {'I-toloc.airport_name': ['I-airport_name', 'I-fromloc.airport_name', 'I-toloc.airport_name'], 'B-arrive_time.period_of_day': ['B-depart_time.period_of_day', 'B-arrive_time.time'], 'I-flight_mod': ['B-fromloc.city_name']})

defaultdict(<type 'list'>, {})

defaultdict(<type 'list'>, {'B-arrive_time.period_of_day': ['B-arrive_time.period_of_day', 'B-depart_time.period_of_day'], 'I-flight_mod': ['B-arrive_time.time', 'O']})

defaultdict(<type 'list'>, {'I-city_name': ['I-toloc.city_name', 'B-city_name', 'I-city_name', 'O', 'B-toloc.city_name'], 'I-flight_mod': ['B-flight_time', 'O', 'I-flight_time']})



In [None]:
# def conservative_set(cms_confused): # takes the intersection of all confused label types.
    
#     all_confused = defaultdict(list)
#     keys = list({key for confused in cms_confused for key in confused.keys()})
#     for key in keys:
#         for confused in cms_confused:
#             if key in confused.keys():

# this doesn't make much sense actually, because we'd often get {} as the intersection with high-performance models.
                

In [114]:
def radical_set(cms_confused): # takes the union of all confused label types.
    
    all_confused = defaultdict(list)
    for confused in cms_confused:
        for label in confused.iterkeys():
            if label in all_confused.keys():
                all_confused[label] += confused[label]
            else:
                all_confused[label] = confused[label]
    
    for label in all_confused.iterkeys():
        all_confused[label] = list(set(all_confused[label]))
    
    return all_confused

In [115]:
radical_confused = radical_set(cms_confused)

In [116]:
radical_confused

defaultdict(list,
            {'B-arrive_time.period_of_day': ['B-depart_time.period_of_day',
              'B-arrive_time.time',
              'B-arrive_time.period_of_day'],
             'B-toloc.airport_name': ['B-airport_name',
              'B-fromloc.airport_name',
              'B-toloc.airport_name'],
             'I-city_name': ['B-toloc.city_name',
              'B-city_name',
              'I-toloc.city_name',
              'I-city_name',
              'O'],
             'I-flight_mod': ['B-arrive_time.time',
              'I-flight_time',
              'B-flight_time',
              'O',
              'B-fromloc.city_name'],
             'I-toloc.airport_name': ['I-toloc.airport_name',
              'I-airport_name',
              'O',
              'I-fromloc.airport_name']})

##### TRAIN MODEL TO GET PRED(X_TEST)

In [118]:
%%time
crf.train()

CPU times: user 2min 7s, sys: 461 ms, total: 2min 7s
Wall time: 2min 8s


In [159]:
X_test_featurized_crf = [sent_window(X,word_window=crf_word_window)
                     for X in crf.X_test]
X_test_featurized_maxent = [] # a list of featurized x's
for X in crf.X_test:
    X_test_featurized_maxent += sent_window(X,word_window=maxent_word_window)
y_pred = [crf.tagger.tag(sent) for sent in X_test_featurized_crf]

In [160]:
y_true = crf.Y_test

In [161]:
y_true_merged = list(chain.from_iterable(y_true)) # word/sent order preserved
y_pred_merged = list(chain.from_iterable(y_pred))

In [162]:
# BEFORE RETAGGING
print "Accuracy: %.6f%%" % (accuracy_score(y_true_merged,y_pred_merged)*100)

Accuracy: 96.216569%


In [163]:
def retag_check(X,Y): # X: featurized X_test for maxent; Y: y_pred_merged from crf
    
    for i,y in enumerate(Y):
        if y in radical_confused.keys():
            max_label = ('',0)
            maxent_probdist = maxent.tagger.prob_classify(X[i])
            for label in radical_confused[y]:
                p_label = maxent_probdist.prob(label)
                if p_label >= max_label[1]:
                    max_label = (label,p_label)
            print 'old tag: ', Y[i], '; new tag: ', max_label[0], '(',max_label[1],')', '; true tag: ', y_true_merged[i]
            Y[i] = max_label[0]
    

In [164]:
retag_check(X_test_featurized_maxent,y_pred_merged)

old tag:  I-city_name ; new tag:  I-city_name ( 0.727243481069 ) ; true tag:  I-city_name
old tag:  B-toloc.airport_name ; new tag:  B-toloc.airport_name ( 0.80861774295 ) ; true tag:  B-toloc.airport_name
old tag:  I-toloc.airport_name ; new tag:  I-toloc.airport_name ( 0.625527480815 ) ; true tag:  I-toloc.airport_name
old tag:  I-city_name ; new tag:  I-city_name ( 0.235407372141 ) ; true tag:  I-airport_name
old tag:  I-city_name ; new tag:  I-city_name ( 0.21746858435 ) ; true tag:  I-airport_name
old tag:  I-city_name ; new tag:  I-toloc.city_name ( 0.566995195562 ) ; true tag:  I-city_name
old tag:  I-city_name ; new tag:  I-city_name ( 0.161607521087 ) ; true tag:  B-period_of_day
old tag:  I-city_name ; new tag:  O ( 0.0848734115861 ) ; true tag:  B-period_of_day
old tag:  I-city_name ; new tag:  I-toloc.city_name ( 0.150202467149 ) ; true tag:  I-city_name
old tag:  B-toloc.airport_name ; new tag:  B-toloc.airport_name ( 0.0913130872196 ) ; true tag:  B-depart_time.time
old t

In [165]:
# AFTER RETAGGING
print "Accuracy: %.6f%%" % (accuracy_score(y_true_merged,y_pred_merged)*100)

Accuracy: 96.151337%


# MaxEnt Retagging

In [14]:
import random
import numpy as np
from nltk import MaxentClassifier, classify

In [37]:
class MaxEnt:
    
    def __init__(self,X_train, Y_train, X_test, Y_test):
        self.X_train, self.Y_train = X_train, Y_train
        self.X_test, self.Y_test = X_test, Y_test
        self.train()
    
    def train(self):
        
        X_train_featurized = [sent_window(X,word_window=maxent_word_window) 
                              for X in self.X_train]
        train_featurized = [(w_fts,label) 
                            for X,Y in zip(X_train_featurized,self.Y_train)
                            for w_fts,label in zip(X,Y)]
        self.tagger = MaxentClassifier.train(train_featurized, 
                                             algorithm='megam', 
                                             trace=3, max_iter=10)
    
    def tag(self):
        return
    
    def evaluate(self):
        X_test_featurized = [sent_window(X,word_window=maxent_word_window) 
                              for X in self.X_test]
        test_featurized = [(w_fts,label) 
                           for X,Y in zip(X_test_featurized,self.Y_test) 
                           for w_fts,label in zip(X,Y)]
        print "Accuracy: %.6f%%" % (classify.accuracy(self.tagger,test_featurized)*100)
        

In [38]:
%%time
maxent = MaxEnt(X_train,Y_train,X_test,Y_test)

CPU times: user 29.7 s, sys: 373 ms, total: 30 s
Wall time: 9min 56s


In [39]:
maxent.evaluate()

Accuracy: 95.596869%
