# ATIS in LUIS

* **Task I:** ATIS raw data => LUIS-json app format (importable to LUIS).
* **Task II:** 
    * a. Validation: using luis.Luis(..).analyze to predict on valid/test set (i.e. y_hat), then compare that with the actual labeled data in atis pickle (i.e. y_true).
    * b. Evaluation: Jaccard and F1-score.

### A. Labeled Data $\Rightarrow$ LUIS JSON

In [139]:
import json, gzip, pickle

In [140]:
import os
os.chdir('/Users/jacobsw/Desktop')

In [141]:
with open('ojoatis.json') as json_file:
    ojoatis = json.load(json_file)
ojoatis['entities'][0]['children'].append(u'arrive_time.time')
ojoatis['entities'][2]['children'][5] = u'return_time.period_mod'
ojoatis['entities'][2]['children'][6] = u'return_time.period_of_day'
joshentities = ojoatis['entities']
''' FORMAT:
[{u'children': [u'arrive_time.end_time',
   u'arrive_time.period_mod',
   u'arrive_time.period_of_day',
   u'arrive_time.start_time',
   u'arrive_time.time_relative'],
  u'name': u'arrive_time'},
 {u'children': [u'depart_time.end_time',
   u'depart_time.period_mod',
   u'depart_time.period_of_day',
   u'depart_time.start_time',
   u'depart_time.time',
   u'depart_time.time_relative',
   u'meal',
   u'meal_description'],
  u'name': u'depart_time_meal'},
  ...
'''
print 'DONE'

DONE


In [142]:
def entities_dict_transform(entities):
    new_dict = {}
    for entry in entities:
        for child in entry['children']:
            new_dict[child] = entry['name']
    return new_dict
''' TRANSFORMED:
{u'aircraft_code': u'codes_types',
 u'airline_code': u'codes_types',
 u'airline_name': u'from_loc',
 u'airport_code': u'codes_types',
 u'airport_name': u'from_loc',
 u'arrive_date.date_relative': u'arrive_date',
 u'arrive_date.day_name': u'arrive_date',
 u'arrive_date.day_number': u'arrive_date',
 ...
'''
print 'DONE'

DONE


In [143]:
joshentities = entities_dict_transform(joshentities)

In [145]:
class ATIS:
    # takes an atis pickle, and extract utterances in luis-json format.
    
    def __init__(self, filename='/Users/jacobsw/Downloads/atis.fold1.pkl.gz',
                       entities=joshentities):
        self.filename = filename
        self.entities = entities
        self.__load()
    
    def __load(self):
        f = gzip.open(self.filename, 'rb')
        self.train_set, self.valid_set, self.test_set, self.dicts = pickle.load(f)
        f.close()
        self.idx2labels = {idx:label for (label,idx) in self.dicts['labels2idx'].iteritems()}
        self.idx2words = {idx:word for (word,idx) in self.dicts['words2idx'].iteritems()}
    
    def get_annotated_sent(self, sent_idx, source='train'):
        assert source in ['train','valid','test']
        if source=='train': src = self.train_set
        elif source=='valid': src = self.valid_set
        else: src = self.test_set
        annotated = zip(map(self.idx2words.get,src[0][sent_idx]),
                        map(self.idx2labels.get,src[2][sent_idx]))
        return annotated
    
    def get_utterance_json(self, sent_idx, source='train'):
        annotated = self.get_annotated_sent(sent_idx, source)
        utterance = {u'text':unicode(' '.join([word for word,label in annotated])),
                 u'intent':'None',
                 u'entities':[]}
        entity = {u'entity':'',u'startPos':0,u'endPos':0}
        for i,item in enumerate(annotated):
            current_entity = ''
            if item[1].startswith('B'):
                entity['startPos'] = i
                current_entity = item[1].split('-')[1]
                entity['entity'] = self.entities[current_entity] + '::' + current_entity
                if i+1>=len(annotated) or annotated[i+1][1]=='O':
                    entity['endPos'] = i
                    utterance['entities'].append(entity)
                    entity = {u'entity':'',u'startPos':0,u'endPos':0}
                    current_entity = ''
                    continue
            elif item[1].startswith('I'):
                if i+1>=len(annotated) or annotated[i+1][1]=='O':
                    entity['endPos'] = i
                    utterance['entities'].append(entity)
                    entity = {u'entity':'',u'startPos':0,u'endPos':0}
                    current_entity = ''
                else: continue
            else: continue
        return utterance 
    
    def populate_utterances(self, luis_json, num_utterances=2000):
        # assuming luis_json is an exported app which doesn't have many labeled sents.
        for i in xrange(num_utterances):
            luis_json['utterances'].append(self.get_utterance_json(i))
        return luis_json
        

In [146]:
atis = ATIS()

In [147]:
atis.get_utterance_json(0)

{u'entities': [{u'endPos': 5,
   u'entity': u'from_loc::airline_name',
   u'startPos': 5},
  {u'endPos': 7, u'entity': u'flight::flight_number', u'startPos': 7},
  {u'endPos': 10, u'entity': u'from_loc::fromloc.city_name', u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 12}],
 u'intent': 'None',
 u'text': u'what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city'}

In [148]:
new_ojoatis = atis.populate_utterances(ojoatis)
''' FORMAT:
{u'actions': [],
 u'bing_entities': [],
 u'composites': [],
 u'culture': u'en-us',
 u'desc': u'ner',
 u'entities': [{u'children': [u'arrive_time.end_time',
    u'arrive_time.period_mod',
    u'arrive_time.period_of_day',
    u'arrive_time.start_time',
    u'arrive_time.time_relative',
    u'arrive_time.time'],
   u'name': u'arrive_time'},
'''

In [150]:
with open('new_ojoatis.json', 'w') as f:
     json.dump(new_ojoatis, f)

In [308]:
# EXAMPLE: LABELED DATA
zip(map(atis.idx2words.get,atis.valid_set[0][0]),
    map(atis.idx2labels.get,atis.valid_set[2][0]))

[('what', 'O'),
 ('flights', 'O'),
 ('leave', 'O'),
 ('atlanta', 'B-fromloc.city_name'),
 ('at', 'O'),
 ('about', 'B-depart_time.time_relative'),
 ('DIGIT', 'B-depart_time.time'),
 ('in', 'O'),
 ('the', 'O'),
 ('afternoon', 'B-depart_time.period_of_day'),
 ('and', 'O'),
 ('arrive', 'O'),
 ('in', 'O'),
 ('san', 'B-toloc.city_name'),
 ('francisco', 'I-toloc.city_name')]

### B. Facilities for Importing New Trained Data

In [309]:
import luis

In [167]:
l = luis.Luis(url="https://api.projectoxford.ai/luis/v1/application?id=e13e2ede-be9b-46cf-a568-7a3d92c4fcba&subscription-key=952396816ccf4d869657b9ef49533fb6")

#### TEST: valid sentence 0

In [154]:
val0_true = atis.get_utterance_json(0,source='valid')
val0_true

{u'entities': [{u'endPos': 3,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 3},
  {u'endPos': 6,
   u'entity': u'depart_time_meal::depart_time.time',
   u'startPos': 6},
  {u'endPos': 9,
   u'entity': u'depart_time_meal::depart_time.period_of_day',
   u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 13}],
 u'intent': 'None',
 u'text': u'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco'}

In [173]:
val0_luispy = l.analyze(val0_raw).entities

In [174]:
val0_luispy

[<Entity entity=u'atlanta' type=u'from_loc::fromloc.city_name' score=0.9590335 start_index=19 end_index=25>,
 <Entity entity=u'afternoon' type=u'depart_time_meal::depart_time.period_of_day' score=0.856796861 start_index=49 end_index=57>,
 <Entity entity=u'san francisco' type=u'stop_to_loc::toloc.city_name' score=0.871793866 start_index=73 end_index=85>]

In [231]:
val0_true

{u'entities': [{u'endPos': 3,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 3},
  {u'endPos': 6,
   u'entity': u'depart_time_meal::depart_time.time',
   u'startPos': 6},
  {u'endPos': 9,
   u'entity': u'depart_time_meal::depart_time.period_of_day',
   u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 13}],
 u'intent': 'None',
 u'text': u'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco'}

##### ================================ ATIS => [ (ent_txt: '..', ent_lb: '..'), (..), ..] ================================

In [262]:
val0_true = atis.get_utterance_json(0,source='valid')
val0_true

{u'entities': [{u'endPos': 3,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 3},
  {u'endPos': 6,
   u'entity': u'depart_time_meal::depart_time.time',
   u'startPos': 6},
  {u'endPos': 9,
   u'entity': u'depart_time_meal::depart_time.period_of_day',
   u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 13}],
 u'intent': 'None',
 u'text': u'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco'}

In [246]:
def utterance_json_to_entities_list(utterance):
    text = utterance['text'].split()
    pairs = []
    for entity in utterance['entities']:
        pairs.append((' '.join(text[entity['startPos']:entity['endPos']+1]),
                      entity['entity']))
    return pairs

In [247]:
utterance_json_to_entities_list(val0_true)

[(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]

##### ================================================================================================

##### ================================ LUISPY => [ (ent_txt: '..', ent_lb: '..'), (..), ..] ================================

In [263]:
val0_luispy = l.analyze(val0_raw).entities
val0_luispy

[<Entity entity=u'atlanta' type=u'from_loc::fromloc.city_name' score=0.9590335 start_index=19 end_index=25>,
 <Entity entity=u'afternoon' type=u'depart_time_meal::depart_time.period_of_day' score=0.856796861 start_index=49 end_index=57>,
 <Entity entity=u'san francisco' type=u'stop_to_loc::toloc.city_name' score=0.871793866 start_index=73 end_index=85>]

In [228]:
def luispy_to_entities_list(luispy_entities):
    pairs = []
    for entity in luispy_entities:
        entity_info = str(entity).split("=") 
            # ['<Entity entity',
            #  "u'san francisco' type",
            #  "u'stop_to_loc::toloc.city_name' score",
            #  '0.871793866 start_index',
            #  '73 end_index',
            #  '85>']
        pairs.append((entity_info[1].split('\'')[1],
                      entity_info[2].split()[0][2:-1]))
            # tuple: (ent_text: ..., ent_label: ...)
            # entity_info[1].split('\''): ['u', 'san francisco', ' type']
            # entity_info[2].split()[0][2:-1]: 'stop_to_loc::toloc.city_name'
    return pairs

In [230]:
luispy_to_entities_list(val0_luispy)

[('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]

##### ================================================================================================

##### SIMILARITY between true and hat: JACCARD

In [None]:
# TRUE:
'''
[(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]
'''
# HAT:
'''
[('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]
'''

In [249]:
(u'atlanta', u'from_loc::fromloc.city_name') == ('atlanta', 'from_loc::fromloc.city_name')

True

In [251]:
true = [(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]
hat = [('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]

In [260]:
def jaccard(true,hat):
    return len(set(true).intersection(set(hat))) / \
               float(len(set(true).union(set(hat))))

In [261]:
jaccard(true,hat)

0.75

In [256]:
set(true).intersection(set(hat))

{('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('atlanta', 'from_loc::fromloc.city_name'),
 ('san francisco', 'stop_to_loc::toloc.city_name')}

In [257]:
set(true).union(set(hat))

{(u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'atlanta', u'from_loc::fromloc.city_name'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')}

### C. 100 Validation

#### a. Jaccard

In [292]:
val100_raw = [atis.get_utterance_json(i,source='valid') for i in xrange(100)] # from atis (train=998)

In [293]:
val100_true = [utterance_json_to_entities_list(utterance) for utterance in val100_raw]

In [294]:
%%time
val100_hat = []
for i in xrange(100):
    val_hat = l.analyze(val100_raw[i]['text']).entities
    val100_hat.append(luispy_to_entities_list(val_hat))
    if i%10==0:
        print "... processed %d sentences" % i

... processed 0 sentences
... processed 10 sentences
... processed 20 sentences
... processed 30 sentences
... processed 40 sentences
... processed 50 sentences
... processed 60 sentences
... processed 70 sentences
... processed 80 sentences
... processed 90 sentences
CPU times: user 2.22 s, sys: 104 ms, total: 2.32 s
Wall time: 28.9 s


In [295]:
import numpy as np
val100_jaccard = [jaccard(true,hat) for true,hat in zip(val100_true,val100_hat)]
print np.mean(val100_jaccard)

0.737619047619


In [310]:
val100_true[:5]

[[(u'atlanta', u'from_loc::fromloc.city_name'),
  (u'DIGIT', u'depart_time_meal::depart_time.time'),
  (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
  (u'san francisco', u'stop_to_loc::toloc.city_name')],
 [(u'canadian airlines international', u'from_loc::airline_name')],
 [(u'earliest', u'flight::flight_mod'),
  (u'boston', u'from_loc::fromloc.city_name'),
  (u'atlanta', u'stop_to_loc::toloc.city_name')],
 [(u'us air', u'from_loc::airline_name'),
  (u'atlanta', u'from_loc::fromloc.city_name'),
  (u'boston', u'stop_to_loc::toloc.city_name')],
 [(u'round trips', u'flight::round_trip'),
  (u'dallas', u'from_loc::fromloc.city_name'),
  (u'baltimore', u'stop_to_loc::toloc.city_name')]]

In [311]:
val100_hat[:5]

[[('atlanta', 'from_loc::fromloc.city_name'),
  ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
  ('san francisco', 'stop_to_loc::toloc.city_name')],
 [],
 [('earliest', 'flight::flight_mod'),
  ('boston', 'from_loc::fromloc.city_name'),
  ('atlanta', 'stop_to_loc::toloc.city_name')],
 [('us air', 'from_loc::airline_name'),
  ('atlanta', 'from_loc::fromloc.city_name'),
  ('boston', 'stop_to_loc::toloc.city_name')],
 [('dallas', 'from_loc::fromloc.city_name'),
  ('baltimore', 'stop_to_loc::toloc.city_name')]]

#### b. F1

In [301]:
from __future__ import division

In [306]:
def f1(trues,hats):
    tp, fp, fn = 0, 0, 0
    for true,hat in zip(trues,hats):
        entries = set(true+hat) # all unique entries for this pair
        for entry in entries:
            if entry in hat and entry in true: tp += 1
            elif entry in hat and entry not in true: fp += 1
            elif entry in true and entry not in hat: fn += 1
            else: pass
    prec = tp/(tp+fp)
    rec = tp/(tp+fn)
    print "Precision: %.2f%% | Recall: %.2f%% | F1: %.2f" % (prec*100, rec*100, ((2*prec*rec)/(prec+rec))*100)

In [307]:
f1(val100_true,val100_hat)

Precision: 88.00% | Recall: 78.29% | F1: 82.86


In [299]:
val100_true[0]

[(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]

In [298]:
val100_hat[0]

[('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]

In [300]:
set(val100_true[0]+val100_hat[0])

{(u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'atlanta', u'from_loc::fromloc.city_name'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')}

# Pipeline: LUIS => RNN

* **Task I:** Take LUIS output (list of labeled entities with sents), translate it into RNN-input.
* **Task II:** Evaluation (MAYBE)