# ATIS in LUIS

* **Task I:** ATIS raw data => LUIS-json app format (importable to LUIS).
* **Task II:** 
    * a. Validation: using luis.Luis(..).analyze to predict on valid/test set (i.e. y_hat), then compare that with the actual labeled data in atis pickle (i.e. y_true).
    * b. Evaluation: Jaccard and F1-score.

### A. Labeled Data $\Rightarrow$ LUIS JSON

In [439]:
import json, gzip, pickle

In [440]:
import os
os.chdir('/Users/jacobsw/Desktop')

In [441]:
with open('ojoatis.json') as json_file:
    ojoatis = json.load(json_file)
joshentities = ojoatis['entities']
''' FORMAT:
[{u'children': [u'arrive_time.end_time',
   u'arrive_time.period_mod',
   u'arrive_time.period_of_day',
   u'arrive_time.start_time',
   u'arrive_time.time_relative'],
  u'name': u'arrive_time'},
 {u'children': [u'depart_time.end_time',
   u'depart_time.period_mod',
   u'depart_time.period_of_day',
   u'depart_time.start_time',
   u'depart_time.time',
   u'depart_time.time_relative',
   u'meal',
   u'meal_description'],
  u'name': u'depart_time_meal'},
  ...
'''
print 'DONE'

DONE


In [None]:
ojoatis

In [12]:
def entities_dict_transform(entities):
    new_dict = {}
    for entry in entities:
        for child in entry['children']:
            new_dict[child] = entry['name']
    return new_dict
''' TRANSFORMED:
{u'aircraft_code': u'codes_types',
 u'airline_code': u'codes_types',
 u'airline_name': u'from_loc',
 u'airport_code': u'codes_types',
 u'airport_name': u'from_loc',
 u'arrive_date.date_relative': u'arrive_date',
 u'arrive_date.day_name': u'arrive_date',
 u'arrive_date.day_number': u'arrive_date',
 ...
'''
print 'DONE'

DONE


In [137]:
joshentities

[{u'children': [u'arrive_time.end_time',
   u'arrive_time.period_mod',
   u'arrive_time.period_of_day',
   u'arrive_time.start_time',
   u'arrive_time.time_relative',
   u'arrive_time.time'],
  u'name': u'arrive_time'},
 {u'children': [u'depart_time.end_time',
   u'depart_time.period_mod',
   u'depart_time.period_of_day',
   u'depart_time.start_time',
   u'depart_time.time',
   u'depart_time.time_relative',
   u'meal',
   u'meal_description'],
  u'name': u'depart_time_meal'},
 {u'children': [u'return_date.date_relative',
   u'return_date.day_name',
   u'return_date.day_number',
   u'return_date.month_name',
   u'return_date.today_relative',
   u'return_time.period_mod',
   u'return_time.period_of_day',
   u'cost_relative',
   u'fare_amount'],
  u'name': u'return_cost'},
 {u'children': [u'flight',
   u'flight_days',
   u'flight_mod',
   u'flight_number',
   u'flight_stop',
   u'flight_time',
   u'round_trip',
   u'compartment',
   u'economy',
   u'connect'],
  u'name': u'flight'},
 {u'c

In [13]:
joshentities = entities_dict_transform(joshentities)

In [14]:
class ATIS:
    # takes an atis pickle, and extract utterances in luis-json format.
    
    def __init__(self, filename='/Users/jacobsw/Downloads/atis.fold1.pkl.gz',
                       entities=joshentities):
        self.filename = filename
        self.entities = entities
        self.__load()
    
    def __load(self):
        f = gzip.open(self.filename, 'rb')
        self.train_set, self.valid_set, self.test_set, self.dicts = pickle.load(f)
        f.close()
        self.idx2labels = {idx:label for (label,idx) in self.dicts['labels2idx'].iteritems()}
        self.idx2words = {idx:word for (word,idx) in self.dicts['words2idx'].iteritems()}
    
    def get_annotated_sent(self, sent_idx, source='train'):
        assert source in ['train','valid','test']
        if source=='train': src = self.train_set
        elif source=='valid': src = self.valid_set
        else: src = self.test_set
        annotated = zip(map(self.idx2words.get,src[0][sent_idx]),
                        map(self.idx2labels.get,src[2][sent_idx]))
        return annotated
    
    def get_utterance_json(self, sent_idx, source='train'):
        annotated = self.get_annotated_sent(sent_idx, source)
        utterance = {u'text':unicode(' '.join([word for word,label in annotated])),
                 u'intent':'None',
                 u'entities':[]}
        entity = {u'entity':'',u'startPos':0,u'endPos':0}
        for i,item in enumerate(annotated):
            current_entity = ''
            if item[1].startswith('B'):
                entity['startPos'] = i
                current_entity = item[1].split('-')[1]
                entity['entity'] = self.entities[current_entity] + '::' + current_entity
                if i+1>=len(annotated) or annotated[i+1][1]=='O':
                    entity['endPos'] = i
                    utterance['entities'].append(entity)
                    entity = {u'entity':'',u'startPos':0,u'endPos':0}
                    current_entity = ''
                    continue
            elif item[1].startswith('I'):
                if i+1>=len(annotated) or annotated[i+1][1]=='O':
                    entity['endPos'] = i
                    utterance['entities'].append(entity)
                    entity = {u'entity':'',u'startPos':0,u'endPos':0}
                    current_entity = ''
                else: continue
            else: continue
        return utterance 
    
    def populate_utterances(self, luis_json, num_utterances=2000):
        # assuming luis_json is an exported app which doesn't have many labeled sents.
        for i in xrange(num_utterances):
            luis_json['utterances'].append(self.get_utterance_json(i))
        return luis_json
        

In [17]:
atis = ATIS()

In [147]:
atis.get_utterance_json(0)

{u'entities': [{u'endPos': 5,
   u'entity': u'from_loc::airline_name',
   u'startPos': 5},
  {u'endPos': 7, u'entity': u'flight::flight_number', u'startPos': 7},
  {u'endPos': 10, u'entity': u'from_loc::fromloc.city_name', u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 12}],
 u'intent': 'None',
 u'text': u'what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city'}

In [148]:
new_ojoatis = atis.populate_utterances(ojoatis)
''' FORMAT:
{u'actions': [],
 u'bing_entities': [],
 u'composites': [],
 u'culture': u'en-us',
 u'desc': u'ner',
 u'entities': [{u'children': [u'arrive_time.end_time',
    u'arrive_time.period_mod',
    u'arrive_time.period_of_day',
    u'arrive_time.start_time',
    u'arrive_time.time_relative',
    u'arrive_time.time'],
   u'name': u'arrive_time'},
'''

In [150]:
with open('new_ojoatis.json', 'w') as f:
     json.dump(new_ojoatis, f)

In [308]:
# EXAMPLE: LABELED DATA
zip(map(atis.idx2words.get,atis.valid_set[0][0]),
    map(atis.idx2labels.get,atis.valid_set[2][0]))

[('what', 'O'),
 ('flights', 'O'),
 ('leave', 'O'),
 ('atlanta', 'B-fromloc.city_name'),
 ('at', 'O'),
 ('about', 'B-depart_time.time_relative'),
 ('DIGIT', 'B-depart_time.time'),
 ('in', 'O'),
 ('the', 'O'),
 ('afternoon', 'B-depart_time.period_of_day'),
 ('and', 'O'),
 ('arrive', 'O'),
 ('in', 'O'),
 ('san', 'B-toloc.city_name'),
 ('francisco', 'I-toloc.city_name')]

### B. Facilities for Importing New Trained Data

In [374]:
import luis

In [167]:
l = luis.Luis(url="https://api.projectoxford.ai/luis/v1/application?id=e13e2ede-be9b-46cf-a568-7a3d92c4fcba&subscription-key=952396816ccf4d869657b9ef49533fb6")

#### TEST: valid sentence 0

In [154]:
val0_true = atis.get_utterance_json(0,source='valid')
val0_true

{u'entities': [{u'endPos': 3,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 3},
  {u'endPos': 6,
   u'entity': u'depart_time_meal::depart_time.time',
   u'startPos': 6},
  {u'endPos': 9,
   u'entity': u'depart_time_meal::depart_time.period_of_day',
   u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 13}],
 u'intent': 'None',
 u'text': u'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco'}

In [173]:
val0_luispy = l.analyze(val0_raw).entities

In [174]:
val0_luispy

[<Entity entity=u'atlanta' type=u'from_loc::fromloc.city_name' score=0.9590335 start_index=19 end_index=25>,
 <Entity entity=u'afternoon' type=u'depart_time_meal::depart_time.period_of_day' score=0.856796861 start_index=49 end_index=57>,
 <Entity entity=u'san francisco' type=u'stop_to_loc::toloc.city_name' score=0.871793866 start_index=73 end_index=85>]

In [231]:
val0_true

{u'entities': [{u'endPos': 3,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 3},
  {u'endPos': 6,
   u'entity': u'depart_time_meal::depart_time.time',
   u'startPos': 6},
  {u'endPos': 9,
   u'entity': u'depart_time_meal::depart_time.period_of_day',
   u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 13}],
 u'intent': 'None',
 u'text': u'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco'}

##### ================================ ATIS => [ (ent_txt: '..', ent_lb: '..'), (..), ..] ================================

In [262]:
val0_true = atis.get_utterance_json(0,source='valid')
val0_true

{u'entities': [{u'endPos': 3,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 3},
  {u'endPos': 6,
   u'entity': u'depart_time_meal::depart_time.time',
   u'startPos': 6},
  {u'endPos': 9,
   u'entity': u'depart_time_meal::depart_time.period_of_day',
   u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 13}],
 u'intent': 'None',
 u'text': u'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco'}

In [380]:
def utterance_json_to_entities_list(utterance):
    text = utterance['text'].split()
    pairs = []
    for entity in utterance['entities']:
        pairs.append((' '.join(text[entity['startPos']:entity['endPos']+1]),
                      entity['entity']))
    return pairs

In [247]:
utterance_json_to_entities_list(val0_true)

[(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]

##### ================================================================================================

##### ================================ LUISPY => [ (ent_txt: '..', ent_lb: '..'), (..), ..] ================================

In [263]:
val0_luispy = l.analyze(val0_raw).entities
val0_luispy

[<Entity entity=u'atlanta' type=u'from_loc::fromloc.city_name' score=0.9590335 start_index=19 end_index=25>,
 <Entity entity=u'afternoon' type=u'depart_time_meal::depart_time.period_of_day' score=0.856796861 start_index=49 end_index=57>,
 <Entity entity=u'san francisco' type=u'stop_to_loc::toloc.city_name' score=0.871793866 start_index=73 end_index=85>]

In [379]:
def luispy_to_entities_list(luispy_entities):
    pairs = []
    for entity in luispy_entities:
        entity_info = str(entity).split("=") 
            # ['<Entity entity',
            #  "u'san francisco' type",
            #  "u'stop_to_loc::toloc.city_name' score",
            #  '0.871793866 start_index',
            #  '73 end_index',
            #  '85>']
        pairs.append((entity_info[1].split('\'')[1],
                      entity_info[2].split()[0][2:-1]))
            # tuple: (ent_text: ..., ent_label: ...)
            # entity_info[1].split('\''): ['u', 'san francisco', ' type']
            # entity_info[2].split()[0][2:-1]: 'stop_to_loc::toloc.city_name'
    return pairs

In [230]:
luispy_to_entities_list(val0_luispy)

[('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]

##### ================================================================================================

##### SIMILARITY between true and hat: JACCARD

In [None]:
# TRUE:
'''
[(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]
'''
# HAT:
'''
[('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]
'''

In [249]:
(u'atlanta', u'from_loc::fromloc.city_name') == ('atlanta', 'from_loc::fromloc.city_name')

True

In [251]:
true = [(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]
hat = [('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]

In [384]:
def jaccard(true,hat):
    return len(set(true).intersection(set(hat))) / \
               float(len(set(true).union(set(hat))))

In [261]:
jaccard(true,hat)

0.75

In [256]:
set(true).intersection(set(hat))

{('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('atlanta', 'from_loc::fromloc.city_name'),
 ('san francisco', 'stop_to_loc::toloc.city_name')}

In [257]:
set(true).union(set(hat))

{(u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'atlanta', u'from_loc::fromloc.city_name'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')}

### C. 100 Validation

#### a. Jaccard

In [292]:
val100_raw = [atis.get_utterance_json(i,source='valid') for i in xrange(100)] # from atis (train=998)

In [293]:
val100_true = [utterance_json_to_entities_list(utterance) for utterance in val100_raw]

In [294]:
%%time
val100_hat = []
for i in xrange(100):
    val_hat = l.analyze(val100_raw[i]['text']).entities
    val100_hat.append(luispy_to_entities_list(val_hat))
    if i%10==0:
        print "... processed %d sentences" % i

... processed 0 sentences
... processed 10 sentences
... processed 20 sentences
... processed 30 sentences
... processed 40 sentences
... processed 50 sentences
... processed 60 sentences
... processed 70 sentences
... processed 80 sentences
... processed 90 sentences
CPU times: user 2.22 s, sys: 104 ms, total: 2.32 s
Wall time: 28.9 s


In [295]:
import numpy as np
val100_jaccard = [jaccard(true,hat) for true,hat in zip(val100_true,val100_hat)]
print np.mean(val100_jaccard)

0.737619047619


In [310]:
val100_true[:5]

[[(u'atlanta', u'from_loc::fromloc.city_name'),
  (u'DIGIT', u'depart_time_meal::depart_time.time'),
  (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
  (u'san francisco', u'stop_to_loc::toloc.city_name')],
 [(u'canadian airlines international', u'from_loc::airline_name')],
 [(u'earliest', u'flight::flight_mod'),
  (u'boston', u'from_loc::fromloc.city_name'),
  (u'atlanta', u'stop_to_loc::toloc.city_name')],
 [(u'us air', u'from_loc::airline_name'),
  (u'atlanta', u'from_loc::fromloc.city_name'),
  (u'boston', u'stop_to_loc::toloc.city_name')],
 [(u'round trips', u'flight::round_trip'),
  (u'dallas', u'from_loc::fromloc.city_name'),
  (u'baltimore', u'stop_to_loc::toloc.city_name')]]

In [311]:
val100_hat[:5]

[[('atlanta', 'from_loc::fromloc.city_name'),
  ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
  ('san francisco', 'stop_to_loc::toloc.city_name')],
 [],
 [('earliest', 'flight::flight_mod'),
  ('boston', 'from_loc::fromloc.city_name'),
  ('atlanta', 'stop_to_loc::toloc.city_name')],
 [('us air', 'from_loc::airline_name'),
  ('atlanta', 'from_loc::fromloc.city_name'),
  ('boston', 'stop_to_loc::toloc.city_name')],
 [('dallas', 'from_loc::fromloc.city_name'),
  ('baltimore', 'stop_to_loc::toloc.city_name')]]

#### b. F1

In [387]:
from __future__ import division

In [388]:
def f1(trues,hats):
    tp, fp, fn = 0, 0, 0
    for true,hat in zip(trues,hats):
        entries = set(true+hat) # all unique entries for this pair
        for entry in entries:
            if entry in hat and entry in true: tp += 1
            elif entry in hat and entry not in true: fp += 1
            elif entry in true and entry not in hat: fn += 1
            else: pass
    prec = tp/(tp+fp)
    rec = tp/(tp+fn)
    print "Precision: %.2f%% | Recall: %.2f%% | F1: %.2f" % (prec*100, rec*100, ((2*prec*rec)/(prec+rec))*100)

##### TRAINING ON LUIS Lite (997 sents)

In [307]:
f1(val100_true,val100_hat)

Precision: 88.00% | Recall: 78.29% | F1: 82.86


In [299]:
val100_true[0]

[(u'atlanta', u'from_loc::fromloc.city_name'),
 (u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')]

In [298]:
val100_hat[0]

[('atlanta', 'from_loc::fromloc.city_name'),
 ('afternoon', 'depart_time_meal::depart_time.period_of_day'),
 ('san francisco', 'stop_to_loc::toloc.city_name')]

In [300]:
set(val100_true[0]+val100_hat[0])

{(u'DIGIT', u'depart_time_meal::depart_time.time'),
 (u'afternoon', u'depart_time_meal::depart_time.period_of_day'),
 (u'atlanta', u'from_loc::fromloc.city_name'),
 (u'san francisco', u'stop_to_loc::toloc.city_name')}

##### TRAINING ON LUIS Medium (4332 sents)

In [434]:
l = luis.Luis(url="https://api.projectoxford.ai/luis/v1/application?id=b08f059d-299a-4223-aa3d-06d19964d2e9&subscription-key=952396816ccf4d869657b9ef49533fb6")

In [435]:
%%time
tst100_raw = [atis.get_utterance_json(i,source='test') for i in xrange(100)] # from atis (train=998)
tst100_true = [utterance_json_to_entities_list(utterance) for utterance in tst100_raw]
tst100_hat = []
for i in xrange(100):
    tst_hat = l.analyze(tst100_raw[i]['text']).entities
    tst100_hat.append(luispy_to_entities_list(tst_hat))
    if i%10==0:
        print "... processed %d sentences" % i

... processed 0 sentences
... processed 10 sentences
... processed 20 sentences
... processed 30 sentences
... processed 40 sentences
... processed 50 sentences
... processed 60 sentences
... processed 70 sentences
... processed 80 sentences
... processed 90 sentences
CPU times: user 2.32 s, sys: 106 ms, total: 2.42 s
Wall time: 35.6 s


In [436]:
import numpy as np
tst100_jaccard = [jaccard(true,hat) for true,hat in zip(tst100_true,tst100_hat)]
print np.mean(tst100_jaccard)

0.716166666667


In [437]:
f1(tst100_true,tst100_hat)

Precision: 81.82% | Recall: 78.79% | F1: 80.27


# LUIS <=> RNN Data

* **Task I:** Translating between LUIS utterances and RNN data.
* **Task II:** Evaluation (MAYBE)

In [223]:
entities = [{u'children': [u'arrive_time.end_time',
                   u'arrive_time.period_mod',
                   u'arrive_time.period_of_day',
                   u'arrive_time.start_time',
                   u'arrive_time.time_relative',
                   u'arrive_time.time'],
                  u'name': u'arrive_time'},
                 {u'children': [u'depart_time.end_time',
                   u'depart_time.period_mod',
                   u'depart_time.period_of_day',
                   u'depart_time.start_time',
                   u'depart_time.time',
                   u'depart_time.time_relative',
                   u'meal',
                   u'meal_description'],
                  u'name': u'depart_time_meal'},
                 {u'children': [u'return_date.date_relative',
                   u'return_date.day_name',
                   u'return_date.day_number',
                   u'return_date.month_name',
                   u'return_date.today_relative',
                   u'return_time.period_mod',
                   u'return_time.period_of_day',
                   u'cost_relative',
                   u'fare_amount'],
                  u'name': u'return_cost'},
                 {u'children': [u'flight',
                   u'flight_days',
                   u'flight_mod',
                   u'flight_number',
                   u'flight_stop',
                   u'flight_time',
                   u'round_trip',
                   u'compartment',
                   u'economy',
                   u'connect'],
                  u'name': u'flight'},
                 {u'children': [u'fromloc.airport_code',
                   u'fromloc.airport_name',
                   u'fromloc.city_name',
                   u'fromloc.state_code',
                   u'fromloc.state_name',
                   u'airline_name',
                   u'airport_name',
                   u'city_name',
                   u'state_name'],
                  u'name': u'from_loc'},
                 {u'children': [u'aircraft_code',
                   u'airline_code',
                   u'airport_code',
                   u'booking_class',
                   u'fare_basis_code',
                   u'meal_code',
                   u'restriction_code',
                   u'state_code',
                   u'transport_type',
                   u'class_type'],
                  u'name': u'codes_types'},
                 {u'children': [u'day_name',
                   u'day_number',
                   u'days_code',
                   u'month_name',
                   u'period_of_day',
                   u'time',
                   u'time_relative',
                   u'today_relative'],
                  u'name': u'misc_date_time'},
                 {u'children': [u'depart_date.date_relative',
                   u'depart_date.day_name',
                   u'depart_date.day_number',
                   u'depart_date.month_name',
                   u'depart_date.today_relative',
                   u'depart_date.year',
                   u'mod',
                   u'or'],
                  u'name': u'depart_date_mod_or'},
                 {u'children': [u'stoploc.airport_code',
                   u'stoploc.airport_name',
                   u'stoploc.city_name',
                   u'stoploc.state_code',
                   u'toloc.airport_code',
                   u'toloc.airport_name',
                   u'toloc.city_name',
                   u'toloc.country_name',
                   u'toloc.state_code',
                   u'toloc.state_name'],
                  u'name': u'stop_to_loc'},
                 {u'children': [u'arrive_date.date_relative',
                   u'arrive_date.day_name',
                   u'arrive_date.day_number',
                   u'arrive_date.month_name',
                   u'arrive_date.today_relative'],
                  u'name': u'arrive_date'}
                ]

### A. Translator: Babel!

In [224]:
import pickle, gzip
import numpy as np

In [292]:
class Babel:
    
    def __init__(self, dicts_path='/Users/jacobsw/Downloads/atis.fold1.pkl.gz'):
        f = gzip.open(dicts_path,'rb')
        _,_,_,self.dicts = pickle.load(f)
        f.close()
    
    def luis2data(self, luis_utterances):
        # IN: luis utterances
        #  {u'entities': [{u'endPos': 5,
        #    u'entity': u'from_loc::airline_name',
        #    u'startPos': 5},
        #   {u'endPos': 7, u'entity': u'flight::flight_number', u'startPos': 7},
        #   {u'endPos': 10, u'entity': u'from_loc::fromloc.city_name', u'startPos': 9},
        #   {u'endPos': 14,
        #    u'entity': u'stop_to_loc::toloc.city_name',
        #    u'startPos': 12}],
        #  u'intent': u'None',
        #  u'text': u'what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city'}
        # OUT: training data (w/ labels)
        sents, labels = [],[]
        for utterance in luis_utterances:
            sents.append(np.asarray(map(self.dicts['words2idx'].get,
                                        utterance['text'].split()),dtype='int32'))
            label = ['O' for _ in utterance['text'].split()]
            for entity in utterance['entities']:
                label[entity['startPos']]='B-'+entity['entity'].split('::')[1]
                if entity['startPos']!=entity['endPos']: 
                    for i in range(entity['startPos']+1,entity['endPos']+1):
                        label[i] = 'I-'+entity['entity'].split('::')[1]
            labels.append(label)        
        return (sents,labels)
    
    def data2luis(self, data, entities):
        def entities_dict_transform(entities):
            new_dict = {}
            for entry in entities:
                for child in entry['children']:
                    new_dict[child] = entry['name']
            return new_dict
        entities = entities_dict_transform(entities)
        self.idx2labels = {idx:label for (label,idx) in self.dicts['labels2idx'].iteritems()}
        self.idx2words = {idx:word for (word,idx) in self.dicts['words2idx'].iteritems()}
        utterances = []
        for sent_idx in xrange(len(data[0])):
            annotated = zip(map(self.idx2words.get,data[0][sent_idx]),
                            map(self.idx2labels.get,data[2][sent_idx]))
            utterance = {u'text':unicode(' '.join([word for word,label in annotated])),
                         u'intent':'None', u'entities':[]}
            entity = {u'entity':'',u'startPos':0,u'endPos':0}
            for i,item in enumerate(annotated):
                current_entity = ''
                if item[1].startswith('B'):
                    entity['startPos'] = i
                    current_entity = item[1].split('-')[1]
                    entity['entity'] = entities[current_entity] + '::' + current_entity
                    if i+1>=len(annotated) or annotated[i+1][1]=='O':
                        entity['endPos'] = i
                        utterance['entities'].append(entity)
                        entity = {u'entity':'',u'startPos':0,u'endPos':0}
                        current_entity = ''
                        continue
                elif item[1].startswith('I'):
                    if i+1>=len(annotated) or annotated[i+1][1]=='O':
                        entity['endPos'] = i
                        utterance['entities'].append(entity)
                        entity = {u'entity':'',u'startPos':0,u'endPos':0}
                        current_entity = ''
                    else: continue
                else: continue
            utterances.append(utterance)
        return utterances
    
    def data2uploadable(self, data, entities):
        def entities_dict_transform(entities):
            new_dict = {}
            for entry in entities:
                for child in entry['children']:
                    new_dict[child] = entry['name']
            return new_dict
        entities = entities_dict_transform(entities)
        self.idx2labels = {idx:label for (label,idx) in self.dicts['labels2idx'].iteritems()}
        self.idx2words = {idx:word for (word,idx) in self.dicts['words2idx'].iteritems()}
        def word_index_in_string(w_idx, w, s, mode='start'):
            if mode=='start': return len(' '.join(s[:w_idx]))+1
            elif mode=='end': return len(' '.join(s[:w_idx]))+len(w)
            else: return (len(' '.join(s[:w_idx]))+1, len(' '.join(s[:w_idx]))+len(w))
        def make_label(entity_type, start_token, end_token):
            return {"EntityType": entity_type,"StartToken": start_token,"EndToken": end_token,"IsBuiltIn": 'false'}
            # label_template = {"EntityType": "","StartToken": 0,"EndToken": 0,"IsBuiltIn": 'false'}      
        def make_sent(example_text):
            return {"SelectedIntentName": "None","ExampleText": example_text,"EntityLabels": []}
            # sent_template = {"SelectedIntentName": "None","ExampleText": "","EntityLabels": []}
        sents = []
        for sent_idx in xrange(len(data[0])):
            text = map(self.idx2words.get,data[0][sent_idx])
            labels = map(self.idx2labels.get,data[2][sent_idx])
            EntityType = ""
            StartToken, EndToken = 0, 0
            sent = make_sent(' '.join(text))
            for i,l in enumerate(labels):
                if l.startswith('B'):
                    EntityType = str(entities[l.split('-')[1]] + '::' + l.split('-')[1])
                    if i==len(labels)-1 or labels[i+1]=='O':
                        StartToken, EndToken = word_index_in_string(i,text[i],text,mode='both')
                        sent['EntityLabels'].append(make_label(EntityType,StartToken,EndToken))
                        EntityType,StartToken,EndToken = "",0,0
                    StartToken = word_index_in_string(i,text[i],text,mode='start')
                elif l.startswith('I'):
                    if i==len(labels)-1 or labels[i+1]=='O':
                        EndToken = word_index_in_string(i,text[i],text,mode='end') 
                        sent['EntityLabels'].append(make_label(EntityType,StartToken,EndToken))
                        EntityType,StartToken,EndToken = "",0,0
                    else: pass
                else: pass
            sents.append(sent)
        return sents
    

In [293]:
# LOAD TESTORS
f = gzip.open('/Users/jacobsw/Downloads/atis.fold1.pkl.gz', 'rb')
train_set, valid_set, test_set, dicts = pickle.load(f)
f.close()
import os
with open('/Users/jacobsw/Desktop/new_ojoatis.json') as json_file:
    ojoatis = json.load(json_file)

In [294]:
bbl = Babel()

In [295]:
# RNN DATA => LUIS UTTERANCES
data = train_set
bbl.data2luis(data, entities)[1]

{u'entities': [{u'endPos': 5,
   u'entity': u'from_loc::fromloc.city_name',
   u'startPos': 5},
  {u'endPos': 7, u'entity': u'stop_to_loc::toloc.city_name', u'startPos': 7},
  {u'endPos': 9,
   u'entity': u'depart_date_mod_or::depart_date.day_name',
   u'startPos': 9}],
 u'intent': 'None',
 u'text': u'i want to go from boston to atlanta on monday'}

In [296]:
# RNN DATA => LUIS UPLOADABLES
bbl.data2uploadable(data, entities)[1]

{'EntityLabels': [{'EndToken': 23,
   'EntityType': 'from_loc::fromloc.city_name',
   'IsBuiltIn': 'false',
   'StartToken': 18},
  {'EndToken': 34,
   'EntityType': 'stop_to_loc::toloc.city_name',
   'IsBuiltIn': 'false',
   'StartToken': 28},
  {'EndToken': 44,
   'EntityType': 'depart_date_mod_or::depart_date.day_name',
   'IsBuiltIn': 'false',
   'StartToken': 39}],
 'ExampleText': 'i want to go from boston to atlanta on monday',
 'SelectedIntentName': 'None'}

In [297]:
# LUIS UTTERANCES => RNN DATA
luis_utterances = ojoatis['utterances']
print bbl.luis2data(luis_utterances)[0][0]
print bbl.luis2data(luis_utterances)[0][1]

[554  23 241 534 358 136 193  11 208 251 104 502 413 256 104]
[232 542 502 213 208  77 502  64 358 317]


### B. Upload to LUIS App

In [334]:
# import httplib, urllib, base64
import requests

In [None]:
class LuisUploader:
    
    def __init__(self, appid, subscription_key):
        self.appid = appid
        self.subscription_key = subscription_key
    
    def upload(self, luis_uploadables):
        # luis_uploadables: a list of {}'s in the form of 
        #     {
        #      "SelectedIntentName": "None",
        #      "ExampleText": "I want to fly to london",
        #      "EntityLabels": [
        #        {
        #         "EntityType": "stop_to_loc::toloc.city_name",
        #         "StartToken": 17,
        #         "EndToken": 22,
        #         "IsBuiltIn": 'false'
        #        }
        #      ]
        #    }
        requests.post("https://api.projectoxford.ai/luis/v1.0/prog/apps/%s/examples" % self.appid, 
             headers={"Ocp-Apim-Subscription-Key": self.subscription_key},
             json=luis_uploadables)       

In [308]:
f = gzip.open('/Users/jacobsw/Downloads/atis.fold1.pkl.gz', 'rb')
train_set, valid_set, test_set, dicts = pickle.load(f)
f.close()

In [337]:
appid = 'b08f059d-299a-4223-aa3d-06d19964d2e9'
subscription_key = '952396816ccf4d869657b9ef49533fb6'
uploader = LuisUploader(appid, subscription_key)

In [315]:
bbl = Babel()

In [316]:
# LOAD TRAIN
uploadables = bbl.data2uploadable(train_set, entities)

In [317]:
uploadables[0]

{'EntityLabels': [{'EndToken': 29,
   'EntityType': 'from_loc::airline_name',
   'IsBuiltIn': 'false',
   'StartToken': 25},
  {'EndToken': 57,
   'EntityType': 'flight::flight_number',
   'IsBuiltIn': 'false',
   'StartToken': 38},
  {'EndToken': 74,
   'EntityType': 'from_loc::fromloc.city_name',
   'IsBuiltIn': 'false',
   'StartToken': 64},
  {'EndToken': 92,
   'EntityType': 'stop_to_loc::toloc.city_name',
   'IsBuiltIn': 'false',
   'StartToken': 79}],
 'ExampleText': 'what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city',
 'SelectedIntentName': 'None'}

In [390]:
# LOAD VALID TOO!
uploadables_valid = bbl.data2uploadable(valid_set, entities)

In [391]:
uploadables_valid[0]

{'EntityLabels': [{'EndToken': 25,
   'EntityType': 'from_loc::fromloc.city_name',
   'IsBuiltIn': 'false',
   'StartToken': 19},
  {'EndToken': 40,
   'EntityType': 'depart_time_meal::depart_time.time',
   'IsBuiltIn': 'false',
   'StartToken': 36},
  {'EndToken': 57,
   'EntityType': 'depart_time_meal::depart_time.period_of_day',
   'IsBuiltIn': 'false',
   'StartToken': 49},
  {'EndToken': 85,
   'EntityType': 'stop_to_loc::toloc.city_name',
   'IsBuiltIn': 'false',
   'StartToken': 73}],
 'ExampleText': 'what flights leave atlanta at about DIGIT in the afternoon and arrive in san francisco',
 'SelectedIntentName': 'None'}

In [438]:
# %%time
# counter = 0
# for uploadable in uploadables0[8:]:
#     uploader.upload([uploadable])
#     counter += 1
#     if counter % 10 == 0:
#         print "... %d uploaded" % counter

In [None]:
# USEFUL FUNCTIONS
# 
# def post_luis_examples(app_id, subscription_key, obj):
#     return requests.post("https://api.projectoxford.ai/luis/v1.0/prog/apps/%s/examples" % app_id, 
#              headers={"Ocp-Apim-Subscription-Key": subscription_key},
#              json=obj)
# def get_luis_examples(app_id, subscription_key, skip=0, count=5):
#     return requests.get("https://api.projectoxford.ai/luis/v1.0/prog/apps/%s/examples" % app_id,
#             headers={"Ocp-Apim-Subscription-Key": subscription_key},
#             params={"skip":skip, "count": count})
# from itertools import groupby
# from numpy.random import randn
# x = randn(100)
# groups = groupby(enumerate(x), lambda t:t[0]/25)
# g = next(groups)
# g[0], list(g[1])

### B. RNN NER Learner

** TO RUN IN TERMINAL **

In [None]:
# export PYTHONPATH=$PWD

In [144]:
import os
os.chdir('/Users/jacobsw/Desktop/OJO/LUIS/is13-NER_MODELS')

In [145]:
import numpy as np
import time
import sys
import subprocess
import os
import random

from is13.data import load
from is13.rnn.elman import model
from is13.metrics.accuracy import conlleval
from is13.utils.tools import shuffle, minibatch, contextwin

from keras.models import Sequential
from keras.layers import (Input, Embedding, SimpleRNN, Dense, Activation,
                          TimeDistributed)
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical

In [146]:
def load_atis(k=3):    
    train, valid, test, dic = load.atisfold(k)
    idx2label = {i:l for l,i in dic['labels2idx'].iteritems()}
    idx2word = {i:w for w,i in dic['words2idx'].iteritems()}
    return (train, valid, test, dic, idx2label, idx2word)

In [147]:
train, valid, test, dic, idx2label, idx2word = load_atis()
data = (train, valid, test, dic)
indexation = (idx2label, idx2word)

In [158]:
class RNNNER:
    
    def __init__(self, data, indexation=None, 
                 folder='/Users/jacobsw/Desktop/OJO/LUIS/is13-NER_MODELS/elman-forward',
                 config={'lr':0.1,'verbose':1,'nhidden':100,'seed':345,
                         'emb_dimension':100,'nepochs':50}):
            # data = (train, valid, test, dic)
            # indexation = (idx2label, idx2word)
        self.config = config
        self.folder = folder
        if not os.path.exists(folder): os.mkdir(folder)          
        self.train, self.valid, self.test, self.dic = data
        if indexation:
            self.idx2label, self.idx2word = indexation
        self.__train()
        
    def __train(self):
        
        print "... configuring data"
        train_lex, train_ne, train_y = self.train
        valid_lex, valid_ne, valid_y = self.valid
        test_lex, test_ne, test_y = self.test
        vocsize = len(self.dic['words2idx'])
        nclasses = len(dic['labels2idx'])
        nsentences = len(train_lex)
    
        print "... building model"
        np.random.seed(self.config['seed'])
        random.seed(self.config['seed'])
        model = Sequential()
        model.add(Embedding(vocsize, self.config['emb_dimension']))
        model.add(SimpleRNN(self.config['nhidden'],activation='sigmoid',
                            return_sequences=True))
        model.add(TimeDistributed(Dense(output_dim=nclasses)))
        model.add(Activation('softmax'))
        sgd = SGD(lr=self.config['lr'], momentum=.0, decay=.0, nesterov=False)
        model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
        
        print "... training model"
        best_f1 = -np.inf
        for e in xrange(self.config['nepochs']):
            shuffle([train_lex, train_ne, train_y], self.config['seed'])
            self.config['ce'] = e
#             tic = time.time()

            if self.config['verbose']:
                print "... running epoch %i" % e

            for i in xrange(nsentences):
                X = np.asarray([train_lex[i]])
                Y = to_categorical(np.asarray(train_y[i])[:,np.newaxis],
                                              nclasses)[np.newaxis,:,:]
                if X.shape[1]==1: continue
                model.train_on_batch(X,Y)
#                 if self.config['verbose']:         
#                     print '[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences), \
#                           'completed in %.2f (sec) <<\r'%(time.time()-tic), \
#                           sys.stdout.flush()
                    
                            
        predictions_test = [map(lambda x: self.idx2label[x], \
            model.predict_on_batch( \
            np.asarray([x])).argmax(2)[0]) \
            for x in test_lex]
        groundtruth_test = [ map(lambda x: idx2label[x], y) for y in test_y ]
        words_test = [ map(lambda x: idx2word[x], w) for w in test_lex]

        predictions_valid = [map(lambda x: idx2label[x], \
            model.predict_on_batch( \
            np.asarray([x])).argmax(2)[0]) \
            for x in valid_lex]
        groundtruth_valid = [ map(lambda x: self.idx2label[x], y) for y in valid_y ]
        words_valid = [ map(lambda x: self.idx2word[x], w) for w in valid_lex]

        # evaluation // compute the accuracy using conlleval.pl
        res_test  = conlleval(predictions_test, groundtruth_test, words_test, self.folder + '/current.test.txt')
        res_valid = conlleval(predictions_valid, groundtruth_valid, words_valid, self.folder + '/current.valid.txt')

        if res_valid['f1'] > best_f1:
            model.save_weights('best_model.h5', overwrite=True)
            best_f1 = res_valid['f1']
            if self.config['verbose']: 
                print 'NEW BEST: epoch', e, 'valid F1', res_valid['f1'], 'best test F1', res_test['f1'], ' '*20 
            self.config['vf1'], self.config['vp'], self.config['vr'] = res_valid['f1'], res_valid['p'], res_valid['r'] 
            self.config['tf1'], self.config['tp'], self.config['tr'] = res_test['f1'],  res_test['p'],  res_test['r'] 
            self.config['be'] = e
            subprocess.call(['mv', self.folder + '/current.test.txt', self.folder + '/best.test.txt'])
            subprocess.call(['mv', self.folder + '/current.valid.txt', self.folder + '/best.valid.txt'])
        else:
            print ''

        print 'BEST RESULT: epoch', e, 'valid F1', self.config['vf1'], 'best test F1', self.config['tf1'], 'with the model', self.folder         
    
    
    

In [160]:
rnn = RNNNER(data,indexation)