In [20]:
import numpy as np

In [54]:
class Babel:
    
    def __init__(self, dicts, entities):
        """
        # dicts: 
            keys = {'idx2word', 'word2idx', 'idx2label', 'label2idx'}.
        # entities:
            a list of dictionaries for each
            keys = {'name', 'children'}.
          EXAMPLE:
            [{u'children': [u'arrive_time.end_time',
               u'arrive_time.period_mod',
               u'arrive_time.period_of_day',
               u'arrive_time.start_time',
               u'arrive_time.time_relative'],
              u'name': u'arrive_time'},
              ...
            ]
        """
        self.dicts = dicts
        self.entities = entities_normalize(entities)
    
    def luis2data(self, luis_utterances):
        """
        # luis_utterances: 
            a dictionary with keys = {'text', 'intent', 'entities'}.
            EXAMPLE:
           {u'entities': [{u'endPos': 5,
                           u'entity': u'from_loc::airline_name',
                           u'startPos': 5},
                          {u'endPos': 7, 
                           u'entity': u'flight::flight_number', 
                           u'startPos': 7},
                          {u'endPos': 10, 
                           u'entity': u'from_loc::fromloc.city_name', 
                           u'startPos': 9},
                          {u'endPos': 14,
                           u'entity': u'stop_to_loc::toloc.city_name',
                           u'startPos': 12}],
           u'intent': u'None',
           u'text': u'what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city'}
        """
        sents, labels = [],[]
        for utterance in luis_utterances:
            sents.append(np.asarray(map(self.dicts['word2idx'].get,
                                        utterance['text'].split()),dtype='int32'))
            label = ['O' for _ in utterance['text'].split()]
            for entity in utterance['entities']:
                label[entity['startPos']]='B-'+entity['entity'].split('::')[1]
                if entity['startPos']!=entity['endPos']: 
                    for i in range(entity['startPos']+1,entity['endPos']+1):
                        label[i] = 'I-'+entity['entity'].split('::')[1]
            labels.append(map(self.dicts['label2idx'].get,label))        
        return (sents,labels)        
    
    def data2luis(self, data):
        """
        # data: 
            a 2-tuple (encoded_sents, encoded_labels).
            EXAMPLE:
            (
             [array([554,  23, 241, 534, 358, 136, 193,  11, 208, 251, 104, 502, 413,
                     256, 104], dtype=int32), ... ]
             [array([232, 542, 502, 213, 208,  77, 502,  64, 358, 317], dtype=int32), ...]
            )
        """
        utterances = []
        for sent_idx in xrange(len(data[0])):
            sent = map(self.dicts['idx2word'].get,data[0][sent_idx])
            labels = map(self.dicts['idx2label'].get,data[1][sent_idx])
            utterance = {u'text':unicode(' '.join(sent)),
                         u'intent':'None', u'entities':[]}
            entity = {u'entity':'',u'startPos':0,u'endPos':0}
            for i,label in enumerate(labels):
                current_entity = ''
                if label=='O': continue
                elif label.startswith('B'):
                    entity['startPos'] = i
                    current_entity = label.split('-')[1]
                    entity['entity'] = self.entities[current_entity] + '::' + current_entity
                else: pass
                if i+1>=len(labels) or labels[i+1]=='O':
                    entity['endPos'] = i
                    utterance['entities'].append(entity)
                    entity = {u'entity':'',u'startPos':0,u'endPos':0}
                    current_entity = ''
            utterances.append(utterance)
        return utterances

In [45]:
def entities_normalize(entities):
    """
    # entities:
        a list of dictionaries for each
        keys = {'name', 'children'}.    
      TRANSFORMED EXAMPLE:
        {u'aircraft_code': u'codes_types',
         u'airline_code': u'codes_types',
         u'airline_name': u'from_loc',
         u'airport_code': u'codes_types',
         u'airport_name': u'from_loc',
         u'arrive_date.date_relative': u'arrive_date',
         u'arrive_date.day_name': u'arrive_date',
         u'arrive_date.day_number': u'arrive_date',
         ...
    """
    new_dict = {}
    for entry in entities:
        for child in entry['children']:
            new_dict[child] = entry['name']
    return new_dict


In [36]:
import json, gzip, pickle, os
os.chdir('/Users/jacobsw/Desktop')
# IMPORT LUIS
with open('new_ojoatis.json') as json_file:
    ojoatis = json.load(json_file)
entities = ojoatis['entities']
luis_utterances = ojoatis['utterances']
# IMPORT DATA (ATIS)
f = gzip.open('/Users/jacobsw/Downloads/atis.fold1.pkl.gz', 'rb')
train_set, valid_set, test_set, atis_dicts = pickle.load(f)
train_set, valid_set, test_set = (train_set[0],train_set[2]), (valid_set[0],valid_set[2]), (test_set[0],test_set[2])
f.close()
dicts = {}
dicts['idx2word'] = {i:w for w,i in atis_dicts['words2idx'].iteritems()}
dicts['word2idx'] = atis_dicts['words2idx']
dicts['idx2label'] = {i:l for l,i in atis_dicts['labels2idx'].iteritems()}
dicts['label2idx'] = atis_dicts['labels2idx']

In [55]:
bbl = Babel(dicts, entities)

In [42]:
atis_data = bbl.luis2data(luis_utterances)
print "ATIS sentence: ", map(bbl.dicts['idx2word'].get, atis_data[0][0])
print "ATIS sentence encoded: ", atis_data[0][0]
print "ATIS labels: ", map(bbl.dicts['idx2label'].get, atis_data[1][0])
print "ATIS labels encoded: ", atis_data[1][0]

ATIS sentence:  ['what', 'aircraft', 'is', 'used', 'on', 'delta', 'flight', 'DIGITDIGITDIGITDIGIT', 'from', 'kansas', 'city', 'to', 'salt', 'lake', 'city']
ATIS sentence encoded:  [554  23 241 534 358 136 193  11 208 251 104 502 413 256 104]
ATIS labels:  ['O', 'O', 'O', 'O', 'O', 'B-airline_name', 'O', 'B-flight_number', 'O', 'B-fromloc.city_name', 'I-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'I-toloc.city_name']
ATIS labels encoded:  [126, 126, 126, 126, 126, 2, 126, 43, 126, 48, 109, 126, 78, 123, 123]


In [59]:
luis_utterances = bbl.data2luis(atis_data)
print "LUIS utterances: "
luis_utterances[0]

LUIS utterances: 


{u'entities': [{u'endPos': 5,
   u'entity': u'from_loc::airline_name',
   u'startPos': 5},
  {u'endPos': 7, u'entity': u'flight::flight_number', u'startPos': 7},
  {u'endPos': 10, u'entity': u'from_loc::fromloc.city_name', u'startPos': 9},
  {u'endPos': 14,
   u'entity': u'stop_to_loc::toloc.city_name',
   u'startPos': 12}],
 u'intent': 'None',
 u'text': u'what aircraft is used on delta flight DIGITDIGITDIGITDIGIT from kansas city to salt lake city'}