In [3]:
import json
import string
import nltk

In [4]:
data = None

with open('data.json', 'r') as f_in:
    data = json.load(f_in)

In [5]:
len(data.keys())

10438

In [6]:
domain_stories = []

for story in data:
    if not story.startswith('SNG'):
        continue
    if data[story]['goal']['restaurant']:
        domain_stories.append(data[story])

In [7]:
len(domain_stories)

442

In [8]:
def annotate_utterance(utter, spans):
    utter = ' '.join(nltk.word_tokenize(utter.lower()))
    
    for span in spans:
        normalized_token = span[2].lower()
        
        if ' %s ' %(normalized_token) not in utter\
            and not utter.endswith(' %s' %(normalized_token))\
            and not utter.startswith('%s ' %(normalized_token)):
            
            tokens = nltk.word_tokenize(utter)
            tokens[span[-2]] = '[%s' %(tokens[span[-2]])
            tokens[span[-1]] = '%s](%s:%s)' %(tokens[span[-1]], span[1].lower(), normalized_token)
            utter = ' '.join(tokens)
    
    for span in spans:
        normalized_token = span[2].lower()
        annotation = ' [%s](%s) ' %(normalized_token, span[1].lower())
        
        token_match = ' %s ' %(normalized_token)
        if token_match in utter:
            utter = utter.replace(token_match, annotation)
        
        token_match = ' %s' %(normalized_token)
        if utter.endswith(token_match):
            utter = utter.replace(token_match, annotation)
        
        token_match = '%s ' %(normalized_token)
        if utter.startswith(token_match):
            utter = utter.replace(token_match, annotation)
    
    tokens = nltk.word_tokenize(utter)
    detokenized = nltk.tokenize.treebank.TreebankWordDetokenizer().detokenize(tokens)
    fixed_markers = detokenized.replace('] (', '](').replace('[ ','[').replace(': ',':')
    fixed_punctuation = string.punctuation.translate(str.maketrans('', '', ':()[]'))
    return fixed_markers.translate(str.maketrans('', '', fixed_punctuation)).replace(' ]',']').replace('  ',' ')

In [9]:
utterances_by_intent = {}

for story in domain_stories:
    for turn in story['log']:
        if not turn['metadata']:
            try:
                intent = list(turn['dialog_act'].keys())[0]
                if intent not in utterances_by_intent:
                    utterances_by_intent[intent] = []
                utterances_by_intent[intent].append(annotate_utterance(turn['text'], turn['span_info']))
            except:
                pass

In [10]:
for intent in utterances_by_intent:
    print('For intent %s, we have %s utterances.' %(intent, len(utterances_by_intent[intent])))

For intent Restaurant-Inform, we have 1155 utterances.
For intent general-thank, we have 449 utterances.
For intent general-bye, we have 48 utterances.
For intent Restaurant-Request, we have 206 utterances.
For intent Train-Inform, we have 1 utterances.
For intent general-greet, we have 1 utterances.
For intent Taxi-Inform, we have 1 utterances.
For intent Hotel-Inform, we have 1 utterances.


In [11]:
utterances_by_intent['Restaurant-Inform']

['i am looking for a particular restaurant it is called [pizza hut city centre](name)',
 'that sounds like just the place can you book me a table for [2](people) at [19:45](time) on [thursday](day)',
 'i am looking for an [expensive](price) [italian](food) restaurant',
 'great yeah that sounds great can you book a table for [5](people) people at [11:30](time) on [sunday](day)',
 'how about [10:30](time) on sunday and may i have a reference number',
 'are there any [portuguese](food) restaurants in cambridge',
 'if one of them has a [moderate](price) price range please give me that address if not tell me about [turkish](food) restaurants instead',
 'actually i need a [moderately](price:moderate) priced restaurant are there any fitting that description',
 'yes i need a reservation for [1](people) people at [14:00](time) on [monday](day)',
 'i am looking for a [cheap](price) restaurant in the [centre](area) of town',
 'i want a restaurant that serves [molecular gastronomy](food) food',
 '

In [12]:
utterances_by_intent['Restaurant-Request']

['actually scratch that i dont want to book the restaurant could you give me the phone number for the [golden house](name)',
 'can you give me the address and postcode as well',
 'no i can do that could you give me the phone number also the address with postcode',
 'what about the postcode and the address',
 'yes please can you also give me the address phone number and postcode',
 'that place sounds great can i get the phone number please',
 'can you give me their phone number address and price range please',
 'reference number zmzlmlr9 got it thank you',
 'can i get the address of [anatolia](name)',
 'may i have the reference number please',
 'could you give me the address and phone number for [the slug and lettuce](name) please',
 'i wish you could provide their phone number but other than that youve given me everything i need to know thank you goodbye',
 'how about the moderate one may i have their address please',
 'can you give me the postcode for it please',
 'which of those nine

In [13]:
del utterances_by_intent['Train-Inform']
del utterances_by_intent['general-greet']
del utterances_by_intent['Taxi-Inform']
del utterances_by_intent['Hotel-Inform']

In [14]:
for intent in utterances_by_intent:
    print('For intent %s, we have %s utterances.' %(intent, len(utterances_by_intent[intent])))

For intent Restaurant-Inform, we have 1155 utterances.
For intent general-thank, we have 449 utterances.
For intent general-bye, we have 48 utterances.
For intent Restaurant-Request, we have 206 utterances.


In [15]:
with open('restaurant_utterances.txt', 'w') as f_out:
    for intent in utterances_by_intent:
        f_out.write('## %s\n' %intent.lower())
        for utter in utterances_by_intent[intent]:
            f_out.write('- %s\n' %utter)
        f_out.write('\n')

In [25]:
seqs_of_intents = []

for story in domain_stories:
    intents = []
    
    for turn in story['log']:
        if not turn['metadata']:
            try:
                intent = list(turn['dialog_act'].keys())[0].lower()
                intents.append(intent)
            except:
                pass
    seqs_of_intents.append(intents)

In [32]:
with open('restaurant_stories.txt', 'w') as f_out:
    for i, seq in enumerate(seqs_of_intents):
        f_out.write('## path nr %s\n' %i)
        for intent in seq:
            f_out.write(' * %s\n' %intent)
            f_out.write('  - utter_std\n')
        f_out.write('\n')