## Multiwoz Analysis
source: https://www.repository.cam.ac.uk/handle/1810/280608

place this file under the directory.

This file contains:
- database load and analyse
- dialog act load and analyse
- user goal analysis
- user data annotation

In [1]:
import numpy as np
import json
from collections import Counter
import pprint
import re

### database load and analyse
load *_db.json, and print attributes

In [2]:
def read_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [3]:
database = read_json('restaurant_db.json')

In [4]:
len(database)

110

In [5]:
# domain attributes (*_db.json)
ls = []
attributes = []
for i in database:
    ls.append(len(i))
    attributes.append(tuple(sorted(i.keys())))
print(Counter(ls))
for k,v in dict(Counter(attributes)).items():
    print(k,v)
# print(Counter(attributes))

Counter({11: 77, 10: 17, 12: 16})
('address', 'area', 'food', 'id', 'introduction', 'location', 'name', 'phone', 'postcode', 'pricerange', 'type') 77
('address', 'area', 'food', 'id', 'introduction', 'location', 'name', 'phone', 'postcode', 'pricerange', 'signature', 'type') 16
('address', 'area', 'food', 'id', 'location', 'name', 'phone', 'postcode', 'pricerange', 'type') 14
('address', 'area', 'food', 'id', 'introduction', 'location', 'name', 'postcode', 'pricerange', 'type') 3


### dialog act load and analyse
load dialog_acts.json and create dialog_act-slot-values(list)

In [6]:
# dialog act
dialog_acts = read_json('dialogue_acts.json')

In [7]:
len(dialog_acts)

10438

In [8]:
das = []
da2slot = {}
for item in dialog_acts.values():
    for turn_das in item.values():
        if type(turn_das)==type(""):
            das.append(turn_das)
        else:
            das.extend(turn_das.keys())
            for da,svs in turn_das.items():
                # (da,list of [slot, value])
                if da not in da2slot:
                    da2slot[da] = {}
                for sv in svs:
                    if sv[0] not in da2slot[da]:
                        da2slot[da][sv[0]]=[]
                    da2slot[da][sv[0]].append(sv[1])
da_count = Counter(das)

In [9]:
# number of sentences that dialog act occurence
da_count

Counter({'Attraction-Inform': 6976,
         'Attraction-NoOffer': 490,
         'Attraction-Recommend': 1451,
         'Attraction-Request': 1641,
         'Attraction-Select': 438,
         'Booking-Book': 5256,
         'Booking-Inform': 5703,
         'Booking-NoBook': 1313,
         'Booking-Request': 2708,
         'Hotel-Inform': 8224,
         'Hotel-NoOffer': 914,
         'Hotel-Recommend': 1501,
         'Hotel-Request': 3215,
         'Hotel-Select': 1005,
         'No Annotation': 1933,
         'Restaurant-Inform': 8071,
         'Restaurant-NoOffer': 1453,
         'Restaurant-Recommend': 1495,
         'Restaurant-Request': 3083,
         'Restaurant-Select': 918,
         'Taxi-Inform': 2087,
         'Taxi-Request': 1613,
         'Train-Inform': 7204,
         'Train-NoOffer': 117,
         'Train-OfferBook': 3032,
         'Train-OfferBooked': 2309,
         'Train-Request': 5522,
         'Train-Select': 389,
         'general-bye': 9107,
         'general-greet': 

In [10]:
# slots of specific dialog act
da2slot['Restaurant-Inform'].keys()

dict_keys(['Post', 'Food', 'Name', 'Price', 'Addr', 'Phone', 'Area', 'Choice', 'Ref', 'none'])

In [11]:
len(da2slot['Hotel-Inform']['Type'])

2944

In [12]:
# print all dialog-act-slot and their occurence
for k,v in sorted(da2slot.items()):
    print(k)
    for slot in v.keys():
        print('\t'+slot+'\t'+str(len(da2slot[k][slot])))

Attraction-Inform
	Area	2365
	Type	2592
	Choice	2243
	Post	1579
	Name	3073
	Phone	1742
	Addr	2665
	Fee	1923
	none	28
	Price	48
	Open	13
Attraction-NoOffer
	Area	324
	Type	325
	none	109
	Name	5
	Choice	1
	Addr	1
	Fee	1
Attraction-Recommend
	Post	165
	Name	1461
	Area	338
	Fee	396
	Phone	153
	Addr	530
	Type	217
	Choice	13
	none	6
	Price	9
	Open	3
Attraction-Request
	Area	798
	Type	823
	Price	86
	Name	110
Attraction-Select
	Type	286
	none	243
	Area	66
	Name	57
	Fee	47
	Choice	7
	Phone	1
	Price	4
	Addr	1
Booking-Book
	Ref	4913
	Name	1265
	People	789
	Time	593
	Day	899
	Stay	481
	none	107
Booking-Inform
	none	5317
	Stay	31
	Name	277
	Day	89
	People	71
	Ref	18
	Time	31
Booking-NoBook
	none	1086
	Ref	14
	Stay	41
	Day	82
	People	28
	Time	78
	Name	72
Booking-Request
	Time	916
	Day	1633
	People	993
	Stay	889
Hotel-Inform
	Name	3678
	Ref	50
	Type	2944
	Choice	3856
	Addr	1158
	Post	648
	Area	2399
	Internet	1446
	Parking	1373
	Stars	1785
	none	44
	Price	2282
	Phone	704
Hotel-NoOffer
	none	322
	Type	

### user goal analysis
load all data and extract user goal, to see all attributes in goal.

In [13]:
data = read_json('data.json')

In [14]:
len(data)

10438

In [15]:
user_goals = {}
for _,item in data.items():
    goal = item['goal']
    for domain,subgoal in goal.items():
        if domain=='message' or domain=='topic':
            continue
        if domain not in user_goals:
            user_goals[domain] = {}
        for k,v in subgoal.items():
            if type(v)==type({}):
                user_goals[domain][k] = user_goals[domain].setdefault(k,set([]))|set(v.keys())
#                 user_goals[domain].setdefault(k,{}).update(v)
            elif type(v)==type([]):
                user_goals[domain][k] = user_goals[domain].setdefault(k,set([]))|set(v)
#             else:
#                 print(_)
#                 print(k,v,type(v))
#                 print(subgoal)
#         if domain=='hospital' and len(subgoal)>1:
#             print(_)
for domain, subgoal in sorted(user_goals.items()):
    print(domain)
    print(subgoal)
    print()

attraction
{'info': {'area', 'name', 'type'}, 'reqt': {'area', 'postcode', 'phone', 'address', 'entrance fee', 'type'}, 'fail_info': {'area', 'name', 'type'}}

hospital
{'info': {'department'}, 'reqt': {'postcode', 'phone', 'address'}, 'fail_info': set()}

hotel
{'info': {'stars', 'area', 'parking', 'internet', 'pricerange', 'type', 'name'}, 'fail_info': {'stars', 'area', 'parking', 'internet', 'pricerange', 'type', 'name'}, 'book': {'pre_invalid', 'stay', 'day', 'invalid', 'people'}, 'fail_book': {'stay', 'day'}, 'reqt': {'stars', 'area', 'postcode', 'phone', 'parking', 'pricerange', 'internet', 'type', 'address'}}

police
{'info': set(), 'reqt': {'postcode', 'phone', 'address'}, 'fail_info': set()}

restaurant
{'info': {'area', 'food', 'name', 'pricerange'}, 'reqt': {'food', 'area', 'postcode', 'phone', 'address', 'pricerange'}, 'fail_info': {'area', 'pricerange', 'name', 'food'}, 'book': {'pre_invalid', 'day', 'time', 'invalid', 'people'}, 'fail_book': {'day', 'time'}}

taxi
{'info'

### Reorganize data
merge data.json and dialog_acts.json

In [16]:
sessions_key = list(map(lambda x: x.split('.')[0],data.keys()))
len(sessions_key)

10438

In [17]:
dialog_acts[sessions_key[0]]

{'1': {'Hotel-Request': [['Area', '?']]},
 '2': {'Booking-Inform': [['none', 'none']],
  'Hotel-Inform': [['Price', 'cheap'], ['Choice', '1'], ['Parking', 'none']]},
 '3': {'Booking-NoBook': [['Day', 'Tuesday']],
  'Booking-Request': [['Stay', '?'], ['Day', '?']]},
 '4': {'Booking-Book': [['Ref', '7GAWK763']],
  'general-reqmore': [['none', 'none']]},
 '5': {'general-bye': [['none', 'none']]}}

In [18]:
data[sessions_key[0]+'.json']

{'goal': {'attraction': {},
  'hospital': {},
  'hotel': {'book': {'day': 'tuesday',
    'invalid': False,
    'people': '6',
    'pre_invalid': True,
    'stay': '2'},
   'fail_book': {'stay': '3'},
   'fail_info': {},
   'info': {'internet': 'yes',
    'parking': 'yes',
    'pricerange': 'cheap',
    'type': 'hotel'}},
  'message': ["You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>cheap</span> price range and should be in the type of <span class='emphasis'>hotel</span>",
   "The hotel should <span class='emphasis'>include free parking</span> and should <span class='emphasis'>include free wifi</span>",
   "Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>6 people</span> and <span class='emphasis'>3 nights</span> starting from <span class='emphasis'>tuesday</span>",
   "If the booking fails how about <span class='emphasis'>2 nights</span>",
   "Make sure you get the 

In [19]:
all_data = {}
for session in sessions_key:
    all_data[session] = data[session+'.json']
    if len(all_data[session]['log'])-len(dialog_acts[session])*2>1:
        # some annotation are incomplete
        all_data.pop(session)
        continue
    for i,turn in enumerate(all_data[session]['log']):
        if i%2==0:
            turn['dialog_act'] = {}
        else:
            turn['dialog_act'] = dialog_acts[session]['%d'%((i+1)/2)]

In [20]:
# example
all_data[sessions_key[0]]

{'goal': {'attraction': {},
  'hospital': {},
  'hotel': {'book': {'day': 'tuesday',
    'invalid': False,
    'people': '6',
    'pre_invalid': True,
    'stay': '2'},
   'fail_book': {'stay': '3'},
   'fail_info': {},
   'info': {'internet': 'yes',
    'parking': 'yes',
    'pricerange': 'cheap',
    'type': 'hotel'}},
  'message': ["You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>cheap</span> price range and should be in the type of <span class='emphasis'>hotel</span>",
   "The hotel should <span class='emphasis'>include free parking</span> and should <span class='emphasis'>include free wifi</span>",
   "Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>6 people</span> and <span class='emphasis'>3 nights</span> starting from <span class='emphasis'>tuesday</span>",
   "If the booking fails how about <span class='emphasis'>2 nights</span>",
   "Make sure you get the 

### Automatic annotate user dialog
- Inform
- Request
- General

#### Inform
Infer from dialog state table change. 

In [21]:
# empty initial state
init_state = {'taxi': {'book': {'booked': []}, 'semi': {'leaveAt': '', 'destination': '', 'departure': '', 'arriveBy': ''}}, 'police': {'book': {'booked': []}, 'semi': {}}, 'restaurant': {'book': {'booked': [], 'time': '', 'day': '', 'people': ''}, 'semi': {'food': '', 'pricerange': '', 'name': '', 'area': ''}}, 'hospital': {'book': {'booked': []}, 'semi': {'department': ''}}, 'hotel': {'book': {'booked': [], 'stay': '', 'day': '', 'people': ''}, 'semi': {'name': '', 'area': '', 'parking': '', 'pricerange': '', 'stars': '', 'internet': '', 'type': ''}}, 'attraction': {'book': {'booked': []}, 'semi': {'type': '', 'name': '', 'area': ''}}, 'train': {'book': {'booked': [], 'people': '', 'ticket': ''}, 'semi': {'leaveAt': '', 'destination': '', 'day': '', 'arriveBy': '', 'departure': ''}}}
init_state

{'attraction': {'book': {'booked': []},
  'semi': {'area': '', 'name': '', 'type': ''}},
 'hospital': {'book': {'booked': []}, 'semi': {'department': ''}},
 'hotel': {'book': {'booked': [], 'day': '', 'people': '', 'stay': ''},
  'semi': {'area': '',
   'internet': '',
   'name': '',
   'parking': '',
   'pricerange': '',
   'stars': '',
   'type': ''}},
 'police': {'book': {'booked': []}, 'semi': {}},
 'restaurant': {'book': {'booked': [], 'day': '', 'people': '', 'time': ''},
  'semi': {'area': '', 'food': '', 'name': '', 'pricerange': ''}},
 'taxi': {'book': {'booked': []},
  'semi': {'arriveBy': '', 'departure': '', 'destination': '', 'leaveAt': ''}},
 'train': {'book': {'booked': [], 'people': '', 'ticket': ''},
  'semi': {'arriveBy': '',
   'day': '',
   'departure': '',
   'destination': '',
   'leaveAt': ''}}}

In [68]:
# translate attribute in state table to slot
attribute2slot = {
    'area': 'Area',
    'type': 'Type',
    'name': 'Name',
    'internet': 'Internet',
    'parking': 'Parking',
    'stars': 'Stars',
    'pricerange': 'Price',
    'food': 'Food',
    'arriveBy': 'Arrive',
    'leaveAt': 'Leave',
    'departure': 'Depart',
    'destination': 'Dest',
    'department': 'Department',
    'day': 'Day',
    'people': 'People',
    'stay': 'Stay',
    'time': 'Time',
    'car type': 'Car',
    'phone': 'Phone',
    'postcode': 'Post',
    'address': 'Addr',
    'entrance fee': 'Fee',
    'price': 'Price',
    'duration': 'Duration',
    'trainID': 'TrainID',
    'ref': 'Ref'
}
slot2attribute = {v:k for k,v in attribute2slot.items()}

digit2word = {
    '0': 'zero',
    '1': 'one',
    '2': 'two',
    '3': 'three',
    '4': 'four',
    '5': 'five',
    '6': 'six',
    '7': 'seven',
    '8': 'eight',
    '9': 'nine',
    '10': 'ten'
}
# user da(dict)-slot(set)
user_da_slot = {}
domains = []
for domain, subgoal in user_goals.items():
    domains.append(domain)
    user_da_slot[domain.capitalize()+'-Inform'] = set()
    for slot in subgoal['info']:
        user_da_slot[domain.capitalize()+'-Inform'].add(slot)
    if 'book' in subgoal:
        for slot in subgoal['book']:
            if slot not in ['invalid', 'pre_invalid']:
                user_da_slot[domain.capitalize()+'-Inform'].add(slot)
    
    user_da_slot[domain.capitalize()+'-Request'] = set()
    for slot in subgoal['reqt']:
        user_da_slot[domain.capitalize()+'-Request'].add(slot)
print(domains)
pprint.pprint(user_da_slot)

['taxi', 'police', 'hospital', 'hotel', 'attraction', 'train', 'restaurant']
{'Attraction-Inform': {'area', 'name', 'type'},
 'Attraction-Request': {'address',
                        'area',
                        'entrance fee',
                        'phone',
                        'postcode',
                        'type'},
 'Hospital-Inform': {'department'},
 'Hospital-Request': {'postcode', 'address', 'phone'},
 'Hotel-Inform': {'area',
                  'day',
                  'internet',
                  'name',
                  'parking',
                  'people',
                  'pricerange',
                  'stars',
                  'stay',
                  'type'},
 'Hotel-Request': {'address',
                   'area',
                   'internet',
                   'parking',
                   'phone',
                   'postcode',
                   'pricerange',
                   'stars',
                   'type'},
 'Police-Inform': set(),
 'Police

In [23]:
sample = all_data[sessions_key[0]]
sample

{'goal': {'attraction': {},
  'hospital': {},
  'hotel': {'book': {'day': 'tuesday',
    'invalid': False,
    'people': '6',
    'pre_invalid': True,
    'stay': '2'},
   'fail_book': {'stay': '3'},
   'fail_info': {},
   'info': {'internet': 'yes',
    'parking': 'yes',
    'pricerange': 'cheap',
    'type': 'hotel'}},
  'message': ["You are looking for a <span class='emphasis'>place to stay</span>. The hotel should be in the <span class='emphasis'>cheap</span> price range and should be in the type of <span class='emphasis'>hotel</span>",
   "The hotel should <span class='emphasis'>include free parking</span> and should <span class='emphasis'>include free wifi</span>",
   "Once you find the <span class='emphasis'>hotel</span> you want to book it for <span class='emphasis'>6 people</span> and <span class='emphasis'>3 nights</span> starting from <span class='emphasis'>tuesday</span>",
   "If the booking fails how about <span class='emphasis'>2 nights</span>",
   "Make sure you get the 

In [24]:
def dict_diff(dict1,dict2):
    # compare two dict
    # two exceptions:
    # 1) 'bus' domain unuse
    # 2) 'ticket' and 'people' attr for 'train-book' domain may be missing
    diff_dict = {}
    for k,v2 in dict2.items():
        if k in dict1:
            assert type(v2)==type(dict1[k])
            v1 = dict1[k]
            if v1 != v2:
                if type(v2) != type({}):
                    diff_dict[k] = v2
                else:
                    if dict_diff(v1,v2)!={}:
                        diff_dict[k] = dict_diff(v1,v2)
        else:
            if k!='bus':
                assert k=='people'
                # people attribute for train domain
                if v2!='':
                    diff_dict[k] = v2
    return diff_dict

In [90]:
da2usersen = {}
cnt0 = 0
cnt1 = 0
cnt2 = 0
cnt3 = 0
cnt4 = 0
for no,session in all_data.items():
    usersen = [session['log'][i]['text'] for i in range(0,len(session['log'])-1,2)]
    
    user_das = []
    user_goal = session['goal']
    for i in range(1,len(session['log']),2):
        prev_state = init_state if i==1 else session['log'][i-2]['metadata']
        next_state = session['log'][i]['metadata']
        prev_utterance = '' if i==1 else session['log'][i-2]['text']
        next_utterance = session['log'][i]['text']
        user_utterance = session['log'][i-1]['text']
        if i==1 or 'dialog_act' not in session['log'][i-2]:
            prev_da = {}
        else:
            prev_da = session['log'][i-2]['dialog_act']
        next_da = session['log'][i]['dialog_act']
        diff_table = dict_diff(prev_state, next_state)
        
        # user da annotate, Inform
        da = {}
        for domain in domains:
            if len(user_goal[domain])!=0:
                da[domain.capitalize()+'-Inform'] = []
                for slot in user_da_slot[domain.capitalize()+'-Inform']:
                    value_state = ''
                    if domain in diff_table:
                        for subtable in diff_table[domain].values():
                            if slot in subtable and subtable[slot] != 'not mentioned':
                                value_state = subtable[slot]                    
                    # state for that slot change
                    
                    value_goal = ''
                    if slot in user_goal[domain]['info']:
                        value_goal = user_goal[domain]['info'][slot]
                    elif 'book' in user_goal[domain] and slot in user_goal[domain]['book']:
                        value_goal = user_goal[domain]['book'][slot]
                    # slot-value appear in goal
                    
                    if value_state!='':
                        # state change
                        if value_state.lower() in user_utterance.lower():
                            # value in user utterance
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot.lower() in user_utterance.lower():
                            # slot in user utterance
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot=='people' and 'one person' in user_utterance.lower():
                            # keyword 'person' for people
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],"1"])
                        elif slot=='stay' and 'night' in user_utterance.lower():
                            # keyword 'night' for stay
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot=='internet' and 'wifi' in user_utterance.lower():
                            # alias 'wifi' for internet
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot=='pricerange' and 'price' in user_utterance.lower():
                            # alias 'price' for pricerange
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot=='arriveBy' and 'arrive' in user_utterance.lower():
                            # alias 'arrive' for arriveBy
                            if value_state == value_goal:
                                da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot=='leaveAt' and 'leave' in user_utterance.lower():
                            # alias 'leave' for leaveAt
                            if value_state != value_goal:
                                da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif slot=='name' and (value_state.lower() in prev_utterance.lower()):
                            # coreference
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif domain.capitalize()+'-Request' in prev_da and [attribute2slot[slot],'?'] in prev_da[domain.capitalize()+'-Request']:
                            # answer system request
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif value_goal!='' and value_goal.lower() in user_utterance.lower():
                            # wrong state update
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                        elif value_state in digit2word and (' '+digit2word[value_state]) in user_utterance.lower():
                            # alias for digital value
                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_state])
                    elif value_goal!='':
                        # state don't change but value is in goal
                        if value_goal=='yes' or value_goal=='no':
                            # binary value
                            if '?' not in user_utterance:
                                # not for acknowledgement
                                if slot.lower() in user_utterance.lower():
                                    da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                elif slot=='internet' and 'wifi' in user_utterance.lower():
                                    da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                        elif value_goal.isdigit():
                            # digital value
                            if (' '+value_goal.lower()+' ') in user_utterance.lower() or (' '+digit2word[value_goal]) in user_utterance.lower():
                                if slot=='stars':
                                    if 'stars' in user_utterance.lower():
                                        da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                    elif 'star' in user_utterance.lower() and 'start' not in user_utterance.lower():
                                        da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                elif slot=='stay':
                                    if 'stay' in user_utterance.lower():
                                        da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                    elif 'night' in user_utterance.lower():
                                        da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                elif slot=='people':
                                    if 'people' in user_utterance.lower() or 'person' in user_utterance.lower():
                                        if domain in user_utterance.lower() or domain in prev_utterance.lower():
                                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                    elif 'ticket' in user_utterance.lower():
                                        if domain=='train':
                                            da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                                else:
                                    assert 0
                        elif value_goal in user_utterance.lower():
                            # string value
                            if domain in user_utterance.lower() or domain in prev_utterance.lower():
                                da[domain.capitalize()+'-Inform'].append([attribute2slot[slot],value_goal])
                    
                if len(da[domain.capitalize()+'-Inform'])==0:
                    da.pop(domain.capitalize()+'-Inform')

        # Request
        for domain in domains:
            if len(user_goal[domain])!=0:
                da[domain.capitalize()+'-Request'] = []
                slots = user_da_slot[domain.capitalize()+'-Request']
                for slot in slots:
                    # for each possible request in goal
                    if 'reqt' in user_goal[domain] and slot in user_goal[domain]['reqt']:
                        # if actually in request goal
                        if slot.lower() in user_utterance:
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])                              
                        elif slot=='internet' and 'wifi' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='postcode' and 'post' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='pricerange' and 'price' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='trainID' and ' id' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='arriveBy' and 'arrive' in user_utterance.lower():
                            if 'when' in user_utterance.lower() or 'what time' in user_utterance.lower():
                                da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='leaveAt' and 'leave' in user_utterance.lower():
                            if 'when' in user_utterance.lower() or 'what time' in user_utterance.lower():
                                da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='duration' and 'travel time' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='arriveBy' and 'arrival time' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
                        elif slot=='leaveAt' and 'departure time' in user_utterance.lower():
                            da[domain.capitalize()+'-Request'].append([attribute2slot[slot],'?'])
#                             print(user_utterance)
                
                # for request not in goal
                if ' ref' in user_utterance.lower():
                    if 'book' in user_goal[domain] and len(user_goal[domain]['book'])>0:
                        da[domain.capitalize()+'-Request'].append(['Ref','?'])

                        
                if len(da[domain.capitalize()+'-Request'])==0:
                    da.pop(domain.capitalize()+'-Request')
                                
        # Clarify the domain of request slot (for address, postcode, area, phone,...)
        slot2domain = {}
#         print(user_utterance)
        for domain_da, svs in da.items():
            if 'Request' in domain_da:
                for (s,v) in svs:
                    slot2domain.setdefault(s,[])
                    slot2domain[s].append(domain_da.split('-')[0])
#         print(slot2domain)
        for s,d in slot2domain.items():
            if len(d)>1:
                # several request for same slot
                # note that in data no slot alias appear twice
                if len(re.findall(s, user_utterance))<=1:
                    # for slot don't appear twice
                    system_ack = []
                    for each in d:
                        if each+'-Inform' in next_da:
                            if s in map(lambda x:x[0],next_da[each+'-Inform']):
                                # system Inform the value of slot:
                                system_ack.append(each)
                        elif each+'-Recommend' in next_da:
                            if s in map(lambda x:x[0],next_da[each+'-Recommend']):
                                # system Recommend the value of slot:
                                system_ack.append(each)
                            
                    if len(system_ack)==0:
                        # not informed or recommended by system, abort. 227 samples
                        for each in d:
                            for request_slot_value in da[each+'-Request']:
                                if s==request_slot_value[0]:
                                    da[each+'-Request'].remove(request_slot_value)
                            if len(da[each+'-Request'])==0:
                                da.pop(each+'-Request')
                    elif len(system_ack)==1:
                        # one of domain informed or recommended by system. 1441 samples
                        for each in d:
                            if each in system_ack:
                                continue
                            for request_slot_value in da[each+'-Request']:
                                if s==request_slot_value[0]:
                                    da[each+'-Request'].remove(request_slot_value)
                            if len(da[each+'-Request'])==0:
                                da.pop(each+'-Request')
                    elif len(system_ack)==2:
                        # two of domain informed or recommended by system. 3 samples
                        pass
                    else:
                        # no >2 sample
                        assert 0
                        
        # General
        if len(da)==0:
            for domain in domains:
                if domain in user_utterance.lower():
                    da.setdefault(domain.capitalize()+'-Inform',[])
                    da[domain.capitalize()+'-Inform'].append(['none','none'])
#                     cnt3+=1
        if len(da)==0:
            if 'bye' in user_utterance.lower():
#                 cnt0+=1
                da['general-bye'] = [['none','none']]
#                 print(user_utterance)
            elif 'thank' in user_utterance.lower():
#                 cnt1+=1
                da['general-thank'] = [['none','none']]
#                 print(user_utterance)
            elif sum([1 if x in user_utterance.lower() else 0 for x in ['hello','hi ']])>0:
#                 cnt2+=1
                da['general-greet'] = [['none','none']]
#                 print(user_utterance)
            
        
        user_das.append(da)
        
        if len(da)==0:
            cnt3+=1
#             print(user_utterance)
        else:
            cnt4+=1
        
#         if no=='PMUL1635':
#             print(user_utterance)
#             print(da)
#             print(next_da)
#             print(diff_table)
    
    assert len(usersen) == len(user_das)
    for j,user_da in enumerate(user_das):
        session['log'][j*2]['dialog_act'] = user_das[j]
    for da,sen in zip(user_das,usersen):
        da2usersen.setdefault(tuple(da),[])
        da2usersen[tuple(da)].append(sen)
print(cnt0,cnt1,cnt2,cnt3,cnt4)

0 0 0 7091 64396


In [670]:
8068+63419

71487

In [115]:
# da statistics
cnt = [0]*10
user_da2slot2val = {}
sys_da2slot2val = {}
for no,session in all_data.items():
    for i in range(0,len(session['log']),2):
        user_da = session['log'][i]['dialog_act']
        for da,svs in user_da.items():
            user_da2slot2val.setdefault(da,{})
            for s,v in svs:
                user_da2slot2val[da].setdefault(s,[])
                user_da2slot2val[da][s].append(v)
    for i in range(1,len(session['log']),2):
        sys_da = session['log'][i]['dialog_act']
        if type(sys_da)!=type({}):
            assert sys_da=='No Annotation'
            session['log'][i]['dialog_act'] = {}
            continue
        for da,svs in sys_da.items():
            sys_da2slot2val.setdefault(da,{})
            for s,v in svs:
                sys_da2slot2val[da].setdefault(s,[])
                sys_da2slot2val[da][s].append(v)
print('User Dialog Act:')
for k,v in sorted(user_da2slot2val.items()):
    print(k)
    for s in sorted(v.keys()):
        print('\t',len(v[s]),'\t',s)
print()
print('System Dialog Act:')
for k,v in sorted(sys_da2slot2val.items()):
    print(k)
    for s in sorted(v.keys()):
        print('\t',len(v[s]),'\t',s)

User Dialog Act:
Attraction-Inform
	 2084 	 Area
	 1447 	 Name
	 2477 	 Type
	 269 	 none
Attraction-Request
	 1236 	 Addr
	 379 	 Area
	 1109 	 Fee
	 1314 	 Phone
	 1401 	 Post
	 258 	 Type
Hospital-Inform
	 95 	 Department
	 231 	 none
Hospital-Request
	 101 	 Addr
	 123 	 Phone
	 150 	 Post
Hotel-Inform
	 2241 	 Area
	 2513 	 Day
	 1808 	 Internet
	 1996 	 Name
	 1894 	 Parking
	 2373 	 People
	 2375 	 Price
	 2335 	 Stars
	 3161 	 Stay
	 2731 	 Type
	 642 	 none
Hotel-Request
	 518 	 Addr
	 190 	 Area
	 235 	 Internet
	 228 	 Parking
	 478 	 Phone
	 511 	 Post
	 243 	 Price
	 453 	 Ref
	 61 	 Stars
	 141 	 Type
Police-Inform
	 161 	 none
Police-Request
	 101 	 Addr
	 53 	 Phone
	 95 	 Post
Restaurant-Inform
	 3225 	 Area
	 2582 	 Day
	 4661 	 Food
	 1790 	 Name
	 2489 	 People
	 3434 	 Price
	 3007 	 Time
	 855 	 none
Restaurant-Request
	 1178 	 Addr
	 153 	 Area
	 138 	 Food
	 1156 	 Phone
	 832 	 Post
	 186 	 Price
	 515 	 Ref
Taxi-Inform
	 638 	 Arrive
	 887 	 Depart
	 831 	 Des

In [269]:
# span annotate
cnt = [0]*10
span_da = {}
no_span_sentence = []
for no,session in all_data.items():
    for i in range(0,len(session['log']),1):
        user_da = session['log'][i]['dialog_act']
        user_utterance = session['log'][i]['text']
        span_info = []
        for da,svs in user_da.items():
            if 'Inform' in da:
                span_da.setdefault(da,{})
                for s,v in svs:
                    if s!='Internet' and s!='Parking' and s!='none':
                        cnt[-1]+=1
                        is_annotated = False
                        span_da[da].setdefault(s,set([]))
                        span_da[da][s].add(v)
                        if v.lower() in user_utterance.lower():
                            # mentioned explicitly
                            cnt[0]+=1
                            is_annotated = True
                            char_index_begin = user_utterance.lower().index(v.lower())
                            char_index_end = char_index_begin+len(v)
                            word_index_begin = len(user_utterance[:char_index_begin].split())
                            word_index_end = len(user_utterance[:char_index_end].split())-1
                            span_info.append((da,s,v,word_index_begin,word_index_end))
                        elif v in digit2word and digit2word[v] in user_utterance.lower():
                            # digit alias
                            cnt[1]+=1
                            is_annotated = True
                            char_index_begin = user_utterance.lower().index(digit2word[v].lower())
                            char_index_end = char_index_begin+len(digit2word[v])
                            word_index_begin = len(user_utterance[:char_index_begin].split())
                            word_index_end = len(user_utterance[:char_index_end].split())-1
                            span_info.append((da,s,v,word_index_begin,word_index_end))
                        elif s=='Name':
                            # coreference
                            cnt[2]+=1
                        elif 'same' in user_utterance.lower() and s.lower() in user_utterance.lower():
                            # coreference-'same'
                            if 'same '+s.lower() in user_utterance.lower():
                                cnt[3]+=1
                                is_annotated = True
                                assert len(s.split())==1
                                char_index_begin = user_utterance.lower().index('same '+s.lower())
                                char_index_end = char_index_begin+len('same '+s.lower())
                                word_index_begin = len(user_utterance[:char_index_begin].split())
                                word_index_end = len(user_utterance[:char_index_end].split())-1
                                span_info.append((da,s,v,word_index_begin,word_index_end))
                            elif s=='People':
                                cnt[3]+=1
                                is_annotated = True
                                if  'same group of people' in user_utterance.lower():
                                    pattern = 'same group of people'
                                elif 'same number of people' in user_utterance.lower():
                                    pattern = 'same number of people'
                                elif 'same amount of people' in user_utterance.lower():
                                    pattern = 'same amount of people'
                                elif 'same quantity of people' in user_utterance.lower():
                                    pattern = 'same quantity of people'
                                else:
                                    assert 0
                                char_index_begin = user_utterance.lower().index(pattern)
                                word_index_begin = len(user_utterance[:char_index_begin].split())
                                span_info.append((da,s,v,word_index_begin,word_index_begin+3))
                            else:
                                char_index_begin = user_utterance.lower().index('same')
                                word_index_begin = len(user_utterance[:char_index_begin].split())
                                shift = len(user_utterance[:user_utterance.lower().index(s.lower())].split())-word_index_begin
                                if 0<shift<=3:
                                    cnt[3]+=1
                                    is_annotated = True
                                    span_info.append((da,s,v,word_index_begin,word_index_begin+shift))
                            
                        elif 'care' in v:
                            # value: don't care 
                            key_phrases = ["not particular", "no particular", "any ","not really", "don't matter", "don't care", "do not care", "don't really care", "dont care", "doesn't matter", "doesnt matter", "don't have a preference", "do not have a preference", "doesn't really matter", "does not matter"]
                            flag = 0
                            for key_phrase in key_phrases:
                                if key_phrase in user_utterance.lower():
                                    flag=1
                                    break
                            if flag==1:
                                char_index_begin = user_utterance.lower().index(key_phrase)
                                word_index_begin = len(user_utterance[:char_index_begin].split())
                                span_info.append((da,s,v,word_index_begin,word_index_begin+len(key_phrase.split())-1))
                                cnt[4]+=1
                                is_annotated = True
                        elif ':' in v and ':' in user_utterance.lower():
                            # time value
                            char_index_begin = user_utterance.lower().index(':')
                            word_index_begin = len(user_utterance[:char_index_begin].split())-1
                            if user_utterance.split()[word_index_begin-1]=='after' or user_utterance.split()[word_index_begin-1]=='before':
                                span_info.append((da,s,v,word_index_begin-1,word_index_begin))
                            else:
                                span_info.append((da,s,v,word_index_begin,word_index_begin))
                            cnt[5]+=1
                            is_annotated = True
                        elif v=='centre' and 'center' in user_utterance.lower():
                            char_index_begin = user_utterance.lower().index('center')
                            word_index_begin = len(user_utterance[:char_index_begin].split())
                            span_info.append((da,s,v,word_index_begin,word_index_begin))
                            cnt[6]+=1
                            is_annotated = True
                        else:
                            cnt[7]+=1
                        if is_annotated:
                            cnt[-2]+=1
                        else:
                            no_span_sentence.append((da,s,v,user_utterance))
        session['log'][i]['span_info'] = span_info
print(cnt)

[142535, 1428, 2525, 1002, 285, 358, 57, 1538, 145665, 149890]


In [273]:
no_span_sentence

[('Hotel-Inform',
  'Price',
  'dont care',
  'no, but it should have free parking, please.'),
 ('Restaurant-Inform',
  'Name',
  'meze bar restaurant',
  'Can I have the phone number, address and postcode for the restaurant please?'),
 ('Attraction-Inform',
  'Name',
  'funky fun house',
  'Could i also get the phone number and postcode?'),
 ('Attraction-Inform',
  'Name',
  "christ's college",
  'Can I please get their phone number and postcode? '),
 ('Restaurant-Inform',
  'Name',
  'the good luck chinese food takeaway',
  'Yes, that sounds good! Book a table for 3 at 13:45 please. '),
 ('Train-Inform', 'Dest', 'cambridge', 'I will be heading to cabridge'),
 ('Attraction-Inform',
  'Name',
  'the cherry hinton village centre',
  'I would like to get the phone number, please.'),
 ('Restaurant-Inform',
  'Name',
  'the gardenia',
  'Yes, please book a table for 1 people at 19:15 on monday.'),
 ('Hotel-Inform', 'Name', 'el shaddai', 'Yes, can you book the room for me?'),
 ('Train-Infor

In [91]:
for k,v in da2usersen.items():
    print(k,'\t',len(v))
    for sen in v[:3]:
        print('\t',sen)

('Hotel-Inform',) 	 11515
	 am looking for a place to to stay that has cheap price range it should be in a type of hotel
	 no, i just need to make sure it's cheap. oh, and i need parking
	 Yes, please. 6 people 3 nights starting on tuesday.
('general-bye',) 	 2714
	 No, that will be all. Good bye.
	 You were great. Goodbye.
	 Thank you very much, goodbye.
('Police-Inform',) 	 160
	 Hello, I have been robbed.  Can you please help me get in touch with the police?
	 Can you tell me the address to the police station in Cambridge?
	 Hi, I would like to find Parkside Police Station please.
('Police-Request',) 	 181
	 Can I please have the postcode as well?
	 Was Parkside the address of the police station? If not, can I have the address please?
	 I need the address including postcode of the police station and also the phone number.
('general-thank',) 	 9026
	 Thank you that will be all for now.
	 Thank you for all the help! I appreciate it.
	 Thank you. That's all I needed.
() 	 7091
	 Could 

In [271]:
145665/149890

0.9718126626192541

In [270]:
sum([142535, 1428, 1002, 285, 358, 57])

145665

In [277]:
json.dump(all_data,open('annotated_user_da_with_span_full.json','w'),indent=4)

In [276]:
json.dump(dict(list(all_data.items())[:100]),open('annotated_user_da_with_span_100sample.json','w'),indent=4)

In [275]:
json.dump(no_span_sentence,open('no_span_sentence.json','w'),indent=4)

In [274]:
149890-145665

4225