In [1]:
import os,sys
import csv
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Constants
MR_FIELDS = ["name", "familyFriendly", "eatType", "food", "priceRange", "near", "area", "customer rating"]
MR_KEYMAP = dict((key, idx) for idx, key in enumerate(MR_FIELDS))
MR_KEY_NUM = len(MR_KEYMAP)

# Mappngs for delexicalization
NAME_TOKEN = '<name>'
NEAR_TOKEN = '<near>'
PAD_TOKEN = '<blank>'
PAD_ID = 0

# Regex pattern for tokenization
_WORD_SPLIT = re.compile(r"([.,!?\"':;)(])")

# Misc
FOOD_TYPE_PAT = re.compile(r'food\[(.*?)\]')
JAP_CUISINE = set(['sushi', 'sushi.', 'asian', 'oriental'])

In [2]:
%matplotlib
pd.options.display.max_seq_items = 2000
pd.set_option('max_colwidth', 200)

Using matplotlib backend: TkAgg


In [3]:
def process_e2e_text(s):
    # Process E2E Challenge reference texts
    # Represent each references as a list of tokens.
    words = []
    for fragment in s.strip().split():
        fragment_tokens = _WORD_SPLIT.split(fragment)
        words.extend(fragment_tokens)
    tokens = [w for w in words if w]
    return tokens

In [4]:
def process_e2e_mr(s):
    # Process E2E Challenge meaning representation
    # Represent each MR as a list of 8 attributes, specified in 'MR_FIELDS'
    items = s.split(", ")
    mr_data = [None] * MR_KEY_NUM
    for idx, item in enumerate(items):
        key, raw_val = item.split("[")
        key_idx = MR_KEYMAP[key]
        mr_data[key_idx] = raw_val[:-1]
    return dict(zip(MR_FIELDS,mr_data))

In [5]:
def process_e2e_mr_delex(s):
    # Process E2E Challenge meaning representation w/ 'name' and 'near' delexicalized
    # Represent each MR as a list of 8 attributes, specified in 'MR_FIELDS'
    items = s.split(", ")
    mr_data = [None] * MR_KEY_NUM
    lex = [None, None] # holds lexicalized variants of NAME and NEAR
    for idx, item in enumerate(items):
        key, raw_val = item.split("[")
        key_idx = MR_KEYMAP[key]
        # Delexicalization
        if key == 'name':
            mr_val = NAME_TOKEN
            lex[0] = raw_val[:-1]
        elif key == 'near':
            mr_val = NEAR_TOKEN
            lex[1] = raw_val[:-1]
        else:
            mr_val = raw_val[:-1]
        mr_data[key_idx] = mr_val
    return dict(zip(MR_FIELDS,mr_data))


def cnt_bins_and_cnts():
    # Aux function to compute exact number of snt lengths
    lengths_to_consider = [0,10,20,30,40,50,60,70,80]
    bins = [(lengths_to_consider[i], lengths_to_consider[i+1]) for i in range(len(lengths_to_consider)-1)]
    cnts = [0] * len(bins)
    for l in references_lens:
        for bin_idx, b in enumerate(bins):
            if l > b[0] and l <= b[1]:
                cnts[bin_idx] += 1
                break
    return (bins, cnts)

def plot_len_hist(lens, fname):
    # Aux function to plot a histogram of the distribution of lengths of references.
    # Creating a Pandas DataFrame from a list of lengths 
    references_lens_df = pd.DataFrame(references_lens)
    # Retrieving stats from Pandas DF
    mean = float(references_lens_df.mean())
    std = float (references_lens_df.std())
    min_len = int(references_lens_df.min())
    max_len = int(references_lens_df.max())
    
    pp = PdfPages(fname)
    # plot the histogram of the length distribution
    n, bins, patches = plt.hist(lens, 20, facecolor='b', alpha=0.55)
    plt.xlabel('Sentence Length')
    plt.ylabel('Number of sentences')
    plt.title('Sentence length distribution')
    plt.axis([0, 80, 0, 10000])
    
    plt.text(40, 7500, r'$mean={:.2f},\ std={:.2f}$'.format(mean, std))
    plt.text(40, 6800, r'$min={},\ max={}$'.format(min_len, max_len))
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    pp.savefig()
    pp.close()

In [7]:
FNAME = os.path.abspath("e2e_dataset/trainset.csv") # fname of the training data
DATA_LEN_HIST_FN = 'data_len_hist.pdf' #fname for the histogram of length distribution
data = pd.read_csv(FNAME,dtype=str) # Pandas DataFrame obj holding data

In [14]:
mr_dict = [process_e2e_mr(data.iloc[i]['mr']) for i in range(len(data))]
mr_dict

[{'name': 'The Vaults',
  'familyFriendly': None,
  'eatType': 'pub',
  'food': None,
  'priceRange': 'more than £30',
  'near': 'Café Adriatic',
  'area': None,
  'customer rating': '5 out of 5'},
 {'name': 'The Cambridge Blue',
  'familyFriendly': None,
  'eatType': 'pub',
  'food': 'English',
  'priceRange': 'cheap',
  'near': 'Café Brazil',
  'area': None,
  'customer rating': None},
 {'name': 'The Eagle',
  'familyFriendly': 'yes',
  'eatType': 'coffee shop',
  'food': 'Japanese',
  'priceRange': 'less than £20',
  'near': 'Burger King',
  'area': 'riverside',
  'customer rating': 'low'},
 {'name': 'The Mill',
  'familyFriendly': None,
  'eatType': 'coffee shop',
  'food': 'French',
  'priceRange': '£20-25',
  'near': 'The Sorrento',
  'area': 'riverside',
  'customer rating': None},
 {'name': 'Loch Fyne',
  'familyFriendly': None,
  'eatType': None,
  'food': 'French',
  'priceRange': None,
  'near': 'The Rice Boat',
  'area': 'riverside',
  'customer rating': 'high'},
 {'name': 

In [16]:
ref_dict = [process_e2e_text(data.iloc[i]['ref']) for i in range(len(data))]
ref_dict[0]

['The',
 'Vaults',
 'pub',
 'near',
 'Café',
 'Adriatic',
 'has',
 'a',
 '5',
 'star',
 'rating',
 '.',
 'Prices',
 'start',
 'at',
 '£30',
 '.']

In [8]:
mr_df = pd.DataFrame.from_dict([process_e2e_mr(data.iloc[i]['mr']) for i in range(len(data))])

In [9]:
mr_df.describe(exclude=[None])

Unnamed: 0,name,familyFriendly,eatType,food,priceRange,near,area,customer rating
count,42061,26295,20111,35126,29127,20546,24716,28090
unique,34,2,3,7,6,19,2,6
top,The Rice Boat,yes,coffee shop,Japanese,moderate,Café Rouge,riverside,average
freq,2826,17564,10396,5996,5650,2052,16894,5495


In [10]:
references = [process_e2e_text(data.iloc[i]['ref']) for i in range(len(data))]

In [19]:
references_lens = [len(d) for d in references]
references_lens[:10]

[17, 26, 27, 29, 19, 30, 18, 30, 19, 27]

In [22]:
import matplotlib
matplotlib.rcParams.update({'font.size': 12})
plot_len_hist(references_lens, fname=DATA_LEN_HIST_FN)

In [13]:
# see what the unique values for each MR are
for colname in mr_df.columns:
    print(colname, ": ", mr_df[colname].unique())

name :  ['The Vaults' 'The Cambridge Blue' 'The Eagle' 'The Mill' 'Loch Fyne'
 'Bibimbap House' 'The Rice Boat' 'The Wrestlers' 'Aromi' 'The Phoenix'
 'Browns Cambridge' 'Taste of Cambridge' 'Cocum' 'The Dumpling Tree'
 'The Punter' 'The Golden Curry' 'Alimentum' 'Midsummer House'
 'Blue Spice' 'Strada' 'The Waterman' 'Zizzi' 'Green Man' 'Clowns'
 'Giraffe' 'The Olive Grove' 'The Twenty Two' 'The Cricketers' 'Wildwood'
 'The Golden Palace' 'The Plough' 'Cotto' 'Fitzbillies'
 'Travellers Rest Beefeater']
familyFriendly :  [None 'yes' 'no']
eatType :  ['pub' 'coffee shop' None 'restaurant']
food :  [None 'English' 'Japanese' 'French' 'Fast food' 'Italian' 'Indian'
 'Chinese']
priceRange :  ['more than £30' 'cheap' 'less than £20' '£20-25' None 'moderate' 'high']
near :  ['Café Adriatic' 'Café Brazil' 'Burger King' 'The Sorrento'
 'The Rice Boat' 'Clare Hall' None 'Raja Indian Cuisine' 'Café Rouge'
 'Yippee Noodle Bar' 'The Portland Arms' 'All Bar One'
 'Express by Holiday Inn' 'The Baker

In [23]:
mr_df

Unnamed: 0,name,familyFriendly,eatType,food,priceRange,near,area,customer rating
0,The Vaults,,pub,,more than £30,Café Adriatic,,5 out of 5
1,The Cambridge Blue,,pub,English,cheap,Café Brazil,,
2,The Eagle,yes,coffee shop,Japanese,less than £20,Burger King,riverside,low
3,The Mill,,coffee shop,French,£20-25,The Sorrento,riverside,
4,Loch Fyne,,,French,,The Rice Boat,riverside,high
...,...,...,...,...,...,...,...,...
42056,The Rice Boat,yes,,Indian,cheap,Express by Holiday Inn,city centre,5 out of 5
42057,The Vaults,,restaurant,Chinese,,,,
42058,The Cambridge Blue,,restaurant,Italian,,,,high
42059,The Eagle,yes,coffee shop,Italian,less than £20,Burger King,riverside,low
