In [1]:
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

In [2]:
DATA_PATH = Path("../data")
FRAMES_JSON_PATH = Path(DATA_PATH, "raw/frames.json")

raw_data = pd.read_json(FRAMES_JSON_PATH)
raw_data.describe(include='all')

Unnamed: 0,user_id,turns,wizard_id,id,labels
count,1369,1369,1369,1369,1369
unique,11,1369,12,1369,16
top,U22K1SX9N,[{'text': 'I'd like to book a trip to Atlantis...,U21T9NMKM,e2c0fc6c-2134-4891-8353-ef16d8412c9a,"{'userSurveyRating': 5.0, 'wizardSurveyTaskSuc..."
freq,345,1,301,1,929


**1369 annoted dialogs** (composed of several turns) between a bot and a human user trying to **book a flight**

In [3]:
raw = raw_data[["id", "wizard_id", "user_id"]]
frames= raw.copy()
frames[["userSurveyRating", "wizardSurveyTaskSuccessful"]] = [
    [x["userSurveyRating"], x["wizardSurveyTaskSuccessful"]]
    for x in raw_data.labels
]
frames = frames.astype(
    {"userSurveyRating": "float", "wizardSurveyTaskSuccessful": "bool"}
)
frames.describe(include="all")

Unnamed: 0,id,wizard_id,user_id,userSurveyRating,wizardSurveyTaskSuccessful
count,1369,1369,1369,1366.0,1369
unique,1369,12,11,,2
top,e2c0fc6c-2134-4891-8353-ef16d8412c9a,U21T9NMKM,U22K1SX9N,,True
freq,1,301,345,,1287
mean,,,,4.573419,
std,,,,0.839596,
min,,,,1.0,
25%,,,,4.0,
50%,,,,5.0,
75%,,,,5.0,


In [4]:
frames.head()

Unnamed: 0,id,wizard_id,user_id,userSurveyRating,wizardSurveyTaskSuccessful
0,e2c0fc6c-2134-4891-8353-ef16d8412c9a,U21DKG18C,U22HTHYNP,4.0,True
1,4a3bfa39-2c22-42c8-8694-32b4e34415e9,U21DMV0KA,U21E41CQP,3.0,True
2,6e67ed28-e94c-4fab-96b6-68569a92682f,U21E0179B,U21RP4FCY,2.0,False
3,5ae76e50-5b48-4166-9f6d-67aaabd7bcaa,U21DKG18C,U22HTHYNP,5.0,True
4,24603086-bb53-431e-a0d8-1dcc63518ba9,U21DMV0KA,U21E41CQP,5.0,True


In [5]:
for turn in raw_data["turns"]:
    known_facts = {}
    for i, frame in enumerate(turn):
        print(f'{i} - { frame["author"] } says : \n"{ frame["text"] }"')

        known_facts.update(
            {
                info_key: info[-1]["val"] if not info[-1]["negated"] else None
                for f in frame["labels"]["frames"]
                for info_key, info in f["info"].items()
            }
        )

        print(f"Known facts : \n{known_facts}")
        print()
    
        if i > 3:
            break
    break

0 - user says : 
"I'd like to book a trip to Atlantis from Caprica on Saturday, August 13, 2016 for 8 adults. I have a tight budget of 1700."
Known facts : 
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '8'}

1 - wizard says : 
"Hi...I checked a few options for you, and unfortunately, we do not currently have any trips that meet this criteria.  Would you like to book an alternate travel option?"
Known facts : 
{'intent': 'book', 'budget': '1700.0', 'dst_city': 'Atlantis', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '8', 'NO_RESULT': True}

2 - user says : 
"Yes, how about going to Neverland from Caprica on August 13, 2016 for 5 adults. For this trip, my budget would be 1900."
Known facts : 
{'intent': 'book', 'budget': '1900.0', 'dst_city': 'Neverland', 'or_city': 'Caprica', 'str_date': 'august 13', 'n_adults': '5', 'NO_RESULT': True}

3 - wizard says : 
"I checked the availability for this date 

In [6]:
if Path(DATA_PATH, "processed/turns.csv").exists():
    turns = pd.read_csv(Path(DATA_PATH, "processed/turns.csv"))

else:
    turns = pd.DataFrame()

    for turn in tqdm(raw_data["turns"]):
        known_facts = {}

        for i, frame in enumerate(turn):
            if frame["author"] == "wizard":
                continue

            turn_dict = {
                "text": frame["text"],
            }

            turn_dict.update(
                {f"old_{key}": value for key, value in known_facts.items()}
            )

            known_facts.update(
                {
                    info_key: info[-1]["val"]
                    if not info[-1]["negated"]
                    else None
                    for f in frame["labels"]["frames"]
                    for info_key, info in f["info"].items()
                }
            )

            turn_dict.update(
                {f"new_{key}": value for key, value in known_facts.items()}
            )

            turns = turns.append(turn_dict, ignore_index=True)

    turns.to_csv(Path(DATA_PATH, "processed/turns.csv"), index=False)

turns

Unnamed: 0,text,old_or_city,new_or_city,old_dst_city,new_dst_city,old_str_date,new_str_date,old_end_date,new_end_date,old_budget,new_budget
0,I'd like to book a trip to Atlantis from Capri...,,Caprica,,Atlantis,,august 13,,,,1700.0
1,"Yes, how about going to Neverland from Caprica...",Caprica,Caprica,Atlantis,Neverland,august 13,august 13,,,1700.0,1900.0
2,I have no flexibility for dates... but I can l...,Caprica,Atlantis,Neverland,Atlantis,august 13,august 13,,,1900.0,1700.0
3,I suppose I'll speak with my husband to see if...,Atlantis,Atlantis,Atlantis,Atlantis,august 13,august 13,,,1700.0,1700.0
4,"Hello, I am looking to book a vacation from Go...",,Gotham City,,Mos Eisley,,,,,,2100.0
...,...,...,...,...,...,...,...,...,...,...,...
10402,"5 adults and 7 kids! Yup, the lot of us. We wa...",Tampa,Tampa,-1,-1,,,,,,32800.0
10403,Oh yes! Between September 12 and 26!,Tampa,Tampa,-1,-1,,september 12,,26,32800.0,32800.0
10404,"That sounds amazing, and it's within those dat...",Tampa,Tampa,-1,Queenstown,september 12,september 12,26,26,32800.0,32800.0
10405,"Ok perfect, book me!",Tampa,Tampa,Queenstown,Queenstown,september 12,september 12,26,25,32800.0,32800.0


In [7]:
turns.describe(include="all")

Unnamed: 0,text,old_or_city,new_or_city,old_dst_city,new_dst_city,old_str_date,new_str_date,old_end_date,new_end_date,old_budget,new_budget
count,10407,8287,9620,8307,9631,6287,7430,4787,5734,5255,6229
unique,9695,332,339,382,392,151,155,129,131,225,228
top,Thanks!,-1,-1,-1,Punta Cana,-1,-1,-1,-1,-1,-1
freq,73,158,174,257,283,567,655,344,404,1469,1704


In [8]:
columns = ["text"] + [
    f"{prefix}_{key}"
    for key in ["or_city", "dst_city", "str_date", "end_date", "budget"]
    for prefix in ["old", "new"]
]
new_turns = turns[columns]
new_turns.to_csv(Path(DATA_PATH, "processed/turns.csv"), index=False)
new_turns

Unnamed: 0,text,old_or_city,new_or_city,old_dst_city,new_dst_city,old_str_date,new_str_date,old_end_date,new_end_date,old_budget,new_budget
0,I'd like to book a trip to Atlantis from Capri...,,Caprica,,Atlantis,,august 13,,,,1700.0
1,"Yes, how about going to Neverland from Caprica...",Caprica,Caprica,Atlantis,Neverland,august 13,august 13,,,1700.0,1900.0
2,I have no flexibility for dates... but I can l...,Caprica,Atlantis,Neverland,Atlantis,august 13,august 13,,,1900.0,1700.0
3,I suppose I'll speak with my husband to see if...,Atlantis,Atlantis,Atlantis,Atlantis,august 13,august 13,,,1700.0,1700.0
4,"Hello, I am looking to book a vacation from Go...",,Gotham City,,Mos Eisley,,,,,,2100.0
...,...,...,...,...,...,...,...,...,...,...,...
10402,"5 adults and 7 kids! Yup, the lot of us. We wa...",Tampa,Tampa,-1,-1,,,,,,32800.0
10403,Oh yes! Between September 12 and 26!,Tampa,Tampa,-1,-1,,september 12,,26,32800.0,32800.0
10404,"That sounds amazing, and it's within those dat...",Tampa,Tampa,-1,Queenstown,september 12,september 12,26,26,32800.0,32800.0
10405,"Ok perfect, book me!",Tampa,Tampa,Queenstown,Queenstown,september 12,september 12,26,25,32800.0,32800.0
