## Function to create train/test data from Commonsense-Dialogues dataset

For use in Team #2 Project for AIT 726

https://github.com/alexa/Commonsense-Dialogues

In [4]:
import json
import pandas as pd


In [5]:
dialogs = """
{
"1": {
        "context": "Austin left in a huff of rage after they were beaten in the video game competition.",
        "speaker": "Austin",
        "turns": [
            "I got so mad, I couldn't contain it anymore",
            "Did you huff off?",
            "I did, I flared up into anger",
            "You need to calm down, it's just a video game",
            "I know, I should not let it get to me like this.",
            "blow off some steam and come back"
        ]
    },
    "2": {
        "context": "Austin left in a huff of rage after they were beaten in the video game competition.",
        "speaker": "Austin",
        "turns": [
            "I was so devastated after the defeat.",
            "You should have put more effort into it!",
            "yes, I know! I left in a huff of rage after I was beaten in the video game competition.",
            "You need to buckle up next time!",
            "I know, I will."
        ]
    },
    "3": {
        "context": "Austin left in a huff of rage after they were beaten in the video game competition.",
        "speaker": "Austin",
        "turns": [
            "I was so upset that I lost yesterday.",
            "What did you play?",
            "It was a video game competition.",
            "Who did you play against?",
            "They were players form the town next to us.",
            "That happened sometimes. Better luck next time."
        ]
    },
    "4": {
        "context": "Cameron had accidentally broken the door, so they decided to tell their mom about it.",
        "speaker": "Cameron",
        "turns": [
            "Remember when I was playing with the guys while you were leaving?",
            "Did you break something?",
            "Yeah, the boys tackled me and we broke the door. So I called my mom and told her.",
            "What did she say?",
            "She was really mad and told me to call the repair man.",
            "Well at least you aren't grounded, yet."
        ]
    }
}"""

In [7]:
def parse_dialogs(dialogs: str) -> pd.DataFrame:
    '''given a json string object from commonsense-dialogues, return dataframe with parsed
    training or testing data
    	incoming	reply_needed	response
0	Austin left in a huff of rage after they were ...	True	Did you huff off?
1	Austin left in a huff of rage after they were ...	True	blow off some steam and come back
2	Austin left in a huff of rage after they were ...	False	None
    '''
    assert type(dialogs) == str
    try:
        our_json = json.loads(dialogs)
    except Exception as e:
        print('failed json.loading dialogs: ', e)

    new_data = []

    for idx, d in enumerate(our_json.items()):
        # bonus preprocessing # TODO:
        # replace speaker in any of these messages with "I"?? (maybe not...)

        # read in our dialog, parse into "incoming" message, and maybe a response
        context = d[1]['context']
        turns = d[1]['turns']  # d[0] is the index, d[1] is the dialog
        line_count = len(turns)

        # Case 1. incoming = d[1]['context'] +d[1]['turns'][0]
        incoming = ' '.join([context, turns[0]])
        reply_needed = True
        response = turns[1]
        new_data.append({'incoming': incoming, 'reply_needed':reply_needed, 'response':response})

        # Case 2. context + line n-1 = incoming email that needs a response, with target line n
        incoming = ' '.join([context, turns[line_count-2]])  # zero-index, line n-1
        reply_needed = True
        response = turns[line_count-1]  # last line in turns
        new_data.append({'incoming': incoming, 'reply_needed':reply_needed, 'response':response})

        # Case 3. context + line n = incoming email that does not need a response (target = None)
        incoming = ' '.join([context, turns[line_count-1]])
        reply_needed = False
        response = None
        new_data.append({'incoming': incoming, 'reply_needed':reply_needed, 'response':response})

        # Case 4. context + line ending in "?", if not last line = incoming email needing a response, and next line is target
        for idx, t in enumerate(turns[:-1]):
            if "?" in t:
                incoming = ' '.join([context, t])
                reply_needed = True
                response = turns[idx]
                new_data.append({'incoming': incoming, 'reply_needed':reply_needed, 'response':response})



    # print out all of our content -- uncomment this block to see full values
    #for element in new_data:
    #    print(f'\nincoming: {element["incoming"]}\n\nresponse: {element["response"]}\n\n{"---"*5}')

    df = pd.DataFrame(new_data, columns=['incoming', 'reply_needed', 'response'])
    return df
    

df = parse_dialogs(dialogs)


In [10]:
df.loc[0]['incoming']

"Austin left in a huff of rage after they were beaten in the video game competition. I got so mad, I couldn't contain it anymore"