In [2]:
import pandas as pd
from ydata_profiling import ProfileReport

In [3]:
data = pd.read_json('./data/de_client_01.json')
data = pd.concat([pd.read_json('./data/de_client_01.json'),
                  pd.read_json('./data/de_client_02.json'),
                  pd.read_json('./data/pt_br_client_02.json'),
                  pd.read_json('./data/pt_pt_client_03.json')],
                  axis=0).reset_index(drop=True)

# find duplicates in data['id']
data

Unnamed: 0,id,turns,dialog
0,en_de_#CLIENT-01#_default_2020-12-20-4,[{'text_mt': ['Visitor uploaded: #ALPHANUMERIC...,"{'Dropped conversation': 0, 'Task Sucess': 5}"
1,en_de_#CLIENT-01#_default_2020-12-20-13,"[{'text_mt': ['Hello,', 'I am interested in th...","{'Dropped conversation': 0, 'Task Sucess': 3}"
2,en_de_#CLIENT-01#_default_2020-12-20-12,"[{'text_mt': ['Hello, I have an article that s...","{'Dropped conversation': 1, 'Task Sucess': 3}"
3,en_de_#CLIENT-01#_default_2020-12-20-1,[{'text_mt': ['When does my order #NUMBER# fin...,"{'Dropped conversation': 1, 'Task Sucess': 2}"
4,en_de_#CLIENT-01#_default_2020-12-20-0,"[{'text_mt': ['Hello, I would like to cancel a...","{'Dropped conversation': 0, 'Task Sucess': 5}"
...,...,...,...
496,en_pt_#CLIENT-03#_2021-02-14-0,"[{'text_mt': ['Good morning, I would like to k...","{'Dropped conversation': 0.0, 'Task Sucess': 5.0}"
497,en_pt_#CLIENT-03#_2021-02-14-2,[{'text_mt': ['Good morning my watchman is not...,"{'Dropped conversation': 0.0, 'Task Sucess': 2.0}"
498,en_pt_#CLIENT-03#_2021-02-14-1,"[{'text_mt': ['Good afternoon, no longer use m...","{'Dropped conversation': 0.0, 'Task Sucess': 3.0}"
499,en_pt_#CLIENT-03#_2021-02-14-3,[{'text_mt': ['I acquired #PRS_ORG# #NUMBER# 1...,"{'Dropped conversation': 0.0, 'Task Sucess': 5.0}"


In [4]:
turn_dict = {
    'id': [],
    'turn_order': [],
    'sentences': [],
    'sentences_text': [],
    'source': [],
    'understanding': [],
    'sensibleness': [],
    'politeness': [],
    'IQ': []   
}

def craft_sentences(sentences, emotions, engagements):
    res_sentences = []
    sentences_text = ""
    for index, sentence in enumerate(sentences):
        sentences_text += sentence + "\n"
        d = dict()
        d['sentence'] = sentence
        try:
            d['emotion'] = emotions[index]
        except:
            d['emotion'] = None
        try:
            d['engagement'] = engagements[index]
        except:
            d['engagement'] = None
        
        res_sentences.append(d)
    
    return res_sentences, sentences_text


for index, row in data.iterrows():
    for index, turn in enumerate(row['turns']):
        turn_dict['id'].append(row['id'])
        turn_dict['turn_order'].append(index)
        if turn['floor'] == 'inbound':
            turn_dict['source'].append('client')
            sentences_dict, sentences_text = craft_sentences(
                turn['text_mt'], turn['Emotion'], turn['Engagement']
            )
            turn_dict['sentences'].append(sentences_dict)
            turn_dict['sentences_text'].append(sentences_text)
        else:
            turn_dict['source'].append('agent')
            sentences_dict, sentences_text = craft_sentences(
                turn['text_src'], turn['Emotion'], turn['Engagement']
            )
            turn_dict['sentences'].append(sentences_dict)
            turn_dict['sentences_text'].append(sentences_text)

        turn_dict['understanding'].append(turn['Understanding'])
        turn_dict['sensibleness'].append(turn['Sensibleness'])
        turn_dict['IQ'].append(turn['IQ'])
        turn_dict['politeness'].append(turn['Politeness'])

turn_df = pd.DataFrame.from_dict(turn_dict)

turn_df

Unnamed: 0,id,turn_order,sentences,sentences_text,source,understanding,sensibleness,politeness,IQ
0,en_de_#CLIENT-01#_default_2020-12-20-4,0,[{'sentence': 'Visitor uploaded: #ALPHANUMERIC...,Visitor uploaded: #ALPHANUMERIC_ID#\nURL: #URL...,client,,,,
1,en_de_#CLIENT-01#_default_2020-12-20-4,1,"[{'sentence': 'Good Evening #NAME#', 'emotion'...",Good Evening #NAME#\nThanks for contacting #PR...,agent,1.0,1.0,1.0,5.0
2,en_de_#CLIENT-01#_default_2020-12-20-4,2,"[{'sentence': '#ADDRESS#, #EMAIL#', 'emotion':...","#ADDRESS#, #EMAIL#\n#PRS_ORG# seater sofa retr...",client,1.0,1.0,1.0,5.0
3,en_de_#CLIENT-01#_default_2020-12-20-4,3,[{'sentence': 'I will just look into this for ...,I will just look into this for you.\nMay I ask...,agent,1.0,1.0,1.0,5.0
4,en_de_#CLIENT-01#_default_2020-12-20-4,4,[{'sentence': 'Will I receive an email tomorro...,Will I receive an email tomorrow?\nA discount ...,client,1.0,1.0,1.0,5.0
...,...,...,...,...,...,...,...,...,...
7452,en_pt_#CLIENT-03#_2021-02-21-0,19,[{'sentence': 'Yes as the payment was not succ...,Yes as the payment was not successful.\n,agent,1.0,1.0,1.0,5.0
7453,en_pt_#CLIENT-03#_2021-02-21-0,20,[{'sentence': 'And how do I cancel the order I...,And how do I cancel the order I made and that ...,client,1.0,1.0,1.0,4.0
7454,en_pt_#CLIENT-03#_2021-02-21-0,21,[{'sentence': 'Previously the order payment ha...,Previously the order payment has been failed s...,agent,1.0,1.0,1.0,5.0
7455,en_pt_#CLIENT-03#_2021-02-21-0,22,[{'sentence': 'So I will proceed to another or...,So I will proceed to another order\n,client,1.0,1.0,1.0,5.0


In [5]:
sentences_dict = {
    "id": [],
    "turn_order": [],
    "source": [],
    "sentence": [],
    "emotion": [],
    "engagement": []
}

for index, row in turn_df.iterrows():
    for sentence in row['sentences']:
        sentences_dict['id'].append(row['id'])
        sentences_dict['turn_order'].append(row['turn_order'])
        sentences_dict['source'].append(row['source'])
        sentences_dict['sentence'].append(sentence['sentence'])
        sentences_dict['emotion'].append(sentence['emotion'])
        sentences_dict['engagement'].append(sentence['engagement'])

sentences_df = pd.DataFrame.from_dict(sentences_dict)

In [None]:
dialog_dict = {
    "id": [],
    "dialog_text": [],
    "dialog": []
}

# group turn_df by id

for id, group in turn_df.groupby('id'):
    dialog_text = ""
    dialog = []
    for index, row in group.iterrows():
        dialog_text += f"{row['source']}: {row['sentences_text']}"
        dialog_text += "---\n"
        dialog.append({row['source']: {"sentences":row['sentences'],
                                       "iq": row['IQ'],
                                       "sensibleness": row['sensibleness'],
                                       "understanding": row['understanding'],
                                       "politeness": row['politeness']
                                       }})
    dialog_dict['id'].append(id)
    dialog_dict['dialog'].append(dialog)
    dialog_dict['dialog_text'].append(dialog_text)

# dialog_df = pd.DataFrame.from_dict(dialog_dict).concat(pd.DataFrame(data['dialog'].to_list()))
dialog_df = pd.concat([pd.DataFrame.from_dict(dialog_dict), pd.DataFrame(data['dialog'].to_list())], axis=1)
dialog_df.iloc[0]['dialog']

In [None]:
sentences_profile = ProfileReport(sentences_df)
sentences_profile

In [None]:
turn_profile = ProfileReport(turn_df)
turn_profile

In [None]:
dialog_profile = ProfileReport(dialog_df)
dialog_profile

In [10]:
turn_df.to_json('./created_data/turn_df.json')
sentences_df.to_json('./created_data/sentences_df.json')
dialog_df.to_json('./created_data/dialog_df.json')