In [None]:
import json
import plotly.express as px
import pandas as pd

In [None]:
json_file = json.load(open('data_level0.json', encoding="utf8"))
keys = json_file.keys()
print(keys)

In [None]:
def extract_data(key: str, json_file: json) -> dict:
    print(f"Total number of Utterances: {len(json_file[key])}")

for key in keys:
    print(key)
    extract_data(key, json_file)
    print("\n\n")

# Pre-processing Rules

- Replace '’' with '''. (Issue: mixture of apostrophe)

# Layer 1

- Count the distribution of emotion
- Count the number of sentiments

In [None]:
emotion_count = {}
sentiment_count = {}

sentiment_emotion_count = {
    'positive': {},
    'negative': {},
    'neutral': {}
}

dialog_length_emotion = {
    'even': {},
    'odd': {}
}

dialogue_dict = {}
utterance_length = []
dialog_length_list = []

speakers_per_dialogue = []

emotion_utter_dict = {}

def concat_dialogue_dict(dialogue_dict: dict, dialog_id: str, utterance: str) -> None:
    
    # Check if
    dialogue_dict[dialog_id] = dialogue_dict.get(dialog_id, 0) + 1
    
    return None

def preprocessing_utterance(utterance: str) -> str:
    utterance = utterance.replace("’","'")
    
    return utterance

# Use Train dataset for testing
for key in ['train']:
    
    # Extract all the dialogue in given array
    # dia0_utt0 -> Dialogue 0, Utterance 0
    utterance_dialogue_list = json_file[key]
    
    curr_dialog = ""
    speakers_list = []
    total_dialog_length = 0
    total_utterance = 0
    emotion_list = []
    
    # Go through one utterance at a time
    for utterance_id in utterance_dialogue_list.keys():
        
        utter_dict = utterance_dialogue_list[utterance_id]
        
        dialog_num, utt_num = utterance_id.split("_")
        
        # New dialog is in place
        if dialog_num != curr_dialog:
            curr_dialog = dialog_num
            
            speakers_per_dialogue += [len(speakers_list)]
            dialog_length_list += [total_dialog_length]
            
            value = 'odd' if total_utterance % 2 != 0 else 'even'
            
            for emotion in emotion_list:
                inner_dict = dialog_length_emotion[value]
                
                if emotion not in inner_dict:
                    dialog_length_emotion[value][emotion] = 1
                else:
                    dialog_length_emotion[value][emotion] +=1
                
            speakers_list = []
            total_dialog_length = 0
            total_utterance = 0
            emotion_list = []
        
        # Preprocessing
        utterance = preprocessing_utterance(utter_dict['Utterance'])
        
        # Add utterance to dialog dictionary, due dataset structure.
        concat_dialogue_dict(dialogue_dict, utter_dict['Dialogue_ID'], utterance)
        
        # Find the length of an utterance
        utterance_tokens = len(utterance.split(" "))
        utterance_length += [utterance_tokens]
        total_dialog_length += utterance_tokens
        total_utterance += 1
        
        # For emotion and sentiment labels
        emotion = utter_dict['Emotion']
        sentiment = utter_dict['Sentiment']
        
        emotion_list.append(emotion)
        
        # Count number of emotions per sentiment
        if emotion not in sentiment_emotion_count[sentiment]:
            sentiment_emotion_count[sentiment][emotion] = 1
        else:
            sentiment_emotion_count[sentiment][emotion] += 1
            
        if emotion not in emotion_utter_dict:
            emotion_utter_dict[emotion] = [utterance_tokens]
        else:
            emotion_utter_dict[emotion] += [utterance_tokens]
        
        emotion_count[emotion] = emotion_count.get(emotion, 0) + 1
        sentiment_count[sentiment] = sentiment_count.get(sentiment, 0) + 1
        
        # Speaker information
        speaker = utter_dict['Speaker']
        if speaker not in speakers_list:
            speakers_list += [speaker]

In [None]:
emotion_count_df = pd.DataFrame(emotion_count.items(), columns=['emotion', 'count'])

fig = px.bar(emotion_count_df, x='emotion', y='count', title='Emotion Counts')
fig.update_layout(hovermode="x")
fig.show()

In [None]:
sentiment_count_df = pd.DataFrame(sentiment_count.items(), columns=['sentiment', 'count'])

fig = px.bar(sentiment_count_df, x='sentiment', y='count', title='Topic Counts')
fig.update_layout(hovermode="x")
fig.show()

# Layer 2

- Count the number of utterances and distribution (Histogram)
- Count the number of tokens and distribution (Histogram)

In [None]:
dialogue_df = pd.DataFrame(dialogue_dict.values(), columns=['utterance_per_dialogue'])

fig = px.histogram(dialogue_df, x="utterance_per_dialogue")
fig.update_layout(bargap=0.5, hovermode="x unified")
fig.show()

In [None]:
utterance_length_df = pd.DataFrame(utterance_length, columns=['utterance_length'])

fig = px.histogram(utterance_length_df, x="utterance_length")
fig.update_layout(bargap=0.5, hovermode="x")
fig.show()

# Self-Analysis

- Starting line number in JSON: Dev(2), Train(48340), Valid(14408)
- There are 1038 dialogues in the training dataset, which consists of 9989 utterances
- The utterances needs to be re-sorted, as the dataset utterances are in random order (does not follow sequence)
- There are 80,258 tokens from 9989 utterances in the training dataset

# Layer 3

- Break down sentiments to find distribution of emotions
- Find the number of speakers per dialog
- Cross check utterance length and emotion (Even number length vs odd number length)
- Distribution of total utterance length in one dialog
- Cross check utterance length and emotions
- Check the nature of dialogue (DailyDialog (Simulated) vs MELD vs EmoryNLP) -> More in-depth analysis

In [None]:
# Break down sentiments to find distribution of emotions
#print(sentiment_emotion_count)

In [None]:
# Find the number of speakers per dialog

#speakers_per_dialogue.pop(0)
#print(speakers_per_dialogue)

In [None]:
# Cross check dialog length and emotion
#print(dialog_length_emotion)

In [None]:
# Find the total length of utterances in one dialog
#dialog_length_list.pop(0)
#print(dialog_length_list)

In [None]:
# Check coorelation between utterance length and emotions
#print(emotion_utter_dict)