In [1]:
%cd /home/tk/repos/erc/

/home/tk/repos/erc


In [54]:
from glob import glob
import os
import json
import numpy as np
from scipy.spatial import distance

def get_emotion2num(DATASET):
    emotions = {}
    # MELD has 7 classes
    emotions['MELD'] = ['neutral',
                        'joy',
                        'surprise',
                        'anger',
                        'sadness',
                        'disgust',
                        'fear']

    # IEMOCAP originally has 11 classes but we'll only use 6 of them.
    emotions['IEMOCAP'] = ['neutral',
                           'frustration',
                           'sadness',
                           'anger',
                           'excited',
                           'happiness']

    # EmoryNLP has 7 classes
    emotions['EmoryNLP'] = ['neutral',
                            'joyful',
                            'scared',
                            'mad',
                            'peaceful',
                            'powerful',
                            'sad']

    # DailyDialog originally has 7 classes, but be sure not to include the
    # neutral class, which accounts for 80% of the data, in calculating
    # the micro f1_score.
    emotions['DailyDialog'] = ['neutral',
                               'happiness',
                               'surprise',
                               'sadness',
                               'anger',
                               'disgust',
                               'fear']

    emotion2num = {DATASET: {emotion: idx for idx, emotion in enumerate(
        emotions_)} for DATASET, emotions_ in emotions.items()}

    return emotion2num[DATASET]

def get_prob_distributions(list_of_elements):
    key2count = dict(Counter(list_of_elements))
    unique_keys = list(key2count.keys())
    unique_keys = sorted(unique_keys)
#     print(unique_keys)
    distribution = np.array([key2count[key] for key in unique_keys])
    distribution = distribution / distribution.sum()
    
    return distribution


In [55]:
emotions = {}
for DATASET in ['MELD', 'IEMOCAP', 'EmoryNLP', 'DailyDialog']:
    emotions[DATASET] = {}
    emotion2num = get_emotion2num(DATASET)
    for SPLIT in ['train', 'val', 'test']:
        emotions[DATASET][SPLIT] = []
        paths = glob(os.path.join(f'Datasets/{DATASET}/raw-texts/{SPLIT}/*.json'))
        for path in paths:
            with open(path, 'r') as stream:
                emotion = json.load(stream)['Emotion']
            if emotion in list(emotion2num.keys()):
                emotions[DATASET][SPLIT].append(emotion)    

In [59]:
for DATASET in ['MELD', 'IEMOCAP', 'EmoryNLP', 'DailyDialog']:
    distributions = {}
    for SPLIT in ['train', 'val', 'test']:
        distributions[SPLIT] = get_prob_distributions(emotions[DATASET][SPLIT])
    
    distance_train_val = distance.jensenshannon(distributions['train'], distributions['val'])
    distance_train_test = distance.jensenshannon(distributions['train'], distributions['test'])
    distance_val_test = distance.jensenshannon(distributions['val'], distributions['test'])
    
    print(DATASET)
#     print(distributions)
    print(distance_train_val, distance_train_test, distance_val_test)
    print()

MELD
0.06550474955037426 0.03914670471976914 0.062252950989606703

IEMOCAP
0.12312271388269282 0.0664251319487302 0.15926022732486805

EmoryNLP
0.03255742801480734 0.062374036142908755 0.04773720840015923

DailyDialog
0.06008652875385612 0.02632099827929686 0.07226251887251979



In [57]:
distributions

{'train': array([0.00948721, 0.00347597, 0.00167489, 0.12827808, 0.82761271,
        0.01111621, 0.01835494]),
 'val': array([9.54269426e-03, 3.71793283e-04, 1.36324204e-03, 8.47688685e-02,
        8.80902218e-01, 9.79055645e-03, 1.32606271e-02]),
 'test': array([0.01524548, 0.00607235, 0.00219638, 0.13165375, 0.81666667,
        0.01317829, 0.01498708])}