In [2]:
def make_markdown_table(array):
    """ Input: Python list with rows of table as lists
               First element as header. 
        Output: String to put into a .md file 

    Ex Input: 
        [["Name", "Age", "Height"],
         ["Jake", 20, 5'10],
         ["Mary", 21, 5'7]] 
    """

    markdown = "\n" + "| "

    for e in array[0]:
        to_add = " " + str(e) + " |"
        markdown += to_add
    markdown += "\n"

    markdown += '|'
    for i in range(len(array[0])):
        markdown += "-------------- | "
    markdown += "\n"

    for entry in array[1:]:
        markdown += "| "
        for e in entry:
            to_add = str(e) + " | "
            markdown += to_add
        markdown += "\n"

    return markdown

In [3]:
from glob import glob
import json
from collections import Counter
import numpy as np

stats = {}
for DATASET in ['MELD', 'IEMOCAP', 'EmoryNLP', 'DailyDialog']:
    stats[DATASET] = {}
    with open(f"../{DATASET}/labels.json", 'r') as stream:
        labels = json.load(stream)
    for SPLIT in ['train', 'val', 'test']:
        stats[DATASET][SPLIT] = dict(Counter(list(labels[SPLIT].values())))

for DATASET in ['MELD', 'IEMOCAP', 'EmoryNLP', 'DailyDialog']:
    print(f"### {DATASET}")
    for SPLIT in ['train', 'val', 'test']:
        table = []

        bar = dict(sorted(stats[DATASET][SPLIT].items()))
        bar['SUM'] = sum(list(bar.values()))
        bar = dict(sorted(bar.items(), key=lambda item: -item[1]))
        
        table.append(['SPLIT'])
        for emotion in bar.keys():
            table[0].append(emotion)
        table.append([f"{SPLIT}"])
        for emotion in bar.values():
            table[1].append(emotion)
            
        print(make_markdown_table(table))
        

### MELD

|  SPLIT | SUM | neutral | joy | surprise | anger | sadness | disgust | fear |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| train | 9989 | 4710 | 1743 | 1205 | 1109 | 683 | 271 | 268 | 


|  SPLIT | SUM | neutral | joy | anger | surprise | sadness | fear | disgust |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| val | 1109 | 470 | 163 | 153 | 150 | 111 | 40 | 22 | 


|  SPLIT | SUM | neutral | joy | anger | surprise | sadness | disgust | fear |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| test | 2610 | 1256 | 402 | 345 | 281 | 208 | 68 | 50 | 

### IEMOCAP

|  SPLIT | SUM | undecided | neutral | frustration | sadness | anger | excited | happiness |

In [4]:
from glob import glob
import json
from collections import Counter

stats = {}
for DATASET in ['MELD', 'IEMOCAP', 'EmoryNLP', 'DailyDialog']:
    stats[DATASET] = {}
    with open(f"../{DATASET}/utterance-ordered.json", 'r') as stream:
        diautt_ordered = json.load(stream)

    print(f"### {DATASET}")
    for SPLIT in ['train', 'val', 'test']:
        table = []     
        table.append(['SPLIT', 
                      'number of dialogues in total', 
                      'number of utterances in total',
                     'number of utterances per dialogue (mean)',
                     'number of utterances per dialogue (std.)',
                     'number of utterances per dialogue (min)',
                     'number of utterances per dialogue (max)'])

        
        num_utts_all = []
        
        for diaid, uttids in diautt_ordered[SPLIT].items():
            num_utts = len(uttids)
            num_utts_all.append(num_utts)
        
        table.append([SPLIT, 
                      len(diautt_ordered[SPLIT].keys()), 
                      len([v for key, val in diautt_ordered[SPLIT].items() for v in val]),
                     round(np.mean(np.array(num_utts_all)),3),
                     round(np.std(np.array(num_utts_all)),3),
                     round(np.min(np.array(num_utts_all)),3),
                     round(np.max(np.array(num_utts_all)),3)])

            
        print(make_markdown_table(table))
        

### MELD

|  SPLIT | number of dialogues in total | number of utterances in total | number of utterances per dialogue (mean) | number of utterances per dialogue (std.) | number of utterances per dialogue (min) | number of utterances per dialogue (max) |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| train | 1038 | 9989 | 9.623 | 5.843 | 1 | 24 | 


|  SPLIT | number of dialogues in total | number of utterances in total | number of utterances per dialogue (mean) | number of utterances per dialogue (std.) | number of utterances per dialogue (min) | number of utterances per dialogue (max) |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| val | 114 | 1109 | 9.728 | 5.434 | 1 | 23 | 


|  SPLIT | number of dialogues in total | number of utterances in total | number of utterances per dialogue (mean) | number of utterances per dialogue (std.) |

In [5]:
import nltk
from glob import glob
import json
from collections import Counter
from tqdm.notebook import tqdm
import os

import numpy as np

def load_utt(path):
    with open(path, 'r') as stream:
        foo = json.load(stream)
    return foo['Utterance']


for DATASET in tqdm(['MELD', 'IEMOCAP', 'EmoryNLP', 'DailyDialog']):    
    print(f"### {DATASET}")
    with open(f"../{DATASET}/utterance-ordered.json", 'r') as stream:
        diautt_ordered = json.load(stream)

    num_tokens = {}
    for SPLIT in ['train', 'val', 'test']:
        num_tokens[SPLIT] = {}
        
        for diaid, uttids in diautt_ordered[SPLIT].items():
            utts = [load_utt(os.path.join(f"../{DATASET}/raw-texts/{SPLIT}/{uttid}.json")) for uttid in uttids]
            tokens = [len(nltk.word_tokenize(utt)) for utt in utts]
            num_tokens[SPLIT][diaid] = tokens
            pass
        
        table = []
        table.append(['SPLIT', 
                      'number of tokens in total', 
                      'number of tokens per dialogue (mean)', 
                      'number of tokens per dialogue (std.)',
                      'number of tokens per dialogue (min)',
                      'number of tokens per dialogue (max)',
                      'number of tokens per utterance per dialogue (mean)', 
                      'number of tokens per utterance per dialogue (std.)',
                     'number of tokens per utterance per dialogue (min)',
                     'number of tokens per utterance per dialogue (max)'])
        table.append([SPLIT, 
                      np.sum(np.array([bar for foo in list(num_tokens[SPLIT].values()) for bar in foo])),
                      round(np.mean(np.array([np.sum(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                      round(np.std(np.array([np.sum(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                      round(np.min(np.array([np.sum(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                      round(np.max(np.array([np.sum(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                      round(np.mean(np.array([np.mean(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                      round(np.std(np.array([np.mean(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                     round(np.min(np.array([np.mean(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3),
                     round(np.max(np.array([np.mean(np.array(numtokens)) for diaid, numtokens in num_tokens[SPLIT].items()])),3)])
        print(make_markdown_table(table))

  0%|          | 0/4 [00:00<?, ?it/s]

### MELD

|  SPLIT | number of tokens in total | number of tokens per dialogue (mean) | number of tokens per dialogue (std.) | number of tokens per dialogue (min) | number of tokens per dialogue (max) | number of tokens per utterance per dialogue (mean) | number of tokens per utterance per dialogue (std.) | number of tokens per utterance per dialogue (min) | number of tokens per utterance per dialogue (max) |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| train | 110476 | 106.432 | 70.518 | 1 | 419 | 11.097 | 3.958 | 1.0 | 43.0 | 


|  SPLIT | number of tokens in total | number of tokens per dialogue (mean) | number of tokens per dialogue (std.) | number of tokens per dialogue (min) | number of tokens per dialogue (max) | number of tokens per utterance per dialogue (mean) | number of tokens per utterance per dialogue (std.) | number of tokens per utterance per 

In [32]:
av.InvalidDataError

av.error.InvalidDataError

In [38]:
from glob import glob
import json
from collections import Counter
from glob import glob
import json
import av
import os
from tqdm.notebook import tqdm
import numpy as np



for DATASET in ['MELD', 'IEMOCAP']:
    print(f"### {DATASET}")
    for SPLIT in ['train', 'val', 'test']:
        stats = {}
        vidpaths = glob(f"../{DATASET}/raw-videos/{SPLIT}/*")

        stats['num_frames'] = []
        stats['fps'] = []
        stats['duration_sec'] = []
        
        for path in vidpaths:
            try:
                container = av.open(path)
                videostream = container.streams.video[0]
                num_frames = videostream.frames
                fps = int(round(float(videostream.average_rate)))
                duration_sec = videostream.frames / fps

                stats['num_frames'].append(num_frames)
                stats['fps'].append(fps)
                stats['duration_sec'].append(duration_sec)
            except av.InvalidDataError as e:
                pass
        
        stats['num_frames'] = np.array(stats['num_frames'])
        stats['fps'] = np.array(stats['fps'])
        stats['duration_sec'] = np.array(stats['duration_sec'])
        
    
        table = []     
        table.append(['SPLIT', 
                      'num_frames (mean)',
                      'num_frames (std.)',
                      'num_frames (min)',
                      'num_frames (max)',

                      'fps (mean)',
                      'fps (std.)',
                      'fps (min)',
                      'fps (max)',

                      'duration in sec (mean)',
                      'duration in sec (std.)',
                      'duration in sec (min)',
                      'duration in sec (max)'])
        
        table.append([SPLIT, 
                      round(np.mean(stats['num_frames']),3),
                      round(np.std(stats['num_frames']),3),
                      round(np.min(stats['num_frames']),3),
                      round(np.max(stats['num_frames']),3),

                      round(np.mean(stats['fps']),3),
                      round(np.std(stats['fps']),3),
                      round(np.min(stats['fps']),3),
                      round(np.max(stats['fps']),3),

                      round(np.mean(stats['duration_sec']),3),
                      round(np.std(stats['duration_sec']),3),
                      round(np.min(stats['duration_sec']),3),
                      round(np.max(stats['duration_sec']),3)])

        print(make_markdown_table(table))


### MELD


Format mov,mp4,m4a,3gp,3g2,mj2 detected only with low score of 1, misdetection possible!
moov atom not found



|  SPLIT | num_frames (mean) | num_frames (std.) | num_frames (min) | num_frames (max) | fps (mean) | fps (std.) | fps (min) | fps (max) | duration in sec (mean) | duration in sec (std.) | duration in sec (min) | duration in sec (max) |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | 
| train | 75.175 | 58.239 | 2 | 984 | 24.005 | 0.068 | 24 | 25 | 3.132 | 2.425 | 0.083 | 41.0 | 


|  SPLIT | num_frames (mean) | num_frames (std.) | num_frames (min) | num_frames (max) | fps (mean) | fps (std.) | fps (min) | fps (max) | duration in sec (mean) | duration in sec (std.) | duration in sec (min) | duration in sec (max) |
|-------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- | -------------- 