In [1]:
import yaml

with open('configs/paths.yaml', 'r') as stream:
    PATHS = yaml.safe_load(stream)

MAIN_ACTORS = ['Chandler', 'Joey', 'Monica', 'Phoebe', 'Rachel', 'Ross']

## Get the stats of the original annotations

In [2]:
from IPython.display import HTML, display
# !pip3 install tabulate
import tabulate
from collections import Counter

with open(PATHS['ANNOTATIONS']['all'], 'r') as stream:
    datasets = yaml.safe_load(stream)

table = [['DATASET', 'number of dialogues', 'number of utterances', 'number of unique speakers']]
table.append('-'*len(foo) for foo in table[0])

for DATASET in ['train', 'dev', 'test']:
    dias = set([foo.split('_')[0] for foo in list(datasets[DATASET].keys())])
    speakers = set([val['Speaker'] for key, val in datasets[DATASET].items()])
    table.append([DATASET, len(dias), len(datasets[DATASET]), len(speakers)])
table.append('-'*len(foo) for foo in table[0])
display(HTML(tabulate.tabulate(table, tablefmt='html')))


for DATASET in ['train', 'dev', 'test']:

    table = [[DATASET, 'num_utts', 'proportion']]
    table.append('-'*len(foo) for foo in table[0])

    dias = set([foo.split('_')[0] for foo in list(datasets[DATASET].keys())])
    num_utts = len(datasets[DATASET])
    speakers = [val['Speaker'] for key, val in datasets[DATASET].items()]
    speakers = dict(Counter(speakers))

    accu_utts = 0
    accu_proportion = 0
    for spk, num_occur in speakers.items():
        if spk in MAIN_ACTORS:
            accu_utts += num_occur
            accu_proportion += num_occur / num_utts
            table.append([spk, num_occur, round(num_occur / num_utts, 3)])
            
    accu_proportion = round(accu_proportion, 3)

    table.append(['SUM', accu_utts, accu_proportion])
    table.append('-'*len(foo) for foo in table[0])
    display(HTML(tabulate.tabulate(table, tablefmt='html')))


0,1,2,3
DATASET,number of dialogues,number of utterances,number of unique speakers
-------,-------------------,--------------------,-------------------------
train,1038,9989,260
dev,114,1108,47
test,280,2610,100
-------,-------------------,--------------------,-------------------------


0,1,2
train,num_utts,proportion
-----,--------,----------
Chandler,1283,0.128
Monica,1299,0.13
Rachel,1435,0.144
Joey,1510,0.151
Ross,1458,0.146
Phoebe,1321,0.132
SUM,8306,0.832
-----,--------,----------


0,1,2
dev,num_utts,proportion
---,--------,----------
Phoebe,184,0.166
Monica,137,0.124
Rachel,164,0.148
Chandler,101,0.091
Joey,149,0.134
Ross,217,0.196
SUM,952,0.859
---,--------,----------


0,1,2
test,num_utts,proportion
----,--------,----------
Rachel,356,0.136
Phoebe,291,0.111
Monica,346,0.133
Ross,373,0.143
Joey,411,0.157
Chandler,379,0.145
SUM,2156,0.826
----,--------,----------


## Get the stats of the raw video data

In [3]:
import os
from glob import glob

table = [['DATASET', 'number of vids (mp4) ']]
table.append('-'*len(foo) for foo in table[0])

for DATASET in ['train', 'dev', 'test']:
    vids = glob(os.path.join(PATHS['ORIGINAL_VIDEOS'][DATASET], '*.mp4'))
    table.append([DATASET, len(vids)])
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1
DATASET,number of vids (mp4)
-------,---------------------
train,9989
dev,1112
test,2747


## Get the stats of the extracted visual features

In [4]:
table = [['DATASET', 'number of npys']]
table.append('-'*len(foo) for foo in table[0])

for DATASET in ['train', 'dev', 'test']:
    npys = glob(os.path.join(PATHS['VISUAL_FEATURES'][DATASET], '*.npy'))
    
    table.append([DATASET, len(npys)])
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1
DATASET,number of npys
-------,--------------
train,9988
dev,1108
test,2610


## Get the stats of the extracted face videos

In [5]:
table = [['DATASET', 'number of face-vids']]
table.append('-'*len(foo) for foo in table[0])

for DATASET in ['train', 'dev', 'test']:
    facevids = glob(os.path.join(PATHS['FACE_VIDEOS'][DATASET], '*.mp4'))

    table.append([DATASET, len(facevids)])
display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1
DATASET,number of face-vids
-------,-------------------
train,7523
dev,860
test,1983
