In [1]:
import os
current_dir = os.getcwd()
root_dir = os.path.dirname(os.path.dirname(current_dir))
os.chdir(root_dir)

In [2]:
import os
import json
import gzip
import configs
from collections import defaultdict

def gather_all_captions(path, splits=['train', 'val', 'test']):
    all_captions = []
    for split in splits:
        this_path = os.path.join(path, f'{split}.json')
        if os.path.exists(this_path):
            data = json.load(open(this_path))
            for item in data:
                captions = item['caption'] if type(item['caption']) is list else [item['caption']]
                all_captions.extend(captions)
    return all_captions


def get_captions_from_tsv_gz_file(file, position=-1):
    captions = []
    with gzip.open(file, 'rt', encoding='utf8') as f:
        for line in f:
            captions.append(line.strip().split('\t')[position])
    return captions


def run(dataset='xm3600', diction=None, print_info=True):
    if diction is None:
        diction = defaultdict(list)

    assert dataset in ['xm3600', 'coco']
    
    for lang in configs.xm3600_langs:
        if dataset == 'xm3600':
            captions = gather_all_captions(f'data/annotations/{dataset}/{lang}')
        else:
            # val, test captions
            captions = gather_all_captions(f'data/annotations/{dataset}/translated/{lang}')
            captions.extend(
                get_captions_from_tsv_gz_file(f'data/corpus/multilingual_coco/36langs/triplet/coco_vision-en-{lang}.tsv.gz')
            )

        num_words, num_chars = 0, 0
        for caption in captions:
            num_words += len(caption.split(" "))
            num_chars += len(caption)        
        
        diction[lang].append(len(captions))
        diction[lang].append(num_words / len(captions))
        diction[lang].append(num_chars / len(captions))
        if print_info:
            print(lang, len(captions), num_words / len(captions), num_chars / len(captions))
    return diction


def print_latex_table_info(dictions=None, keys=['name', 'script', 'family', 'branch']):
    for lang in configs.xm3600_langs:
        line = [lang]
        for key in keys:
            line.append(configs.language_info[lang][key])
        
        if dictions is not None:
            if not isinstance(dictions, list):
                dictions = [dictions]
            
            for diction in dictions:
                for item in diction[lang]:
                    if type(item) is int:
                        line.append('{:,}'.format(item))
                    else:
                        line.append('%.1f'%item)
        print('\t&'.join(line) + '\\\\')

In [3]:
coco_diction = run(dataset='coco', print_info=True)

ar 616767 7.57328618424786 41.17360688882511
bn 616767 8.623934159901552 50.937923073056766
cs 616767 7.603200236069699 45.864389631741
da 616767 9.622653287221917 51.634586480794205
de 616767 9.454635218810346 60.28532168549874
el 616767 10.157325213573358 61.90445824760404
en 616767 10.480586672114429 52.350302464301755
es 616767 11.051148975220789 59.902989297417015
fa 616767 10.871907867963104 48.85080265318994
fi 616767 6.095961684071943 50.71441727589187
fil 616767 11.701319947403153 66.60320510014317
fr 616767 10.905677508686425 60.682187925099754
he 616767 6.954697641086504 36.58122921621941
hi 616767 10.892756908200342 50.382820416786245
hr 616767 7.8033973931808935 47.373247920203255
hu 616767 7.742838057159348 49.76929699546182
id 616767 8.70650018564547 56.96618496125765
it 616767 10.733751967923057 60.038345112497915
ja 616767 1.3081374327744513 22.18303022048845
ko 616767 6.874897651787466 24.94046374076434
mi 616767 13.719128941723536 61.63063198906556
nl 616767 9.783310

In [4]:
xm3600_diction = run(dataset='xm3600', print_info=True)

ar 7367 7.7112800325777116 42.18840776435455
bn 3600 11.2775 62.117222222222225
cs 7207 6.50839461634522 39.13819897322048
da 7264 8.704157488986784 48.32433920704846
de 8643 11.158509776697906 76.45377762351036
el 7204 7.742087729039422 51.410744031093834
en 7200 9.370416666666667 49.51222222222222
es 8614 9.77490132342698 56.30415602507546
fa 7245 12.747550034506556 59.35886818495514
fi 7127 7.514943173845938 65.15139609934053
fil 7109 12.165986777324518 67.56041637361092
fr 8562 12.330296659658957 69.59074982480729
he 7200 11.931111111111111 63.579861111111114
hi 8503 13.403034223215336 59.87204516053158
hr 7280 9.041208791208792 57.77403846153846
hu 7216 8.539634146341463 60.51510532150776
id 7126 14.282486668537748 93.5148751052484
it 8471 12.09373155471609 71.84559083933419
ja 7185 1.0019485038274183 25.980793319415447
ko 7650 6.964575163398693 24.681307189542483
mi 4732 11.74323753169907 55.53677092138631
nl 8059 7.9954088596600075 45.94006700583199
no 7213 9.579786496603354 54.

In [16]:
print_latex_table_info()

ar	&Arabic	&Arabic	&Afro-Asiatic	&\\
bn	&Bengali	&Bengali	&Indo-European	&Indo-Iranian\\
cs	&Czech	&Latin	&Indo-European	&Balto-Slavic\\
da	&Danish	&Latin	&Indo-European	&North Germanic\\
de	&German	&Latin	&Indo-European	&West Germanic\\
el	&Greek	&Latin	&Indo-European	&Hellenic\\
en	&English	&Latin	&Indo-European	&West Germanic\\
es	&Spanish	&Latin	&Indo-European	&Italic\\
fa	&Persian	&Arabic	&Indo-European	&Indo-Iranian\\
fi	&Finnish	&Latin	&Uralic	&Finnic\\
fil	&Filipino	&Latin	&Austronesian	&Malayo-Polynesian\\
fr	&French	&Latin	&Indo-European	&Italic\\
he	&Hebrew	&Hebrew	&Afro-Asiatic	&Semitic\\
hi	&Hindi	&Devanagari	&Indo-European	&Indo-Iranian\\
hr	&Croatian	&Latin	&Indo-European	&Balto-Slavic\\
hu	&Hungarian	&Latin	&Uralic	&\\
id	&Indonesian	&Latin	&Austronesian	&Malayo-Polynesian\\
it	&Italian	&Latin	&Indo-European	&Italic\\
ja	&Japanese	&Kanji	&Japonic	&\\
ko	&Korean	&Hangul	&Koreanic	&\\
mi	&Māori	&Latin	&Austronesian	&Malayo-Polynesian\\
nl	&Dutch	&Latin	&Indo-European	&Wes

In [5]:
print_latex_table_info(dictions=[coco_diction, xm3600_diction], keys=[])

ar	&616,767	&7.6	&41.2	&7,367	&7.7	&42.2\\
bn	&616,767	&8.6	&50.9	&3,600	&11.3	&62.1\\
cs	&616,767	&7.6	&45.9	&7,207	&6.5	&39.1\\
da	&616,767	&9.6	&51.6	&7,264	&8.7	&48.3\\
de	&616,767	&9.5	&60.3	&8,643	&11.2	&76.5\\
el	&616,767	&10.2	&61.9	&7,204	&7.7	&51.4\\
en	&616,767	&10.5	&52.4	&7,200	&9.4	&49.5\\
es	&616,767	&11.1	&59.9	&8,614	&9.8	&56.3\\
fa	&616,767	&10.9	&48.9	&7,245	&12.7	&59.4\\
fi	&616,767	&6.1	&50.7	&7,127	&7.5	&65.2\\
fil	&616,767	&11.7	&66.6	&7,109	&12.2	&67.6\\
fr	&616,767	&10.9	&60.7	&8,562	&12.3	&69.6\\
he	&616,767	&7.0	&36.6	&7,200	&11.9	&63.6\\
hi	&616,767	&10.9	&50.4	&8,503	&13.4	&59.9\\
hr	&616,767	&7.8	&47.4	&7,280	&9.0	&57.8\\
hu	&616,767	&7.7	&49.8	&7,216	&8.5	&60.5\\
id	&616,767	&8.7	&57.0	&7,126	&14.3	&93.5\\
it	&616,767	&10.7	&60.0	&8,471	&12.1	&71.8\\
ja	&616,767	&1.3	&22.2	&7,185	&1.0	&26.0\\
ko	&616,767	&6.9	&24.9	&7,650	&7.0	&24.7\\
mi	&616,767	&13.7	&61.6	&4,732	&11.7	&55.5\\
nl	&616,767	&9.8	&56.0	&8,059	&8.0	&45.9\\
no	&616,767	&9.6	&51.4	&7,213	&9.6