# 1. XM3600
```
@inproceedings{thapliyal2022XM3600,
  title = {Crossmodal-3600: A Massively Multilingual Multimodal Evaluation Dataset},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing},
  author = {Thapliyal, Ashish V. and Pont Tuset, Jordi and Chen, Xi and Soricut, Radu},
  year = {2022},
  pages = {715--729}
}
```

## 1.1 Official Annotations

In [1]:
!wget -c https://google.github.io/crossmodal-3600/web-data/captions.zip --no-check-certificate
!unzip captions.zip
!rm captions.zip

--2023-12-25 18:10:52--  https://google.github.io/crossmodal-3600/web-data/captions.zip
Resolving google.github.io (google.github.io)... 2606:50c0:8002::153, 2606:50c0:8003::153, 2606:50c0:8000::153, ...
Connecting to google.github.io (google.github.io)|2606:50c0:8002::153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16778794 (16M) [application/zip]
Saving to: ‘captions.zip’


2023-12-25 18:10:55 (9.32 MB/s) - ‘captions.zip’ saved [16778794/16778794]

Archive:  captions.zip
  inflating: captions.jsonl          


In [8]:
# !wget -c https://open-images-dataset.s3.amazonaws.com/crossmodal-3600/images.tgz --no-check-certificate
# !mkdir XM3600
# !tar -xvzf images.tgz -C XM3600
# !rm images.tgz

In [3]:
import os
SAVE_PATH = './annotations/xm3600'
os.makedirs(SAVE_PATH, exist_ok=True)

In [4]:
lines = open('captions.jsonl').read().strip().split('\n')
data = {i: eval(line) for i, line in enumerate(lines)}

In [5]:
data[0].keys()

dict_keys(['image/key', 'image/locale', 'ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'hi', 'hr', 'hu', 'id', 'it', 'he', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh'])

In [6]:
import json

for key in data[0].keys():
    if 'image' not in key:
        save_path = os.path.join(SAVE_PATH, key)
        os.makedirs(save_path, exist_ok=True)

        test_data = []
        for i in range(len(data)):
            item = {'image': data[i]['image/key'] + '.jpg', 'image/locale': data[i]['image/locale'], 'caption': data[i][key]['caption'], 'image_id': i}
            test_data.append(item)
        
        with open(os.path.join(save_path, 'test.json'), 'w') as wf:
            json.dump(test_data, wf)
        
        for caption_key, name in zip(['caption', 'caption/tokenized/lowercase'], ['test_gt.json', 'test_tokenized_gt.json']):
            gt = {
                'annotations': [],
                'images': [],
            }

            caption_id = 0
            for item in test_data:
                assert isinstance(item['caption'], (list, tuple))
                image_id = item['image_id']
                for caption in data[image_id][key][caption_key]:
                    item = dict(
                        image_id=image_id,
                        caption=caption,
                        id=caption_id,
                    )
                    caption_id += 1
                    gt['annotations'].append(item)
                gt['images'].append({'id': image_id})
                        
            with open(os.path.join(save_path, name), 'w') as wf:
                json.dump(gt, wf)

In [7]:
!rm captions.jsonl

## 1.2 Translating XM3600's annotations into English for the translate-test evaluation
Here are the steps:
1. put the test captions in, e.g., `annotations/xm3600/ar/test.json`, into a `.docx` file line by line (refer to 1.2.1)
2. get the translated results of the 1st step's file via [Google Translator](https://translate.google.com)
3. convert the 2nd step's results to, e.g., `annotations/xm3600/translated_to_en/ar/test.json` (refer to 1.2.3)

**Note:** `annotations/xm3600/en/test.json` is identical to `annotations/xm3600/translated_to_en/en/test.json`

### 1.2.1 `annotations/xm3600/{lang}/test.json` --> `dummy_xm3600/{lang}.docx`

In [12]:
!pip install python-docx

Collecting python-docx
  Downloading python_docx-1.1.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.6/239.6 kB[0m [31m718.9 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.0


In [27]:
import os
import json
import re
from  tqdm import tqdm
from docx import Document

annotations_root = "annotations/xm3600"
save_root = 'dummy_xm3600'
os.makedirs(save_root, exist_ok=True)
fn = 'test.json'

langs = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh']

for lang in tqdm(langs):
    save_path = os.path.join(save_root, f'{lang}.docx')
    if os.path.exists(save_path):
        continue
    json_data = json.load(open(os.path.join(annotations_root, lang, fn), 'r', encoding='utf8'))
    captions = [caption.strip() for item in json_data for caption in item['caption']]
    print(lang, len(captions))

    doc = Document()
    for caption in captions:
        try:
            doc.add_paragraph(caption)
        except:
            print(caption)
            caption = re.sub(u"[\\x00-\\x08\\x0b\\x0e-\\x1f\\x7f]", "", caption)
            print(caption)
            doc.add_paragraph(caption)
        
    doc.save(save_path)

  0%|                                                                                                                                                                    | 0/36 [00:00<?, ?it/s]

ar 7367


  3%|████▎                                                                                                                                                       | 1/36 [00:01<00:59,  1.71s/it]

bn 3600


  6%|████████▋                                                                                                                                                   | 2/36 [00:02<00:34,  1.02s/it]

cs 7207


  8%|█████████████                                                                                                                                               | 3/36 [00:03<00:34,  1.05s/it]

da 7264


 11%|█████████████████▎                                                                                                                                          | 4/36 [00:04<00:40,  1.27s/it]

de 8643


 14%|█████████████████████▋                                                                                                                                      | 5/36 [00:06<00:46,  1.49s/it]

el 7204


 17%|██████████████████████████                                                                                                                                  | 6/36 [00:08<00:41,  1.39s/it]

en 7200


 19%|██████████████████████████████▎                                                                                                                             | 7/36 [00:09<00:37,  1.29s/it]

es 8614


 22%|██████████████████████████████████▋                                                                                                                         | 8/36 [00:10<00:37,  1.35s/it]

fa 7245


 25%|███████████████████████████████████████                                                                                                                     | 9/36 [00:11<00:35,  1.31s/it]

fi 7127


 28%|███████████████████████████████████████████                                                                                                                | 10/36 [00:12<00:32,  1.25s/it]

fil 7109


 31%|███████████████████████████████████████████████▎                                                                                                           | 11/36 [00:14<00:30,  1.20s/it]

fr 8562


 33%|███████████████████████████████████████████████████▋                                                                                                       | 12/36 [00:15<00:31,  1.30s/it]

he 7200


 36%|███████████████████████████████████████████████████████▉                                                                                                   | 13/36 [00:16<00:29,  1.27s/it]

hi 8503


 39%|████████████████████████████████████████████████████████████▎                                                                                              | 14/36 [00:18<00:29,  1.36s/it]

hr 7280


 42%|████████████████████████████████████████████████████████████████▌                                                                                          | 15/36 [00:19<00:27,  1.29s/it]

hu 7216


 44%|████████████████████████████████████████████████████████████████████▉                                                                                      | 16/36 [00:20<00:24,  1.24s/it]

id 7126


 47%|█████████████████████████████████████████████████████████████████████████▏                                                                                 | 17/36 [00:21<00:22,  1.21s/it]

it 8471


 50%|█████████████████████████████████████████████████████████████████████████████▌                                                                             | 18/36 [00:23<00:23,  1.29s/it]

ja 7185


 53%|█████████████████████████████████████████████████████████████████████████████████▊                                                                         | 19/36 [00:24<00:20,  1.23s/it]

ko 7650


 56%|██████████████████████████████████████████████████████████████████████████████████████                                                                     | 20/36 [00:25<00:19,  1.23s/it]

mi 4732


 58%|██████████████████████████████████████████████████████████████████████████████████████████▍                                                                | 21/36 [00:26<00:16,  1.11s/it]

nl 8059


 61%|██████████████████████████████████████████████████████████████████████████████████████████████▋                                                            | 22/36 [00:28<00:18,  1.30s/it]

no 7213


 64%|███████████████████████████████████████████████████████████████████████████████████████████████████                                                        | 23/36 [00:29<00:16,  1.24s/it]

pl 7141


 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                   | 24/36 [00:30<00:14,  1.19s/it]

pt 7243


 69%|███████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                               | 25/36 [00:31<00:12,  1.16s/it]

quz 7200


 72%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                                           | 26/36 [00:32<00:11,  1.13s/it]

ro 7123


 75%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                      | 27/36 [00:33<00:10,  1.13s/it]

ru 7200


 78%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 28/36 [00:35<00:10,  1.31s/it]

sv 7273


 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                              | 29/36 [00:36<00:09,  1.30s/it]

sw 7046


 83%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏                         | 30/36 [00:37<00:07,  1.23s/it]

te 7200


 86%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                     | 31/36 [00:38<00:06,  1.21s/it]

th 7200


 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 32/36 [00:39<00:04,  1.21s/it]

tr 7233


 92%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████             | 33/36 [00:41<00:03,  1.19s/it]

uk 7215


 94%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍        | 34/36 [00:42<00:02,  1.20s/it]

vi 7350
Dòngchâm ngôn chống tư bản gắn trên tường có khung được kết từ rất nhiều mảnh thẻ Visa vụn, bên dưới có mã QR
Dòng châm ngôn chống tư bản gắn trên tường có khung được kết từ rất nhiều mảnh thẻ Visa vụn, bên dưới có mã QR
Chiếc bànxếp chân sắt phủ tấm trải viền cờ mỹ, trên mặt bàn có 2 hộp bánh rán vòng, dưới chân có lốc nước khoáng và thùng màu trắng, bàn đặt trong căn phòng có sàn màu trắng
Chiếc bàn xếp chân sắt phủ tấm trải viền cờ mỹ, trên mặt bàn có 2 hộp bánh rán vòng, dưới chân có lốc nước khoáng và thùng màu trắng, bàn đặt trong căn phòng có sàn màu trắng
nam thanh niên trẻ mặc áo phông đỏ, đeo kính, tóc húi cua
nam thanh niên trẻ mặc áo phông đỏ, đeo kính, tóc húi cua
Món chân gà rán phủ sốt cay ngon tuyệtbày trên đĩa trắng
Món chân gà rán phủ sốt cay ngon tuyệt bày trên đĩa trắng
nam doanh nhân da màu trung niên tươi cười bên bức tường có chữ Ushahidi
nam doanh nhân da màu trung niên tươi cười bên bức tường có chữ Ushahidi
ĐườngThống nhất tại Sài Gòn khá đông đúc ô 

 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋    | 35/36 [00:43<00:01,  1.22s/it]

zh 7174


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:44<00:00,  1.24s/it]


In [28]:
from docx import Document
doc = Document(os.path.join(save_root, 'zh.docx'))
for p in doc.paragraphs:
    print(p.text)
    break

在山里中站着两只鸡，一只黄色另一只黑黄色，它们俩站着看向同一个方向


### 1.2.2 Translate `dummy_xm3600/{lang}.docx` into English by yourselves!

In [30]:
from docx import Document
save_root = 'dummy_xm3600'
langs = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh']
for lang in langs:
    doc = Document(os.path.join(save_root, f'{lang}.docx'))
    for p in doc.paragraphs:
        print(lang, '\t', p.text)
        break

ar 	 Rooster and chick on the ground
bn 	 There are two brown and black colored chickens in the Jungle Madha
cs 	 Rooster and hen in the grass
da 	 A brown hen and a multicolored rooster in the forest floor
de 	 A hen and a hen in the stony garden in the grass.
el 	 Rooster and hen
en 	 A rooster and hens surrounded by green leaves.
es 	 A rooster and a hen among rocks and grass
fa 	 Rooster and chicken in the garden on a clear day
fi 	 Two chickens and a rooster walking in the forest
fil 	 Two roosters and another chicken with incomplete picture
fr 	 A hen and a rooster in the woods
he 	 Chickens walk on the ground with green vegetation around.
hi 	 view of two chickens among small plants on the ground
hr 	 A hen with brown feathers and a rooster with black and brown feathers in nature
hu 	 Rooster and hen.
id 	 2 chickens nestling in an open garden area among the weeds
it 	 Roosters in the wild, natural environment, dry leaves, weeds, uneven ground
ja 	 Two roosters walking in the gr

### 1.2.3 `dummy_xm3600/{lang}.docx` --> `annotations/xm3600/translated_to_en/{lang}/test.json`

In [31]:
import os
import json
import copy
from  tqdm import tqdm
from docx import Document

annotations_root = 'annotations/xm3600'
save_root = os.path.join(annotations_root, 'translated_to_en')
os.makedirs(save_root, exist_ok=True)
read_root = 'dummy_xm3600'
fn = 'test.json'

langs = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh']

for lang in tqdm(langs):
    this_save_path = os.path.join(save_root, lang, fn)
    if os.path.exists(this_save_path):
        continue

    json_data = json.load(open(os.path.join(annotations_root, lang, fn), 'r', encoding='utf8'))

    if lang == 'en':
        save_data = json_data
    else:
        save_data = []
        read_path = f'{read_root}/{lang}.docx'
        captions = [p.text for p in Document(read_path).paragraphs]
        captions = [c.strip() for c in captions if c.strip()]
        caption_idx = 0
        for item in json_data:
            num_captions = len(item['caption'])
            new_item = copy.deepcopy(item)
            new_item['caption'] = captions[caption_idx:caption_idx+num_captions]
            save_data.append(new_item)
            caption_idx += num_captions

        assert caption_idx == len(captions), f'{lang} {caption_idx} {len(captions)}'
    assert len(save_data) == len(json_data)

    os.makedirs(os.path.dirname(this_save_path), exist_ok=True)
    with open(this_save_path, 'w') as wf:
        json.dump(save_data, wf)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 82062.47it/s]


In [32]:
!rm -r dummy_xm3600

# 2. MSCOCO (Karpathy's splits)

```bibtex
@misc{chen2015MSCOCO,
  title = {Microsoft COCO Captions: Data Collection and Evaluation Server},
  author = {Chen, Xinlei and Fang, Hao and Lin, Tsung-Yi and Vedantam, Ramakrishna and Gupta, Saurabh and Dollar, Piotr and Zitnick, C. Lawrence},
  year = {2015},
  number = {arXiv:1504.00325},
  eprint = {1504.00325},
  doi = {10.48550/arXiv.1504.00325},
  archiveprefix = {arxiv}
}
@inproceedings{karpathy2015deep,
  title={Deep visual-semantic alignments for generating image descriptions},
  author={Karpathy, Andrej and Fei-Fei, Li},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={3128--3137},
  ye015},ar={2015}``
}
```

## 2.1 Official Annotations

Please refer to [yangbang18/ZeroNLG/data/prepare_text_data.ipynb](https://github.com/yangbang18/ZeroNLG/blob/master/data/prepare_text_data.ipynb)

## 2.2 Translating MSCOCO's English annotations into 36 languages
Here are the steps:
1. put the val and test captions into `.docx` files line by line respectively (refer to 2.2.1)
2. get the translated results of the 1st step's file via [Google Translator](https://translate.google.com)
3. convert the 2nd step's results to, e.g., `annotations/coco/translated/ar/val.json` and `annotations/coco/translated/ar/test.json` (refer to 2.2.3)

**Note:** `annotations/coco/en/*` is identical to `annotations/coco/translated/en/*`

### 2.2.1 `annotations/coco/en/val.json` --> `dummy_val/coco_en.docx` && `annotations/coco/en/test.json` --> `dummy_test/coco_en.docx`

In [23]:
import os
import json
from docx import Document

annotations_root = "annotations/coco"
save_root_format = 'dummy_{}'

for mode in ['val', 'test']:
    save_root = save_root_format.format(mode)
    os.makedirs(save_root, exist_ok=True)
    save_path = os.path.join(save_root, 'coco_en.docx')
    
    json_data = json.load(open(os.path.join(annotations_root, 'en', f'{mode}.json')))
    captions = [caption.strip() for item in json_data for caption in item['caption']]
    print(mode, len(captions), captions[:3])

    doc = Document()
    for caption in captions:
        doc.add_paragraph(caption)
    doc.save(save_path)

val 25010 ['A child holding a flowered umbrella and petting a yak.', 'A young man holding an umbrella next to a herd of cattle.', 'a young boy barefoot holding an umbrella touching the horn of a cow']
test 25010 ['A man with a red helmet on a small moped on a dirt road.', 'Man riding a motor bike on a dirt road on the countryside.', 'A man riding on the back of a motorcycle.']


### 2.2.2 Translate `dummy_val/coco_en.docx` and `dummy_test/coco_en.docx` into other 35 languages by yourselves!

In [33]:
from docx import Document
save_root = 'dummy_val'
langs = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh']
for lang in langs:
    doc = Document(os.path.join(save_root, f'coco_{lang}.docx'))
    for p in doc.paragraphs:
        print(lang, '\t', p.text)
        break

ar 	 طفل يحمل مظلة مزهرة ويأكل ثورًا.
bn 	 একটি শিশু একটি ফুলের ছাতা ধরে একটি ইয়াক পোষাচ্ছে।
cs 	 Dítě drží květovaný deštník a hladí jaka.
da 	 Et barn holder en blomstret paraply og klapper en yak.
de 	 Ein Kind hält einen geblümten Regenschirm und streichelt ein Yak.
el 	 Ένα παιδί που κρατά μια ανθισμένη ομπρέλα και χαϊδεύει ένα γιακ.
en 	 A child holding a flowered umbrella and petting a yak.
es 	 Un niño sosteniendo un paraguas floreado y acariciando un yak.
fa 	 کودکی که چتر گلدار را در دست گرفته و سگی را نوازش می کند.
fi 	 Lapsi pitelee kukkaista sateenvarjoa ja silittää jakkia.
fil 	 Isang bata na may hawak na bulaklak na payong at hinahaplos ang isang yak.
fr 	 Un enfant tenant un parapluie fleuri et caressant un yak.
he 	 ילד מחזיק מטריה פרחונית ומלטף יאק.
hi 	 फूल छाता लिए एक बच्चा और याक को सहला रहा है।
hr 	 Dijete drži kišobran s cvjetovima i mazi jaka.
hu 	 Egy gyerek, aki virágos esernyőt tart, és egy jakot simogat.
id 	 Seorang anak memegang payung berbunga dan mengel

In [34]:
from docx import Document
save_root = 'dummy_test'
langs = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh']
for lang in langs:
    doc = Document(os.path.join(save_root, f'coco_{lang}.docx'))
    for p in doc.paragraphs:
        print(lang, '\t', p.text)
        break

ar 	 رجل يرتدي خوذة حمراء على دراجة بخارية صغيرة على طريق ترابي.
bn 	 একটি কাঁচা রাস্তায় একটি ছোট মোপেডে লাল হেলমেট পরা একজন ব্যক্তি।
cs 	 Muž s červenou helmou na malém mopedu na polní cestě.
da 	 En mand med rød hjelm på en lille knallert på en grusvej.
de 	 Ein Mann mit rotem Helm auf einem kleinen Moped auf einer unbefestigten Straße.
el 	 Ένας άντρας με ένα κόκκινο κράνος σε ένα μικρό μοτοποδήλατο σε έναν χωματόδρομο.
en 	 A man with a red helmet on a small moped on a dirt road.
es 	 Un hombre con casco rojo en un pequeño ciclomotor en un camino de tierra.
fa 	 مردی با کلاه ایمنی قرمز روی یک موپد کوچک در جاده خاکی.
fi 	 Mies punaisella kypärällä pienellä mopolla hiekkatiellä.
fil 	 Isang lalaking may pulang helmet sa isang maliit na moped sa isang maruming kalsada.
fr 	 Un homme avec un casque rouge sur un petit cyclomoteur sur un chemin de terre.
he 	 אדם עם קסדה אדומה על טוסטוס קטן בדרך עפר.
hi 	 गंदगी भरी सड़क पर छोटी मोपेड पर लाल हेलमेट पहने एक आदमी।
hr 	 Čovjek s crvenom kac

### 2.2.3 `dummy_val/coco_{lang}.docx` --> `annotations/coco/translated/{lang}/val.json` && `dummy_test/coco_{lang}.docx` --> `annotations/coco/translated/{lang}/test.json`

In [36]:
import os
import json
import copy
from  tqdm import tqdm
from docx import Document

annotations_root = 'annotations/coco'
save_root = os.path.join(annotations_root, 'translated')
os.makedirs(save_root, exist_ok=True)
read_path_format = 'dummy_{}/coco_{}.docx'

langs = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fi', 'fil', 'fr', 'he', 'hi', 'hr', 'hu', 'id', 'it', 'ja', 'ko', 'mi', 'nl', 'no', 'pl', 'pt', 'quz', 'ro', 'ru', 'sv', 'sw', 'te', 'th', 'tr', 'uk', 'vi', 'zh']

for mode in tqdm(['val', 'test']):
    json_data = json.load(open(os.path.join(annotations_root, 'en', f'{mode}.json')))
    gt_data = json.load(open(os.path.join(annotations_root, 'en', f'{mode}_gt.json')))
    
    for lang in tqdm(langs):
        save_path = os.path.join(save_root, lang)
        os.makedirs(save_path, exist_ok=True)
        path1 = os.path.join(save_path, f'{mode}.json')
        path2 = os.path.join(save_path, f'{mode}_gt.json')
        if os.path.exists(path1) and os.path.exists(path2):
            continue
        
        if lang == 'en':
            save_data = json_data
            gt = gt_data
        else:
            save_data = []
            read_path = read_path_format.format(mode, lang)
            captions = [p.text for p in Document(read_path).paragraphs]
            caption_idx = 0
            caption_id = 0
            gt = {
                'annotations': [],
                'images': [],
            }
            for item in json_data:
                num_captions = len(item['caption'])
                new_item = copy.deepcopy(item)
                new_item['caption'] = captions[caption_idx:caption_idx+num_captions]
                save_data.append(new_item)
                caption_idx += num_captions
    
                image_id = item['image_id']
                for caption in new_item['caption']:
                    gt['annotations'].append(
                        dict(
                            image_id=image_id,
                            caption=caption,
                            id=caption_id,
                    ))
                    caption_id += 1
                gt['images'].append({'id': image_id})
        
        if not os.path.exists(path1):
            with open(path1, 'w') as wf:
                json.dump(save_data, wf)
    
        if not os.path.exists(path2):
            with open(path2, 'w') as wf:
                json.dump(gt, wf)

  0%|                                                                                                                                                                     | 0/2 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 24999.16it/s][A

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:00<00:00, 20532.36it/s][A
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 18.88it/s]


In [37]:
!rm -r dummy_val
!rm -r dummy_test

## 2.3 Multilingual Corpora

Similar to Section 2.2:
1. put the train captions (`annotations/coco/en/train.json`) into a `.docx` file line by line
2. get the translated results of the 1st step's file via [Google Translator](https://translate.google.com)
3. convert the 2nd step's results to, e.g., `corpus/multilingual_coco/36langs/coco_en.tsv.gz` and `corpus/multilingual_coco/36langs/coco_en-zh.tsv.gz`

**Note:** captions in `annotations/coco/en/train.json` are identical to `corpus/multilingual_coco/36langs/coco_en.tsv.gz`

In [15]:
import json
json_data = json.load(open("annotations/coco/en/train.json"))
json_captions = []
for item in json_data:
    json_captions.append(item['caption'].strip().replace('\n', ' '))
print(len(json_captions))

566747


In [11]:
import gzip
gzip_captions = []
with gzip.open('corpus/multilingual_coco/36langs/coco_en.tsv.gz', 'rt', encoding='utf8') as f:
    for line in f:
        gzip_captions.append(line.strip())
print(len(gzip_captions))

566747


In [17]:
json_captions == gzip_captions

True

# 3. CC3M
```bibtex
@inproceedings{sharma2018conceptual,
  title={Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning},
  author={Sharma, Piyush and Ding, Nan and Goodman, Sebastian and Soricut, Radu},
  booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  pages={2556--2565},
  year={2018}
}
```

## 3.1 Official Annotations

Place data from: https://ai.google.com/research/ConceptualCaptions/download in this folder

- Train_GCC-training.tsv Training Split (3,318,333)

- Validation_GCC-1.1.0-Validation.tsv Validation Split (15,840)

In [1]:
!wget -c -O GCC-training.tsv https://storage.googleapis.com/gcc-data/Train/GCC-training.tsv?_ga=2.191230122.-1896153081.1529438250 --no-check-certificate

--2024-01-31 16:20:12--  https://storage.googleapis.com/gcc-data/Train/GCC-training.tsv?_ga=2.191230122.-1896153081.1529438250
Resolving storage.googleapis.com (storage.googleapis.com)... 2404:6800:4005:804::201b, 2404:6800:4005:80c::201b, 2404:6800:4005:80a::201b, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:804::201b|:443... failed: Connection timed out.
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:80c::201b|:443... failed: Connection timed out.
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:80a::201b|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 564607502 (538M) [application/octet-stream]
Saving to: ‘GCC-training.tsv’


2024-01-31 16:26:08 (5.75 MB/s) - ‘GCC-training.tsv’ saved [564607502/564607502]



In [18]:
!head GCC-training.tsv

a very typical bus station	http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/AAAAAAAAM6o/_11MuAAKalQ/IMG_3422.JPG?imgmax=800
sierra looked stunning in this top and this skirt while performing with person at their former university	http://78.media.tumblr.com/3b133294bdc7c7784b781b45eb9af7be/tumblr_nbirmjpEme1tkk0fco1_500.jpg
young confused girl standing in front of a wardrobe	https://media.gettyimages.com/photos/young-confused-girl-standing-in-front-of-a-wardrobe-picture-id511063329?s=612x612
interior design of modern living room with fireplace in a new house	https://thumb1.shutterstock.com/display_pic_with_logo/152074/125938838/stock-photo-interior-design-of-modern-living-room-with-fireplace-in-a-new-house-125938838.jpg
cybernetic scene isolated on white background .	https://thumb1.shutterstock.com/display_pic_with_logo/324673/177023534/stock-photo-cybernetic-scene-isolated-on-white-background-sci-fi-robot-arm-made-of-compound-metallic-as-a-177023534.jpg
gangsta rap artist attends sports t

In [2]:
!wget -c -O GCC-1.1.0-Validation.tsv https://storage.googleapis.com/gcc-data/Validation/GCC-1.1.0-Validation.tsv?_ga=2.141047602.-1896153081.1529438250 --no-check-certificate

--2024-01-31 16:27:05--  https://storage.googleapis.com/gcc-data/Validation/GCC-1.1.0-Validation.tsv?_ga=2.141047602.-1896153081.1529438250
Resolving storage.googleapis.com (storage.googleapis.com)... 2404:6800:4005:804::201b, 2404:6800:4005:805::201b, 2404:6800:4005:80c::201b, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:804::201b|:443... failed: Connection timed out.
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:805::201b|:443... failed: Connection timed out.
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:80c::201b|:443... failed: Connection timed out.
Connecting to storage.googleapis.com (storage.googleapis.com)|2404:6800:4005:80a::201b|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2603670 (2.5M) [text/tab-separated-values]
Saving to: ‘GCC-1.1.0-Validation.tsv’


2024-01-31 16:33:38 (2.58 MB/s) - ‘GCC-1.1.0-Validation.tsv’ saved [2603670/2603670]



In [30]:
import pandas as pd
train_df = pd.read_csv('GCC-training.tsv', sep='\t', names=['caption', 'url'])
train_df.head()

Unnamed: 0,caption,url
0,a very typical bus station,http://lh6.ggpht.com/-IvRtNLNcG8o/TpFyrudaT6I/...
1,sierra looked stunning in this top and this sk...,http://78.media.tumblr.com/3b133294bdc7c7784b7...
2,young confused girl standing in front of a war...,https://media.gettyimages.com/photos/young-con...
3,interior design of modern living room with fir...,https://thumb1.shutterstock.com/display_pic_wi...
4,cybernetic scene isolated on white background .,https://thumb1.shutterstock.com/display_pic_wi...


In [50]:
val_df = pd.read_csv('GCC-1.1.0-Validation.tsv', sep='\t', names=['caption', 'url'])
val_df.head()

Unnamed: 0,caption,url
0,author : a life in photography -- in pictures,https://i.pinimg.com/736x/66/01/6c/66016c3ba27...
1,an angler fishes river on a snowy day .,http://www.standard.net/image/2015/02/04/800x_...
2,photograph of the sign being repaired by brave...,http://indianapolis-photos.funcityfinder.com/f...
3,the player staring intently at a computer scre...,http://www.abc.net.au/news/image/9066492-3x2-7...
4,globes : the green 3d person carrying in hands...,https://www.featurepics.com/StockImage/2009031...


In [32]:
import os
image_root = 'cc3m'
existed_train_images = os.listdir(os.path.join(image_root, 'images'))
existed_train_lines = set([int(item.split('_')[0]) for item in existed_train_images])
print(len(existed_train_images), existed_train_images[:5])
      
existed_val_images = os.listdir(os.path.join(image_root, 'validation'))
existed_val_lines = set([int(item.split('_')[0]) for item in existed_val_images])
print(len(existed_val_images), existed_val_images[:5])

3035376 ['2165410_2644318879', '1276069_1690728551', '2377826_3789062313', '3063989_128336495', '2421608_3546761199']
12881 ['1323_1011433307', '3333_3463765184', '10458_1850217442', '15135_3012636910', '13395_3305295462']


In [52]:
from tqdm import tqdm
import zlib

SAVE_PATH = './annotations/cc3m/en'
os.makedirs(SAVE_PATH, exist_ok=True)

def _file_name(folder_name, line_id, url):
    return "%s/%s_%s" % (folder_name, line_id, (zlib.crc32(url.encode('utf-8')) & 0xffffffff))

for mode, df, existed_lines in zip(['train', 'val'], [train_df, val_df], [existed_train_lines, existed_val_lines]):
    folder_name = 'images' if mode == 'train' else 'validation'
    json_data = []
    for i in tqdm(range(len(df))):
        if i in existed_lines:
            fn = _file_name(folder_name, i, df.iloc[i]['url'])
            image_abs_path = os.path.join(image_root, fn)
            assert os.path.exists(image_abs_path), image_abs_path
            item = dict(
                image=fn,
                caption=df.iloc[i]['caption'],
                image_id=i,
            )
            json_data.append(item)

    with open(os.path.join(SAVE_PATH, f'{mode}.json'), 'w') as wf:
        json.dump(json_data, wf)
    
    if mode == 'val':
        gt = {
            'annotations': [],
            'images': [],
        }
        caption_id = 0
        for item in json_data:
            item = dict(
                image_id=item['image_id'],
                caption=item['caption'],
                id=caption_id,
            )
            caption_id += 1
            gt['annotations'].append(item)
            gt['images'].append({'id': item['image_id']})
                    
        with open(os.path.join(SAVE_PATH, f"{mode}_gt.json"), 'w') as wf:
            json.dump(gt, wf)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3318333/3318333 [05:45<00:00, 9613.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15840/15840 [00:01<00:00, 10089.31it/s]


## 3.2 Multilingual Corpora

Similar to Section 2.2:
1. put the train captions (`GCC-training.tsv`) into a `.docx` file line by line (we drop duplicates to save costs)
2. get the translated results of the 1st step's file via [Google Translator](https://translate.google.com)
3. convert the 2nd step's results to, e.g., `corpus/multilingual_cc3m/36langs/cc3m_en.tsv.gz` and `corpus/multilingual_cc3m/36langs/cc3m_en-zh.tsv.gz`

**Note:** After droping duplicates, captions in `GCC-training.tsv` are identical to `corpus/multilingual_cc3m/36langs/cc3m_en.tsv.gz`

In [58]:
import os
SAVE_PATH = 'corpus/multilingual_cc3m/36langs'
os.makedirs(SAVE_PATH, exist_ok=True)

In [59]:
cleaned_df = train_df.drop_duplicates(['caption'])
print(len(train_df), len(cleaned_df))

3318333 2348709


In [62]:
import gzip
with gzip.open(os.path.join(SAVE_PATH, 'cc3m_en.tsv.gz'), 'wt', encoding='utf8') as wf:
    wf.write('\n'.join(cleaned_df['caption'].tolist()))

In [69]:
import json

line2idx = {}
caption2idx = {}
idx = 0
for line, caption in enumerate(train_df['caption'].tolist()):
    if caption not in caption2idx:
        caption2idx[caption] = idx
        idx += 1
    line2idx[line] = caption2idx[caption]

with open(os.path.join(SAVE_PATH, 'line2idx.json'), 'w') as wf:
    json.dump(line2idx, wf)