In [1]:
import os
SAVE_PATH = './annotations'
os.makedirs(SAVE_PATH, exist_ok=True)

# 1. Flickr30k

```bibtex
@article{young2014image,
  title={From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions},
  author={Young, Peter and Lai, Alice and Hodosh, Micah and Hockenmaier, Julia},
  journal={Transactions of the Association for Computational Linguistics},
  volume={2},
  pages={67--78},
  year={2014},
  publisher={MIT Press}
}
```

In [9]:
DATASET = 'flickr30k'
RELATIVE_PATH_FORMAT = 'flickr30k-images/{}.jpg'

# 1.1 Flickr30k-EN (Karpathy's splits)
```bibtex
@inproceedings{karpathy2015deep,
  title={Deep visual-semantic alignments for generating image descriptions},
  author={Karpathy, Andrej and Fei-Fei, Li},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={3128--3137},
  year={2015}
}
```

In [39]:
!wget https://cs.stanford.edu/people/karpathy/deepimagesent/flickr30k.zip --no-check-certificate

--2023-02-14 21:00:07--  https://cs.stanford.edu/people/karpathy/deepimagesent/flickr30k.zip
正在解析主机 cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
正在连接 cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... 已连接。
警告: 无法验证 cs.stanford.edu 的由 “CN=InCommon RSA Server CA,OU=InCommon,O=Internet2,L=Ann Arbor,ST=MI,C=US” 颁发的证书:
  出现了自己签名的证书。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：197966511 (189M) [application/zip]
正在保存至: “flickr30k.zip”


2023-02-14 21:00:43 (5.30 MB/s) - 已保存 “flickr30k.zip” [197966511/197966511])



In [1]:
!unzip -o flickr30k.zip
!rm flickr30k.zip

Archive:  flickr30k.zip
  inflating: flickr30k/dataset.json  
  inflating: flickr30k/vgg_feats.mat  
  inflating: flickr30k/readme.txt    


In [3]:
import json
data = json.load(open('flickr30k/dataset.json', 'r'))['images']

In [4]:
data[0]

{'sentids': [0, 1, 2, 3, 4],
 'imgid': 0,
 'sentences': [{'tokens': ['two',
    'young',
    'guys',
    'with',
    'shaggy',
    'hair',
    'look',
    'at',
    'their',
    'hands',
    'while',
    'hanging',
    'out',
    'in',
    'the',
    'yard'],
   'raw': 'Two young guys with shaggy hair look at their hands while hanging out in the yard.',
   'imgid': 0,
   'sentid': 0},
  {'tokens': ['two',
    'young',
    'white',
    'males',
    'are',
    'outside',
    'near',
    'many',
    'bushes'],
   'raw': 'Two young, White males are outside near many bushes.',
   'imgid': 0,
   'sentid': 1},
  {'tokens': ['two',
    'men',
    'in',
    'green',
    'shirts',
    'are',
    'standing',
    'in',
    'a',
    'yard'],
   'raw': 'Two men in green shirts are standing in a yard.',
   'imgid': 0,
   'sentid': 2},
  {'tokens': ['a',
    'man',
    'in',
    'a',
    'blue',
    'shirt',
    'standing',
    'in',
    'a',
    'garden'],
   'raw': 'A man in a blue shirt standing in

In [5]:
from collections import defaultdict
imageid2captions = defaultdict(list)
imageid2split = {}
split2imageids = defaultdict(list)
for item in data:
    imageid = int(item['filename'].split('.')[0])
    for sentence in item['sentences']:
        caption = sentence['raw']
        imageid2captions[imageid].append(caption)
    imageid2split[imageid] = item['split']
    split2imageids[item['split']].append(imageid)

In [6]:
set(list(imageid2split.values()))

{'test', 'train', 'val'}

In [18]:
import os, json

save_path = os.path.join(SAVE_PATH, DATASET, 'en')
os.makedirs(save_path, exist_ok=True)

caption_id = 0

for mode in ['train', 'val', 'test']:
    json_data = []
    for image_id in split2imageids[mode]:
        if mode == 'train':
            for caption in imageid2captions[image_id]:
                item = dict(
                    image=RELATIVE_PATH_FORMAT.format(image_id),
                    caption=caption,
                    image_id=image_id,
                )
                json_data.append(item)
        else:
            item = dict(
                image=RELATIVE_PATH_FORMAT.format(image_id),
                caption=imageid2captions[image_id],
                image_id=image_id,
            )
            json_data.append(item)

    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(json_data, wf)

    if mode != 'train':
        gt = {
            'annotations': [],
            'images': [],
        }

        for item in json_data:
            assert isinstance(item['caption'], (list, tuple))
            image_id = item['image_id']
            for caption in item['caption']:
                item = dict(
                    image_id=image_id,
                    caption=caption,
                    id=caption_id,
                )
                caption_id += 1
                gt['annotations'].append(item)
            gt['images'].append({'id': image_id})
                    
        with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
            json.dump(gt, wf)

# 1.2 Flickr30k-ZH

```bibtex
@inproceedings{lan2017fluency,
  title={Fluency-guided cross-lingual image captioning},
  author={Lan, Weiyu and Li, Xirong and Dong, Jianfeng},
  booktitle={Proceedings of the 25th ACM international conference on Multimedia},
  pages={1549--1557},
  year={2017}
}
```

**Note**: We found that Flickr30k-CN does not follow the splits proposed by Karpathy et al. (https://cs.stanford.edu/people/karpathy/deepimagesent/flickr30k.zip). Besides, some images required by Flicrk30k-CN does not exist in Karpathy's splits (31,014 images) but exist in the official splits (31783 images)

**please manually operates as follows:**
1. Download Flickr30k-CN from https://github.com/li-xirong/cross-lingual-cap;
2. Get the file named `seg.flickr30kzhmbosontest.caption.txt`;
3. Put the file to the same directory as this notebook

In [14]:
from collections import defaultdict

zh_imageid2captions = defaultdict(list)
zh_data = open('seg.flickr30kzhmbosontest.caption.txt', 'r').read().strip().split('\n')
for line in zh_data:
    splits = line.split(' ')
    
    tag = splits[0]
    imageid = int(tag.split('#')[0]) # e.g., 1009692167#zhm#1 -> 1009692167

    tokens = [item.split(':')[0] for item in splits[1:]] # e.g., 一个:m -> 一个
    caption = ''.join(tokens)

    zh_imageid2captions[imageid].append(caption)

In [19]:
import os, json

save_path = os.path.join(SAVE_PATH, DATASET, 'zh')
os.makedirs(save_path, exist_ok=True)

json_data = []
for image_id, captions in zh_imageid2captions.items():
    item = dict(
        image=RELATIVE_PATH_FORMAT.format(image_id),
        caption=captions,
        image_id=image_id,
    )
    json_data.append(item)

with open(os.path.join(save_path, 'test.json'), 'w') as wf:
    json.dump(json_data, wf)

gt = {
    'annotations': [],
    'images': [],
}

caption_id = 0
for item in json_data:
    assert isinstance(item['caption'], (list, tuple))
    image_id = item['image_id']
    for caption in item['caption']:
        item = dict(
            image_id=image_id,
            caption=caption,
            id=caption_id,
        )
        caption_id += 1
        gt['annotations'].append(item)
    gt['images'].append({'id': image_id})
            
with open(os.path.join(save_path, f'test_gt.json'), 'w') as wf:
    json.dump(gt, wf)

# 1.3 Flickr30k-DE, Fr, Cs (Multi30K)

```bibtex
@inproceedings{elliott2016multi30k,
  title={Multi30K: Multilingual English-German Image Descriptions},
  author={Elliott, Desmond and Frank, Stella and Sima’an, Khalil and Specia, Lucia},
  booktitle={Proceedings of the 5th Workshop on Vision and Language},
  pages={70--74},
  year={2016}
}
```


In [50]:
!git clone https://github.com/multi30k/dataset.git

Cloning into 'dataset'...
remote: Enumerating objects: 313, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 313 (delta 17), reused 21 (delta 16), pack-reused 281[K
Receiving objects: 100% (313/313), 18.21 MiB | 16.00 KiB/s, done.
Resolving deltas: 100% (69/69), done.


# 1.3.1 Flickr30k-DE

In [52]:
import os, json
import gzip

save_path = os.path.join(SAVE_PATH, DATASET, 'de')
os.makedirs(save_path, exist_ok=True)

caption_id = 0
for mode in ['train', 'val', 'test_2016']:
    images_list = open(f'dataset/data/task2/image_splits/{mode}_images.txt', 'r').read().strip().split('\n')
    captions_list = []
    for i in range(5):
        data = []
        with gzip.open(f'dataset/data/task2/raw/{mode}.{i+1}.de.gz', 'rt', encoding='utf8') as f:
            for line in f:
                caption = line.strip()
                data.append(caption)
        assert len(data) == len(images_list)
        captions_list.append(data)

    mode = mode.split('_')[0] # 'test_2016' -> 'test'
    
    imageid2captions = {}
    for fn, captions in zip(images_list, zip(*captions_list)):
        image_id = int(fn.split('.')[0])
        imageid2captions[image_id] = list(captions)

    json_data = []
    for image_id, captions in imageid2captions.items():
        if mode == 'train':
            for caption in captions:
                item = dict(
                    image=RELATIVE_PATH_FORMAT.format(image_id),
                    caption=caption,
                    image_id=image_id,
                )
                json_data.append(item)
        else:
            item = dict(
                image=RELATIVE_PATH_FORMAT.format(image_id),
                caption=captions,
                image_id=image_id,
            )
            json_data.append(item)

    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(json_data, wf)

    if mode != 'train':
        gt = {
            'annotations': [],
            'images': [],
        }

        for item in json_data:
            assert isinstance(item['caption'], (list, tuple))
            image_id = item['image_id']
            for caption in item['caption']:
                item = dict(
                    image_id=image_id,
                    caption=caption,
                    id=caption_id,
                )
                caption_id += 1
                gt['annotations'].append(item)
            gt['images'].append({'id': image_id})
                    
        with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
            json.dump(gt, wf)

# 1.3.2 Flickr30k-FR, CS

In [53]:
def replace_special_tokens(data):
    new_data = []
    for line in data:
        # in translation data, `man 's` is wrriten as `man &apos;s`
        line = line.replace("&apos;", "'")
        # in translation data, `" 92 "` is wrriten as `&quot; 92 &quot;`
        line = line.replace('&quot;', '"')
        new_data.append(line)
    return new_data

In [56]:
import os, json
import gzip

for lang in ['fr', 'cs']:
    save_path = os.path.join(SAVE_PATH, DATASET, lang)
    os.makedirs(save_path, exist_ok=True)

    caption_id = 0
    for mode in ['train', 'val', 'test_2016_flickr']:
        images_list = open(f'dataset/data/task1/image_splits/{mode}.txt', 'r').read().strip().split('\n')
        captions_list = []
        for i in range(1):
            data = []
            with gzip.open(f'dataset/data/task1/raw/{mode}.{lang}.gz', 'rt', encoding='utf8') as f:
                for line in f:
                    caption = line.strip()
                    data.append(caption)

            assert len(data) == len(images_list)
            data = replace_special_tokens(data)
            captions_list.append(data)
        
        mode = mode.split('_')[0] # 'test_2016_flickr' -> 'test'
        
        imageid2captions = {}
        for fn, captions in zip(images_list, zip(*captions_list)):
            image_id = int(fn.split('.')[0])
            imageid2captions[image_id] = list(captions)

        json_data = []
        for image_id, captions in imageid2captions.items():
            if mode == 'train':
                for caption in captions:
                    item = dict(
                        image=RELATIVE_PATH_FORMAT.format(image_id),
                        caption=caption,
                        image_id=image_id,
                    )
                    json_data.append(item)
            else:
                item = dict(
                    image=RELATIVE_PATH_FORMAT.format(image_id),
                    caption=captions,
                    image_id=image_id,
                )
                json_data.append(item)

        with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
            json.dump(json_data, wf)

        if mode != 'train':
            gt = {
                'annotations': [],
                'images': [],
            }

            for item in json_data:
                assert isinstance(item['caption'], (list, tuple))
                image_id = item['image_id']
                for caption in item['caption']:
                    item = dict(
                        image_id=image_id,
                        caption=caption,
                        id=caption_id,
                    )
                    caption_id += 1
                    gt['annotations'].append(item)
                gt['images'].append({'id': image_id})
                        
            with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
                json.dump(gt, wf)

# 1.4 Translation

# 1.4.1 Flickr30k EN-ZH Pairs

**Note**: please run `Section 1.2 Flicrk30k-ZH` first

In [29]:
# Download the official flickr30k annotations and get the file named results_20130124.token
!wget http://shannon.cs.illinois.edu/DenotationGraph/data/flickr30k.tar.gz --no-check-certificate
!tar -xzf flickr30k.tar.gz
!rm flickr30k.tar.gz
!rm readme.txt

--2023-03-23 01:22:34--  http://shannon.cs.illinois.edu/DenotationGraph/data/flickr30k.tar.gz
正在解析主机 shannon.cs.illinois.edu (shannon.cs.illinois.edu)... 18.220.149.166
正在连接 shannon.cs.illinois.edu (shannon.cs.illinois.edu)|18.220.149.166|:80... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：3652513 (3.5M) [application/x-gzip]
正在保存至: “flickr30k.tar.gz”


2023-03-23 01:23:28 (67.5 KB/s) - 已保存 “flickr30k.tar.gz” [3652513/3652513])



In [57]:
from collections import defaultdict

en_imageid2captions = defaultdict(list)
official_data = open('results_20130124.token', 'r').read().strip().split('\n')
for line in official_data:
    tag, caption = line.split('\t')
    imageid = int(tag.split('.')[0]) # e.g., 1000092795.jpg#0 -> 1000092795
    en_imageid2captions[imageid].append(caption)

In [58]:
zh_imageids = list(zh_imageid2captions.keys())
zh_imageids[:10]

[1009692167,
 1021439420,
 1032122270,
 1043819504,
 1095580424,
 11034843,
 11214470,
 1128230658,
 1132772170,
 1143882946]

In [59]:
zh_imageid2captions[zh_imageids[0]]

['在警车前，一条训练有素的警犬坐在它的警官身旁。',
 '一名警察站着，身边有一只德国牧羊犬',
 '一位安保人员带着他的狗正在寻找某些东西',
 '一名穿着反光背心的军官和他的狗站在他的车前面',
 '一个警察和一只搜索犬在街上']

In [60]:
en_imageid2captions[zh_imageids[0]]

['An officer in a reflective vest stands at the front of his van with his dog .',
 'A trained police dog sits next to his handler in front of the police van .',
 'A security man with his watch dog is looking for something .',
 'A policeman is standing with a German Shepherd dog .',
 'A policeman stops on a street with a search dog .']

In [61]:
import os
path = os.path.join(SAVE_PATH, DATASET, 'en-zh')
os.makedirs(path, exist_ok=True)

en_captions = [en_imageid2captions[image_id][i] for image_id in zh_imageids for i in range(5)]
zh_captions = [zh_imageid2captions[image_id][i] for image_id in zh_imageids for i in range(5)]

# add a full stop at the end of each sentence
for i in range(len(en_captions)):
    en_captions[i] = (en_captions[i] + ' .') if not en_captions[i].endswith(' .') else en_captions[i]
    zh_captions[i] = (zh_captions[i] + '。') if not zh_captions[i].endswith('。') else zh_captions[i]

with open(os.path.join(path, 'test.en'), 'w') as wf:
    wf.write('\n'.join(en_captions))

with open(os.path.join(path, 'test.zh'), 'w') as wf:
    wf.write('\n'.join(zh_captions))

image_rpaths = [RELATIVE_PATH_FORMAT.format(image_id) for image_id in zh_imageids for _ in range(5)]
with open(os.path.join(path, 'test_images.txt'), 'w') as wf:
    wf.write('\n'.join(image_rpaths))

# 1.4.2 Flickr30k EN-DE, EN-FR, DE-FR Pairs

**Note**: please run `Section 1.3 Flicrk30k-DE, FR,CS` first

In [62]:
import os

for folder in ['en-de', 'en-fr', 'de-fr']:
    path = os.path.join(SAVE_PATH, DATASET, folder)
    os.makedirs(path, exist_ok=True)

    for mode in ['train', 'val', 'test_2016_flickr']:
        for lang in folder.split('-'):
            src = f'dataset/data/task1/tok/{mode}.lc.norm.tok.{lang}'
            data = open(src, 'r').read().strip().split('\n')
            data = replace_special_tokens(data)
            
            trg = os.path.join(path, f'{mode.split("_")[0]}.{lang}')
            with open(trg, 'w') as wf:
                wf.write('\n'.join(data))
        
        image_ids = [item.split('.')[0] for item in open(f'dataset/data/task1/image_splits/{mode}.txt', 'r').read().strip().split('\n')]
        image_rpaths = [RELATIVE_PATH_FORMAT.format(image_id) for image_id in image_ids]

        with open(os.path.join(path, f'{mode.split("_")[0]}_images.txt'), 'w') as wf:
            wf.write('\n'.join(image_rpaths))

# 1.4.3 Flickr30k ZH-DE, ZH-FR Pairs

**Note**: please run `Section 1.2 Flicrk30k-ZH` and `Section 1.3 Flicrk30k-DE, FR,CS` first

In [73]:
import json
zh_imageids = set([item['image_id'] for item in json.load(open(os.path.join(SAVE_PATH, DATASET, 'zh', 'test.json'), 'r'))])
de_fr_imageids = set([item['image_id'] for item in json.load(open(os.path.join(SAVE_PATH, DATASET, 'de', 'test.json'), 'r'))])

In [74]:
print(len(zh_imageids), len(de_fr_imageids))

1000 1000


In [75]:
len(zh_imageids & de_fr_imageids)

36

In [76]:
en_zh_path = os.path.join(SAVE_PATH, DATASET, 'en-zh')
en_captions = open(os.path.join(en_zh_path, 'test.en'), 'r').read().strip().split('\n')
zh_captions = open(os.path.join(en_zh_path, 'test.zh'), 'r').read().strip().split('\n')
en2zh = {item[0].lower(): item[1] for item in zip(*[en_captions, zh_captions])}

In [77]:
en_captions = open(os.path.join(SAVE_PATH, DATASET, 'en-de', 'test.en'), 'r').read().strip().split('\n')

for i in range(len(en_captions)):
    # add a full stop to each sentence of en_captions
    en_captions[i] = (en_captions[i] + ' .') if not en_captions[i].endswith(' .') else en_captions[i]

de_captions = open(os.path.join(SAVE_PATH, DATASET, 'en-de', 'test.de'), 'r').read().strip().split('\n')
fr_captions = open(os.path.join(SAVE_PATH, DATASET, 'en-fr', 'test.fr'), 'r').read().strip().split('\n')

en2de = {item[0].lower(): item[1] for item in zip(*[en_captions, de_captions])}
en2fr = {item[0].lower(): item[1] for item in zip(*[en_captions, fr_captions])}

imageids = open(os.path.join(SAVE_PATH, DATASET, 'en-de', 'test_images.txt'), 'r').read().strip().split('\n')
en2imageid = {item[0].lower(): item[1].split('/')[-1].split('.')[0] for item in zip(*[en_captions, imageids])}

In [78]:
common_en = set(list(en2zh.keys())) & set(list(en2de.keys()))
len(common_en)

36

In [79]:
for trg_lang, trg_map in zip(['de', 'fr'], [en2de, en2fr]):
    path = os.path.join(SAVE_PATH, DATASET, f'zh-{trg_lang}')
    os.makedirs(path, exist_ok=True)

    with open(os.path.join(path, 'test.zh'), 'w') as wf:
        wf.write('\n'.join([en2zh[en] for en in common_en]))
    
    with open(os.path.join(path, f'test.{trg_lang}'), 'w') as wf:
        wf.write('\n'.join([trg_map[en] for en in common_en]))
    
    image_rpaths = [RELATIVE_PATH_FORMAT.format(en2imageid[en]) for en in common_en]
    with open(os.path.join(path, 'test_images.txt'), 'w') as wf:
        wf.write('\n'.join(image_rpaths))

# 2. MS-COCO

```bibtex
@article{chen2015microsoft,
  title={Microsoft coco captions: Data collection and evaluation server},
  author={Chen, Xinlei and Fang, Hao and Lin, Tsung-Yi and Vedantam, Ramakrishna and Gupta, Saurabh and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  journal={arXiv preprint arXiv:1504.00325},
  year={2015}
}
@inproceedings{lin2014microsoft,
  title={Microsoft coco: Common objects in context},
  author={Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Doll{\'a}r, Piotr and Zitnick, C Lawrence},
  booktitle={Computer Vision--ECCV 2014: 13th European Conference, Zurich, Switzerland, September 6-12, 2014, Proceedings, Part V 13},
  pages={740--755},
  year={2014},
  organization={Springer}
}
```

In [2]:
DATASET = 'coco'

# 2.1 COCO-EN (Karpathy's splits)

```bibtex
@inproceedings{karpathy2015deep,
  title={Deep visual-semantic alignments for generating image descriptions},
  author={Karpathy, Andrej and Fei-Fei, Li},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={3128--3137},
  year={2015}
}
```

In [99]:
!wget https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip --no-check-certificate
!unzip caption_datasets.zip

--2023-03-23 14:19:46--  https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
正在解析主机 cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
正在连接 cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... 已连接。
警告: 无法验证 cs.stanford.edu 的由 “CN=InCommon RSA Server CA,OU=InCommon,O=Internet2,L=Ann Arbor,ST=MI,C=US” 颁发的证书:
  出现了自己签名的证书。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：36745453 (35M) [application/zip]
正在保存至: “caption_datasets.zip”


2023-03-23 14:20:06 (1.83 MB/s) - 已保存 “caption_datasets.zip” [36745453/36745453])

Archive:  caption_datasets.zip
  inflating: dataset_coco.json       
  inflating: dataset_flickr30k.json  
  inflating: dataset_flickr8k.json   


In [3]:
import json
data = json.load(open('dataset_coco.json', 'r'))['images']

In [4]:
data[0]

{'filepath': 'val2014',
 'sentids': [770337, 771687, 772707, 776154, 781998],
 'filename': 'COCO_val2014_000000391895.jpg',
 'imgid': 0,
 'split': 'test',
 'sentences': [{'tokens': ['a',
    'man',
    'with',
    'a',
    'red',
    'helmet',
    'on',
    'a',
    'small',
    'moped',
    'on',
    'a',
    'dirt',
    'road'],
   'raw': 'A man with a red helmet on a small moped on a dirt road. ',
   'imgid': 0,
   'sentid': 770337},
  {'tokens': ['man',
    'riding',
    'a',
    'motor',
    'bike',
    'on',
    'a',
    'dirt',
    'road',
    'on',
    'the',
    'countryside'],
   'raw': 'Man riding a motor bike on a dirt road on the countryside.',
   'imgid': 0,
   'sentid': 771687},
  {'tokens': ['a',
    'man',
    'riding',
    'on',
    'the',
    'back',
    'of',
    'a',
    'motorcycle'],
   'raw': 'A man riding on the back of a motorcycle.',
   'imgid': 0,
   'sentid': 772707},
  {'tokens': ['a',
    'dirt',
    'path',
    'with',
    'a',
    'young',
    'person',

In [5]:
import os
from collections import defaultdict

imageid2rpath = {}
imageid2captions = defaultdict(list)
split2imageids = defaultdict(list)
for item in data:
    imageid = int(item['filename'].split('.')[0].split('_')[-1])
    for sentence in item['sentences']:
        caption = sentence['raw']
        imageid2captions[imageid].append(caption)
    split2imageids[item['split']].append(imageid)
    imageid2rpath[imageid] =  os.path.join(item['filepath'], item['filename'])

In [6]:
for k, v in split2imageids.items():
    print(k, len(v))

test 5000
restval 30504
val 5000
train 82783


In [7]:
import os, json

save_path = os.path.join(SAVE_PATH, DATASET, 'en')
os.makedirs(save_path, exist_ok=True)

caption_id = 0
for mode in ['train', 'val', 'test']:
    json_data = []

    if mode == 'train':
        image_ids_of_this_split = split2imageids['restval'] + split2imageids[mode]
    else:
        image_ids_of_this_split = split2imageids[mode]

    for image_id in image_ids_of_this_split:
        if mode == 'train':
            for caption in imageid2captions[image_id]:
                item = dict(
                    image=imageid2rpath[image_id],
                    caption=caption,
                    image_id=image_id,
                )
                json_data.append(item)
        else:
            item = dict(
                image=imageid2rpath[image_id],
                caption=imageid2captions[image_id],
                image_id=image_id,
            )
            json_data.append(item)

    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(json_data, wf)

    if mode != 'train':
        gt = {
            'annotations': [],
            'images': [],
        }

        for item in json_data:
            assert isinstance(item['caption'], (list, tuple))
            image_id = item['image_id']
            for caption in item['caption']:
                item = dict(
                    image_id=image_id,
                    caption=caption,
                    id=caption_id,
                )
                caption_id += 1
                gt['annotations'].append(item)
            gt['images'].append({'id': image_id})
                    
        with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
            json.dump(gt, wf)

# 2.2 COCO-JA

```bibtex
@InProceedings{Yoshikawa2017,
  title     = {STAIR Captions: Constructing a Large-Scale Japanese Image Caption Dataset},
  booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month     = {July},
  year      = {2017},
  address   = {Vancouver, Canada},
  publisher = {Association for Computational Linguistics},
  pages     = {417--421},
  url       = {http://www.aclweb.org/anthology/P17-2066}
}
```

Please run `Section 2.1 COCO-EN` first

In [113]:
!wget https://github.com/STAIR-Lab-CIT/STAIR-captions/raw/master/stair_captions_v1.2.tar.gz --no-check-certificate

--2023-03-23 15:06:17--  https://github.com/STAIR-Lab-CIT/STAIR-captions/raw/master/stair_captions_v1.2.tar.gz
正在解析主机 github.com (github.com)... 140.82.113.4
正在连接 github.com (github.com)|140.82.113.4|:443... 已连接。
警告: 无法验证 github.com 的由 “CN=DigiCert TLS Hybrid ECC SHA384 2020 CA1,O=DigiCert Inc,C=US” 颁发的证书:
  无法本地校验颁发者的权限。
已发出 HTTP 请求，正在等待回应... 302 Found
位置：https://raw.githubusercontent.com/STAIR-Lab-CIT/STAIR-captions/master/stair_captions_v1.2.tar.gz [跟随至新的 URL]
--2023-03-23 15:06:18--  https://raw.githubusercontent.com/STAIR-Lab-CIT/STAIR-captions/master/stair_captions_v1.2.tar.gz
正在解析主机 raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.96.133
正在连接 raw.githubusercontent.com (raw.githubusercontent.com)|199.232.96.133|:443... ^C


In [114]:
!tar -xzvf stair_captions_v1.2.tar.gz

x stair_captions_v1.2_train.json
x stair_captions_v1.2_train_tokenized.json
x stair_captions_v1.2_val.json
x stair_captions_v1.2_val_tokenized.json


In [10]:
import json
train_data = json.load(open('stair_captions_v1.2_train.json', 'r'))
val_data = json.load(open('stair_captions_v1.2_val.json', 'r'))
train_data.keys()

dict_keys(['info', 'images', 'licenses', 'annotations'])

In [11]:
train_data['annotations'][0]

{'image_id': 203312, 'id': 3, 'caption': '山の中を赤い電車が走っている'}

In [12]:
train_data['images'][0]

{'license': 5,
 'file_name': 'COCO_train2014_000000057870.jpg',
 'coco_url': 'http://mscoco.org/images/57870',
 'height': 480,
 'width': 640,
 'date_captured': '2013-11-14 16:28:13',
 'flickr_url': 'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg',
 'id': 57870}

In [13]:
from collections import defaultdict
imageid2captions = defaultdict(list)
for item in train_data['annotations'] + val_data['annotations']:
    imageid2captions[item['image_id']].append(item['caption'])

In [14]:
import os, json

save_path = os.path.join(SAVE_PATH, DATASET, 'ja')
os.makedirs(save_path, exist_ok=True)

caption_id = 0
for mode in ['train', 'val', 'test']:
    json_data = []
    # split2imageids is obtained by Section 2.1, so COCO-JA shares the same Karpathy's splits
    if mode == 'train':
        image_ids_of_this_split = split2imageids['restval'] + split2imageids[mode]
    else:
        image_ids_of_this_split = split2imageids[mode]

    for image_id in image_ids_of_this_split:
        if mode == 'train':
            for caption in imageid2captions[image_id]:
                item = dict(
                    image=imageid2rpath[image_id],
                    caption=caption,
                    image_id=image_id,
                )
                json_data.append(item)
        else:
            item = dict(
                image=imageid2rpath[image_id],
                caption=imageid2captions[image_id],
                image_id=image_id,
            )
            json_data.append(item)

    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(json_data, wf)

    if mode != 'train':
        gt = {
            'annotations': [],
            'images': [],
        }

        for item in json_data:
            assert isinstance(item['caption'], (list, tuple))
            image_id = item['image_id']
            for caption in item['caption']:
                item = dict(
                    image_id=image_id,
                    caption=caption,
                    id=caption_id,
                )
                caption_id += 1
                gt['annotations'].append(item)
            gt['images'].append({'id': image_id})
                    
        with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
            json.dump(gt, wf)

# 3. MSR-VTT

```bibtex
@inproceedings{xu2016msr,
  title={Msr-vtt: A large video description dataset for bridging video and language},
  author={Xu, Jun and Mei, Tao and Yao, Ting and Rui, Yong},
  booktitle={Proceedings of the IEEE conference on computer vision and pattern recognition},
  pages={5288--5296},
  year={2016}
}
```

In [142]:
DATASET = 'msrvtt'
RELATIVE_PATH_FORMAT = 'all_videos/video{vid}.mp4'

In [143]:
import os
import wget

save_path = os.path.join(SAVE_PATH, DATASET)
os.makedirs(save_path, exist_ok=True)

json_path = os.path.join(save_path, 'videodatainfo_2016.json')

if not os.path.exists(json_path):
    # the official url is http://ms-multimedia-challenge.com/2016, but the website is out-of-date
    url = "https://github.com/ybCliff/VideoCaptioning/releases/download/v1.0/videodatainfo_2016.json"
    wget.download(url, json_path)

In [144]:
import os
import json
from collections import defaultdict

save_path = os.path.join(SAVE_PATH, DATASET, 'en')
os.makedirs(save_path, exist_ok=True)

data = json.load(open(json_path, 'r'))
splits = defaultdict(list)

for item in data['videos']:
    # 'video1000' -> 1000
    vid = int(item['video_id'][5:])
    splits[item['split']].append(vid)

for k in splits.keys():
    splits[k] = sorted(splits[k])

splits['val'] = splits.pop('validate')

vid2captions = defaultdict(list)

for item in data['sentences']:
    vid = int(item['video_id'][5:])
    vid2captions[vid].append(item['caption'])

train_items = []
for vid in splits['train']:
    for caption in vid2captions[vid]:
        item = dict(
            image=RELATIVE_PATH_FORMAT.format(vid=vid), # we only record the relative path of a video
            caption=caption,
            image_id=vid,
        )
        train_items.append(item)

with open(os.path.join(save_path, 'train.json'), 'w') as wf:
    json.dump(train_items, wf)

caption_id = 0
for mode in ['val', 'test']:
    items = []
    for vid in splits[mode]:
        item = dict(
            image=RELATIVE_PATH_FORMAT.format(vid=vid),
            caption=vid2captions[vid],
            image_id=vid,
        )
        items.append(item)
    
    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(items, wf)
    
    gt = {
        'annotations': [],
        'images': [{'id': vid} for vid in splits[mode]],
    }

    for vid in splits[mode]:
        for caption in vid2captions[vid]:
            item = dict(
                image_id=vid,
                caption=caption,
                id=caption_id,
            )
            caption_id += 1
            gt['annotations'].append(item)
    
    with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
        json.dump(gt, wf)

# 4. VATEX
```bibtex
@inproceedings{wang2019vatex,
  title={Vatex: A large-scale, high-quality multilingual dataset for video-and-language research},
  author={Wang, Xin and Wu, Jiawei and Chen, Junkun and Li, Lei and Wang, Yuan-Fang and Wang, William Yang},
  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
  pages={4581--4591},
  year={2019}
}
```

**Notes: Some videos of this dataset can not be downloaded any more; We use the dataset identical to the following paper:**
```bibtex
@inproceedings{yang2022clip,
  title={CLIP Meets Video Captioning: Concept-Aware Representation Learning Does Matter},
  author={Yang, Bang and Zhang, Tong and Zou, Yuexian},
  booktitle={Pattern Recognition and Computer Vision: 5th Chinese Conference, PRCV 2022, Shenzhen, China, November 4--7, 2022, Proceedings, Part I},
  pages={368--381},
  year={2022},
}
```
url: https://github.com/yangbang18/CLIP-Captioner

In [1]:
DATASET = 'vatex'
RELATIVE_PATH_FORMAT = 'all_videos/video{vid}.mp4'

In [4]:
import os
import wget 

root = os.path.join(SAVE_PATH, DATASET)
os.makedirs(root, exist_ok=True)

base_url = "https://eric-xw.github.io/vatex-website/data/"

for filename in ['vatex_training_v1.0.json', 'vatex_validation_v1.0.json', 'vatex_public_test_english_v1.1.json']:
    now_path = os.path.join(root, filename)
    now_url = base_url + filename
    if not os.path.exists(now_path):
        wget.download(now_url, out=now_path)

base_url = "https://raw.githubusercontent.com/yangbang18/CLIP-Captioner/master/data/"
for filename in ['vatex_mapping.txt', 'vatex_existed_videos.txt']:
    now_path = os.path.join(root, filename)
    now_url = base_url + filename
    if not os.path.exists(now_path):
        wget.download(now_url, out=now_path)

# 4.1 VATEX-EN

In [9]:
import json
import os
from collections import defaultdict

save_path = os.path.join(SAVE_PATH, DATASET, 'en')
os.makedirs(save_path, exist_ok=True)

lines = open(os.path.join(root, 'vatex_mapping.txt'), 'r').read().strip().split('\n')
id2vid = {}
for line in lines:
    id, vid = line.split(' ')
    id2vid[id] = vid # e.g., Ptf_2VRj-V0_000122_000132 -> video0

existed_videos = open(os.path.join(root, 'vatex_existed_videos.txt'), 'r').read().strip().split('\n')
existed_videos = set([int(item[5:]) for item in existed_videos])

splits = defaultdict(list)
vid2captions = {}
vid2captions_zh = {}
for fn, split in zip(
    ['vatex_training_v1.0.json', 'vatex_validation_v1.0.json', 'vatex_public_test_english_v1.1.json'], 
    ['train', 'val', 'test']):
    data = json.load(open(os.path.join(root, fn), 'r'))

    failed_count = 0
    for item in data:
        # Ptf_2VRj-V0_000122_000132 -> video0
        vid = id2vid[item['videoID']]
        # video0 -> 0
        vid = int(vid[5:])
        if vid not in existed_videos:
            # we do not use the annotations of those unaccessible videos
            failed_count += 1
            continue

        splits[split].append(vid)
        vid2captions[vid] = item['enCap']
        if split == 'test':
            assert 'chCap' not in item
        else:
            vid2captions_zh[vid] = item['chCap']
    
    print(f'The `{split}` split has {len(data) - failed_count} videos, {failed_count} of {len(data)} videos are unaccessible!')

for k in splits.keys():
    splits[k] = sorted(splits[k])

train_items = []
for vid in splits['train']:
    for caption in vid2captions[vid]:
        item = dict(
            image=RELATIVE_PATH_FORMAT.format(vid=vid), # we only record the relative path of a video
            caption=caption,
            image_id=vid,
        )
        train_items.append(item)

with open(os.path.join(save_path, 'train.json'), 'w') as wf:
    json.dump(train_items, wf)

caption_id = 0
for mode in ['val', 'test']:
    items = []
    for vid in splits[mode]:
        item = dict(
            image=RELATIVE_PATH_FORMAT.format(vid=vid),
            caption=vid2captions[vid],
            image_id=vid,
        )
        items.append(item)

    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(items, wf)
    
    gt = {
        'annotations': [],
        'images': [{'id': vid} for vid in splits[mode]],
    }

    for vid in splits[mode]:
        for caption in vid2captions[vid]:
            item = dict(
                image_id=vid,
                caption=caption,
                id=caption_id,
            )
            caption_id += 1
            gt['annotations'].append(item)
    
    with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
        json.dump(gt, wf)

The `train` split has 25006 videos, 985 of 25991 videos are unaccessible!
The `val` split has 2893 videos, 107 of 3000 videos are unaccessible!
The `test` split has 5792 videos, 208 of 6000 videos are unaccessible!


# 4.2 VATEX-ZH

In the following, we provide the splits of VATEX-ZH identical to the paper:
```bibtex
@article{Yang2023ZeroNLG,
   title={ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and Multilingual Natural Language Generation},
   author={Yang, Bang and Liu, Fenglin and Zou, Yuexian and Wu, Xian and Wang, Yaowei and Clifton, David A.},
   journal={arXiv preprint arXiv:2303.06458}
   year={2023}
}
```

In [10]:
import copy

save_path = os.path.join(SAVE_PATH, DATASET, 'zh')
os.makedirs(save_path, exist_ok=True)

vid2captions_zh = {}
for fn, split in zip(
    ['vatex_training_v1.0.json', 'vatex_validation_v1.0.json', 'vatex_public_test_english_v1.1.json'], 
    ['train', 'val', 'test']):
    data = json.load(open(os.path.join(root, fn), 'r'))

    failed_count = 0
    for item in data:
        # Ptf_2VRj-V0_000122_000132 -> video0
        vid = id2vid[item['videoID']]
        # video0 -> 0
        vid = int(vid[5:])
        if vid not in existed_videos:
            # we do not use the annotations of those unaccessible videos
            failed_count += 1
            continue

        if split == 'test':
            assert 'chCap' not in item
        else:
            vid2captions_zh[vid] = item['chCap']
    
    if split != 'test':
        print(f'The `{split}` split has {len(data) - failed_count} videos, {failed_count} of {len(data)} videos are unaccessible!')

train_items_zh = []
for vid in splits['train']:
    for caption in vid2captions_zh[vid]:
        item = dict(
            image=RELATIVE_PATH_FORMAT.format(vid=vid), # we only record the relative path of a video
            caption=caption,
            image_id=vid,
        )
        train_items_zh.append(item)

with open(os.path.join(save_path, 'train.json'), 'w') as wf:
    json.dump(train_items_zh, wf)


print('Constructing the test split by myself')
splits_zh = copy.deepcopy(splits)
splits_zh['test'] = splits_zh['val'][-1500:]
splits_zh['val'] = splits_zh['val'][:-1500]

print(f'Now, train: val: test == {len(splits_zh["train"])}: {len(splits_zh["val"])}: {len(splits_zh["test"])}')

caption_id = 0
for mode in ['val', 'test']:
    items = []
    for vid in splits_zh[mode]:
        item = dict(
            image=RELATIVE_PATH_FORMAT.format(vid=vid),
            caption=vid2captions_zh[vid],
            image_id=vid,
        )
        items.append(item)

    with open(os.path.join(save_path, f'{mode}.json'), 'w') as wf:
        json.dump(items, wf)
    
    gt = {
        'annotations': [],
        'images': [{'id': vid} for vid in splits_zh[mode]],
    }

    for vid in splits_zh[mode]:
        for caption in vid2captions_zh[vid]:
            item = dict(
                image_id=vid,
                caption=caption,
                id=caption_id,
            )
            caption_id += 1
            gt['annotations'].append(item)
    
    with open(os.path.join(save_path, f'{mode}_gt.json'), 'w') as wf:
        json.dump(gt, wf)

The `train` split has 25006 videos, 985 of 25991 videos are unaccessible!
The `val` split has 2893 videos, 107 of 3000 videos are unaccessible!
Constructing the test split by myself
Now, train: val: test == 25006: 1393: 1500


# 5. Subsets for Semi-Supervised Experiments

In [20]:
import os
import json
import random
from collections import defaultdict

SAVE_PATH = './annotations'

def run(dataset='coco', lang='en', ratios=[0.01, 0.1, 1, 10], seeds=[0, 1, 2]):
    root = os.path.join(SAVE_PATH, dataset, lang)
    train_file = os.path.join(root, 'train.json')
    assert os.path.exists(train_file), train_file

    print('### load json path from', train_file)
    data = json.load(open(train_file, 'r'))

    id2item = defaultdict(list)
    for item in data:
        id2item[item['image_id']].append(item)
    
    ids = sorted(list(id2item.keys()))

    print(f'### there are {len(id2item)} unique images/videos, {len(data)} vision-caption pairs')

    save_path = os.path.join(root, 'subsets')
    os.makedirs(save_path, exist_ok=True)

    for ratio in ratios:
        n_unique_images = int(len(id2item) * ratio / 100)
        if n_unique_images < 1:
            print(f'{ratio} is not applicible')
            continue

        print(f'--- generating a training subset of {ratio}% ({n_unique_images}) unique images/videos')

        for seed in seeds:
            json_path = os.path.join(save_path, f'{ratio}%_{seed}.json')
            if os.path.exists(json_path):
                print(json_path, 'exists')
                continue
            
            random.seed(seed)
            
            this_ids = random.sample(ids, n_unique_images)
            this_data = []
            for this_id in this_ids:
                this_data.extend(id2item[this_id])

            print(json_path)
            with open(json_path, 'w') as wf:
                json.dump(this_data, wf)

In [21]:
run('coco', 'en')
run('coco', 'ja') # not used at all

### load json path from ./annotations/coco/en/train.json
### there are 113287 unique images/videos, 566747 vision-caption pairs
--- generating a training subset of 0.01% (11) unique images/videos
./annotations/coco/en/subsets/0.01%_0.json exists
./annotations/coco/en/subsets/0.01%_1.json exists
./annotations/coco/en/subsets/0.01%_2.json exists
--- generating a training subset of 0.1% (113) unique images/videos
./annotations/coco/en/subsets/0.1%_0.json exists
./annotations/coco/en/subsets/0.1%_1.json exists
./annotations/coco/en/subsets/0.1%_2.json exists
--- generating a training subset of 1% (1132) unique images/videos
./annotations/coco/en/subsets/1%_0.json exists
./annotations/coco/en/subsets/1%_1.json exists
./annotations/coco/en/subsets/1%_2.json exists
--- generating a training subset of 10% (11328) unique images/videos
./annotations/coco/en/subsets/10%_0.json exists
./annotations/coco/en/subsets/10%_1.json exists
./annotations/coco/en/subsets/10%_2.json exists
### load json path

In [50]:
run('flickr30k', 'en')  # not used at all
run('flickr30k', 'de')
run('flickr30k', 'fr')
run('flickr30k', 'cs')  # not used at all

### load json path from ./annotations/flickr30k/en/train.json
### there are 29000 unique images/videos, 145000 vision-caption pairs
--- generating a training subset of 0.01% (2) unique images/videos
./annotations/flickr30k/en/subsets/0.01%_0.json
./annotations/flickr30k/en/subsets/0.01%_1.json
./annotations/flickr30k/en/subsets/0.01%_2.json
--- generating a training subset of 0.1% (29) unique images/videos
./annotations/flickr30k/en/subsets/0.1%_0.json
./annotations/flickr30k/en/subsets/0.1%_1.json
./annotations/flickr30k/en/subsets/0.1%_2.json
--- generating a training subset of 1% (290) unique images/videos
./annotations/flickr30k/en/subsets/1%_0.json
./annotations/flickr30k/en/subsets/1%_1.json
./annotations/flickr30k/en/subsets/1%_2.json
--- generating a training subset of 10% (2900) unique images/videos
./annotations/flickr30k/en/subsets/10%_0.json
./annotations/flickr30k/en/subsets/10%_1.json
./annotations/flickr30k/en/subsets/10%_2.json
### load json path from ./annotations/flic

In [52]:
run('msrvtt', 'en')

### load json path from ./annotations/msrvtt/en/train.json
### there are 6513 unique images/videos, 130260 vision-caption pairs
0.01 is not applicible
--- generating a training subset of 0.1% (6) unique images/videos
./annotations/msrvtt/en/subsets/0.1%_0.json
./annotations/msrvtt/en/subsets/0.1%_1.json
./annotations/msrvtt/en/subsets/0.1%_2.json
--- generating a training subset of 1% (65) unique images/videos
./annotations/msrvtt/en/subsets/1%_0.json
./annotations/msrvtt/en/subsets/1%_1.json
./annotations/msrvtt/en/subsets/1%_2.json
--- generating a training subset of 10% (651) unique images/videos
./annotations/msrvtt/en/subsets/10%_0.json
./annotations/msrvtt/en/subsets/10%_1.json
./annotations/msrvtt/en/subsets/10%_2.json


In [55]:
run('vatex', 'en')  # not used at all
run('vatex', 'zh')

### load json path from ./annotations/vatex/en/train.json
### there are 25006 unique images/videos, 250060 vision-caption pairs
--- generating a training subset of 0.01% (2) unique images/videos
./annotations/vatex/en/subsets/0.01%_0.json
./annotations/vatex/en/subsets/0.01%_1.json
./annotations/vatex/en/subsets/0.01%_2.json
--- generating a training subset of 0.1% (25) unique images/videos
./annotations/vatex/en/subsets/0.1%_0.json
./annotations/vatex/en/subsets/0.1%_1.json
./annotations/vatex/en/subsets/0.1%_2.json
--- generating a training subset of 1% (250) unique images/videos
./annotations/vatex/en/subsets/1%_0.json
./annotations/vatex/en/subsets/1%_1.json
./annotations/vatex/en/subsets/1%_2.json
--- generating a training subset of 10% (2500) unique images/videos
./annotations/vatex/en/subsets/10%_0.json
./annotations/vatex/en/subsets/10%_1.json
./annotations/vatex/en/subsets/10%_2.json
### load json path from ./annotations/vatex/zh/train.json
### there are 25006 unique images/vi

# 6. How we obtain training corpora in 4 languages (`en`, `zh`, `de`, `fr`)

```bibtex
@inproceedings{sharma2018conceptual,
  title={Conceptual captions: A cleaned, hypernymed, image alt-text dataset for automatic image captioning},
  author={Sharma, Piyush and Ding, Nan and Goodman, Sebastian and Soricut, Radu},
  booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  pages={2556--2565},
  year={2018}
}
```

1. download the raw annotation files of Google's Conceptual Captions (GCC)
2. concat the training and validation files, yielding 3,334,173 English captions
3. pre-process captions by removing punctuations etc
4. equally divide captions into three parts (1,111,391 English captions each) from front to back, corresponding to English captions parallel to Chinese, German, and French, respectively
5. get the translated results of the 4th step's outputs via Google Translator (https://translate.google.com)
6. randomly sample 1,111,391 English captions from the 2nd step's outputs

Finally, we get 1,111,391 English-Chinese pairs, 1,111,391 English-German pairs, 1,111,391 English-French pairs, and 1,111,391 English-only sentences.

In [24]:
!wget -O GCC-training.tsv https://storage.googleapis.com/gcc-data/Train/GCC-training.tsv?_ga=2.191230122.-1896153081.1529438250 --no-check-certificate
!wget -O GCC-1.1.0-Validation.tsv https://storage.googleapis.com/gcc-data/Validation/GCC-1.1.0-Validation.tsv?_ga=2.141047602.-1896153081.1529438250 --no-check-certificate

--2023-04-13 00:08:18--  https://storage.googleapis.com/gcc-data/Train/GCC-training.tsv?_ga=2.191230122.-1896153081.1529438250
正在解析主机 storage.googleapis.com (storage.googleapis.com)... 142.251.42.240, 172.217.163.48, 172.217.160.112, ...
正在连接 storage.googleapis.com (storage.googleapis.com)|142.251.42.240|:443... 已连接。
警告: 无法验证 storage.googleapis.com 的由 “CN=GTS CA 1C3,O=Google Trust Services LLC,C=US” 颁发的证书:
  无法本地校验颁发者的权限。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：564607502 (538M) [application/octet-stream]
正在保存至: “GCC-training.tsv”


2023-04-13 00:10:02 (5.26 MB/s) - 已保存 “GCC-training.tsv” [564607502/564607502])

--2023-04-13 00:10:03--  https://storage.googleapis.com/gcc-data/Validation/GCC-1.1.0-Validation.tsv?_ga=2.141047602.-1896153081.1529438250
正在解析主机 storage.googleapis.com (storage.googleapis.com)... 142.251.43.16, 172.217.160.80, 172.217.163.48, ...
正在连接 storage.googleapis.com (storage.googleapis.com)|142.251.43.16|:443... 已连接。
警告: 无法验证 storage.googleapis.com 的由 “CN=GTS CA 1C3,O=Google T

In [30]:
# drew inspiration from the pre-processing of ALBEF (https://github.com/salesforce/ALBEF)
def clean(caption):
    # this function is induced step by step via assertation
    # success for SBU
    def clean_token(token):
        if '-' in token:
            return
        out = [char for char in token if char not in '\',.();!!:#\"*~?']
        if len(out):
            return ''.join(out)
        return False

    def _split(token, sep):
        if not isinstance(token, list):
            token = [token]

        out = []
        for t in token:
            out.extend(t.split(sep))
        return out

    def split(token, sep='-/'):
        for char in sep:
            token = _split(token, char)
        return token

    tokens = []
    for token in caption.lower().strip().split(' '):
        # if token in ['&', '@', '+', '=', '|', '_', '>', '<', '{', '}', '$'] or token not in string.punctuation:
        tokens.extend(split(token))

    tokens = [clean_token(token) for token in tokens if clean_token(token)]

    return ' '.join(tokens[:50])

In [31]:
captions = []

with open('GCC-training.tsv', 'r') as f:
    for line in f:
        caption, url = line.strip().split('\t')
        captions.append(clean(caption))

with open('GCC-1.1.0-Validation.tsv', 'r') as f:
    for line in f:
        caption, url = line.strip().split('\t')
        captions.append(clean(caption))

In [32]:
len(captions)

3334173

In [33]:
import os
import numpy as np

langs = ['zh', 'de', 'fr']

captions_splits = np.array_split(captions, len(langs))
for lang, data in zip(langs, captions_splits):
    fn = f'cc3m_en_to_{lang}.txt'
    with open(os.path.join(fn), 'w') as f:
        f.write('\n'.join(data.tolist()))
    
    print(lang, len(data), fn)

# now, you should translate these generated files to the specific target language by yourself
# we use Google Translator

zh 1111391 cc3m_en_to_zh.txt
de 1111391 cc3m_en_to_de.txt
fr 1111391 cc3m_en_to_fr.txt


In [35]:
import random
random.seed(0)
with open(os.path.join('cc3m_en.txt'), 'w') as f:
    f.write('\n'.join(random.sample(captions, len(captions_splits[0]))))