In [1]:
import warnings
import os
warnings.filterwarnings("ignore")

ZERONLG_HOME = 'data/checkpoints'
REPO = os.path.dirname(os.path.realpath('.'))
os.chdir(REPO)

save_path = 'output/coca_results'
os.makedirs(save_path, exist_ok=True)

# CoCa Inference

In [2]:
!pip install open_clip_torch==2.14.0



In [2]:
import open_clip
from open_clip.coca_model import CoCa as open_clip_coca
from open_clip.coca_model import prepare_inputs_for_generation
from transformers import (
    BeamSearchScorer,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
)
class CoCa(open_clip_coca):
    def _encode_image(self, images, normalize=True):
        is_video = False
        B, T = images.shape[:2]
        if images.dim() == 5:
            if T > 1:
                is_video = True
                images = images.view(B * T, *images.shape[2:])
            else:
                images = images.squeeze(1)
            
        image_latent, tokens_embs = self.visual(images)
        
        if is_video:
            tokens_embs = image_latent.view(B, T, -1)
            
        image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent
        return image_latent, tokens_embs
    
    def _generate_beamsearch(
            self,
            image_inputs,
            pad_token_id=None,
            eos_token_id=None,
            sot_token_id=None,
            num_beams=6,
            num_beam_groups=3,
            min_seq_len=5,
            stopping_criteria=None,
            logit_processor=None,
            logit_warper=None,
    ):
        device = image_inputs.device
        batch_size = image_inputs.shape[0]
        image_inputs = torch.repeat_interleave(image_inputs, num_beams, dim=0)
        image_latent, image_embs = self._encode_image(image_inputs)

        input_ids = torch.ones((batch_size * num_beams, 1), device=device, dtype=torch.long)
        input_ids = input_ids * sot_token_id
        beam_scorer = BeamSearchScorer(
            batch_size=batch_size,
            num_beams=num_beams,
            device=device,
            num_beam_groups=num_beam_groups,
        )
        # instantiate logits processors
        logits_processor = (
            LogitsProcessorList([MinLengthLogitsProcessor(min_seq_len, eos_token_id=eos_token_id)])
            if logit_processor is None
            else logit_processor
        )
        
        batch_size = len(beam_scorer._beam_hyps)
        num_beams = beam_scorer.num_beams
        num_beam_groups = beam_scorer.num_beam_groups
        num_sub_beams = num_beams // num_beam_groups
        batch_beam_size, cur_len = input_ids.shape
        beam_indices = None

        if num_beams * batch_size != batch_beam_size:
            raise ValueError(
                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
            )

        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
        # the same group don't produce same tokens everytime.
        beam_scores[:, ::num_sub_beams] = 0
        beam_scores = beam_scores.view((batch_size * num_beams,))

        while True:

            # predicted tokens in cur_len step
            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)

            # indices which will form the beams in the next time step
            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)

            # do one decoder step on all beams of all sentences in batch
            model_inputs = prepare_inputs_for_generation(input_ids=input_ids, image_inputs=image_inputs)
            outputs = self(
                model_inputs['images'],
                model_inputs['text'],
                embed_cls=False,
                image_latent=image_latent,
                image_embs=image_embs
            )
            
            for beam_group_idx in range(num_beam_groups):
                group_start_idx = beam_group_idx * num_sub_beams
                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
                group_size = group_end_idx - group_start_idx

                # indices of beams of current group among all sentences in batch
                batch_group_indices = []

                for batch_idx in range(batch_size):
                    batch_group_indices.extend(
                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
                    )
                group_input_ids = input_ids[batch_group_indices]

                # select outputs of beams of currentg group only
                next_token_logits = outputs['logits'][batch_group_indices, -1, :]
                vocab_size = next_token_logits.shape[-1]

                next_token_scores_processed = logits_processor(
                    group_input_ids, next_token_logits, current_tokens=current_tokens, beam_group_idx=beam_group_idx
                )
                next_token_scores = next_token_scores_processed + beam_scores[batch_group_indices].unsqueeze(-1)
                next_token_scores = next_token_scores.expand_as(next_token_scores_processed)

                # reshape for beam search
                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)

                next_token_scores, next_tokens = torch.topk(
                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
                )

                next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
                next_tokens = next_tokens % vocab_size

                # stateless
                process_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
                beam_outputs = beam_scorer.process(
                    group_input_ids,
                    next_token_scores,
                    next_tokens,
                    next_indices,
                    pad_token_id=pad_token_id,
                    eos_token_id=eos_token_id,
                )
                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
                beam_next_tokens = beam_outputs["next_beam_tokens"]
                beam_idx = beam_outputs["next_beam_indices"]

                input_ids[batch_group_indices] = group_input_ids[beam_idx]
                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
                current_tokens[batch_group_indices] = group_input_ids[:, -1]

                # (beam_idx // group_size) -> batch_idx
                # (beam_idx % group_size) -> offset of idx inside the group
                reordering_indices[batch_group_indices] = (
                    num_beams * torch.div(beam_idx, group_size, rounding_mode="floor") + group_start_idx + (beam_idx % group_size)
                )

            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)

            # increase cur_len
            cur_len = cur_len + 1
            if beam_scorer.is_done or stopping_criteria(input_ids, None):
                break

        final_beam_indices = sum(beam_indices, ()) if beam_indices is not None else None
        sequence_outputs = beam_scorer.finalize(
            input_ids,
            beam_scores,
            next_tokens,
            next_indices,
            pad_token_id=pad_token_id,
            eos_token_id=eos_token_id,
            max_length=stopping_criteria.max_length,
        )
        return sequence_outputs['sequences']

In [3]:
import json
import logging
import os
import pathlib
import re
from copy import deepcopy
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union

import torch
import torch.nn.functional as F

from open_clip.constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
from open_clip.model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\
    resize_pos_embed, get_cast_dtype
from open_clip.loss import ClipLoss, DistillClipLoss, CoCaLoss
from open_clip.openai import load_openai_model
from open_clip.pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained, list_pretrained_tags_by_model, download_pretrained_from_hf
from open_clip.transform import image_transform, AugmentationCfg
from open_clip.tokenizer import HFTokenizer, tokenize
from open_clip.factory import HF_HUB_PREFIX, get_model_config, load_checkpoint


def create_model(
        model_name: str,
        pretrained: Optional[str] = None,
        precision: str = 'fp32',
        device: Union[str, torch.device] = 'cpu',
        jit: bool = False,
        force_quick_gelu: bool = False,
        force_custom_text: bool = False,
        force_patch_dropout: Optional[float] = None,
        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
        pretrained_image: bool = False,
        pretrained_hf: bool = True,
        cache_dir: Optional[str] = None,
        output_dict: Optional[bool] = None,
        require_pretrained: bool = False,
):
    has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX)
    if has_hf_hub_prefix:
        model_id = model_name[len(HF_HUB_PREFIX):]
        checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir)
        config_path = download_pretrained_from_hf(model_id, filename='open_clip_config.json', cache_dir=cache_dir)

        with open(config_path, 'r', encoding='utf-8') as f:
            config = json.load(f)
        pretrained_cfg = config['preprocess_cfg']
        model_cfg = config['model_cfg']
    else:
        model_name = model_name.replace('/', '-')  # for callers using old naming with / in ViT names
        checkpoint_path = None
        pretrained_cfg = {}
        model_cfg = None

    if isinstance(device, str):
        device = torch.device(device)

    if pretrained and pretrained.lower() == 'openai':
        logging.info(f'Loading pretrained {model_name} from OpenAI.')
        model = load_openai_model(
            model_name,
            precision=precision,
            device=device,
            jit=jit,
            cache_dir=cache_dir,
        )

        # to always output dict even if it is clip
        if output_dict and hasattr(model, "output_dict"):
            model.output_dict = True
    else:
        model_cfg = model_cfg or get_model_config(model_name)
        if model_cfg is not None:
            logging.info(f'Loaded {model_name} model config.')
        else:
            logging.error(f'Model config for {model_name} not found; available models {list_models()}.')
            raise RuntimeError(f'Model config for {model_name} not found.')

        if force_quick_gelu:
            # override for use of QuickGELU on non-OpenAI transformer models
            model_cfg["quick_gelu"] = True

        if force_patch_dropout is not None:
            # override the default patch dropout value
            model_cfg["vision_cfg"]["patch_dropout"] = force_patch_dropout

        if force_image_size is not None:
            # override model config's image size
            model_cfg["vision_cfg"]["image_size"] = force_image_size

        if pretrained_image:
            if 'timm_model_name' in model_cfg.get('vision_cfg', {}):
                # pretrained weight loading for timm models set via vision_cfg
                model_cfg['vision_cfg']['timm_model_pretrained'] = True
            else:
                assert False, 'pretrained image towers currently only supported for timm models'

        cast_dtype = get_cast_dtype(precision)
        is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {})
        custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model

        if custom_text:
            if is_hf_model:
                model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf
            if "coca" in model_name:
                model = CoCa(**model_cfg, cast_dtype=cast_dtype)
            else:
                model = CustomTextCLIP(**model_cfg, cast_dtype=cast_dtype)
        else:
            model = CLIP(**model_cfg, cast_dtype=cast_dtype)

        pretrained_loaded = False
        if pretrained:
            checkpoint_path = ''
            pretrained_cfg = get_pretrained_cfg(model_name, pretrained)
            if pretrained_cfg:
                checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir)
            elif os.path.exists(pretrained):
                checkpoint_path = pretrained

            if checkpoint_path:
                logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
                load_checkpoint(model, checkpoint_path)
            else:
                error_str = (
                    f'Pretrained weights ({pretrained}) not found for model {model_name}.'
                    f'Available pretrained tags ({list_pretrained_tags_by_model(model_name)}.')
                logging.warning(error_str)
                raise RuntimeError(error_str)
            pretrained_loaded = True
        elif has_hf_hub_prefix:
            logging.info(f'Loading pretrained {model_name} weights ({pretrained}).')
            load_checkpoint(model, checkpoint_path)
            pretrained_loaded = True

        if require_pretrained and not pretrained_loaded:
            # callers of create_model_from_pretrained always expect pretrained weights
            raise RuntimeError(
                f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.')

        model.to(device=device)
        if precision in ("fp16", "bf16"):
            convert_weights_to_lp(model, dtype=torch.bfloat16 if precision == 'bf16' else torch.float16)

        # set image / mean metadata from pretrained_cfg if available, or use default
        model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN
        model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD

        # to always output dict even if it is clip
        if output_dict and hasattr(model, "output_dict"):
            model.output_dict = True

        if jit:
            model = torch.jit.script(model)

    return model

def create_model_and_transforms(
        model_name: str,
        pretrained: Optional[str] = None,
        precision: str = 'fp32',
        device: Union[str, torch.device] = 'cpu',
        jit: bool = False,
        force_quick_gelu: bool = False,
        force_custom_text: bool = False,
        force_patch_dropout: Optional[float] = None,
        force_image_size: Optional[Union[int, Tuple[int, int]]] = None,
        pretrained_image: bool = False,
        pretrained_hf: bool = True,
        image_mean: Optional[Tuple[float, ...]] = None,
        image_std: Optional[Tuple[float, ...]] = None,
        aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None,
        cache_dir: Optional[str] = None,
        output_dict: Optional[bool] = None,
):
    model = create_model(
        model_name,
        pretrained,
        precision=precision,
        device=device,
        jit=jit,
        force_quick_gelu=force_quick_gelu,
        force_custom_text=force_custom_text,
        force_patch_dropout=force_patch_dropout,
        force_image_size=force_image_size,
        pretrained_image=pretrained_image,
        pretrained_hf=pretrained_hf,
        cache_dir=cache_dir,
        output_dict=output_dict,
    )

    image_mean = image_mean or getattr(model.visual, 'image_mean', None)
    image_std = image_std or getattr(model.visual, 'image_std', None)
    preprocess_train = image_transform(
        model.visual.image_size,
        is_train=True,
        mean=image_mean,
        std=image_std,
        aug_cfg=aug_cfg,
    )
    preprocess_val = image_transform(
        model.visual.image_size,
        is_train=False,
        mean=image_mean,
        std=image_std,
    )

    return model, preprocess_train, preprocess_val

In [4]:
model, _, transform = create_model_and_transforms(
  model_name="coca_ViT-B-32",
  pretrained="laion2B-s13B-b90k",
  device='cuda',
  cache_dir=ZERONLG_HOME
)

'(MaxRetryError("HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /laion/CoCa-ViT-B-32-laion2B-s13B-b90k/resolve/main/open_clip_pytorch_model.bin (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f24fc4c80d0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))"), '(Request ID: 12615bbb-d178-4c70-b3d6-6a8eef6ef55f)')' thrown while requesting HEAD https://huggingface.co/laion/CoCa-ViT-B-32-laion2B-s13B-b90k/resolve/main/open_clip_pytorch_model.bin


LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on.

In [6]:
import open_clip
import configs
import json
import os

from tqdm import tqdm
from torch.utils.data import DataLoader
from zeronlg import CaptionDataset

all_datasets = [
    'msrvtt',
    'coco',
    'vatex',
    'flickr30k',
]
all_langs = [
    ['en'],
    ['en'],
    ['zh'],
    ['zh', 'de', 'fr'],
]

generation_kwargs = {
    'num_beams': 3,
    'min_seq_len': 3,
    'seq_len': 20
}

for tag, langs in zip(all_datasets, all_langs):
    for lang in langs:
        dataset = CaptionDataset(
            vision_root=configs.image_video_root[tag],
            ann_rpath=f'{configs.annotation_root}/{tag}/{lang}/test.json',
            lang=lang,
            return_images=True,
        )
        loader = DataLoader(
            dataset,
            batch_size=1,
            sampler=None,
            shuffle=False,
            collate_fn=dataset.collate_fn,
            drop_last=False,
        )

        results = []
        for batch in tqdm(loader):
            image_ids, images = batch
            image = images[0][len(images[0]) // 2] # get the middle frame
            im = [transform(image).unsqueeze(0)]
            im = torch.stack(im, dim=1).to('cuda')

            with torch.no_grad(), torch.cuda.amp.autocast():
                generated = model.generate(im, **generation_kwargs)

            caption = open_clip.decode(generated[0]).split("<end_of_text>")[0].replace("<start_of_text>", "")
            results.append({"image_id": image_ids[0], "caption": caption})

        print(tag, lang, results[:5])
        results_file = os.path.join(save_path, f'coca-b-32_{lang}_{tag}.json')
        json.dump(results, open(results_file, 'w'))

100%|██████████| 2990/2990 [36:38<00:00,  1.36it/s]


msrvtt en [{'image_id': 7010, 'caption': 'how to download free mp 3 from youtube to your computer '}, {'image_id': 7011, 'caption': 'pink color in english '}, {'image_id': 7012, 'caption': 'watch this video of ed sheeran on the today show '}, {'image_id': 7013, 'caption': 'how to use the fv - 1 2 1 1 to decode a set of 2 2 '}, {'image_id': 7014, 'caption': 'the x - factor 2 0 1 5 : who is the new face of the show ? '}]


100%|██████████| 5000/5000 [1:07:22<00:00,  1.24it/s]


coco en [{'image_id': 391895, 'caption': 'a man on a mountain bike in the huanglong national park , huanglong , china '}, {'image_id': 60623, 'caption': 'a group of people are eating a big bowl of food . one is a woman is one of '}, {'image_id': 483108, 'caption': 'a man is waiting for a train at the train station in the city of luang prabang '}, {'image_id': 384213, 'caption': 'the kitchen is one of the more well - known in the world . '}, {'image_id': 386164, 'caption': 'photo of a collection of wooden kitchen tools '}]


100%|██████████| 1500/1500 [21:01<00:00,  1.19it/s]


vatex zh [{'image_id': 27439, 'caption': 'watch this video of a man who is a new day after a 2 1 - year - old '}, {'image_id': 27440, 'caption': 'watch this video of a man who is a black man who is a white man who is a '}, {'image_id': 27441, 'caption': 'watch this baby play with a bucket of water '}, {'image_id': 27442, 'caption': 'a white paint roller is used to paint a wooden wall in a house . - paint roller stock '}, {'image_id': 27443, 'caption': 'how to clean a white tiled floor with a mops '}]


100%|██████████| 1000/1000 [11:43<00:00,  1.42it/s]


flickr30k zh [{'image_id': 1009692167, 'caption': 'a new group of new canaan police department k - 9 s is set to be a part '}, {'image_id': 1021439420, 'caption': 'photo : the family is all set up for a day of family fun . this is one of '}, {'image_id': 1032122270, 'caption': 'two english cocker spaniels and a great dane in a field '}, {'image_id': 1043819504, 'caption': 'photo : the first day of the 2 0 1 2 ice age day camp . '}, {'image_id': 1095580424, 'caption': 'a black and white dog is running in a large bag of food . '}]


100%|██████████| 1000/1000 [11:48<00:00,  1.41it/s]


flickr30k de [{'image_id': 1007129816, 'caption': 'a man with a hat made out of a 1 2 - pack of 1 2 - pack of '}, {'image_id': 1009434119, 'caption': 'english : a boston is a small english dog . '}, {'image_id': 101362133, 'caption': '2 0 1 2 world cup of karate - day 1 '}, {'image_id': 102617084, 'caption': 'a group of people in a snow day . one is a white man in a red jacket and '}, {'image_id': 10287332, 'caption': 'new home construction - new home construction is a great time to do home projects . we can help '}]


100%|██████████| 1000/1000 [10:43<00:00,  1.55it/s]

flickr30k fr [{'image_id': 1007129816, 'caption': 'a man with a hat made out of a 1 2 - pack of 1 2 - pack of '}, {'image_id': 1009434119, 'caption': 'english : a boston is a small english dog . '}, {'image_id': 101362133, 'caption': '2 0 1 2 world cup of karate - day 1 '}, {'image_id': 102617084, 'caption': 'a group of people in a snow day . one is a white man in a red jacket and '}, {'image_id': 10287332, 'caption': 'new home construction - new home construction is a great time to do home projects . we can help '}]





# Translate CoCa's results with NLLB

In [2]:
# if your `transformers` version is low, e.g., 4.12.5
# then you should upgrade it to load the NLLB model
!pip install transformers==4.27.1



In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/nllb-200-distilled-600M"
model_name = f"{ZERONLG_HOME}/{model_name.replace('/', '_')}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [5]:
import json
import torch
from zeronlg.utils import batch_to_device

tag = 'NLLB'
mapping = {
    'en': 'eng_Latn',
    'zh': 'zho_Hans',
    'de': 'deu_Latn',
    'fr': 'fra_Latn',
}
batch_size = 32
device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
model.to(device)
tokenizer.src_lang = mapping['en']

all_datasets = [
    'vatex',
    'flickr30k',
]
all_langs = [
    ['zh'],
    ['zh', 'de', 'fr'],
]

for tag, langs in zip(all_datasets, all_langs):
    for lang in langs:
        src_path = f"{save_path}/coca-b-32_{lang}_{tag}.json"
        trg_path = f"{save_path}/coca-b-32_{lang}_{tag}_NLLB.json"

        print(f'translating {src_path} ...')
        data = json.load(open(src_path, 'r'))
        
        results = []
        
        num_batches = len(data) // batch_size
        if batch_size * num_batches != len(data):
            num_batches += 1
        
        for i in range(num_batches):
            start, end = i * batch_size, (i + 1) * batch_size
            text = [line['caption'] for line in data[start:end]]
            encoded_text = tokenizer(text, return_tensors='pt', padding=True)
            encoded_text = batch_to_device(encoded_text, device)
            generated_tokens = model.generate(
                **encoded_text,
                forced_bos_token_id=tokenizer.lang_code_to_id[mapping[lang]]
            )
            res = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            
            for line, caption in zip(data[start:end], res):
                line['caption'] = caption
                results.append(line)

        print(f'save results to {trg_path}')
        with open(trg_path, 'w') as wf:
            json.dump(results, wf)

translating output/coca_results/coca-b-32_zh_vatex.json ...
save results to output/coca_results/coca-b-32_zh_vatex_NLLB.json
translating output/coca_results/coca-b-32_zh_flickr30k.json ...
save results to output/coca_results/coca-b-32_zh_flickr30k_NLLB.json
translating output/coca_results/coca-b-32_de_flickr30k.json ...
save results to output/coca_results/coca-b-32_de_flickr30k_NLLB.json
translating output/coca_results/coca-b-32_fr_flickr30k.json ...
save results to output/coca_results/coca-b-32_fr_flickr30k_NLLB.json


# Evaluation

In [6]:
!pip install transformers==4.12.5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting transformers==4.12.5
  Downloading transformers-4.12.5-py3-none-any.whl.metadata (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m335.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.11,>=0.10.1 (from transformers==4.12.5)
  Using cached tokenizers-0.10.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
Using cached transformers-4.12.5-py3-none-any.whl (3.1 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenize

In [7]:
!python infer_caption.py --results_file output/coca_results/coca-b-32_en_msrvtt.json --dataset msrvtt --lang en

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-02-25 17:49:59 - results_file: output/coca_results/coca-b-32_en_msrvtt.json
2024-02-25 17:49:59 - gt_file: data/annotations/msrvtt/en/test_gt.json
loading annotations into memory...
Done (t=0.23s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 617049 tokens at 558328.57 tokens per second.
PTBTokenizer tokenized 48701 tokens at 144096.82 tokens per second.
setting up scorers...
computing Bleu score...
{'testlen': 41143, 'reflen': 37148, 'guess': [41143, 38153, 35163, 32174], 'correct': [11513, 2500, 562, 154]}
ratio: 1.1075428017658797
Bleu_1: 0.280
Bleu_2: 0.135
Bleu_3: 0.066
Bleu_4: 0.034
computing

In [8]:
!python infer_caption.py --results_file output/coca_results/coca-b-32_en_coco.json --dataset coco --lang en

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-02-25 17:51:25 - results_file: output/coca_results/coca-b-32_en_coco.json
2024-02-25 17:51:25 - gt_file: data/annotations/coco/en/test_gt.json
loading annotations into memory...
Done (t=0.19s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
tokenization...
PTBTokenizer tokenized 307085 tokens at 573385.60 tokens per second.
Feb 25, 2024 5:51:26 PM edu.stanford.nlp.process.PTBLexer next
PTBTokenizer tokenized 80843 tokens at 189421.10 tokens per second.
setting up scorers...
computing Bleu score...
{'testlen': 69059, 'reflen': 58042, 'guess': [69059, 64059, 59061, 54080], 'correct': [20370, 5963, 1716, 512]}
ratio: 1.1898108266427554
Bleu_

In [9]:
!python infer_caption.py --results_file output/coca_results/coca-b-32_zh_vatex_NLLB.json --dataset vatex --lang zh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-02-25 17:52:18 - results_file: output/coca_results/coca-b-32_zh_vatex_NLLB.json
2024-02-25 17:52:18 - gt_file: data/annotations/vatex/zh/test_gt.json
loading annotations into memory...
Done (t=0.06s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
Building prefix dict from the default dictionary ...
2024-02-25 17:52:18 - Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2024-02-25 17:52:18 - Loading model from cache /tmp/jieba.cache
Loading model cost 1.215 seconds.
2024-02-25 17:52:19 - Loading model cost 1.215 seconds.
Prefix dict has been built successfully.
2024-02-25 17:52:19 - Prefix dict

In [10]:
!python infer_caption.py --results_file output/coca_results/coca-b-32_zh_flickr30k_NLLB.json --dataset flickr30k --lang zh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-02-25 17:52:53 - results_file: output/coca_results/coca-b-32_zh_flickr30k_NLLB.json
2024-02-25 17:52:53 - gt_file: data/annotations/flickr30k/zh/test_gt.json
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
Building prefix dict from the default dictionary ...
2024-02-25 17:52:53 - Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
2024-02-25 17:52:53 - Loading model from cache /tmp/jieba.cache
Loading model cost 1.243 seconds.
2024-02-25 17:52:54 - Loading model cost 1.243 seconds.
Prefix dict has been built successfully.
2024-02-25 17:52:54 - Pre

In [11]:
!python infer_caption.py --results_file output/coca_results/coca-b-32_de_flickr30k_NLLB.json --dataset flickr30k --lang de

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-02-25 17:53:18 - results_file: output/coca_results/coca-b-32_de_flickr30k_NLLB.json
2024-02-25 17:53:18 - gt_file: data/annotations/flickr30k/de/test_gt.json
loading annotations into memory...
Done (t=0.01s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
2024-02-25 17:53:18 - Initializing native server...
2024-02-25 17:53:18 - java -Xmx4g -cp "/data/yb/checkpoints/stanford-corenlp-4.5.2/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000
2024-02-25 17:53:18 - Server shell PID: 1668829
2024-02-25 17:53:19 - The server is available.
setting up scorers...
computing Bleu score...
{'testlen': 12443, 'reflen': 10754, 'guess': [12443,

In [12]:
!python infer_caption.py --results_file output/coca_results/coca-b-32_fr_flickr30k_NLLB.json --dataset flickr30k --lang fr

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2024-02-25 17:54:22 - results_file: output/coca_results/coca-b-32_fr_flickr30k_NLLB.json
2024-02-25 17:54:22 - gt_file: data/annotations/flickr30k/fr/test_gt.json
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.00s)
creating index...
index created!
2024-02-25 17:54:22 - Initializing native server...
2024-02-25 17:54:22 - java -Xmx4g -cp "/data/yb/checkpoints/stanford-corenlp-4.5.2/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000
2024-02-25 17:54:22 - Server shell PID: 2092534
2024-02-25 17:54:23 - The server is available.
setting up scorers...
computing Bleu score...
{'testlen': 14221, 'reflen': 12839, 'guess': [14221,