# ImageCLEFmedical 2023 Task1-Caption

https://www.imageclef.org/2023/medical/caption

In [1]:
import os
import re
import json

def prepare_id_mapping(image_ids, save_path):
    if os.path.exists(save_path):
        return json.load(open(save_path, 'r'))

    assert len(set(image_ids)) == len(image_ids)

    id_mapping = {}
    int_id = 0
    for image_id in image_ids:
        id_mapping[image_id] = int_id
        int_id += 1
    
    with open(save_path, 'w') as wf:
        json.dump(id_mapping, wf)
    
    return id_mapping


def prepare_caption_gt_files(json_data, save_path, caption_id=0):
    gt = {
        'annotations': [],
        'images': [],
    }

    for item in json_data:
        if not isinstance(item['caption'], (list, tuple)):
            captions = [item['caption']]
        else:
            captions = item['caption']
        image_id = item['image_id']
        for caption in captions:
            item = dict(
                image_id=image_id,
                caption=caption,
                id=caption_id,
            )
            caption_id += 1
            gt['annotations'].append(item)
        gt['images'].append({'id': image_id})
    
    dirname = os.path.dirname(save_path)
    os.makedirs(dirname, exist_ok=True)

    with open(save_path, 'w') as wf:
        json.dump(gt, wf)
    
    return caption_id


def pre_caption(caption, max_words=None):
    caption = re.sub(
        r"([.!\"()*#:;~])",
        " ",
        caption.lower(),
    )
    caption = re.sub(
        r"\s{2,}",
        " ",
        caption,
    )
    caption = caption.rstrip("\n")
    caption = caption.strip(" ")

    # truncate caption
    if max_words:
        caption_words = caption.split(" ")
        if len(caption_words) > max_words:
            caption = " ".join(caption_words[: max_words])

    return caption

In [2]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [110]:
import json
import pandas as pd

root = "./ImageCLEFmedical_2023/Task1-Caption"

image_ids = [fn.rstrip('.jpg') for split in ['train', 'val', 'test'] for fn in os.listdir(f"{root}/images/{split}")]
id_mapping = prepare_id_mapping(image_ids, f"{root}/id_mapping.json")


caption_id = 0
for split in ['train', 'valid']:
    data = pd.read_csv(f"{root}/caption_prediction_{split}_labels.csv", sep='\t')
    
    if split == 'valid':
        split = 'val'
        
    out = []
    total_len = 0
    total_len2 = 0
    for i in range(len(data)):
        item = data.iloc[i]
        new_item = dict(
            image=f"images/{split}/{item['ID'] + '.jpg'}",
            caption=item['caption'],
            image_id=id_mapping[item['ID']],
        )
        total_len += len(new_item['caption'].split())
        total_len2 += len(tokenizer.tokenize(pre_caption(new_item['caption'])))
        out.append(new_item)

    print(f"{split} has {len(out)} samples, average length of captions is {total_len / len(out)}, average tokens: {total_len2 / len(out)}")
    print(out[-1])
    print('------')
    
    with open(f"{root}/{split}.json", 'w') as wf:
        json.dump(out, wf)
    
    if split != 'train':
        caption_id = prepare_caption_gt_files(out, f'{root}/{split}_gt.json', caption_id)


split = 'test'
out = []
for fn in os.listdir(f"{root}/images/{split}"):
    if not fn.endswith('.jpg'):
        continue
    
    new_item = dict(
        image=f"images/{split}/{fn}",
        image_id=id_mapping[fn.rstrip('.jpg')],
    )
    out.append(new_item)

with open(f"{root}/{split}.json", 'w') as wf:
    json.dump(out, wf)

train has 60918 samples, average length of captions is 20.81668800682885, average tokens: 31.03590071899931
{'image': 'images/train/ImageCLEFmedical_Caption_2023_train_060918.jpg', 'caption': 'Aortic angiogram showing small aneurysm involving the aorta and left subclavian artery (red arrow head).', 'image_id': 60917}
------
val has 10437 samples, average length of captions is 22.44361406534445, average tokens: 33.59863945578231
{'image': 'images/val/ImageCLEFmedical_Caption_2023_valid_010437.jpg', 'caption': '12-year-old boy with known case of acute lymphoblastic leukemia, presented with fever for 4 days, conjunctivitis, maculopapular rash, hypotension and cardiogenic shock he was ventilated due respiratory distress, his COVID status was PCR swab positive , COVID IgM negative COVID IgG positive , ; axial chest CT shows extensive consolidation implicating the left lung (CT severity score= 13). Note the associated pleural effusion on both sides (asterisk). The patient was on ventilatory 