In [1]:
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import torchvision

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = '../data/roco_dataset'
train_path = os.path.join(data_dir, 'train')
train_img_dir = os.path.join(train_path, 'radiology', 'images')
val_path = os.path.join(data_dir, 'validation')
val_img_dir = os.path.join(val_path, 'radiology', 'images')
test_path = os.path.join(data_dir, 'test')
test_img_dir = os.path.join(test_path, 'radiology', 'images')

In [3]:
def has_unicode(text):
    for c in text:
        if ord(c)>127:
            print(f"unicode: {c}", text)
            return True
    return False

In [4]:
import collections

def read_lines(img_dir, csv_file, n_lines=0):
    lines = []
    image_path_to_caption = collections.defaultdict(list)
    for idx, row in csv_file.iterrows():
        if n_lines > 0 and idx > n_lines:
            break
        img_path = os.path.join(img_dir, row['name'])
        if not os.path.exists(img_path):
            continue
        try:
            # to make sure the file is a valid image
            data = torchvision.io.image.read_file(img_path)
        except:
            print(img_path)
            continue
        # TODO check if the image is not empty
        caption = row['caption'].lower().rstrip().replace(
            "\\n", "").rstrip(".")
        try:
            caption = caption.encode('ascii')
        except:
            continue
        
        if len(caption) < 10:
            continue
        # caption = caption.replace("\\n", "").rstrip()
        # print([caption])
        # has_unicode(caption)
        caption = f"{caption}"
        image_path_to_caption[img_path].append(caption)

    for img_path, captions in image_path_to_caption.items():
        line = json.dumps({"image_path": img_path, "captions": captions})
        lines.append(line)

    return lines


In [5]:
train_csv = pd.read_csv(os.path.join(train_path, 'radiology', 'traindata.csv'))
lines = read_lines(train_img_dir, train_csv)

/home/kaushalya/downloads/roco-dataset/train/radiology/images/PMC4240561_MA-68-291-g002.jpg


In [6]:
json_dir = '../../../data'
with open(os.path.join(json_dir, "train_dataset_new.json"), "w") as f:
    f.write("\n".join(lines))

In [7]:
val_csv = pd.read_csv(os.path.join(val_path, 'radiology', 'valdata.csv'))
lines = read_lines(val_img_dir, val_csv)

In [9]:
with open(os.path.join(json_dir, "valid_dataset_new.json"), "w") as f:
    f.write("\n".join(lines))

In [None]:
# Used only for creating sample dataset
train_lines = lines[:45000]
val_lines = lines[45000:]

json_dir = '../../../data'
with open(os.path.join(json_dir, "train_dataset.json"), "w") as f:
    f.write("\n".join(train_lines))

with open(os.path.join(json_dir, "valid_dataset.json"), "w") as f:
    f.write("\n".join(val_lines))

In [None]:
# A sample caption
caption = " axial computed tomography scan of the pelvis showing a diffuse infiltration of the bladder wall, catheter in situ (arrow).\\n"
caption.rstrip("\\n")