In [5]:
import collections
import json
import os
import pandas as pd
import matplotlib.pyplot as plt
import torchvision

# Enter data folder name
dataset_folder_name = 'roco-dataset'


# This dataset_folder structure should look like this:
# dataset_folder/
#     └── data/
#         ├── train/
#         │   ├── non-radiology/
#         │   │   ├── captions.txt
#         │   │   └── images/
#         │   └── radiology/
#         │       ├── captions.txt
#         │       └── images/
#         ├── validation/
#         │   ├── non-radiology/
#         │   │   ├── captions.txt
#         │   │   └── images/
#         │   └── radiology/
#         │       ├── captions.txt
#         │       └── images/
#         └── test/
#             ├── non-radiology/
#             │   ├── captions.txt
#             │   └── images/
#             └── radiology/
#                 ├── captions.txt
#                 └── images/
# 
# In each "images" folder, filenames of images are: "ROCO_00020.jpg", "ROCO_00027.jpg", etc...
# 
# In "captions.txt", the content is stored as below:
# ROCO_00020	 Axial computed tomography scan of the pelvis showing a diffuse infiltration of the bladder wall, catheter in situ (arrow).
# ROCO_00027	 Postoperative anteroposterior radiograph of the pelvis.

In [6]:
# Reading dataset_dir from a json file. You can also manually set dataset_dir
with open('../Experiments/filepath.json', 'r') as f:
    # Use json.load to read data from file
    data = json.load(f)
    dataset_dir = data[dataset_folder_name]

# You can also manually set this dataset_dir
dataset_dir = os.path.join(dataset_dir, 'data') 

# Create a subfolder with the name dataset_folder_name in "Experiments" folder, for storing dataset json files.
folder_temp = os.path.join('../Experiments', dataset_folder_name)
if not os.path.exists(folder_temp):
    os.makedirs(folder_temp)
    print(f"The folder '{folder_temp}' has been created.")

In [7]:
def generate_json_file (dataset_folder_name: str, dataset_dir: str, train_or_val: str):
    """
    Generates a newline-delimited JSON (NDJSON) file containing image paths and corresponding captions.
    The function reads from a given 'captions.txt' file and links the captions with image files present in a 
    specified directory. Non-ASCII captions and those less than 10 characters long are skipped.
    The resulting NDJSON file is stored in a specific location defined in the function.

    Parameters:
    dataset_folder_name (str): Name of the dataset folder, e.g.: 'roco-dataset'
    
    dataset_dir (str): The base directory containing the dataset folders for training and validation.
    
    train_or_val (str): A string to specify whether the operation is being performed on 'train' or 'validation' or 'test' data.
    This string is used in defining the paths for both reading the 'captions.txt' and image files and writing the output JSON file.

    The output JSON file has each line as a separate JSON object of the format:
    {"image_path": "<path_to_image>", "captions": ["<caption_for_image>"]}

    The function does not return any value.
    
    Note: This function assumes the existence of a 'captions.txt' file and a corresponding 'images' directory 
    in the specified 'train_or_val' directory.

    Raises:
    The function continues without raising exceptions but prints the image path if it encounters an invalid image file.
    """
    
    json_filepath = os.path.join('../Experiments', dataset_folder_name, f'{train_or_val}_dataset.json')
    text_file_path = os.path.join(dataset_dir, train_or_val, 'radiology', 'captions.txt')
    image_dir_path = os.path.join(dataset_dir, train_or_val, 'radiology', 'images')

    # Open the JSON file for writing
    with open(json_filepath, 'w') as json_file:
        # Read the captions.txt file
        with open(text_file_path, 'r') as file:
            for line in file:
                # Split the line into the image ID and caption
                try:
                    image_id, caption = line.strip().split('\t')
                except:
                    continue

                # Processing the caption content
                caption = caption.lower().rstrip().replace("\\n", "").rstrip(".")
                try:
                    # caption = caption.encode('ascii')
                    caption = "b'" + caption.encode('ascii').decode('ascii') + "'"
                except:
                    continue
                if len(caption) < 10: #Skip if the caption is too short
                    continue

                # Construct the path to the image file
                image_path = os.path.join(image_dir_path, f'{image_id}.jpg')

                # Check if the image file exists
                if not os.path.exists(image_path):
                    continue
                
                # to make sure the file is a valid image
                try:
                    temp_data = torchvision.io.image.read_file(image_path)
                except:
                    print(image_path)
                    continue

                # Create the data dictionary
                data = {
                    'image_path': image_path,
                    'captions': [caption]  # wrap caption with a list
                }

                # Write the data to the JSON file
                json_file.write(json.dumps(data) + '\n')

In [8]:
# Generate json files for train data
train_or_val = 'train'
generate_json_file(dataset_folder_name, dataset_dir,train_or_val)

# Generate json files for validation data
train_or_val = 'validation'
generate_json_file(dataset_folder_name, dataset_dir,train_or_val)

# Generate json files for test data
train_or_val = 'test'
generate_json_file(dataset_folder_name, dataset_dir,train_or_val)