In [None]:
# importing prerequisites
import sys
import requests
import tarfile
import os
import json
import numpy as np
from pathlib import Path
from PIL import Image, ImageDraw, ImageFont
from glob import glob
from matplotlib import pyplot as plt
import pickle
import random
import shutil
from tqdm import tqdm
from PIL import Image
%matplotlib inline

### Helper Functions

In [None]:
# def list_files(directory, file_types=['*.pkl']):
#     files = []
#     for file_type in file_types:
#         files.extend(glob(os.path.join(directory, file_type)))
#     return files

# def list_image_files(directory, file_types=['*.jpg', '*.jpeg', '*.png']):
#     return list_files(directory, file_types=file_types)

### Create file hierarchy for the project

In [None]:
!mkdir yolov8/data/train yolov8/data/val

### Dataset Download & Extract

In [2]:
# Extract dataset 
# !tar -xvzf train-0.tar.gz
# or for entire dataset
# !tar -xvzf publaynet.tar.gz

In [3]:
# Setup base paths
data_path = Path("publaynet/")
train_data_path = data_path / "train/"

assert data_path.exists()
assert train_data_path.exists()

In [None]:
# Verifying the file was extracted properly

# os.chdir(train_data_path)
# train_img_names = list_image_files(".")
# train_img_names = [train_img[2:] for train_img in train_img_names]
# print(len(train_img_names), train_img_names[:5])
# os.chdir("../..")

In [4]:
# Extract labels
# !tar -xvzf labels.tar.gz

In [5]:
# Parse the JSON file and read all the images and labels
with open(f'{data_path}/train.json', 'r') as fp:
    train_labels = json.load(fp)

# with open(f'{data_path}/val.json', 'r') as fp:
#     val_labels = json.load(fp)

print(train_labels.keys())
# print(val_labels.keys())

dict_keys(['images', 'annotations', 'categories'])


### Create split from available train images - one time operation

Normally, the full dataset has its own split.  Given we're using train-0 (1 of 7) data archives. There are no images from `val.json` in the `train-0` folder, so we're going to create our own train-validation split from the train data.

In the future, we could also try to evaluate what the distribution of the classes for each split is, but is overkill for our use-case. Good to have in mind for a production use-case.

In [8]:
# seed = 42
# random.seed(seed)

# # Shuffle the list to ensure randomness
# random.shuffle(train_img_names)

# # Split the list into 80% training and 20% validation
# split_index = int(0.8 * len(train_img_names))
# train_images = train_img_names[:split_index]
# val_images = train_img_names[split_index:]

# # Save the splits into .pkl files
# with open('train_images.pkl', 'wb') as f:
#     pickle.dump(train_images, f)

# with open('val_images.pkl', 'wb') as f:
#     pickle.dump(val_images, f)

# # Print the results to verify
# print("Training images:", len(train_images), train_images[:5])
# print("Validation images:", len(val_images), val_images[:5])

#### Load image names so we have consistency through multiple runs

In [6]:
# Load image names cell
import pickle

with open('train_images.pkl', 'rb') as f:
    train_images = pickle.load(f)

with open('val_images.pkl', 'rb') as f:
    val_images = pickle.load(f)

print("Loaded training images:", len(train_images), train_images [:5])
print("Loaded validation images:", len(val_images), val_images [:5])

Loaded training images: ['PMC4826661_00004.jpg', 'PMC3214698_00011.jpg', 'PMC4573649_00006.jpg', 'PMC4255692_00007.jpg', 'PMC5886901_00001.jpg']
Loaded validation images: ['PMC5148792_00003.jpg', 'PMC5812507_00001.jpg', 'PMC3759700_00001.jpg', 'PMC3403926_00000.jpg', 'PMC5005041_00000.jpg']


#### We have to normalize pixel values before writing bounding boxes to label files

In [17]:
def normalize_bbox(obj_x, obj_y, obj_w, obj_h, img_w, img_h, verbose=False):
    # Step 1: Calculate the center coordinates of the bounding box
    x_center = obj_x + obj_w / 2
    y_center = obj_y + obj_h / 2
    
    # Step 2: Normalize the center coordinates and dimensions
    x_center_norm = x_center / img_w
    y_center_norm = y_center / img_h
    width_norm = obj_w / img_w
    height_norm = obj_h / img_h

    if verbose:
        # Print normalized values
        print(f"x_center_norm: {x_center_norm}")
        print(f"y_center_norm: {y_center_norm}")
        print(f"width_norm: {width_norm}")
        print(f"height_norm: {height_norm}")

    return min(x_center_norm, 1), min(y_center_norm, 1), min(width_norm, 1), min(height_norm, 1)

# Example label entry (assuming class_id = 0)
class_id = 0
normalized_bbox = normalize_bbox(obj_x=150, obj_y=200, obj_w=50, obj_h=100, img_w=640, img_h=640, verbose=True)
label_entry = f"{class_id} {normalized_bbox[0]} {normalized_bbox[1]} {normalized_bbox[2]} {normalized_bbox[3]}"
print("Normalized YOLO label entry:", label_entry)


x_center_norm: 0.2734375
y_center_norm: 0.390625
width_norm: 0.078125
height_norm: 0.15625
Normalized YOLO label entry: 0 0.2734375 0.390625 0.078125 0.15625


#### Copy images to COCO-style file structure; Create labels using class_id and normalized (0-1) bounding boxes

In [24]:
!rm -rf yolov8/data/train/*.txt yolov8/data/train/*.jpg
!rm -rf yolov8/data/val/*.txt yolov8/data/val/*.jpg

**This cell below takes approx 4h for 13GB of image data, despite certain optimizations.** 

Unfortunately, we must pass through every single train image  object (335,703 train images), and then iterate through all annotations given the image_id. We then save all annotations which have been 'visited' (parsed) into a set based on their id, so that we may skip them in the future. This approach was chosed due to how the COCO dataset convention works, where we have an "image" object(s) dictionary, and multiple annotations for any given image - but they are in separate JSON sub-objects (sub-dictionaries), in no particular order, with no ability of indexing given a certain id or filename. As such, the search is linear through all sub-objects, yielding complexity of O(M * N), where M is the number of images, and N is the number of annotations. Storing parsed annotations simply reduces the number of operations performed, but will not change complexity.

In [25]:
train_img_names_set = set(train_images)
val_img_names_set = set(val_images)
output_data_path = Path("yolov8/data/")
visited_entries_ann = set()

for idx, img_obj in tqdm(enumerate(train_labels['images'])):

    file_name = img_obj['file_name']
    if file_name not in train_img_names_set.union(val_img_names_set):
        continue

    full_img_path = train_data_path / file_name
    
    if file_name in train_img_names_set:
        img_output_path = output_data_path / "train" / file_name
        label_output_path = output_data_path / "train" / str(file_name[:-3] + "txt")
    else:
        img_output_path = output_data_path / "val" / file_name
        label_output_path = output_data_path / "val" / str(file_name[:-3] + "txt")

    shutil.copy(full_img_path, img_output_path)
    
    img_width = img_obj['width']
    img_height = img_obj['height']
    image_id = img_obj['id']
    label = None
    
    for ann_obj in train_labels['annotations']:
        if ann_obj['image_id'] != image_id or ann_obj['id'] in visited_entries_ann:
            continue

        visited_entries_ann.add(ann_obj['id'])
        class_id = int(ann_obj['category_id']) - 1
        bbox = ann_obj['bbox']

        normalized_bbox = normalize_bbox(*bbox, img_width, img_height)
        # print(normalized_bbox)
        label_entry = f'{class_id} ' + f'{normalized_bbox[0]} {normalized_bbox[1]} {normalized_bbox[2]} {normalized_bbox[3]}\n'

        # Write the content to the file
        with open(label_output_path, 'a') as file:
            file.write(label_entry)

335703it [4:07:23, 22.62it/s]  


We should now have:
- Created the appropriate file structure expected by YOLO, taking the COCO conventions into account  (file hierarchy + files + `custom.yaml`)
- All names of train/validation images in `train_images.pkl` and `val_images.pkl` - for reproducibility.
- After running the above cell, we should have all images in place, alongside their labels. We are now ready for fine-tuning.