This notebook aims to standardize the dataset on which we will conduct the study, meaning converting it into a uniform format: a dataset consisting of
six folders, each containing images and their annotations in COCO format.
An important part of this notebook corresponds to how I personally proceeded to obtain the desired dataset format from the original dataset format I had available. It's up to you to see if certain cells in this notebook can also be useful for you to obtain the correct dataset format!


In [None]:
import os
import json
import math
import random
import shutil
import datetime
import argparse
from pathlib import Path
import cv2
import wandb
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
from sklearn.cluster import KMeans
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchsummary import summary
import albumentations as A
from albumentations.pytorch import ToTensorV2

I had a dataset consisting of six folders, each containing images and two JSON annotation files.
The first was in COCO format and only provided the positions of the annotations, without considering the cell class (black cell or white cell).
The second, in addition to the positions, included a binary grid indicating the two possible classes: black cell or white cell.
The two classes below are designed to handle the data from these two JSON files.

In [None]:
class KeypointDataset(Dataset):
    def __init__(self, coco_json, img_dir, img_list=None, transform=None, sigma=2, target_size=(512,512)):
        with open(coco_json, 'r') as f:
            self.coco_data = json.load(f)
        self.img_dir = img_dir
        self.transform = transform
        self.sigma = sigma
        self.target_size = target_size

        self.liste_id = {ann['id'] for ann in self.coco_data['annotations']}

        self.id_to_id_image_id = {ann['id']: ann['image_id'] for ann in self.coco_data['annotations']}

        self.img_id_to_file = {img['id']: img['file_name'] for img in self.coco_data['images']}

        self.id_to_keypoints = {ann['id']: ann['keypoints'] for ann in self.coco_data['annotations']}



In [None]:
class BinaryGrid(Dataset):
    def __init__(self, json_dataset):
        with open(json_dataset, 'r') as f:
            self.data = json.load(f)


        self.filename_to_id_dataset = {img['file_name']: img['id'] for img in self.data['images']}

        self.id_dataset_to_binary_grid = {ann['image_id']: ann['datamatrix']['binary_grid'] for ann in self.data['annotations']}

I copy the dataset I was given to my personal workspace so that I can later convert it to the proper format and apply my transformations and other processing steps.

In [None]:
source_dir = '../../clone/NextGenCode_Dataset/NextGenCode_ROI_Dataset'
destination_dir = 'DATASET'

if os.path.exists(destination_dir):
    shutil.rmtree(destination_dir)

shutil.copytree(source_dir, destination_dir)

for element in os.listdir(destination_dir):
    dossier = os.path.join(destination_dir, element)
    if os.path.isdir(dossier):
        coco = 'processed_coco_annotations.json'
        fichier = os.path.join(dossier, coco)
        if os.path.isfile(fichier):
            shutil.copy(fichier, os.path.join(dossier, 'annotations.json'))

In the annotation file containing the binary grids, the way of distinguishing the annotation classes was not standardized.
For example, a 1 in the grid of one image could represent a black cell, while in another image it could represent a white cell.
Therefore, it was necessary to standardize the annotation encoding method across all images. These functions call the two classes defined above.

In [None]:
def grille_binaire(id, file_name):
    path_json = os.path.join(file_name, 'processed_annotations.json')
    path_coco_json = os.path.join(file_name, 'annotations.json')
    X = BinaryGrid(json_dataset=path_json)
    Y = KeypointDataset(coco_json=path_coco_json, img_dir=file_name)

    id_image = Y.id_to_id_image_id[id]
    filename = Y.img_id_to_file[id_image]
    id_dataset = X.filename_to_id_dataset[filename]
    binary_grid = X.id_dataset_to_binary_grid[id_dataset]
    return binary_grid


def liste_binaire(grille_binaire):
    liste_binaire = []
    for i in grille_binaire:
        liste_binaire.extend(i)
    return liste_binaire

def coherent(grille_binaire):
    l0 = [row[0] for row in grille_binaire]
    l1 = grille_binaire[0]
    l2 = [row[-1] for row in grille_binaire]
    l3 = grille_binaire[-1]
    l = [l0, l1, l2, l3]
    retour = False
    for i in range(4):
        j = (i + 1) % 4
        somme = sum(l[i]) + sum(l[j])
        longueur = len(l[i]) + len(l[j])
        if somme == longueur:
            retour = True
    return retour

Here, we iterate through the annotations by ID and only update the keypoints in our COCO-format JSON file to follow this distinction:
v = 2 for black cells and v = 1 for white cells.

In [None]:
dataset = destination_dir

for element in os.listdir(dataset):
    image_dir = os.path.join(dataset, element)
    path_coco_json = os.path.join(image_dir, 'annotations.json')
    path_json = os.path.join(image_dir, 'processed_annotations.json')

    if os.path.isfile(path_coco_json) and os.path.isfile(path_json):
        X = BinaryGrid(json_dataset=path_json)
        Y = KeypointDataset(coco_json=path_coco_json, img_dir=image_dir)

        with open(path_coco_json, 'r') as f:
            data_set = json.load(f)

        for id in Y.liste_id:
            try:
                keypoints = Y.id_to_keypoints[id]
                grille_binaire_result = grille_binaire(id, image_dir)
                liste_binaire_result = liste_binaire(grille_binaire_result)

                if coherent(grille_binaire_result):
                    for i in range(0, len(keypoints), 3):
                        if liste_binaire_result[i // 3] == 1:
                            keypoints[i + 2] = 2
                        else:
                            keypoints[i + 2] = 1
                else:
                    for i in range(0, len(keypoints), 3):
                        if liste_binaire_result[i // 3] == 1:
                            keypoints[i + 2] = 1
                        else:
                            keypoints[i + 2] = 2

                for ann in data_set['annotations']:
                    if ann['id'] == id:
                        ann['keypoints'] = keypoints
            except (IndexError, ValueError) as e:
                print(f"Error processing id {id}: {e}")

        with open(path_coco_json, 'w') as f:
            json.dump(data_set, f, indent=2)

At this stage, each dataset folder contains an "annotations.json" file in COCO format that distinguishes datamatrix cell classes (v=1 for white cells, v=2 for black cells)

In case you only have an image folder in your Dataset, you can execute this next cell to obtain a Dataset in the format. Once this format is obtained, it's up to you to see if the cells above can be useful to you. Please specify: input_folder = your dataset, input_annotations = your annotations, out_folder = the path to your future Dataset

In [None]:

input_folder =
input_annotations =
output_folder =
n_clusters =


def segment_images(input_folder, output_folder="segmented_images", n_clusters=5):
    input_path = Path(input_folder)
    output_path = Path(output_folder)
    output_path.mkdir(exist_ok=True)

    image_files = []
    for ext in ['*.jpg', '*.jpeg', '*.png', '*.bmp']:
        image_files.extend(input_path.glob(ext))
        image_files.extend(input_path.glob(ext.upper()))

    features = []
    valid_images = []

    for img_path in image_files:
        try:
            img = cv2.imread(str(img_path))
            if img is None:
                continue

            img = cv2.resize(img, (64, 64))  

            mean_colors = np.mean(img.reshape(-1, 3), axis=0)
            std_colors = np.std(img.reshape(-1, 3), axis=0)

            feature = np.concatenate([mean_colors, std_colors])
            features.append(feature)
            valid_images.append(img_path)

        except Exception as e:
            print(f"Erreur avec {img_path}: {e}")
            continue

    if len(features) < n_clusters:
        n_clusters = len(features)

    features_array = np.array(features)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(features_array)

    for i in range(n_clusters):
        cluster_folder = output_path / f"groupe_{i+1}"
        cluster_folder.mkdir(exist_ok=True)

    for img_path, label in zip(valid_images, labels):
        dest_folder = output_path / f"groupe_{label+1}"
        dest_path = dest_folder / img_path.name

        counter = 1
        while dest_path.exists():
            stem = img_path.stem
            suffix = img_path.suffix
            dest_path = dest_folder / f"{stem}_{counter}{suffix}"
            counter += 1

        shutil.copy2(img_path, dest_path)



segment_images(input_folder, output_folder, n_clusters)


Z = KeypointDataset(coco_json=input_annotations, img_dir=input_folder)


path_coco_json = input_annotations

filename_to_id = {filename: img_id for img_id, filename in Z.img_id_to_file.items()}


output_path = Path(output_folder)
for cluster_folder in output_path.iterdir():
    if cluster_folder.is_dir():
        annotations = {
            "images": [],
            "annotations": [],
            "categories": Z.coco_data.get("categories", []),
            "info": Z.coco_data.get("info", {}),
            "licenses": Z.coco_data.get("licenses", [])
        }


        for img_file in cluster_folder.iterdir():
            if img_file.is_file() and img_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:

                original_filename = img_file.name

                if '_' in img_file.stem and img_file.stem.split('_')[-1].isdigit():
                    stem_parts = img_file.stem.split('_')[:-1]
                    original_filename = '_'.join(stem_parts) + img_file.suffix


                if original_filename in filename_to_id:
                    img_id = filename_to_id[original_filename]


                    image_info = None
                    for img in Z.coco_data['images']:
                        if img['id'] == img_id:
                            image_info = img.copy()
                            break

                    if image_info:

                        annotations["images"].append(image_info)


                        for ann in Z.coco_data['annotations']:
                            if ann['image_id'] == img_id:

                                annotations["annotations"].append(ann.copy())


        output_json_path = cluster_folder / "annotations.json"
        with open(output_json_path, 'w') as f:
            json.dump(annotations, f, indent=2)



Finally, we now have a dataset consisting of six folders, each containing DataMatrix images and a COCO-format JSON file with the annotations of the DataMatrix present in the images.