At this stage, you have a dataset consisting of six folders, each containing a JSON annotation file and the corresponding images.

We will restructure our dataset into a new format suitable for training, containing 2 folders: one named "Train" where our model will be trained (containing 5 of the 6 previous folders with all images and a single JSON file in COCO format combining all annotations), and one folder named "Val" which will be used to evaluate our model's performance after each training epoch (containing the COCO format JSON file and images from the one original dataset folder not included in "Train").

The ultimate objective is to implement K-fold cross-validation, meaning we will perform 6 different training runs, rotating the "Val" folder so that each of our original dataset folders serves as the evaluation set during one training run, ensuring our model demonstrates consistent performance across all 6 dataset configurations.

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torchsummary import summary
import torchvision.models as models

import numpy as np
from PIL import Image
import cv2
from scipy.ndimage import map_coordinates
import albumentations as A
from albumentations.pytorch import ToTensorV2

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

import os
import json
import shutil
import math
import random
import datetime
import argparse

import wandb
from noise import pnoise2
from sympy.benchmarks.bench_discrete_log import data_set_2

As always, we implement a class to handle annotations in COCO format:

In [None]:
class KeypointDataset(Dataset):
    def __init__(self, coco_json, img_dir, img_list=None, transform=None, sigma=2, target_size=(512,512)):
        with open(coco_json, 'r') as f:
            self.coco_data = json.load(f)
        self.img_dir = img_dir
        self.transform = transform
        self.sigma = sigma
        self.target_size = target_size

        
        self.liste_id = {ann['id'] for ann in self.coco_data['annotations']}

        
        self.id_to_id_image_id = {ann['id']: ann['image_id'] for ann in self.coco_data['annotations']}

        
        self.img_id_to_file = {img['id']: img['file_name'] for img in self.coco_data['images']}

        
        self.id_to_keypoints = {ann['id']: ann['keypoints'] for ann in self.coco_data['annotations']}

Please indicate which training run you are on out of the 6 (in range 0 to 5)

In [None]:
k_iter = 

Let's create a list containing all the paths to your annotations

Please specify the path to your dataset in 'racine ='

In [None]:
racine = 'DATASET'
fichiers = []
fichiers_json = []

for element in os.listdir(racine):
    chemin_dossier = os.path.join(racine, element)
    if os.path.isdir(chemin_dossier):
        fichiers.append(chemin_dossier)
        chemin_json = os.path.join(chemin_dossier, 'annotations.json')
        if os.path.exists(chemin_json):
            fichiers_json.append(chemin_json)

print(fichiers)
print(fichiers_json)

Let's now create, with these next two cells, two annotation files in your dataset: one corresponding to the evaluation data and the other to the training data (which is a concatenation of annotations from 5 out of 6 of your original annotation files).

In [None]:
fichiers_json_train = fichiers_json[:k_iter] + fichiers_json[k_iter+1:]

fichiers_train = fichiers[:k_iter] + fichiers[k_iter+1:]

fichier_json_val = fichiers_json[k_iter]

fichier_val = fichiers[k_iter]
source = fichiers_json_train[0]


new_filename = "annotations_train.json"
os.makedirs(racine, exist_ok=True)
destination = os.path.join(racine, new_filename)
shutil.copyfile(source, destination)


source = fichier_json_val
new_filename = "annotations_val.json"
destination = os.path.join(racine, new_filename)
shutil.copyfile(source, destination)

print(fichiers_json_train)
print(fichier_json_val)
print(fichiers_train)
print(fichier_val)


In [None]:
annotations_path = racine + "/annotations_train.json"

with open(annotations_path, 'r') as f:
    data_main = json.load(f)

max_img_id = max([img['id'] for img in data_main.get('images', [])], default=0)
max_ann_id = max([ann['id'] for ann in data_main.get('annotations', [])], default=0)

for fichier_path in fichiers_json_train[1:]:
    if not os.path.exists(fichier_path):
        print(f"Fichier introuvable : {fichier_path}")
        continue

    try:
        with open(fichier_path, 'r') as f:
            data_to_merge = json.load(f)

        if 'images' not in data_to_merge or 'annotations' not in data_to_merge:
            print(f"Structure invalide dans : {fichier_path}")
            continue

        img_id_mapping = {}
        ann_id_mapping = {}


        for img in data_to_merge.get('images', []):
            old_id = img['id']
            max_img_id += 1
            new_id = max_img_id

            img_id_mapping[old_id] = new_id
            img['id'] = new_id

        for ann in data_to_merge.get('annotations', []):
            old_ann_id = ann['id']
            old_img_id = ann['image_id']

            max_ann_id += 1
            ann['id'] = max_ann_id
            ann['image_id'] = img_id_mapping.get(old_img_id, old_img_id)


        data_main['images'].extend(data_to_merge.get('images', []))
        data_main['annotations'].extend(data_to_merge.get('annotations', []))


    except json.JSONDecodeError:
        print(f"Erreur JSON dans : {fichier_path}")
    except Exception as e:
        print(f"Erreur avec {fichier_path}: {e}")

with open(annotations_path, 'w') as f:
    json.dump(data_main, f, indent=4)



Run this next cell to have two folders, one VAL and the other TRAIN

In [None]:
train_dir = os.path.join(racine, "TRAIN")
os.rename(fichiers_train[0], train_dir)


val_dir = os.path.join(racine, "VAL")
os.rename(fichier_val, val_dir)

for i in range(1, len(fichiers_train)):
    source_dir = fichiers_train[i]

    for item in os.listdir(source_dir):
        source_item = os.path.join(source_dir, item)
        dest_item = os.path.join(train_dir, item)

        if os.path.isdir(source_item):

            shutil.copytree(source_item, dest_item)
        else:

            shutil.copy2(source_item, dest_item)


    shutil.rmtree(source_dir)



Now you just need to delete all the files that are not images in "TRAIN" and "VAL", and drag the annotations_train file into "TRAIN" and annotations_val into "VAL", and your Dataset will be ready for training!"