In [1]:
import json
from collections import Counter

label_file = "large_rock_dataset.json"

with open(label_file, 'r') as f:
    data = json.load(f)

splits = [tile.get('split', 'train') for tile in data['dataset']]  # Default to 'train' if missing
split_counts = Counter(splits)

for split, count in split_counts.items():
    print(f"{split.capitalize()}: {count} images")
    print(f"Percentage: {count / len(splits) * 100:.2f}%")

Train: 640 images
Percentage: 64.52%
Test: 352 images
Percentage: 35.48%


In [2]:
import os
import shutil
from typing import List, Tuple
import json


class LargeRocksDatasetV2:
    def __init__(self, image_folder: str, json_dataset: str, output_path: str):
        """
        Initialize the dataset processor
        
        Args:
            image_folder (str): Path to folder containing `.tif` images
            json_dataset (str): Path to JSON dataset file
            output_path (str): Path to save YOLOv8 formatted dataset
        """
        self.image_folder = image_folder
        self.label_file = json_dataset
        self.output_path = output_path
        
        # Define directories for train and test splits
        self.splits = ["train", "test"]
        self.image_dir = output_path
        self.label_dir = output_path
        
        # Create directories for each split
        for split in self.splits:
            os.makedirs(os.path.join(self.image_dir, split, "images"), exist_ok=True)
            os.makedirs(os.path.join(self.label_dir, split, "labels"), exist_ok=True)
    
    def _convert_bbox(self, rel_loc: Tuple[float, float], bbox_size: Tuple[int, int], img_size: Tuple[int, int]) -> List[float]:
        """
        Convert bounding box info to YOLO format: [class_id, x_center, y_center, width, height].
        
        Args:
            rel_loc (Tuple[float, float]): Relative location of the object in the image (normalized).
            bbox_size (Tuple[int, int]): Size of the bounding box in pixels.
            img_size (Tuple[int, int]): Image size (width, height).
        
        Returns:
            List[float]: Bounding box in YOLO format.
        """
        x_center, y_center = rel_loc
        width = bbox_size[0] / img_size[0]
        height = bbox_size[1] / img_size[1]
        return [0, x_center, y_center, width, height]  # class_id = 0 for rocks
    
    def process_dataset(self):
        """
        Process the dataset and convert it to YOLOv8 format with train/test splits.
        """
        # Load the annotations JSON
        with open(self.label_file, 'r') as f:
            data = json.load(f)
        
        # Iterate over each image in the dataset
        for tile in data['dataset']:
            file_name = tile['file_name']
            img_path = os.path.join(self.image_folder, file_name)
            
            # Check if the image exists
            if not os.path.exists(img_path):
                print(f"Image {img_path} not found. Skipping.")
                continue
            
            img_width, img_height = tile['width'], tile['height']
            annotations = tile.get('rocks_annotations', [])
            split = tile.get('split', "train")  # Default to 'train' if no split is specified
            
            # Ensure split is either train or test
            if split not in self.splits:
                print(f"Skipping split '{split}' for file {file_name}.")
                continue
            
            # Copy the image to the appropriate YOLO image folder
            dst_img_path = os.path.join(self.image_dir, split, "images", file_name)
            shutil.copy(img_path, dst_img_path)
            
            # Prepare labels for this image
            label_lines = []
            for annotation in annotations:
                rel_loc = annotation['relative_within_patch_location']
                bbox_size = annotation.get('bbox_size', [30, 30])  # Default bbox size to 30x30
                yolo_bbox = self._convert_bbox(rel_loc, bbox_size, (img_width, img_height))
                label_lines.append(" ".join(map(str, yolo_bbox)))
            
            # Save labels to the appropriate folder
            label_file = os.path.join(self.label_dir, split, "labels", f"{os.path.splitext(file_name)[0]}.txt")
            with open(label_file, 'w') as lf:
                lf.write("\n".join(label_lines))
        
        print(f"Dataset ({self.image_folder}) converted to YOLO format with train/test splits at {self.output_path}")



In [3]:
# Example usage
image_folder = "swissImage_50cm_patches"  # Path to image folder
label_file = "large_rock_dataset.json"  # Path to JSON annotation file
output_path = "dataset_rgb_only"  # Path to save processed dataset

rocks_dataset = LargeRocksDatasetV2(image_folder, label_file, output_path)
rocks_dataset.process_dataset()

Dataset (swissImage_50cm_patches) converted to YOLO format with train/test splits at dataset_rgb_only


In [4]:
!pip install ultralytics



In [5]:
from ultralytics.data.utils import verify_image_label
import os

# Define the required arguments
image_file = "dataset_rgb_only/train/images/2581_1126_0_2.tif"  # Path to the image file
label_file = "dataset_rgb_only/train/labels/2581_1126_0_2.txt"  # Path to the corresponding label file
prefix = "[VERIFY] "  # Optional log message prefix
keypoint = False  # Whether the labels include keypoints
num_classes = 1  # Total number of classes in the dataset
nkpt = 0  # Number of keypoints (if keypoint is True)
ndim = 0  # Number of dimensions for keypoints

# Verify the image and its label
args = (image_file, label_file, prefix, keypoint, num_classes, nkpt, ndim)
result = verify_image_label(args)

# Output the result
print("Verification Results:")
print(f"Image File: {result[0]}")
print(f"Labels: {result[1]}")
print(f"Image Shape: {result[2]}")
print(f"Segments: {result[3]}")
print(f"Keypoints: {result[4]}")
print(f"Missing Labels: {result[5]}")
print(f"Found Labels: {result[6]}")
print(f"Empty Labels: {result[7]}")
print(f"Corrupt Files: {result[8]}")
print(f"Message: {result[9]}")


Verification Results:
Image File: dataset_rgb_only/train/images/2581_1126_0_2.tif
Labels: [[          0        0.12        0.39    0.046875    0.046875]]
Image Shape: (640, 640)
Segments: []
Keypoints: None
Missing Labels: 0
Found Labels: 1
Empty Labels: 0
Corrupt Files: 0
Message: 


In [6]:
import os

def remove_duplicates_in_labels(base_dir):
    """
    Traverse the labels directory and remove duplicate lines in each label file.
    Print a message only if duplicates were removed.
    """
    subfolders = ['train/labels', 'test/labels']
    
    for subfolder in subfolders:
        labels_path = os.path.join(base_dir, subfolder)
        
        if not os.path.exists(labels_path):
            print(f"Directory not found: {labels_path}")
            continue
        
        for label_file in os.listdir(labels_path):
            file_path = os.path.join(labels_path, label_file)
            
            if not label_file.endswith('.txt'):
                continue  # Skip non-label files
            
            try:
                # Read file and remove duplicates
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                
                unique_lines = list(set(lines))  # Remove duplicates
                
                # Check if duplicates were removed
                if len(lines) != len(unique_lines):
                    # Write back the unique lines
                    with open(file_path, 'w') as f:
                        f.writelines(sorted(unique_lines))  # Sorting for consistency
                    
                    print(f"Duplicates removed in file: {file_path}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

# Specify the base directory of your dataset
base_dataset_dir = 'dataset_rgb_only'

# Call the function
remove_duplicates_in_labels(base_dataset_dir)


Duplicates removed in file: dataset_rgb_only/train/labels/2588_1133_0_2.txt
Duplicates removed in file: dataset_rgb_only/train/labels/2704_1127_3_3.txt
Duplicates removed in file: dataset_rgb_only/train/labels/2588_1133_1_2.txt
Duplicates removed in file: dataset_rgb_only/train/labels/2582_1127_0_1.txt
Duplicates removed in file: dataset_rgb_only/train/labels/2598_1132_1_3.txt
Duplicates removed in file: dataset_rgb_only/train/labels/2598_1132_0_3.txt
Duplicates removed in file: dataset_rgb_only/test/labels/2626_1102_2_0.txt


In [7]:
import yaml

def write_yaml_file(output_path, dataset_path, train_path, val_path, test_path, class_names):
    """
    Write a YAML file for the dataset configuration.
    
    Args:
        output_path (str): Path to save the YAML file.
        dataset_path (str): Base path to the dataset.
        train_path (str): Path to the training data folder.
        val_path (str): Path to the validation data folder.
        test_path (str): Path to the test data folder.
        class_names (dict): Dictionary with class names (e.g., {0: 'Rock'}).
    """
    data = {
        "path": dataset_path,
        "train": train_path,
        "val": val_path,
        "test": test_path,
        "names": class_names
    }
    
    # Write the YAML file
    with open(output_path, 'w') as yaml_file:
        yaml.dump(data, yaml_file, default_flow_style=False)
    
    print(f"YAML file written to: {output_path}")

In [8]:
# Example usage
output_yaml = "data.yaml"

## You should add your own entire paths here, there seem to have been some issues in the past

dataset_path = "/Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only" 
train_path = "/Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only/train"
val_path = "/Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only/train"
test_path = "/Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only/test"


class_names = {0: "Rock"}

write_yaml_file(output_yaml, dataset_path, train_path, val_path, test_path, class_names)


YAML file written to: data.yaml


In [9]:
from ultralytics import YOLO

# Load the YAML configuration
model = YOLO('yolov8n.pt')  # Load YOLOv8

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:02<00:00, 2.21MB/s]


In [10]:
## 1. Add your own paths
## 2. change deive, mps is specifically for MacBook with M1 chip

results = model.train(data="/Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/data.yaml", epochs=1, imgsz=640, device="mps")

New https://pypi.org/project/ultralytics/8.3.43 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.40 🚀 Python-3.11.5 torch-2.5.0 MPS (Apple M2 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.pt, data=/Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/data.yaml, epochs=1, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=mps, workers=8, project=None, name=train18, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embe

[34m[1mtrain: [0mScanning /Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only/train/labels... 640 images, 320 backgrounds, 0 corrupt: 100%|██████████| 640/640 [00:00<00:00, 2044.15it/s]

[34m[1mtrain: [0mNew cache created: /Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only/train/labels.cache



[34m[1mval: [0mScanning /Users/janclevorn/Desktop/EPFL/IPEO_Project_Group_4/dataset_rgb_only/train/labels.cache... 640 images, 320 backgrounds, 0 corrupt: 100%|██████████| 640/640 [00:00<?, ?it/s]






Plotting labels to /Users/janclevorn/runs/detect/train18/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1m/Users/janclevorn/runs/detect/train18[0m
Starting training for 1 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/1       4.9G      2.763      3.655      1.569         75        640: 100%|██████████| 40/40 [01:04<00:00,  1.61s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):   0%|          | 0/20 [00:00<?, ?it/s]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):   5%|▌         | 1/20 [00:14<04:26, 14.01s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  10%|█         | 2/20 [00:22<03:12, 10.67s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  15%|█▌        | 3/20 [00:31<02:51, 10.07s/it]



  bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt]  # b, max_num_obj, h*w




                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  25%|██▌       | 5/20 [00:49<02:22,  9.49s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  30%|███       | 6/20 [00:56<02:00,  8.60s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  35%|███▌      | 7/20 [01:07<02:01,  9.32s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  40%|████      | 8/20 [01:16<01:50,  9.21s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  45%|████▌     | 9/20 [01:23<01:34,  8.57s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  50%|█████     | 10/20 [01:32<01:28,  8.80s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  55%|█████▌    | 11/20 [01:41<01:18,  8.73s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  60%|██████    | 12/20 [01:50<01:09,  8.74s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  65%|██████▌   | 13/20 [02:00<01:04,  9.15s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  70%|███████   | 14/20 [02:07<00:50,  8.46s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  75%|███████▌  | 15/20 [02:16<00:43,  8.79s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  80%|████████  | 16/20 [02:23<00:32,  8.24s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  85%|████████▌ | 17/20 [02:31<00:24,  8.19s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  90%|█████████ | 18/20 [02:40<00:16,  8.39s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  95%|█████████▌| 19/20 [02:52<00:09,  9.60s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 20/20 [03:04<00:00,  9.20s/it]


                   all        640       3152    0.00925      0.144     0.0326     0.0129

1 epochs completed in 0.070 hours.
Optimizer stripped from /Users/janclevorn/runs/detect/train18/weights/last.pt, 6.2MB
Optimizer stripped from /Users/janclevorn/runs/detect/train18/weights/best.pt, 6.2MB

Validating /Users/janclevorn/runs/detect/train18/weights/best.pt...
Ultralytics 8.3.40 🚀 Python-3.11.5 torch-2.5.0 MPS (Apple M2 Pro)
Model summary (fused): 168 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):   0%|          | 0/20 [00:00<?, ?it/s]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):   5%|▌         | 1/20 [00:06<02:03,  6.53s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  10%|█         | 2/20 [00:11<01:36,  5.38s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  15%|█▌        | 3/20 [00:16<01:28,  5.20s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  20%|██        | 4/20 [00:20<01:20,  5.03s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  25%|██▌       | 5/20 [00:25<01:15,  5.03s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  30%|███       | 6/20 [00:30<01:07,  4.85s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  35%|███▌      | 7/20 [00:35<01:04,  4.96s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  40%|████      | 8/20 [00:40<00:58,  4.87s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  45%|████▌     | 9/20 [00:44<00:52,  4.79s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  50%|█████     | 10/20 [00:49<00:47,  4.75s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  55%|█████▌    | 11/20 [00:54<00:43,  4.79s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  60%|██████    | 12/20 [00:59<00:38,  4.82s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  65%|██████▌   | 13/20 [01:04<00:33,  4.86s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  70%|███████   | 14/20 [01:09<00:29,  4.85s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  75%|███████▌  | 15/20 [01:14<00:24,  4.95s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  80%|████████  | 16/20 [01:19<00:19,  4.94s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  85%|████████▌ | 17/20 [01:23<00:14,  4.89s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  90%|█████████ | 18/20 [01:28<00:09,  4.84s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95):  95%|█████████▌| 19/20 [01:33<00:04,  4.83s/it]



                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 20/20 [01:38<00:00,  4.93s/it]


                   all        640       3152    0.00891      0.103      0.026     0.0111
Speed: 1.4ms preprocess, 7.0ms inference, 0.0ms loss, 130.3ms postprocess per image
Results saved to [1m/Users/janclevorn/runs/detect/train18[0m
