In [2]:
!pip install ultralytics opencv-python pillow matplotlib tqdm pandas scikit-learn

Collecting ultralytics
  Downloading ultralytics-8.3.203-py3-none-any.whl.metadata (37 kB)
Collecting opencv-python
  Using cached opencv_python-4.12.0.88-cp37-abi3-macosx_13_0_arm64.whl.metadata (19 kB)
Collecting pillow
  Downloading pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.6-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting numpy>=1.23.0 (from ultralytics)
  Downloading numpy-2.3.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting pyyaml>=5.3.1 (from ultralytics)
  Downloading pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl.metadata (2.4 kB)
Collecting requests>=2.23.0 (from ultralytics)
  Downloading request

In [41]:
import shutil
from pathlib import Path
from PIL import Image
import random

In [19]:
data_dir = Path("dataset/images")
image_out = Path("dataset/clean_img")
def is_image_ok(path):
    try:
        with Image.open(path) as im:
            im.verify()
        return True
    except Exception as e:
        return False
    
hashes = {}
good, bad = 0,0
for img in data_dir.rglob("*.jpg"):
    if not is_image_ok(img):
        print("Corrupted",img)
        bad += 1
        continue
    h = hash(img.read_bytes())
    if h in hashes:
        print("Duplicate",img, "==",hashes[h])
        bad += 1
        continue
    hashes[h] = str(img)
    shutil.copy(img,image_out / img.name)
    good += 1
print(f"Clened images saved. Good: {good}, Bad: {bad}")

Clened images saved. Good: 32823, Bad: 0


In [20]:
import hashlib
hashes = {}
for img_path in data_dir.glob("*.jpg"):
    with open(img_path, "rb") as f:
        filehash = hashlib.md5(f.read()).hexdigest()
    if filehash in hashes:
        print(f"Duplicate: {img_path} and {hashes[filehash]}")
    else:
        hashes[filehash] = img_path

In [40]:
import json
from pprint import pprint

ann_json = "dataset/annotations.json"

with open(ann_json, 'r') as f:
    ann = json.load(f)

print("Top-level keys:", ann.keys())

print("\nNumber of annotations:", len(ann['annotations']))
print("Number of categories:", len(ann['categories']))

# Show a sample annotation
print("\nSample annotation:")
pprint(ann['annotations'][0])

# Show a sample category
print("\nSample category:")
pprint(ann['categories'][0])


Top-level keys: dict_keys(['info', 'licenses', 'categories', 'annotations'])

Number of annotations: 32823
Number of categories: 8

Sample annotation:
{'altitude': 19921.6,
 'angle_phi': -0.06713105738162994,
 'angle_psi': 1.1161083340644837,
 'angle_theta': 0.06894744634628296,
 'bbox': [{'class': 1, 'height': 185, 'left': 1098, 'top': 163, 'width': 420},
          {'class': 1, 'height': 176, 'left': 1128, 'top': 421, 'width': 393},
          {'class': 0, 'height': 153, 'left': 1703, 'top': 927, 'width': 183}],
 'image_height': 1080.0,
 'image_name': 'frame_20190829091111_x_0001973.jpg',
 'image_width:': 1920.0,
 'latitude': 56.20630134795274,
 'linear_x': 0.03130074199289083,
 'linear_y': 0.028357808757573367,
 'linear_z': 0.0744575835764408,
 'longtitude': 10.18798203255313,
 'platform': 'Parrot Bebop 2',
 'time': {'day': 29,
          'hour': 9,
          'min': 11,
          'month': 8,
          'ms': 394400.0,
          'sec': 11,
          'year': 2019}}

Sample category:
'Huma

In [42]:
# Paths
data_dir = Path("dataset/images")
image_out = Path("dataset/images_subset")
image_out.mkdir(parents=True, exist_ok=True)

# Load annotations
with open("dataset/annotations.json", "r") as f:
    ann = json.load(f)

annotations = ann['annotations']

# Take 15% of dataset
subset_size = int(0.15 * len(annotations))
subset_annotations = random.sample(annotations, subset_size)

# Collect unique image names
subset_image_names = set(a['image_name'] for a in subset_annotations)

# Copy images to new folder
for img_name in subset_image_names:
    src = data_dir / img_name
    dst = image_out / img_name
    if src.exists():
        shutil.copy(src, dst)

print(f"Copied {len(subset_image_names)} images to {image_out}")

# Save new subset JSON
subset_json = {
    "info": ann["info"],
    "licenses": ann["licenses"],
    "categories": ann["categories"],
    "annotations": subset_annotations
}

with open(image_out / "annotations_subset.json", "w") as f:
    json.dump(subset_json, f, indent=2)

print("Subset annotations saved.")


Copied 4923 images to dataset/images_subset
Subset annotations saved.


In [33]:
import xml.etree.ElementTree as ET

LABELS_DIR = Path("data/yolo/labels_all")
LABELS_DIR.mkdir(parents=True, exist_ok=True)

CLASSES = ["car","truck","person"]  # adjust based on dataset
class_map = {c:i for i,c in enumerate(CLASSES)}

def voc_to_yolo(xml_file, img_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    w = int(root.find("size/width").text)
    h = int(root.find("size/height").text)
    lines = []
    for obj in root.findall("object"):
        cls = obj.find("name").text
        if cls not in class_map:# =============================================
# 5. Create train/val/test splits (only 15% of dataset)
# =============================================
SPLIT_DIR = Path("data/yolo")
for split in ["train","val","test"]:
    (SPLIT_DIR / "images" / split).mkdir(parents=True, exist_ok=True)
    (SPLIT_DIR / "labels" / split).mkdir(parents=True, exist_ok=True)

# Take only 15% of cleaned images
images = list(IMAGES_OUT.glob("*.jpg"))
random.shuffle(images)
subset_size = int(0.15 * len(images))
images = images[:subset_size]
print(f"Using {len(images)} images out of {len(list(IMAGES_OUT.glob('*.jpg')))} (~15%)")

# Split into train (80%), val (10%), test (10%)
n = len(images)
train, val, test = np.split(images, [int(.8*n), int(.9*n)])

splits = {"train":train,"val":val,"test":test}

for split, imgs in splits.items():
    for img in imgs:
        shutil.copy(img, SPLIT_DIR/"images"/split/img.name)
        lbl = LABELS_DIR / (img.stem + ".txt")
        if lbl.exists():
            shutil.copy(lbl, SPLIT_DIR/"labels"/split/lbl.name)
        else:
            (SPLIT_DIR/"labels"/split/lbl.name).write_text("")

print("Filtered and split dataset saved in data/yolo/")

            continue
        cls_id = class_map[cls]
        b = obj.find("bndbox")
        xmin, ymin, xmax, ymax = [float(b.find(x).text) for x in ["xmin","ymin","xmax","ymax"]]
        x_center = ((xmin+xmax)/2)/w
        y_center = ((ymin+ymax)/2)/h
        bw = (xmax-xmin)/w
        bh = (ymax-ymin)/h
        lines.append(f"{cls_id} {x_center:.6f} {y_center:.6f} {bw:.6f} {bh:.6f}")
    return lines

ANN_DIR = data_dir / "annotations/voc"  # adjust path
for xml in ANN_DIR.glob("*.xml"):
    img_path = image_out / (xml.stem + ".jpg")
    if not img_path.exists():
        continue
    lines = voc_to_yolo(xml, img_path)
    (LABELS_DIR / f"{xml.stem}.txt").write_text("\n".join(lines))

with open("data/yolo/classes.txt","w") as f:
    f.write("\n".join(CLASSES))

print("YOLO labels created in", LABELS_DIR)


YOLO labels created in data/yolo/labels_all
