In [4]:
import json
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm import tqdm

# === Paths ===
csv_path = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/train/annotations.csv")
csv_image_root = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/train/images/")
json_path = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/annotations.json")
combined_output = Path("combined_annotations.json")

# === Load JSON ===
with open(json_path) as f:
    coco_aug = json.load(f)

# Build category lookup from JSON
name_to_id = {cat["name"]: cat["id"] for cat in coco_aug["categories"]}
categories = coco_aug["categories"]

# Get current max IDs
next_image_id = max(img["id"] for img in coco_aug["images"]) + 1
next_ann_id = max(ann["id"] for ann in coco_aug["annotations"]) + 1

# === Load CSV and generate annotations ===
df = pd.read_csv(csv_path, names=["file_name", "species"])

coco_csv = {
    "images": [],
    "annotations": [],
    "categories": categories
}

print("Converting CSV image-level annotations to bounding boxes...")
for i, row in tqdm(df.iterrows(), total=len(df)):
    image_path = csv_image_root / row["file_name"]
    if not image_path.exists():
        continue

    try:
        with Image.open(image_path) as im:
            width, height = im.size
    except:
        continue

    # Register image
    image_id = next_image_id
    coco_csv["images"].append({
        "id": image_id,
        "file_name": row["file_name"],
        "width": width,
        "height": height
    })
    next_image_id += 1

    # Map species to category_id
    species = row["species"].strip()
    if species not in name_to_id:
        continue  # Skip unknown categories

    category_id = name_to_id[species]

    # Add full-image bounding box
    coco_csv["annotations"].append({
        "id": next_ann_id,
        "image_id": image_id,
        "category_id": category_id,
        "bbox": [0, 0, width, height],
        "area": width * height,
        "iscrowd": 0
    })
    next_ann_id += 1

# === Combine both ===
print("Merging datasets...")
combined = {
    "images": coco_aug["images"] + coco_csv["images"],
    "annotations": coco_aug["annotations"] + coco_csv["annotations"],
    "categories": categories
}

# === Save combined output ===
with open(combined_output, "w") as f:
    json.dump(combined, f)
print(f"Combined dataset saved to {combined_output}")


Converting CSV image-level annotations to bounding boxes...


  0%|          | 0/23700 [00:00<?, ?it/s]

100%|██████████| 23700/23700 [00:01<00:00, 13385.76it/s]


Merging datasets...
Combined dataset saved to combined_annotations.json


In [5]:
from collections import Counter
import json

with open("combined_annotations.json") as f:
    data = json.load(f)

print(f"Total images: {len(data['images'])}")
print(f"Total annotations: {len(data['annotations'])}")
print(f"Total categories: {len(data['categories'])}")

category_ids = [ann["category_id"] for ann in data["annotations"]]
print(f"Category ID distribution: {Counter(category_ids)}")


Total images: 34352
Total annotations: 46962
Total categories: 79
Category ID distribution: Counter({2: 600, 3: 600, 4: 600, 5: 600, 6: 600, 7: 600, 8: 600, 9: 600, 10: 600, 11: 600, 12: 600, 13: 600, 14: 600, 15: 600, 16: 600, 17: 600, 18: 600, 19: 600, 20: 600, 22: 600, 23: 600, 24: 600, 25: 600, 26: 600, 28: 600, 29: 600, 30: 600, 31: 600, 32: 600, 33: 600, 34: 600, 35: 600, 36: 600, 37: 600, 38: 600, 39: 600, 40: 600, 41: 600, 42: 600, 43: 600, 44: 600, 45: 600, 47: 600, 48: 600, 49: 600, 50: 600, 51: 600, 52: 600, 53: 600, 54: 600, 55: 600, 56: 600, 57: 600, 58: 600, 59: 600, 60: 600, 61: 600, 62: 600, 63: 600, 64: 600, 65: 600, 66: 600, 67: 600, 68: 600, 69: 600, 70: 600, 71: 600, 72: 600, 73: 600, 74: 600, 75: 600, 76: 600, 77: 600, 78: 600, 79: 600, 21: 599, 46: 530, 1: 527, 27: 306})


In [8]:
import shutil
from pathlib import Path
from tqdm import tqdm

# Source and destination directories
src_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset")
dst_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")

# File extensions considered as images
image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}

# Make sure destination exists
dst_dir.mkdir(parents=True, exist_ok=True)

# Copy files
print(f"Copying images from {src_dir} to {dst_dir}...")
for file in tqdm(src_dir.iterdir(), desc="Copying"):
    if file.suffix.lower() in image_extensions and file.is_file():
        shutil.move(file, dst_dir / file.name)

print("✅ Copy complete.")


Copying images from /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset to /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images...


Copying: 23703it [00:00, 37825.27it/s]

✅ Copy complete.





In [3]:
import json
from pathlib import Path
from tqdm import tqdm

# === Config ===
json_path = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/combined_annotations.json")
image_root = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/")  # Update if needed

# === Load JSON ===
with open(json_path) as f:
    data = json.load(f)

# === Validate Image Existence ===
missing = []
print(f"Checking {len(data['images'])} image paths...")
for img in tqdm(data["images"]):
    img_path = image_root / img["file_name"]
    if not img_path.is_file():
        missing.append(img["file_name"])

# === Results ===
if missing:
    print(f"\n❌ {len(missing)} missing images:")
    for fname in missing[:10]:
        print(f" - {fname}")
    if len(missing) > 10:
        print("... (truncated)")
else:
    print("\n✅ All images found.")


Checking 34352 image paths...


100%|██████████| 34352/34352 [00:00<00:00, 179555.32it/s]


✅ All images found.





In [1]:
from PIL import Image
from pathlib import Path

broken = []

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")

for image_path in image_dir.glob("*.jpg"):
    try:
        with Image.open(image_path) as img:
            img.verify()
    except Exception:
        broken.append(str(image_path))

print(f"\nFound {len(broken)} broken images.")

# Optional: delete them
for path in broken:
    Path(path).unlink()


Found 9298 broken images.


In [2]:
from pathlib import Path

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")

stats = {
    "total": 0,
    "missing": 0,
    "zero_bytes": 0,
    "tiny_files": 0,
    "ok": 0
}

for path in image_dir.glob("*.jpg"):
    stats["total"] += 1
    if not path.exists():
        stats["missing"] += 1
    elif path.stat().st_size == 0:
        stats["zero_bytes"] += 1
    elif path.stat().st_size < 1024:
        stats["tiny_files"] += 1
    else:
        stats["ok"] += 1

from pprint import pprint
pprint(stats)


{'missing': 0, 'ok': 1355, 'tiny_files': 0, 'total': 1355, 'zero_bytes': 0}


In [3]:
from collections import Counter
from pathlib import Path

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
exts = Counter(p.suffix.lower() for p in image_dir.iterdir())

print(exts)


Counter({'.png': 23699, '.jpg': 1355})


In [4]:
import json
from pathlib import Path

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
json_path = "/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/combined_annotations.json"

with open(json_path) as f:
    data = json.load(f)

json_filenames = set(Path(img["file_name"]).stem for img in data["images"])
actual_files = {p.stem: p.suffix for p in image_dir.iterdir() if p.suffix.lower() in [".jpg", ".jpeg", ".png"]}

missing = [stem for stem in json_filenames if stem not in actual_files]

print(f"{len(missing)} annotation image references are missing")


9298 annotation image references are missing


In [5]:
import json
from pathlib import Path

# Paths
image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
json_path = "/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/combined_annotations.json"
fixed_json_path = json_path.replace(".json", "_fixed.json")

# Index all actual files by stem -> suffix
actual_files = {p.stem: p.suffix.lower() for p in image_dir.iterdir() if p.is_file()}

# Load JSON
with open(json_path) as f:
    data = json.load(f)

# Fix file extensions
fixed_count = 0
missing_count = 0

for img in data["images"]:
    stem = Path(img["file_name"]).stem
    if stem in actual_files:
        img["file_name"] = stem + actual_files[stem]
        fixed_count += 1
    else:
        missing_count += 1

# Save new JSON
with open(fixed_json_path, "w") as f:
    json.dump(data, f, indent=2)

print(f"✔ Fixed extensions for {fixed_count} images.")
print(f"❗ {missing_count} image references still missing.")
print(f"✅ Saved to: {fixed_json_path}")


✔ Fixed extensions for 25054 images.
❗ 9298 image references still missing.
✅ Saved to: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/combined_annotations_fixed.json


In [6]:
from pathlib import Path

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
png_files = list(image_dir.glob("*.png"))

print(f"Found {len(png_files)} .png files. Deleting...")

for f in png_files:
    try:
        f.unlink()
        print(f"🗑️ Deleted: {f}")
    except Exception as e:
        print(f"❌ Failed to delete {f}: {e}")

print("✅ All .png files removed.")


Found 23699 .png files. Deleting...
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/3057_10290.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/63_256.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/7772_21398.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/3995_12624.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/4433_13709.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/6503_18689.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/1029_4001.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/633_2569.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/6375_18410.png
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents

In [7]:
from pathlib import Path

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
jpg_files = list(image_dir.glob("*.jpg"))

print(f"📸 Found {len(jpg_files)} .jpg files.")


📸 Found 10653 .jpg files.


In [10]:
from pathlib import Path

image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
jpg_files = list(image_dir.glob("*.jpg"))

print(f"Found {len(jpg_files)} .jpg files. Deleting...")

for f in jpg_files:
    try:
        f.unlink()
        print(f"🗑️ Deleted: {f}")
    except Exception as e:
        print(f"❌ Failed to delete {f}: {e}")

print("✅ All .jpg files removed.")


Found 134 .jpg files. Deleting...
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/e0d80527-822e-4336-a7ac-51a96e138fae.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/96602728-7ed2-478d-9b4a-dfcf4564eadd.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/2fba62bd-8f2b-4947-89e7-6d03a65fa679.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/f0677792-160e-4f63-8406-47f654752296.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/4c0ffeee-251c-4874-ba9d-0c6471c90296.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/402f2b3e-8fdd-4251-a27d-26c4dd3c90cf.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images/ed8abed6-3310-46a4-b1d8-7d46176cc3aa.jpg
🗑️ Deleted: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augmen

In [3]:
from pathlib import Path
from PIL import Image

# Source directory
image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")

# Loop through all JPG files (case-insensitive)
jpg_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.JPG"))

for jpg_path in jpg_files:
    png_path = jpg_path.with_suffix(".png")
    try:
        with Image.open(jpg_path) as img:
            img.convert("RGB").save(png_path, "PNG")
        print(f"✅ Converted: {jpg_path.name} → {png_path.name}")
    except Exception as e:
        print(f"⚠️ Failed to convert {jpg_path.name}: {e}")


✅ Converted: 18d6a773-5196-4fd8-a4d4-aab6ed14c42f.jpg → 18d6a773-5196-4fd8-a4d4-aab6ed14c42f.png
✅ Converted: aba5dfe2-f7c5-4474-84ee-fc2c3162fe37.jpg → aba5dfe2-f7c5-4474-84ee-fc2c3162fe37.png
✅ Converted: bec89ecc-cf6c-47da-96d1-b303055078a6.jpg → bec89ecc-cf6c-47da-96d1-b303055078a6.png
✅ Converted: f43bca3a-7990-497a-825f-e64ac7c30ce0.jpg → f43bca3a-7990-497a-825f-e64ac7c30ce0.png
✅ Converted: 76523c8e-4a04-4ff6-9590-e5060db4bb69.jpg → 76523c8e-4a04-4ff6-9590-e5060db4bb69.png
✅ Converted: a8d533f7-b0c3-4185-b5ac-cada7425616f.jpg → a8d533f7-b0c3-4185-b5ac-cada7425616f.png
✅ Converted: 0e074c31-ecbd-4d7f-8542-7823b2329de8.jpg → 0e074c31-ecbd-4d7f-8542-7823b2329de8.png
✅ Converted: 306e8aef-1a77-4c58-98b5-73656144816c.jpg → 306e8aef-1a77-4c58-98b5-73656144816c.png
✅ Converted: 8f656ccf-2cf7-4685-a862-f648b8d7e01f.jpg → 8f656ccf-2cf7-4685-a862-f648b8d7e01f.png
✅ Converted: b72fe167-3b82-44cf-9a26-8a1e221c1d11.jpg → b72fe167-3b82-44cf-9a26-8a1e221c1d11.png
✅ Converted: 602353d8-35e0-445

In [9]:
from pathlib import Path

# Path to image folder
image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")

# Find all .jpg and .JPG files
jpg_files = list(image_dir.glob("*.jpg")) + list(image_dir.glob("*.JPG"))

deleted_count = 0

for jpg_file in jpg_files:
    png_file = jpg_file.with_suffix(".png")
    if png_file.exists():
        try:
            jpg_file.unlink()
            deleted_count += 1
            print(f"🗑️ Deleted: {jpg_file.name}")
        except Exception as e:
            print(f"⚠️ Could not delete {jpg_file.name}: {e}")

print(f"\n✅ Deleted {deleted_count} .jpg files that had matching .png files.")


🗑️ Deleted: 18d6a773-5196-4fd8-a4d4-aab6ed14c42f.jpg
🗑️ Deleted: aba5dfe2-f7c5-4474-84ee-fc2c3162fe37.jpg
🗑️ Deleted: bec89ecc-cf6c-47da-96d1-b303055078a6.jpg
🗑️ Deleted: f43bca3a-7990-497a-825f-e64ac7c30ce0.jpg
🗑️ Deleted: 76523c8e-4a04-4ff6-9590-e5060db4bb69.jpg
🗑️ Deleted: a8d533f7-b0c3-4185-b5ac-cada7425616f.jpg
🗑️ Deleted: 0e074c31-ecbd-4d7f-8542-7823b2329de8.jpg
🗑️ Deleted: 306e8aef-1a77-4c58-98b5-73656144816c.jpg
🗑️ Deleted: 8f656ccf-2cf7-4685-a862-f648b8d7e01f.jpg
🗑️ Deleted: b72fe167-3b82-44cf-9a26-8a1e221c1d11.jpg
🗑️ Deleted: 602353d8-35e0-445e-a2d1-dd5e20f9d47e.jpg
🗑️ Deleted: efc46bd0-ca92-4946-93c2-bd64d9020dc9.jpg
🗑️ Deleted: 396fa7c9-b749-4832-8fb5-f7aee9760e9f.jpg
🗑️ Deleted: 30ae91a4-086c-43a4-9f7e-dc727176d85c.jpg
🗑️ Deleted: 56b6083d-2eae-4172-989c-69f35fb7c30b.jpg
🗑️ Deleted: e0925ddb-9708-4762-9da6-ca57488eb6aa.jpg
🗑️ Deleted: ae50d750-f54e-4ac3-9e8f-66010136a2b5.jpg
🗑️ Deleted: fb30c525-4d49-49b8-9ff8-c48ca5bf450e.jpg
🗑️ Deleted: f2bf22d3-b988-4935-ba28-77d6e64c71

In [13]:
import json
from pathlib import Path

# Paths
image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
json_path = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/annotations.json")
output_json_path = json_path.with_name(json_path.stem + "_fixed.json")

# Index all available files by stem
actual_files = {p.stem: p.name for p in image_dir.iterdir() if p.is_file() and p.suffix.lower() in [".jpg", ".png"]}

# Load annotations
with open(json_path) as f:
    data = json.load(f)

# Fix file_name fields using actual file extensions
fixed_count = 0
missing_count = 0

for img in data["images"]:
    stem = Path(img["file_name"]).stem
    if stem in actual_files:
        img["file_name"] = actual_files[stem]
        fixed_count += 1
    else:
        missing_count += 1
        print(f"⚠️ Missing image file for: {img['file_name']}")

# Save new JSON
with open(output_json_path, "w") as f:
    json.dump(data, f, indent=2)

print(f"\n✅ Fixed extensions for {fixed_count} images.")
print(f"🚫 {missing_count} images could not be matched.")
print(f"📄 Saved to: {output_json_path}")


⚠️ Missing image file for: c336937a-6757-4af1-98f7-dec9fa9ad857.jpg
⚠️ Missing image file for: ee49c208-f4ab-4544-9087-539c8af96a08.jpg
⚠️ Missing image file for: b8baa81d-d261-4bf1-86fd-f68c7d41ffc1.jpg
⚠️ Missing image file for: 22444236-89af-4a81-b6ee-af1b38ac061b.jpg
⚠️ Missing image file for: e7526bab-d1e1-4043-abeb-b3ef46e085bf.jpg
⚠️ Missing image file for: e68a68e4-cae1-4417-9e39-19766ada5ffc.jpg
⚠️ Missing image file for: 1a7fd0b6-54d9-41b8-bfb6-0df71c0f12ce.jpg
⚠️ Missing image file for: bec621f6-708d-485f-9ff2-20b54f2bacfd.jpg
⚠️ Missing image file for: 3149a8d6-590f-4c58-a471-251f1705f117.jpg
⚠️ Missing image file for: 2a382ca1-cc9d-4688-9553-af1f90c13cb3.jpg
⚠️ Missing image file for: 8b0663c3-76c5-4f7d-ba30-495189a31d41.jpg
⚠️ Missing image file for: a65eaf42-7b7d-4a3c-874d-263964f8c4cc.jpg
⚠️ Missing image file for: bf435b62-536c-4148-ae39-4ec994d36579.jpg
⚠️ Missing image file for: 2827c325-eea9-459b-a7b7-913cb4826526.jpg
⚠️ Missing image file for: f0677792-160e-4f63-84

In [14]:
import json

with open("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/combined_annotations_fixed.json") as f:
    data = json.load(f)

print(f"🔢 Total bounding boxes: {len(data['annotations'])}")


🔢 Total bounding boxes: 46962


In [19]:
import os
import json
import csv
from pathlib import Path
from PIL import Image

# Paths
image_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/images")
json_path = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/annotations.json")
output_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/")
output_img_dir = output_dir / "rois"
output_csv_path = output_dir / "annotations.csv"

# Ensure output directories exist
output_img_dir.mkdir(parents=True, exist_ok=True)

# Load annotations
with open(json_path) as f:
    data = json.load(f)

# Build category ID to label name lookup
cat_id_to_name = {cat["id"]: cat["name"] for cat in data["categories"]}

# Build image ID to actual image path, using normalized filenames
# Index all actual image files by stem (ignoring extension)
actual_files = {
    p.stem: p for p in image_dir.iterdir()
    if p.is_file() and p.suffix.lower() in [".jpg", ".jpeg", ".png"]
}

# Build id_to_path by matching stem
id_to_path = {}
for img in data["images"]:
    stem = Path(img["file_name"]).stem
    if stem in actual_files:
        id_to_path[img["id"]] = actual_files[stem]
    else:
        print(f"⚠️ Could not find file for stem: {stem}")

# Crop and save ROI images
cropped_entries = []
counter = 0
skipped = 0

for ann in data["annotations"]:
    image_id = ann["image_id"]
    category_id = ann["category_id"]
    bbox = ann["bbox"]  # [x, y, width, height]

    if image_id not in id_to_path:
        print(f"⚠️ Skipping annotation: image_id {image_id} not found on disk.")
        skipped += 1
        continue

    input_path = id_to_path[image_id]
    label = cat_id_to_name[category_id]

    try:
        with Image.open(input_path) as img:
            x, y, w, h = map(int, bbox)
            cropped = img.crop((x, y, x + w, y + h))
            out_path = output_img_dir / f"{counter:06d}_{label.replace(' ', '_')}.png"
            cropped.save(out_path, format="PNG", compress_level=0)
            cropped_entries.append([str(out_path.resolve()), label])
            counter += 1
    except Exception as e:
        print(f"⚠️ Failed to process {input_path}: {e}")
        skipped += 1

# Save labels CSV
with open(output_csv_path, "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["path", "label"])
    writer.writerows(cropped_entries)

# Summary
print(f"\n✅ Cropped and saved {counter} ROIs.")
print(f"📂 ROI images saved to: {output_img_dir}")
print(f"📄 Labels CSV saved to: {output_csv_path}")
print(f"🚫 Skipped {skipped} annotations due to missing or unreadable files.")


⚠️ Could not find file for stem: c336937a-6757-4af1-98f7-dec9fa9ad857
⚠️ Could not find file for stem: ee49c208-f4ab-4544-9087-539c8af96a08
⚠️ Could not find file for stem: b8baa81d-d261-4bf1-86fd-f68c7d41ffc1
⚠️ Could not find file for stem: 22444236-89af-4a81-b6ee-af1b38ac061b
⚠️ Could not find file for stem: e7526bab-d1e1-4043-abeb-b3ef46e085bf
⚠️ Could not find file for stem: e68a68e4-cae1-4417-9e39-19766ada5ffc
⚠️ Could not find file for stem: 1a7fd0b6-54d9-41b8-bfb6-0df71c0f12ce
⚠️ Could not find file for stem: bec621f6-708d-485f-9ff2-20b54f2bacfd
⚠️ Could not find file for stem: 3149a8d6-590f-4c58-a471-251f1705f117
⚠️ Could not find file for stem: 2a382ca1-cc9d-4688-9553-af1f90c13cb3
⚠️ Could not find file for stem: 8b0663c3-76c5-4f7d-ba30-495189a31d41
⚠️ Could not find file for stem: a65eaf42-7b7d-4a3c-874d-263964f8c4cc
⚠️ Could not find file for stem: bf435b62-536c-4148-ae39-4ec994d36579
⚠️ Could not find file for stem: 2827c325-eea9-459b-a7b7-913cb4826526
⚠️ Could not find fi

In [20]:
import shutil
import pandas as pd
from pathlib import Path

# Input datasets
roi_dirs = [
    Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/train/rois"),
    Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/rois")
]
csv_paths = [
    Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/train/annotations.csv"),
    Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/augment_dataset/annotations.csv")
]

# Output location
combined_dir = Path("/mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/combined/rois")
combined_csv = combined_dir.parent / "annotations.csv"
combined_dir.mkdir(parents=True, exist_ok=True)

# Processing
combined_entries = []
counter = 0

for roi_dir, csv_path in zip(roi_dirs, csv_paths):
    df = pd.read_csv(csv_path)
    for _, row in df.iterrows():
        src_path = Path(row["path"])
        label = row["label"]
        ext = src_path.suffix.lower()
        dst_name = f"{counter:06d}_{label.replace(' ', '_')}{ext}"
        dst_path = combined_dir / dst_name

        try:
            shutil.copy2(src_path, dst_path)
            combined_entries.append([str(dst_path.resolve()), label])
            counter += 1
        except Exception as e:
            print(f"⚠️ Failed to copy {src_path}: {e}")

# Save combined annotations CSV
df_combined = pd.DataFrame(combined_entries, columns=["path", "label"])
df_combined.to_csv(combined_csv, index=False)

print(f"\n✅ Combined {counter} ROI images.")
print(f"📂 Saved to: {combined_dir}")
print(f"📄 Combined CSV saved to: {combined_csv}")



✅ Combined 46632 ROI images.
📂 Saved to: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/combined/rois
📄 Combined CSV saved to: /mnt/beegfs/home/dzimmerman2021/Documents/fathomnet/combined/annotations.csv
