In [None]:
BASE = "/content/coco_fruits"


In [None]:
import os, subprocess, pathlib

os.makedirs(BASE, exist_ok=True)
os.makedirs(f"{BASE}/coco", exist_ok=True)

# Install pycocotools
!pip -q install pycocotools tqdm

# Download train2017 images (18GB). Comment this if already present.
if not pathlib.Path(f"{BASE}/coco/train2017.zip").exists():
    !wget -q http://images.cocodataset.org/zips/train2017.zip -O "{BASE}/coco/train2017.zip"

# Download annotations
if not pathlib.Path(f"{BASE}/coco/annotations_trainval2017.zip").exists():
    !wget -q http://images.cocodataset.org/annotations/annotations_trainval2017.zip -O "{BASE}/coco/annotations_trainval2017.zip"

# Unzip
if not pathlib.Path(f"{BASE}/coco/train2017").exists():
    !unzip -q "{BASE}/coco/train2017.zip" -d "{BASE}/coco/"

if not pathlib.Path(f"{BASE}/coco/annotations").exists():
    !unzip -q "{BASE}/coco/annotations_trainval2017.zip" -d "{BASE}/coco/"


In [None]:
# ====== CONFIG ======
BASE = "/content/coco_fruits"  # or your Drive path
ROOT_OUT = f"{BASE}/coco_fruits_by_class"
COCO_DIR = f"{BASE}/coco"  # where train2017 + annotations live (from your previous step)
ANN = f"{COCO_DIR}/annotations/instances_train2017.json"
IMAGES = f"{COCO_DIR}/train2017"

CLASSES = ["banana", "apple", "orange"]
PER_CLASS_MAX = 1000          # images per class
VAL_RATIO = 0.10
RANDOM_SEED = 42
# =====================

import json, os, random, shutil
from pathlib import Path
from collections import defaultdict
from tqdm import tqdm

random.seed(RANDOM_SEED)
Path(ROOT_OUT).mkdir(parents=True, exist_ok=True)

# Load COCO
with open(ANN, "r") as f:
    data = json.load(f)

cat_id_to_name = {c["id"]: c["name"] for c in data["categories"]}
name_to_cat_id = {v:k for k,v in cat_id_to_name.items()}

target_cat_ids = {name_to_cat_id[n] for n in CLASSES if n in name_to_cat_id}
imginfo = {im["id"]: im for im in data["images"]}

# Gather anns per image for target classes
anns_by_img = defaultdict(list)
for a in data["annotations"]:
    if a.get("iscrowd",0) == 1:
        continue
    if a.get("segmentation") and a["category_id"] in target_cat_ids:
        anns_by_img[a["image_id"]].append(a)

# Build image lists per class
images_for_class = {c: [] for c in CLASSES}
for img_id, anns in anns_by_img.items():
    present = set(cat_id_to_name[a["category_id"]] for a in anns)
    for c in CLASSES:
        if c in present:
            images_for_class[c].append(img_id)

def coco_polys_from_ann(ann):
    seg = ann.get("segmentation")
    if isinstance(seg, list):
        return [poly for poly in seg if isinstance(poly, list) and len(poly) >= 6]
    return []

def poly_to_yolo_line(poly, cls_id, W, H):
    xs = poly[0::2]; ys = poly[1::2]
    nx = [x / W for x in xs]
    ny = [y / H for y in ys]
    coords = []
    for x,y in zip(nx,ny):
        coords += [x,y]
    return f"{cls_id} " + " ".join(f"{v:.6f}" for v in coords)

# For each class, pick up to PER_CLASS_MAX unique images
for cname in CLASSES:
    selected = list(dict.fromkeys(images_for_class[cname]))  # de-dup, keep order
    random.shuffle(selected)
    selected = selected[:PER_CLASS_MAX]

    # Split
    val_n = max(1, int(len(selected) * VAL_RATIO))
    val_ids = set(selected[:val_n])
    train_ids = set(selected[val_n:])

    # Prepare folders
    for split in ["train","val"]:
        Path(f"{ROOT_OUT}/{cname}/images/{split}").mkdir(parents=True, exist_ok=True)
        Path(f"{ROOT_OUT}/{cname}/labels/{split}").mkdir(parents=True, exist_ok=True)

    # YOLO class index: only this class = 0
    class_index = {cname: 0}

    def export_one(img_id, split):
        info = imginfo[img_id]
        W,H = info["width"], info["height"]
        src_name = Path(info["file_name"]).name
        src_img = Path(IMAGES) / src_name
        dst_img = Path(f"{ROOT_OUT}/{cname}/images/{split}") / src_name
        dst_lbl = Path(f"{ROOT_OUT}/{cname}/labels/{split}") / (Path(src_name).stem + ".txt")

        # keep ONLY this class in the label
        lines = []
        for a in anns_by_img[img_id]:
            if cat_id_to_name[a["category_id"]] != cname:
                continue
            for poly in coco_polys_from_ann(a):
                lines.append(poly_to_yolo_line(poly, 0, W, H))  # class id 0

        if lines:
            shutil.copy2(src_img, dst_img)
            dst_lbl.write_text("\n".join(lines), encoding="utf-8")

    # Write files
    print(f"Exporting {cname} (train: {len(train_ids)}, val: {len(val_ids)})...")
    for img_id in tqdm(train_ids):
        export_one(img_id, "train")
    for img_id in tqdm(val_ids):
        export_one(img_id, "val")

    # Write a small data.yaml for each class
    Path(f"{ROOT_OUT}/{cname}/data.yaml").write_text(f"""# {cname} (COCO subset) for YOLO segmentation
path: {Path(f"{ROOT_OUT}/{cname}").resolve()}
train: images/train
val: images/val
names: ['{cname}']
""", encoding="utf-8")

print("Done. Root:", ROOT_OUT)


Exporting banana (train: 900, val: 100)...


100%|██████████| 900/900 [00:03<00:00, 229.16it/s]
100%|██████████| 100/100 [00:00<00:00, 280.14it/s]


Exporting apple (train: 900, val: 100)...


100%|██████████| 900/900 [00:02<00:00, 325.47it/s]
100%|██████████| 100/100 [00:00<00:00, 351.74it/s]


Exporting orange (train: 900, val: 100)...


100%|██████████| 900/900 [00:02<00:00, 374.31it/s]
100%|██████████| 100/100 [00:00<00:00, 398.72it/s]

Done. Root: /content/coco_fruits/coco_fruits_by_class





In [None]:

# 2) Choose a destination inside Drive (change the folder name if you want)
DEST = "/content/drive/MyDrive/3fruits"

# 3) Copy everything over (keeps subfolders and timestamps)
!mkdir -p "$DEST"
!rsync -ah --progress "/content/Strawberry" "$DEST/"


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Strawberry/train/images/fresa_375_jpg.rf.fc26e223c5fafdda226f55b7e2b2dfef.jpg
         45.59K 100%  947.31kB/s    0:00:00 (xfr#1346, ir-chk=2289/3643)
Strawberry/train/images/fresa_376_jpg.rf.16a3cd6612a804bea43c1f1020c40bb6.jpg
         39.86K 100%  505.52kB/s    0:00:00 (xfr#1347, ir-chk=2288/3643)
Strawberry/train/images/fresa_376_jpg.rf.bea465679691d2edcbbbbafc55f742cf.jpg
         40.33K 100%  315.06kB/s    0:00:00 (xfr#1348, ir-chk=2287/3643)
Strawberry/train/images/fresa_376_jpg.rf.ff0d20cc7a2db6d97ba5702f701111ce.jpg
         40.38K 100%  266.45kB/s    0:00:00 (xfr#1349, ir-chk=2286/3643)
Strawberry/train/images/fresa_377_jpg.rf.82df42497d6e3fd534ba18198cb34891.jpg
         41.67K 100%  239.36kB/s    0:00:00 (xfr#1350, ir-chk=2285/3643)
Strawberry/train/images/fresa_377_jpg.rf.8b729174c094eed13f6bdc55dfe0ca25.jpg
         39.48K 100%  199.78kB/s    0:00:00 (xfr#1351, ir-chk=2284/3643)
Strawberry/train/images/fresa

In [None]:
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="SYEm6T8HXuCwRk6VO9Up")
project = rf.workspace("sannanabbasi").project("strawberry-gzxcf-pgnod")
version = project.version(3)
dataset = version.download("yolov11")


Collecting roboflow
  Downloading roboflow-1.2.9-py3-none-any.whl.metadata (9.7 kB)
Collecting idna==3.7 (from roboflow)
  Downloading idna-3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting opencv-python-headless==4.10.0.84 (from roboflow)
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting pi-heif<2 (from roboflow)
  Downloading pi_heif-1.1.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (6.5 kB)
Collecting pillow-avif-plugin<2 (from roboflow)
  Downloading pillow_avif_plugin-1.5.2-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting filetype (from roboflow)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Downloading roboflow-1.2.9-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.7/88.7 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading idna-3.7-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━

Downloading Dataset Version Zip in strawberry-3 to yolov11:: 100%|██████████| 92828/92828 [00:01<00:00, 55799.04it/s]





Extracting Dataset Version Zip to strawberry-3 in yolov11:: 100%|██████████| 3852/3852 [00:00<00:00, 6580.92it/s]


In [None]:
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="SYEm6T8HXuCwRk6VO9Up")
project = rf.workspace("sannanabbasi").project("avocado-segmentation-wqss9")
version = project.version(3)
dataset = version.download("yolov11")


loading Roboflow workspace...
loading Roboflow project...
Exporting format yolov11 in progress : 85.0%
Version export complete for yolov11 format


Downloading Dataset Version Zip in Avocado-Segmentation-3 to yolov11:: 100%|██████████| 75507/75507 [00:01<00:00, 63515.69it/s]





Extracting Dataset Version Zip to Avocado-Segmentation-3 in yolov11:: 100%|██████████| 3172/3172 [00:00<00:00, 6753.02it/s]


In [None]:
# 2) CONFIG — point to your Roboflow YAML
from pathlib import Path
YAML_PATH = Path("/content/strawberry-3/data.yaml")  # <-- CHANGE THIS

# Optional: if you want to remove images that end up with no labels
DELETE_EMPTY_LABEL_IMAGES = False

# 3) Filter out 'strawberry-stem' and keep only 'strawberry' (class 0)
import yaml, os, shutil

with open(YAML_PATH, "r") as f:
    y = yaml.safe_load(f)

names = [str(n) for n in y.get("names", [])]
if not names:
    raise RuntimeError("No 'names' found in YAML.")

# Find indices
if "strawberry" not in names:
    raise RuntimeError("'strawberry' not found in names list.")
straw_idx = names.index("strawberry")

stem_idx = None
if "strawberry-stem" in names:
    stem_idx = names.index("strawberry-stem")
else:
    print("Note: 'strawberry-stem' not in names; nothing to remove.")

# Resolve split image/label dirs from YAML (train/val/test keys may vary)
def resolve_img_dir(key):
    if key not in y:
        return None
    p = Path(y[key])
    # make absolute relative to YAML location if needed
    return (YAML_PATH.parent / p).resolve() if not p.is_absolute() else p

splits = {}
for k in ["train", "val", "valid", "test"]:
    img_dir = resolve_img_dir(k)
    if img_dir and img_dir.exists():
        # YOLO convention: labels folder mirrors images
        lbl_dir = Path(str(img_dir).replace(os.sep + "images", os.sep + "labels"))
        if not lbl_dir.exists():
            print(f"Warning: labels dir not found for {k}: {lbl_dir}")
        splits[k] = (img_dir, lbl_dir)

total_files = 0
removed_lines = 0
kept_lines = 0
emptied = 0

def filter_label_file(lbl_path):
    global removed_lines, kept_lines, emptied
    if not lbl_path.exists():
        return False, 0  # (emptied?, remaining_count)
    lines = [L.strip() for L in lbl_path.read_text(encoding="utf-8").splitlines() if L.strip()]
    out = []
    for L in lines:
        parts = L.split()
        try:
            cls = int(float(parts[0]))
        except:
            continue
        # Drop stems
        if stem_idx is not None and cls == stem_idx:
            removed_lines += 1
            continue
        # Keep strawberries, remap to 0
        if cls == straw_idx:
            parts[0] = "0"
            out.append(" ".join(parts))
            kept_lines += 1
        else:
            # Any unexpected class -> drop
            removed_lines += 1

    # Write back (empty file allowed = negative image)
    lbl_path.write_text("\n".join(out), encoding="utf-8")
    if len(out) == 0:
        emptied += 1
        return True, 0
    return False, len(out)

for split, (img_dir, lbl_dir) in splits.items():
    if not lbl_dir.exists():
        continue
    print(f"Processing split: {split}")
    # iterate images to decide optional deletions when labels become empty
    img_files = []
    for ext in ("*.jpg","*.jpeg","*.png","*.bmp","*.webp"):
        img_files += list(img_dir.rglob(ext))
    for img_p in img_files:
        total_files += 1
        stem = img_p.stem
        lbl_p = lbl_dir / f"{stem}.txt"
        emptied_flag, rem_count = filter_label_file(lbl_p)

        if DELETE_EMPTY_LABEL_IMAGES and emptied_flag:
            # remove both image and label if labels now empty
            try:
                img_p.unlink(missing_ok=True)
                lbl_p.unlink(missing_ok=True)
            except Exception as e:
                print("Delete error:", e)

# 4) Update YAML to single-class strawberry
y["names"] = ["strawberry"]
y["nc"] = 1
with open(YAML_PATH, "w") as f:
    yaml.safe_dump(y, f, sort_keys=False)

print("Done.\n"
      f"Images seen: {total_files}\n"
      f"Label lines kept (strawberry): {kept_lines}\n"
      f"Label lines removed (stem/others): {removed_lines}\n"
      f"Empty label files after filter: {emptied}\n"
      f"YAML updated to nc=1, names=['strawberry'] at: {YAML_PATH}")


Done.
Images seen: 0
Label lines kept (strawberry): 0
Label lines removed (stem/others): 0
Empty label files after filter: 0
YAML updated to nc=1, names=['strawberry'] at: /content/strawberry-3/data.yaml


In [None]:


# === CONFIG: point to your YAML ===
from pathlib import Path
YAML_PATH = Path("/content/strawberry-3/data.yaml")  # <-- change if different
DELETE_EMPTY_LABEL_IMAGES = False

import yaml, os, re
from glob import glob

ROOT = YAML_PATH.parent.resolve()

# Load YAML (still update to single-class later)
with open(YAML_PATH, "r") as f:
    y = yaml.safe_load(f)

# --- 1) Find real split folders robustly ---
def find_split_images_dir(split_name: str):
    """
    Returns a Path to the images dir for the split, or None.
    Priority:
      a) use YAML value if exists,
      b) ROOT/split/images,
      c) any .../split/images found under ROOT (recursive)
    """
    cand = None
    if split_name in y:
        p = Path(y[split_name])
        cand = (ROOT / p).resolve() if not p.is_absolute() else p
        if cand.exists():
            return cand

    # ROOT/split/images
    p2 = ROOT / split_name / "images"
    if p2.exists():
        return p2

    # recursive search
    hits = [Path(d) for d in glob(str(ROOT / "**/images"), recursive=True)
            if Path(d).parent.name.lower() == split_name]
    return hits[0] if hits else None

splits_order = ["train", "valid", "val", "test"]
img_dirs = {s: find_split_images_dir(s) for s in splits_order}
img_dirs = {k:v for k,v in img_dirs.items() if v is not None}

if not img_dirs:
    raise RuntimeError("Could not locate any split images directories under "
                       f"{ROOT}. Please check the dataset structure.")

print("Discovered images dirs:")
for k,v in img_dirs.items():
    print(f"  {k}: {v}")

# Derive label dirs as sibling: .../<split>/labels
label_dirs = {}
for split, img_dir in img_dirs.items():
    split_root = img_dir.parent           # .../<split>
    lbl_dir = split_root / "labels"       # .../<split>/labels
    if lbl_dir.exists():
        label_dirs[split] = lbl_dir

if not label_dirs:
    raise RuntimeError("No labels directories found next to images. "
                       "Expected .../<split>/labels for each split.")

print("Discovered label dirs:")
for k,v in label_dirs.items():
    print(f"  {k}: {v}")

# --- 2) Scrub labels: keep only class 0; drop all others ---
# This works even if names included 'strawberry-stem' previously,
# because we force everything except class 0 to be removed.

def rewrite_label(lbl_path: Path):
    if not lbl_path.exists():
        return (0,0,False)
    lines = [L.strip() for L in lbl_path.read_text(encoding="utf-8").splitlines() if L.strip()]
    out = []
    kept = removed = 0
    for L in lines:
        parts = L.split()
        # first token is class id (int)
        try:
            cls = int(float(parts[0]))
        except:
            removed += 1
            continue
        if cls == 0:
            # keep strawberry as class 0
            out.append(" ".join(["0"] + parts[1:]))
            kept += 1
        else:
            # drop any other class (e.g., 1 = strawberry-stem)
            removed += 1
    lbl_path.write_text("\n".join(out), encoding="utf-8")
    return (kept, removed, len(out) == 0)

kept_total = removed_total = emptied_total = files_total = 0
for split, lbl_dir in label_dirs.items():
    for lbl_path in lbl_dir.rglob("*.txt"):
        files_total += 1
        k, r, emptied = rewrite_label(lbl_path)
        kept_total += k
        removed_total += r
        if emptied and DELETE_EMPTY_LABEL_IMAGES:
            stem = lbl_path.stem
            img_dir = img_dirs[split]
            for ext in (".jpg",".jpeg",".png",".bmp",".webp"):
                img_path = img_dir / f"{stem}{ext}"
                if img_path.exists():
                    try: img_path.unlink()
                    except: pass
            try: lbl_path.unlink()
            except: pass
            emptied_total += 1

print(f"\nProcessed label files: {files_total}")
print(f"Kept class-0 (strawberry) lines: {kept_total}")
print(f"Removed non-zero class lines: {removed_total}")
print(f"Deleted empty-label images: {emptied_total} (DELETE_EMPTY_LABEL_IMAGES={DELETE_EMPTY_LABEL_IMAGES})")

# --- 3) Normalize YAML: single-class + portable split paths ---
# Prefer 'train' and 'val' keys. If only 'valid' exists, create 'val'.
new_yaml = dict(y)  # copy
new_yaml["names"] = ["strawberry"]
new_yaml["nc"] = 1

def rel_from_root(p: Path):
    return str(p.relative_to(ROOT)).replace("\\", "/")

if "train" in img_dirs:
    new_yaml["train"] = rel_from_root(img_dirs["train"])
if "val" in img_dirs:
    new_yaml["val"] = rel_from_root(img_dirs["val"])
elif "valid" in img_dirs:
    new_yaml["val"] = rel_from_root(img_dirs["valid"])
if "test" in img_dirs:
    new_yaml["test"] = rel_from_root(img_dirs["test"])

# Write back
with open(YAML_PATH, "w") as f:
    yaml.safe_dump(new_yaml, f, sort_keys=False)

print("\nUpdated YAML written to:", YAML_PATH)
print("Final YAML keys:", list(new_yaml.keys()))

# --- 4) Verify no non-zero classes remain anywhere ---
offenders = []
for split, lbl_dir in label_dirs.items():
    for p in lbl_dir.rglob("*.txt"):
        txt = p.read_text(encoding="utf-8")
        # any line starting with 1..9
        if re.search(r'^\s*[1-9]\b', txt, flags=re.M):
            offenders.append(str(p))

print("\nVerification:", "CLEAN ✅ (only class 0 remains)" if not offenders else "Found non-zero classes ❌")
if offenders:
    print("\nSample offenders:")
    print("\n".join(offenders[:20]))


Discovered images dirs:
  train: /content/strawberry-3/train/images
  valid: /content/strawberry-3/valid/images
  test: /content/strawberry-3/test/images
Discovered label dirs:
  train: /content/strawberry-3/train/labels
  valid: /content/strawberry-3/valid/labels
  test: /content/strawberry-3/test/labels

Processed label files: 1920
Kept class-0 (strawberry) lines: 3225
Removed non-zero class lines: 1492
Deleted empty-label images: 0 (DELETE_EMPTY_LABEL_IMAGES=False)

Updated YAML written to: /content/strawberry-3/data.yaml
Final YAML keys: ['train', 'val', 'test', 'nc', 'names', 'roboflow']

Verification: CLEAN ✅ (only class 0 remains)


In [None]:
# Mount Drive only if your dataset is on Drive (skip if not needed)
# from google.colab import drive
# drive.mount('/content/drive')

from pathlib import Path
import yaml, re
from glob import glob

# === CONFIG: your YAML path ===
YAML_PATH = Path("/content/Avocado-Segmentation-3/data.yaml")  # <-- keep or change

ROOT = YAML_PATH.parent.resolve()

with open(YAML_PATH, "r") as f:
    y = yaml.safe_load(f)

def find_split_images_dir(split_name: str):
    # 1) use YAML if present, else 2) ROOT/split/images, else 3) recursive find
    if split_name in y:
        p = Path(y[split_name])
        p = (ROOT / p).resolve() if not p.is_absolute() else p
        if p.exists(): return p
    p2 = ROOT / split_name / "images"
    if p2.exists(): return p2
    hits = [Path(d) for d in glob(str(ROOT / "**/images"), recursive=True)
            if Path(d).parent.name.lower() == split_name]
    return hits[0] if hits else None

splits_order = ["train", "val", "valid", "test"]
img_dirs = {s: find_split_images_dir(s) for s in splits_order}
img_dirs = {k:v for k,v in img_dirs.items() if v is not None}
if not img_dirs:
    raise RuntimeError(f"Could not locate any images dirs under {ROOT}")

# labels dirs = sibling of images dir
label_dirs = {}
for split, img_dir in img_dirs.items():
    lbl_dir = img_dir.parent / "labels"
    if lbl_dir.exists():
        label_dirs[split] = lbl_dir

if not label_dirs:
    raise RuntimeError("No labels directories found. Expected .../<split>/labels")

def rewrite_label(lbl_path: Path):
    if not lbl_path.exists(): return (0,0,False)
    lines = [L.strip() for L in lbl_path.read_text(encoding="utf-8").splitlines() if L.strip()]
    out = []
    kept = removed = 0
    for L in lines:
        parts = L.split()
        try:
            _ = int(float(parts[0]))  # old class id (0/1/2)
        except:
            removed += 1
            continue
        # force single-class: set class id to 0 and keep the rest of coords
        parts[0] = "0"
        out.append(" ".join(parts))
        kept += 1
    lbl_path.write_text("\n".join(out), encoding="utf-8")
    return (kept, removed, len(out) == 0)

kept_total = removed_total = emptied_total = files_total = 0
for split, lbl_dir in label_dirs.items():
    for lbl_path in lbl_dir.rglob("*.txt"):
        files_total += 1
        k, r, emptied = rewrite_label(lbl_path)
        kept_total += k; removed_total += r
        if emptied: emptied_total += 1

print(f"Processed label files: {files_total}")
print(f"Kept lines (now class 0): {kept_total}")
print(f"Dropped malformed/blank lines: {removed_total}")

# Update YAML to single-class + normalize paths
new_yaml = dict(y)
new_yaml["names"] = ["avocado"]
new_yaml["nc"] = 1
def rel_from_root(p: Path): return str(p.relative_to(ROOT)).replace("\\","/")
if "train" in img_dirs: new_yaml["train"] = rel_from_root(img_dirs["train"])
if "val" in img_dirs:   new_yaml["val"]   = rel_from_root(img_dirs["val"])
elif "valid" in img_dirs: new_yaml["val"] = rel_from_root(img_dirs["valid"])
if "test" in img_dirs:  new_yaml["test"]  = rel_from_root(img_dirs["test"])

with open(YAML_PATH, "w") as f:
    yaml.safe_dump(new_yaml, f, sort_keys=False)

# Verify no non-zero class ids remain
offenders = []
for split, lbl_dir in label_dirs.items():
    for p in lbl_dir.rglob("*.txt"):
        txt = p.read_text(encoding="utf-8")
        if re.search(r'^\s*[1-9]\b', txt, flags=re.M):
            offenders.append(str(p))

print("Verification:", "CLEAN ✅ (only class 0 remains)" if not offenders else "Found non-zero classes ❌")
if offenders:
    print("\nSample offenders:\n", "\n".join(offenders[:20]))


Processed label files: 1580
Kept lines (now class 0): 2882
Dropped malformed/blank lines: 0
Verification: CLEAN ✅ (only class 0 remains)


In [None]:
import os, shutil, re
from pathlib import Path
from glob import glob
from tqdm import tqdm
import yaml

# ==== CONFIG ====
# Where to write the merged dataset
MERGED_OUT = Path("/content/drive/MyDrive/3fruits/All_Fruits")  # change if you like

# Class order in the final dataset (this defines class IDs 0..4)
CLASS_ORDER = ["apple","banana","orange","avocado","strawberry"]

# Map each class to its existing single-class YAML
# Update any paths if yours differ.
CLASS_YAMLS = {
    "apple":      "/content/drive/MyDrive/3fruits/apple/data.yaml",
    "banana":     "/content/drive/MyDrive/3fruits/banana/data.yaml",
    "orange":     "/content/drive/MyDrive/3fruits/orange/data.yaml",
    "avocado":    "/content/drive/MyDrive/3fruits/Avocado/data.yaml",
    "strawberry": "/content/drive/MyDrive/3fruits/Strawberry/data.yaml",
}

# If the source dataset has a "test" split, where should we put it?
# Options: None (skip), "val" (merge test into val), or "test" (keep as test).
MERGE_TEST_TO = "val"
# =================

# Prepare merged folders
for split in ["train","val"] + (["test"] if MERGE_TEST_TO=="test" else []):
    (MERGED_OUT/f"images/{split}").mkdir(parents=True, exist_ok=True)
    (MERGED_OUT/f"labels/{split}").mkdir(parents=True, exist_ok=True)

def load_yaml(p: Path):
    with open(p, "r") as f:
        return yaml.safe_load(f)

def resolve_img_dir(y, root: Path, split_name: str):
    if split_name in y:
        p = Path(y[split_name])
        p = (root / p).resolve() if not p.is_absolute() else p
        if p.exists():
            return p
    # common layouts
    p2 = root / split_name / "images"
    if p2.exists():
        return p2
    # recursive: any .../<split>/images under root
    hits = [Path(d) for d in glob(str(root / "**/images"), recursive=True)
            if Path(d).parent.name.lower() == split_name]
    return hits[0] if hits else None

# Give each class a running index for filenames to avoid collisions
name_counters = {cls: 0 for cls in CLASS_ORDER}

def next_dst_name(cls: str, ext: str):
    name_counters[cls] += 1
    return f"{cls}_{name_counters[cls]:06d}{ext.lower()}"

def remap_and_copy_split(cls_name: str, src_img_dir: Path, dst_split: str, new_cls_id: int):
    """Copy images; rewrite labels to new class id. Keeps empty labels as negatives."""
    if not src_img_dir:
        return 0,0,0
    src_lbl_dir = src_img_dir.parent / "labels"
    if not src_lbl_dir.exists():
        print(f"[{cls_name}] Warning: labels dir missing for {src_img_dir}")
        return 0,0,0

    kept_lines = removed_lines = files = 0
    imgs = []
    for ext in ("*.jpg","*.jpeg","*.png","*.bmp","*.webp"):
        imgs += list(src_img_dir.rglob(ext))

    for img_path in tqdm(imgs, desc=f"{cls_name}:{dst_split}", leave=False):
        files += 1
        stem = img_path.stem
        lbl_src = src_lbl_dir / f"{stem}.txt"

        # Choose a unique destination filename
        dst_name = next_dst_name(cls_name, img_path.suffix)
        dst_img = MERGED_OUT / "images" / dst_split / dst_name
        dst_lbl = MERGED_OUT / "labels" / dst_split / (Path(dst_name).stem + ".txt")

        # Copy image
        shutil.copy2(img_path, dst_img)

        if lbl_src.exists():
            lines = [L.strip() for L in lbl_src.read_text(encoding="utf-8").splitlines() if L.strip()]
            out = []
            for L in lines:
                parts = L.split()
                # first token is class id; we ignore old value and set to global id
                # (works for seg and det)
                if not parts:
                    continue
                parts[0] = str(new_cls_id)
                out.append(" ".join(parts))
                kept_lines += 1
            # write label (can be empty -> negative sample)
            dst_lbl.write_text("\n".join(out), encoding="utf-8")
        else:
            # Create empty label file to keep negative image behavior
            dst_lbl.write_text("", encoding="utf-8")

    return files, kept_lines, removed_lines

# Merge loop
total_files = 0
for cls in CLASS_ORDER:
    yaml_path = Path(CLASS_YAMLS[cls])
    if not yaml_path.exists():
        print(f"⚠️ YAML not found for {cls}: {yaml_path} (skipping)")
        continue
    y = load_yaml(yaml_path)
    root = yaml_path.parent.resolve()
    new_cls_id = CLASS_ORDER.index(cls)

    img_train = resolve_img_dir(y, root, "train")
    img_val   = resolve_img_dir(y, root, "val") or resolve_img_dir(y, root, "valid")
    img_test  = resolve_img_dir(y, root, "test")

    f1, k1, _ = remap_and_copy_split(cls, img_train, "train", new_cls_id)
    f2, k2, _ = remap_and_copy_split(cls, img_val,   "val",   new_cls_id)

    if MERGE_TEST_TO in ("val", "test") and img_test:
        split_name = MERGE_TEST_TO
        if split_name == "test" and not (MERGED_OUT/"images/test").exists():
            (MERGED_OUT/"images/test").mkdir(parents=True, exist_ok=True)
            (MERGED_OUT/"labels/test").mkdir(parents=True, exist_ok=True)
        f3, k3, _ = remap_and_copy_split(cls, img_test, split_name, new_cls_id)
    else:
        f3 = k3 = 0

    total_files += (f1+f2+f3)
    print(f"[{cls}] train:{f1} val:{f2} test->{MERGE_TEST_TO}:{f3} (labels kept: {k1+k2+k3})")

print(f"\n✅ Merge complete. Total images copied: {total_files}")

# Write unified data.yaml
unified_yaml = {
    "path": str(MERGED_OUT.resolve()),
    "train": "images/train",
    "val": "images/val",
    "names": CLASS_ORDER
}
# include test only if created
if (MERGED_OUT/"images/test").exists():
    unified_yaml["test"] = "images/test"

with open(MERGED_OUT/"fruits.yaml", "w") as f:
    yaml.safe_dump(unified_yaml, f, sort_keys=False)

print("Unified YAML:", MERGED_OUT/"fruits.yaml")




[apple] train:900 val:100 test->val:0 (labels kept: 3884)




[banana] train:900 val:100 test->val:0 (labels kept: 4429)




[orange] train:900 val:100 test->val:0 (labels kept: 3889)




[avocado] train:1437 val:95 test->val:48 (labels kept: 2882)


                                                               

[strawberry] train:1764 val:104 test->val:52 (labels kept: 3225)

✅ Merge complete. Total images copied: 6500
Unified YAML: /content/drive/MyDrive/3fruits/All_Fruits/fruits.yaml




In [None]:
from pathlib import Path
import yaml, shutil, re

# ==== CONFIG ====
MERGE_ROOT = Path("/content/drive/MyDrive/3fruits/Yolo_seg2")  # unified dataset root
UNIFIED_YAML = MERGE_ROOT / "fruits.yaml"                      # unified yaml path

# Try both spellings; pick the one that exists
CANDIDATES = [Path("/content/drive/MyDrive/3fruits/Avocado/Avocado")]
AVO_ROOT = next((p for p in CANDIDATES if p.exists()), None)
assert AVO_ROOT and (AVO_ROOT / "data.yaml").exists(), "Avocado folder or data.yaml not found."

# Desired unified class order (edit if your order differs)
UNIFIED_NAMES = ["apple", "banana", "orange", "avocado", "strawberry"]
TARGET_CLASS = "avocado"
# =================

def load_or_create_unified_yaml():
    MERGE_ROOT.mkdir(parents=True, exist_ok=True)
    if UNIFIED_YAML.exists():
        with open(UNIFIED_YAML, "r") as f:
            y = yaml.safe_load(f) or {}
    else:
        y = {}
    # Ensure minimal fields
    y.setdefault("path", str(MERGE_ROOT))
    y.setdefault("train", "images/train")
    y.setdefault("val", "images/val")
    y.setdefault("names", UNIFIED_NAMES)
    # Normalize names to a simple list
    if isinstance(y.get("names"), dict):
        # convert {0:name0,1:name1,...} to list
        items = sorted(y["names"].items(), key=lambda kv: int(kv[0]))
        y["names"] = [name for _, name in items]
    # Ensure desired order (append any missing)
    for n in UNIFIED_NAMES:
        if n not in y["names"]:
            y["names"].append(n)
    y["nc"] = len(y["names"])

    # Write back
    with open(UNIFIED_YAML, "w") as f:
        yaml.safe_dump(y, f, sort_keys=False)
    return y

def find_split_images_dir(root: Path, yml: dict, split: str):
    # 1) if YAML has split path, use it (relative to root if needed)
    if split in yml:
        p = Path(yml[split])
        p = (root / p).resolve() if not p.is_absolute() else p
        if p.exists():
            return p
    # 2) common patterns
    for cand in [root / split / "images",
                 root / ({"val":"valid","valid":"valid"}.get(split, split)) / "images"]:
        if cand.exists():
            return cand
    # 3) recursive search: .../<split>/images
    hits = [Path(p) for p in root.rglob("images") if p.parent.name.lower() in (split, {"val":"valid"}.get(split, split))]
    return hits[0] if hits else None

def label_dir_for(img_dir: Path):
    lbl = img_dir.parent / "labels"
    return lbl if lbl.exists() else None

# Load/ensure unified YAML
unified = load_or_create_unified_yaml()
names = unified["names"]
try:
    target_id = names.index(TARGET_CLASS)
except ValueError:
    names.append(TARGET_CLASS)
    target_id = names.index(TARGET_CLASS)
    unified["names"] = names
    unified["nc"] = len(names)
    with open(UNIFIED_YAML, "w") as f:
        yaml.safe_dump(unified, f, sort_keys=False)

# Load avocado YAML
with open(AVO_ROOT / "data.yaml", "r") as f:
    avo_yaml = yaml.safe_load(f)

# Locate avocado split dirs
split_map = {"train":"train", "val":"val", "valid":"val", "test":"val"}  # send test -> val
img_dirs = {}
for s in ["train","val","valid","test"]:
    p = find_split_images_dir(AVO_ROOT, avo_yaml, s)
    if p: img_dirs[s] = p

assert img_dirs, "No avocado images dirs found."

# Prepare unified dirs
(dst_img_train, dst_lbl_train) = (MERGE_ROOT/"images/train", MERGE_ROOT/"labels/train")
(dst_img_val,   dst_lbl_val)   = (MERGE_ROOT/"images/val",   MERGE_ROOT/"labels/val")
for d in [dst_img_train, dst_lbl_train, dst_img_val, dst_lbl_val]:
    d.mkdir(parents=True, exist_ok=True)

def rewrite_and_copy(img_path: Path, src_lbl_dir: Path, dst_img_dir: Path, dst_lbl_dir: Path, prefix: str):
    # Build new base name with class prefix to avoid collisions
    new_base = f"{prefix}___{img_path.stem}"
    dst_img = dst_img_dir / f"{new_base}{img_path.suffix}"
    lbl_src = src_lbl_dir / f"{img_path.stem}.txt"
    dst_lbl = dst_lbl_dir / f"{new_base}.txt"

    if not lbl_src.exists():
        return False  # skip images without labels

    # Read, remap class id on every line to target_id
    lines = [L.strip() for L in lbl_src.read_text(encoding="utf-8").splitlines() if L.strip()]
    out = []
    for L in lines:
        parts = L.split()
        # first token should be class id
        if not parts:
            continue
        if not re.match(r"^-?\d+(\.\d+)?$", parts[0]):
            continue
        parts[0] = str(target_id)
        out.append(" ".join(parts))
    if not out:
        return False

    # Copy image and write label
    shutil.copy2(img_path, dst_img)
    dst_lbl.write_text("\n".join(out), encoding="utf-8")
    return True

# Do the merge
kept = 0
for split, img_dir in img_dirs.items():
    lbl_dir = label_dir_for(img_dir)
    if not lbl_dir:
        print(f"[avocado] Warning: labels dir missing for {img_dir}")
        continue
    target_split = split_map[split]
    dimg = dst_img_train if target_split=="train" else dst_img_val
    dlbl = dst_lbl_train if target_split=="train" else dst_lbl_val
    images = [p for p in img_dir.iterdir() if p.suffix.lower() in {".jpg",".jpeg",".png",".bmp",".tif",".tiff"}]
    for im in images:
        if rewrite_and_copy(im, lbl_dir, dimg, dlbl, prefix="avocado"):
            kept += 1

print(f"✅ Avocado merge done. Images copied with labels: {kept}")
print("Unified YAML:", UNIFIED_YAML)


✅ Avocado merge done. Images copied with labels: 1342
Unified YAML: /content/drive/MyDrive/3fruits/Yolo_seg2/fruits.yaml


In [None]:
import os, glob

root = "/content/drive/MyDrive/3fruits"
classes = ["apple","banana","orange","avocado","strawberry"]

for cls in classes:
    for split in ["train","val"]:
        img_dir = os.path.join(root, cls, "images", split)
        lab_dir = os.path.join(root, cls, "labels", split)
        if not os.path.isdir(img_dir):
            continue
        if not os.path.isdir(lab_dir):
            print(f"[{cls}] Missing {lab_dir}")
            continue
        imgs = sorted(glob.glob(os.path.join(img_dir, "*.*")))
        missing = []
        for im in imgs:
            name = os.path.splitext(os.path.basename(im))[0]
            if not os.path.exists(os.path.join(lab_dir, name + ".txt")):
                missing.append(name)
        print(f"[{cls}] {split}: {len(imgs)} imgs, {len(missing)} without labels",
              ("-> e.g. " + ", ".join(missing[:5])) if missing else "")


[apple] train: 900 imgs, 0 without labels 
[apple] val: 100 imgs, 0 without labels 
[banana] train: 900 imgs, 0 without labels 
[banana] val: 100 imgs, 0 without labels 
[orange] train: 900 imgs, 0 without labels 
[orange] val: 100 imgs, 0 without labels 


In [None]:
from pathlib import Path
import shutil

src = Path("/content/drive/MyDrive/3fruits/orange/labels/val")
dst = Path("/content/drive/MyDrive/3fruits/orange/val/labels")
dst.mkdir(parents=True, exist_ok=True)

# move all files/folders inside src into dst
for item in src.iterdir():
    shutil.move(str(item), str(dst / item.name))

# (optional) remove now-empty source directory
try:
    src.rmdir()  # only succeeds if it's empty
except OSError:
    pass


In [None]:
!pip install ultralytics


Collecting ultralytics
  Downloading ultralytics-8.3.193-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.193-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.17-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.193 ultralytics-thop-2.0.17
