## Merging Data for Origami Images from 2 Datasets

The following are the links to the kaggle datasets:

1. https://www.kaggle.com/datasets/caokhoihuynh/orgami-works-of-some-origamists
2. https://www.kaggle.com/datasets/karthikssalian/origami-models

### Dataset 1: Remove artist names and non animal models

In [3]:
#Setup & Config
from pathlib import Path
import os

RAW_DIR   = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/original data/origami-artist")
WORK_DIR  = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work")
CLEAN_DIR = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/clean")
LOG_DIR   = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs")

for p in [WORK_DIR, CLEAN_DIR, LOG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("RAW_DIR :", RAW_DIR.resolve())
print("WORK_DIR:", WORK_DIR.resolve())
print("CLEAN_DIR:", CLEAN_DIR.resolve())
print("LOG_DIR :", LOG_DIR.resolve())

IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tif", ".tiff", ".gif"}

def _is_image(p: Path) -> bool:
    return p.is_file() and p.suffix.lower() in IMAGE_EXTS

def count_images(root: Path) -> int:
    root = Path(root)
    n = 0
    for base, _, files in os.walk(root):
        for f in files:
            if Path(f).suffix.lower() in IMAGE_EXTS:
                n += 1
    return n



RAW_DIR : /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/original data/origami-artist
WORK_DIR: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work
CLEAN_DIR: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/clean
LOG_DIR : /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs


In [5]:
#image counts and folder names for tracking
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tif", ".tiff", ".gif"}

def _is_image(p: Path) -> bool:
    return p.is_file() and p.suffix.lower() in IMAGE_EXTS

def count_images(root: Path) -> int:
    root = Path(root)
    n = 0
    for base, _, files in os.walk(root):
        for f in files:
            if Path(f).suffix.lower() in IMAGE_EXTS:
                n += 1
    return n

def list_all_folders(path=WORK_DIR, limit=None):
    path = Path(path)
    print(f"\nüìÅ Listing all folders in: {path.resolve()}\n")
    if not path.exists():
        print("‚ö†Ô∏è Path not found.")
        return []
    dirs = [d for d in sorted(path.iterdir(), key=lambda x: x.name.lower()) if d.is_dir()]
    for i, d in enumerate(dirs):
        if limit and i >= limit:
            print(f"... (+{len(dirs)-limit} more)")
            break
        print(f"{i+1:3d}. {d.name}")
    print(f"\nTotal folders: {len(dirs)}")
    return [d.name for d in dirs]

def count_images_in_each_folder(work_dir=WORK_DIR, exts=(".jpg",".jpeg",".png",".webp",".bmp",".gif",".tiff",".tif")):
    """
    Returns a dict {folder_name: image_count} and prints the results.
    """
    work_dir = Path(work_dir)
    results = {}
    total = 0
    for folder in sorted([p for p in work_dir.iterdir() if p.is_dir()]):
        count = 0
        for _, _, files in os.walk(folder):
            for f in files:
                if f.lower().endswith(exts):
                    count += 1
        results[folder.name] = count
        total += count

    width = max((len(name) for name in results), default=10)
    print(f"\nüìÇ Image count per folder in {work_dir.name}:")
    for name, cnt in results.items():
        print(f"  {name.ljust(width)} : {cnt}")
    print(f"\nüñºÔ∏è Total images in WORK_DIR ({work_dir.name}): {total}")

    return results


In [13]:
raw_image_count = count_images(RAW_DIR)
print(f"üñºÔ∏è Total images in RAW_DIR ({RAW_DIR.name}): {raw_image_count}")

üñºÔ∏è Total images in RAW_DIR (origami-artist): 3902


In [16]:
#create a work data folder
import shutil, os

# Clear WORK safely
if WORK_DIR.exists():
    print(f"Clearing WORK: {WORK_DIR}")
    shutil.rmtree(WORK_DIR)
WORK_DIR.mkdir(parents=True, exist_ok=True)

# Copy tree RAW ‚Üí WORK
print("Copying RAW ‚Üí WORK")
shutil.copytree(RAW_DIR, WORK_DIR, dirs_exist_ok=True)

# Confirm
def count_files_dirs(root: Path):
    n_files = 0
    n_dirs = 0
    for base, dirs, files in os.walk(root):
        n_files += len(files)
        n_dirs += len(dirs)
    return n_files, n_dirs

work_files, work_dirs = count_files_dirs(WORK_DIR)
print(f"‚úÖ WORK ready. ~{work_dirs} dirs and {work_files} files")



Clearing WORK: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work
Copying RAW ‚Üí WORK
‚úÖ WORK ready. ~1371 dirs and 3921 files


In [65]:
#directory check
from pathlib import Path

def brief_tree(root: Path, max_models_per_artist=5, max_examples_per_model=3):
    artists = [d for d in root.iterdir() if d.is_dir()]
    print(f"Artists found: {len(artists)}\n")
    for a in sorted(artists)[:50]:
        print(f"[ARTIST] {a.name}")
        model_dirs = [d for d in a.iterdir() if d.is_dir()]
        files = [f for f in a.iterdir() if f.is_file()]
        if files:
            print(f"  (files directly under artist: {len(files)})")
        for m in sorted(model_dirs)[:max_models_per_artist]:
            print(f"  ‚îî‚îÄ [MODEL] {m.name}")
            imgs = [f for f in m.iterdir() if f.is_file()]
            subdirs = [d for d in m.iterdir() if d.is_dir()]
            if subdirs:
                print(f"      (nested folders under model: {len(subdirs)})")
            for img in imgs[:max_examples_per_model]:
                print(f"      ‚Ä¢ {img.name}")
        if len(model_dirs) > max_models_per_artist:
            print(f"  ... (+{len(model_dirs)-max_models_per_artist} more models)")
        print()

if WORK_DIR.exists() and any(WORK_DIR.iterdir()):
    brief_tree(WORK_DIR)
else:
    print("WORK_DIR is empty.")

curr_image_count = count_images(WORK_DIR)
print(f"üñºÔ∏è Total images in WORK_DIR ({WORK_DIR.name}): {curr_image_count}")


Artists found: 1090

[ARTIST] A Rose
  (files directly under artist: 4)

[ARTIST] AcomaPot
  (files directly under artist: 2)

[ARTIST] AcomanPot
  (files directly under artist: 1)

[ARTIST] Acrocinus longimanus
  (files directly under artist: 2)

[ARTIST] Actor
  (files directly under artist: 1)

[ARTIST] Aechmea Fasciata
  (files directly under artist: 1)

[ARTIST] Aedes aegypti
  (files directly under artist: 1)

[ARTIST] African Elephant
  (files directly under artist: 12)

[ARTIST] Aged dragon
  (files directly under artist: 1)

[ARTIST] Alamo Stallion
  (files directly under artist: 1)

[ARTIST] Allomyrina dichotoma
  (files directly under artist: 2)

[ARTIST] Allosaurus Skeleton
  (files directly under artist: 2)

[ARTIST] Amaryllis
  (files directly under artist: 2)

[ARTIST] Amatl Pot
  (files directly under artist: 1)

[ARTIST] Ammonite
  (files directly under artist: 1)

[ARTIST] Angel
  (files directly under artist: 6)

[ARTIST] Anna s Hummingbird Honeysuckle
  (files direc

In [18]:
#dropping non artist directories
from pathlib import Path
import shutil, csv, os
from datetime import datetime

def ruleA_drop(work_dir=WORK_DIR, log_dir=LOG_DIR, dry_run=True):
    drop_dir_names = {"test", "my_model"}
    drop_file_names = {"model.csv"}

    to_delete_dirs, to_delete_files = [], []

    for base, dirs, files in os.walk(work_dir):
        base_path = Path(base)
        for d in dirs:
            if d.lower() in drop_dir_names:
                to_delete_dirs.append(base_path / d)
        for f in files:
            if f.lower() in drop_file_names:
                to_delete_files.append(base_path / f)

    # --- Logging setup ---
    log_dir.mkdir(parents=True, exist_ok=True)
    mode = "DRY-RUN" if dry_run else "EXECUTE"
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_path = log_dir / f"ruleA_drop_{mode}_{timestamp}.csv"

    # --- Print summary ---
    print(f"\n[{mode}]")
    print(f"Found {len(to_delete_dirs)} directories and {len(to_delete_files)} files matching pattern.\n")

    if to_delete_dirs:
        print("Directories to remove (sample):")
        for p in to_delete_dirs[:10]:
            print("  [dir]", p)
        if len(to_delete_dirs) > 10:
            print(f"  ... (+{len(to_delete_dirs)-10} more)")
    if to_delete_files:
        print("\nFiles to remove (sample):")
        for p in to_delete_files[:10]:
            print("  [file]", p)
        if len(to_delete_files) > 10:
            print(f"  ... (+{len(to_delete_files)-10} more)")

    # --- Write plan to CSV ---
    with open(log_path, "w", newline="") as fp:
        writer = csv.writer(fp)
        writer.writerow(["type", "path"])
        for p in to_delete_files:
            writer.writerow(["file", str(p)])
        for p in to_delete_dirs:
            writer.writerow(["dir", str(p)])

    # --- Execute if not dry run ---
    if not dry_run:
        deleted_files = deleted_dirs = 0

        # Delete files first
        for p in to_delete_files:
            try:
                Path(p).unlink()
                deleted_files += 1
            except FileNotFoundError:
                pass

        # Delete dirs (deepest first)
        for p in sorted(to_delete_dirs, key=lambda x: len(Path(x).parts), reverse=True):
            try:
                shutil.rmtree(p)
                deleted_dirs += 1
            except FileNotFoundError:
                pass

        print(f"\nDeleted {deleted_files} files and {deleted_dirs} directories.")
    else:
        print(f"\nDry-run only. No files deleted.")

    print(f"üßæ Log saved to: {log_path}")
    return to_delete_dirs, to_delete_files


In [19]:
to_del_dirs, to_del_files = ruleA_drop(dry_run=True)


[DRY-RUN]
Found 2 directories and 1 files matching pattern.

Directories to remove (sample):
  [dir] /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Test
  [dir] /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/My_model

Files to remove (sample):
  [file] /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/model.csv

Dry-run only. No files deleted.
üßæ Log saved to: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/ruleA_drop_DRY-RUN_20251013_154501.csv


In [20]:
ruleA_drop(dry_run=False)


[EXECUTE]
Found 2 directories and 1 files matching pattern.

Directories to remove (sample):
  [dir] /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Test
  [dir] /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/My_model

Files to remove (sample):
  [file] /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/model.csv

Deleted 1 files and 2 directories.
üßæ Log saved to: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/ruleA_drop_EXECUTE_20251013_154502.csv


([PosixPath('/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Test'),
  PosixPath('/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/My_model')],
 [PosixPath('/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/model.csv')])

In [21]:
#drop artist name layer
from pathlib import Path
import os, csv, shutil, hashlib, re
from datetime import datetime

def _sha1_short(path: Path, chunk=1024*1024) -> str:
    h = hashlib.sha1()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk)
            if not b: break
            h.update(b)
    return h.hexdigest()[:10]

def drop_artist(work_dir=WORK_DIR, log_dir=LOG_DIR, dry_run=True):
    """
    Flatten in place:
        work/<artist>/<model>/...  ‚Üí  work/<model>/...
    - No normalization or fuzzy merging.
    - If two artists share the same <model> name, they merge into the same folder.
    - On filename collision inside a <model> folder, keep both by appending a short content hash.
    - Removes empty artist/model folders afterwards.
    - Writes a CSV plan/execution log to LOG_DIR.
    """
    work_dir = Path(work_dir)
    log_dir = Path(log_dir); log_dir.mkdir(parents=True, exist_ok=True)

    # Snapshot artist dirs (exclude hidden/underscore system dirs just in case)
    artist_dirs = [d for d in work_dir.iterdir()
                   if d.is_dir() and not d.name.startswith("_")]

    moves = []   # (src_file, dst_file, artist, model)
    scanned_models = 0

    # Build a stable snapshot before moving
    for artist in sorted(artist_dirs):
        # treat this as an "artist" dir only if it has subdirectories (models)
        model_dirs = [m for m in artist.iterdir() if m.is_dir()]
        if not model_dirs:
            continue
        for model in sorted(model_dirs):
            scanned_models += 1
            model_name = model.name  # keep as-is
            # take all files under the model (recursively)
            files = [p for p in model.rglob("*") if p.is_file()]
            for f in files:
                dst_folder = work_dir / model_name
                dst = dst_folder / f.name
                if dst.exists():
                    h = _sha1_short(f)
                    dst = dst_folder / f"{dst.stem}__{h}{dst.suffix}"
                moves.append((f, dst, artist.name, model_name))

    mode = "DRY-RUN" if dry_run else "EXECUTE"
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_csv = log_dir / f"inplace_drop_artist_{mode}_{ts}.csv"

    # Log + preview
    with open(log_csv, "w", newline="") as fp:
        w = csv.writer(fp)
        w.writerow(["action","artist","model","src","dst"])
        print(f"\n[IN-PLACE DROP ARTIST ‚Äî {mode}]")
        print(f"Scanned model folders: {scanned_models}")
        print(f"Files to move        : {len(moves)}")
        for i, (src, dst, artist, model) in enumerate(moves[:12]):
            print("  MOVE:", src, "‚Üí", dst)
            w.writerow(["move", artist, model, str(src), str(dst)])
        if len(moves) > 12:
            print(f"  ... (+{len(moves)-12} more)")
            for (src, dst, artist, model) in moves[12:]:
                w.writerow(["move", artist, model, str(src), str(dst)])

    if dry_run:
        print(f"\nüí° Dry-run only. No files moved.")
        print(f"üßæ Plan saved to: {log_csv}")
        return {"planned_moves": len(moves), "log_csv": log_csv}

    # Execute moves
    moved = 0
    for src, dst, artist, model in moves:
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.move(str(src), str(dst))
        moved += 1

    # Cleanup: remove any empty dirs left under work/
    removed_dirs = 0
    # Walk bottom-up so we can remove parents after children
    for root, dirs, files in os.walk(work_dir, topdown=False):
        # Don't try to delete the WORK root itself
        if Path(root) == work_dir:
            continue
        if not dirs and not files:
            try:
                Path(root).rmdir()
                removed_dirs += 1
            except OSError:
                pass

    print(f"\nExecuted. Moved {moved} files into {work_dir}")
    print(f"üßπ Removed {removed_dirs} empty directories.")
    print(f"üßæ Log saved to: {log_csv}")
    return {"moved_files": moved, "removed_dirs": removed_dirs, "log_csv": log_csv}


In [22]:
_ = drop_artist(dry_run=True)


[IN-PLACE DROP ARTIST ‚Äî DRY-RUN]
Scanned model folders: 1337
Files to move        : 3713
  MOVE: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Beth Johnson/Acorns, 2010/acorns.jpg ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Acorns, 2010/acorns.jpg
  MOVE: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Beth Johnson/Acorns, 2012/Acorns_2.jpg ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Acorns, 2012/Acorns_2.jpg
  MOVE: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Beth Johnson/Acorns, 2012/Acorns+2.jpg ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Acorns, 2012/Acorns+2.jpg
  M

In [23]:
_ = drop_artist(dry_run=False)


[IN-PLACE DROP ARTIST ‚Äî EXECUTE]
Scanned model folders: 1337
Files to move        : 3713
  MOVE: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Beth Johnson/Acorns, 2010/acorns.jpg ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Acorns, 2010/acorns.jpg
  MOVE: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Beth Johnson/Acorns, 2012/Acorns_2.jpg ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Acorns, 2012/Acorns_2.jpg
  MOVE: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Beth Johnson/Acorns, 2012/Acorns+2.jpg ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work/Acorns, 2012/Acorns+2.jpg
  M

In [54]:
#normalize folder names and merge 
from pathlib import Path
import os, re, csv, shutil, hashlib
from datetime import datetime


def _sha1_short(path: Path, chunk=1024*1024) -> str:
    h = hashlib.sha1()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk)
            if not b: break
            h.update(b)
    return h.hexdigest()[:10]

def _canonicalize_name(name: str) -> str:
    # keep only alphabetic runs; lowercase; single spaces
    runs = re.findall(r"[A-Za-z]+", name)
    canon = " ".join(r.lower() for r in runs).strip()
    return canon or "unnamed"

def normalize_with_staging_buckets(work_dir=WORK_DIR, log_dir=LOG_DIR, dry_run=True):

    work_dir = Path(work_dir)
    log_dir = Path(log_dir); log_dir.mkdir(parents=True, exist_ok=True)
    staging_root = work_dir / "_lc_buckets"

    before_images = count_images(work_dir)
    src_dirs = [d for d in work_dir.iterdir() if d.is_dir() and d.name != staging_root.name]
    groups = {}
    for d in src_dirs:
        canon = _canonicalize_name(d.name)
        groups.setdefault(canon, []).append(d)

    mode = "DRY-RUN" if dry_run else "EXECUTE"
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_csv = log_dir / f"normalize_staging_{mode}_{ts}.csv"

    planned_bucket_creates = set()
    planned_moves = []      # (src_file, dst_file, src_dir, bucket_dir)
    collisions = []         # (original_dst, new_dst)

    # Plan: build staging buckets and move all files into them
    for canon, dirs in sorted(groups.items(), key=lambda kv: kv[0]):
        bucket = staging_root / canon
        planned_bucket_creates.add(bucket)
        for d in dirs:
            for src in [p for p in d.rglob("*") if p.is_file()]:
                dst = bucket / src.name
                if dst.exists():
                    new_dst = bucket / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                    collisions.append((dst, new_dst))
                    dst = new_dst
                planned_moves.append((src, dst, d, bucket))

    # Log & preview
    with open(log_csv, "w", newline="") as fp:
        w = csv.writer(fp)
        w.writerow(["action","src","dst","extra"])
        for b in sorted(planned_bucket_creates):
            w.writerow(["create_bucket", "", str(b), ""])
        for (orig, new) in collisions:
            w.writerow(["collision_rename_file", str(orig), str(new), ""])
        for src, dst, sdir, bdir in planned_moves:
            w.writerow(["move_to_bucket", str(src), str(dst), f"{sdir.name} -> {bdir.name}"])

    print(f"\n[NORMALIZE via STAGING ‚Äî {mode}]")
    print(f"Top-level folders    : {len(src_dirs)}")
    print(f"Canonical groups     : {len(groups)}")
    print(f"Buckets to create    : {len(planned_bucket_creates)}")
    print(f"Planned file moves   : {len(planned_moves)}")
    print(f"Filename collisions  : {len(collisions)}")
    print(f"üßæ Log: {log_csv}")

    # Sample preview
    for i, b in enumerate(sorted(planned_bucket_creates)[:10]):
        print("  BUCKET:", b.name)
    for i, (src, dst, sdir, bdir) in enumerate(planned_moves[:10]):
        print("  MOVE:", src.name, "‚Üí", bdir.name)

    if dry_run:
        print(f"\nüí° Dry-run only. No changes applied.")
        print(f"üñºÔ∏è Image count (WORK_DIR): {before_images}")
        return {
            "planned_buckets": len(planned_bucket_creates),
            "planned_moves": len(planned_moves),
            "planned_collisions": len(collisions),
            "log_csv": log_csv,
            "images": before_images
        }

    for b in sorted(planned_bucket_creates):
        b.mkdir(parents=True, exist_ok=True)

    moved = 0
    for src, dst, sdir, bdir in planned_moves:
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.move(str(src), str(dst))
        moved += 1

    removed_dirs = 0
    for d in sorted(src_dirs, key=lambda p: len(p.parts), reverse=True):
        # Skip if it is the staging root or is inside staging root
        if d == staging_root or str(d).startswith(str(staging_root) + os.sep):
            continue
        # Remove empty trees bottom-up
        for root, dirs, files in os.walk(d, topdown=False):
            if not dirs and not files:
                try:
                    Path(root).rmdir()
                    removed_dirs += 1
                except OSError:
                    pass
        # Try remove the dir itself if empty
        try:
            d.rmdir()
            removed_dirs += 1
        except OSError:
            pass

    promoted = 0
    for b in sorted(staging_root.iterdir()):
        if not b.is_dir():
            continue
        final = work_dir / b.name  # lowercase canonical name
        if final.exists() and final != b:
            # Should not happen (we deleted originals), but guard: merge contents
            for src in [p for p in b.rglob("*") if p.is_file()]:
                dst = final / src.name
                if dst.exists():
                    dst = final / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.move(str(src), str(dst))
            shutil.rmtree(b)
            promoted += 1
        else:
            b.rename(final)
            promoted += 1

    try:
        staging_root.rmdir()
    except OSError:
        pass

    after_images = count_images(work_dir)
    print(f"\n‚úÖ Executed.")
    print(f" ‚Ä¢ Files moved into buckets : {moved}")
    print(f" ‚Ä¢ Original dirs removed    : {removed_dirs}")
    print(f" ‚Ä¢ Buckets promoted         : {promoted}")
    print(f"üñºÔ∏è Image count (WORK_DIR)  : {after_images}")
    print(f"üßæ Log: {log_csv}")

    return {
        "moved_files": moved,
        "removed_dirs": removed_dirs,
        "promoted_buckets": promoted,
        "log_csv": log_csv,
        "images": after_images
    }


In [56]:
_ = normalize_folder_names(WORK_DIR, LOG_DIR, dry_run=True)



[NORMALIZE (letters-only, lowercase) & MERGE ‚Äî DRY-RUN]
Folders before          : 1090
Canonical groups        : 1090
Dir renames (force lc)  : 0
File moves (merges)     : 2357
Filename collisions     : 2357
üßæ Log: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/normalize_folders_DRY-RUN_20251013_161733.csv
  MERGE: a_miuraken_beauty_rose_2__684326a641__684326a641__684326a641__684326a641.jpg ‚Üí a rose
  MERGE: a_miuraken_beauty_rose_1__e9a52899ba__e9a52899ba__e9a52899ba__e9a52899ba.jpg ‚Üí a rose
  MERGE: a_miuraken_beauty_rose_3__e04dce1168__e04dce1168__e04dce1168__e04dce1168.jpg ‚Üí a rose
  MERGE: a_rose_1__da60985597__da60985597__da60985597__da60985597.jpg ‚Üí a rose
  MERGE: acomanpot_1__a867b464f2__a867b464f2__a867b464f2__a867b464f2.jpg ‚Üí acomanpot
  MERGE: acomapot_1__a956b17bba__a956b17bba__a956b17bba__a956b17bba.jpg ‚Üí acomapot
  MERGE: acomapot_2__18f1e80b67__18f1e80b67__18f1e80b67__18f1e80b67.jpg ‚Üí

In [57]:
_ = normalize_folder_names(WORK_DIR, LOG_DIR, dry_run=False)



[NORMALIZE (letters-only, lowercase) & MERGE ‚Äî EXECUTE]
Folders before          : 1090
Canonical groups        : 1090
Dir renames (force lc)  : 0
File moves (merges)     : 2357
Filename collisions     : 2357
üßæ Log: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/normalize_folders_EXECUTE_20251013_161737.csv
  MERGE: a_miuraken_beauty_rose_2__684326a641__684326a641__684326a641__684326a641.jpg ‚Üí a rose
  MERGE: a_miuraken_beauty_rose_1__e9a52899ba__e9a52899ba__e9a52899ba__e9a52899ba.jpg ‚Üí a rose
  MERGE: a_miuraken_beauty_rose_3__e04dce1168__e04dce1168__e04dce1168__e04dce1168.jpg ‚Üí a rose
  MERGE: a_rose_1__da60985597__da60985597__da60985597__da60985597.jpg ‚Üí a rose
  MERGE: acomanpot_1__a867b464f2__a867b464f2__a867b464f2__a867b464f2.jpg ‚Üí acomanpot
  MERGE: acomapot_1__a956b17bba__a956b17bba__a956b17bba__a956b17bba.jpg ‚Üí acomapot
  MERGE: acomapot_2__18f1e80b67__18f1e80b67__18f1e80b67__18f1e80b67.jpg ‚Üí

In [64]:
folder_names = list_all_folders()


üìÅ Listing all folders in: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work

  1. A Rose
  2. AcomanPot
  3. AcomaPot
  4. acorns
  5. Acrocinus longimanus
  6. Actor
  7. Aechmea Fasciata
  8. Aedes aegypti
  9. aerial hunt
 10. African Elephant
 11. african penguin
 12. Aged dragon
 13. Alamo Stallion
 14. Allomyrina dichotoma
 15. Allosaurus Skeleton
 16. almiraj
 17. Amaryllis
 18. Amatl Pot
 19. Ammonite
 20. ancient dragon
 21. andira anteira taisho
 22. Angel
 23. ankylosaurus
 24. Anna s Hummingbird Honeysuckle
 25. Anna s Hummingbird Trumpet Blossoms
 26. ant
 27. antelope
 28. Anthurium
 29. Anzu wyliei
 30. Apatosaurus
 31. Aquarius
 32. Archaeopteryx
 33. Archangel Gabriel
 34. archangel gabriel second version
 35. archangel gabriel third version
 36. Archangel St Michael
 37. archeopteryx
 38. Argyrops bleekeri
 39. Arle head
 40. Armadillo
 41. Ash Wyrm
 42. Asian Elephant
 43. Asian elephants
 44. asiatic

In [68]:
# === WordNet-based canonical big-group classifier (minimal) === ## do this after dataset 2?

import re
from nltk.corpus import wordnet as wn

# ensure wordnet is loaded (only if not already done)
try:
    _ = wn.synsets("cat")
except LookupError:
    import nltk
    nltk.download("wordnet")
    nltk.download("omw-1.4")

# cached results for speed
_WN_CACHE = {}
_ANIMAL_ROOT = wn.synset("animal.n.01")

def _wn_is_animal_like(term: str) -> bool:
    """Check if a noun belongs under animal.n.01 in WordNet."""
    term = term.strip().lower().replace(" ", "_")
    if term in _WN_CACHE:
        return _WN_CACHE[term]
    syns = wn.synsets(term, pos=wn.NOUN)
    for s in syns:
        for anc in s.closure(lambda x: x.hypernyms()):
            if anc == _ANIMAL_ROOT:
                _WN_CACHE[term] = True
                return True
    _WN_CACHE[term] = False
    return False

DESCRIPTORS = {
    "african","asian","american","australian","arctic","antarctic","snow","white","black","red",
    "golden","silver","greater","lesser","giant","baby","wild","common","domestic","horned","tailed",
    "short","long","great","little","flying","sitting","standing","walking","running","resting",
    "in","on","and","of","the","with","for","from"
}

SYNONYMS = {
    "snow leopard": "leopard",
    "white leopard": "leopard",
    "panther": "leopard",
    "african elephant": "elephant",
    "asian elephant": "elephant",
    "asiatic elephant": "elephant",
    "flying fox": "fox",
    "ladybird": "ladybug",
    "ladybird beetle": "ladybug",
    "sea bream": "seabream",
    "red sea bream": "seabream",
    "orca": "whale",
    "killer whale": "whale",
}

MULTIWORD_KEEP = {
    "sea lion","sea otter","sea turtle","praying mantis","stick insect","wolf spider","garden spider"
}

def _letters_only_lower(s: str) -> str:
    runs = re.findall(r"[A-Za-z]+", s)
    return " ".join(r.lower() for r in runs).strip()

def _canonical_big_group(label: str) -> str | None:
    """Map folder name to big-group canonical animal name using WordNet."""
    s = _letters_only_lower(label)
    if not s:
        return None

    # synonym direct
    if s in SYNONYMS:
        s = SYNONYMS[s]
        return s if (_wn_is_animal_like(s) or s in MULTIWORD_KEEP) else None
    if s in MULTIWORD_KEEP:
        return s

    # remove descriptors
    tokens = [t for t in s.split() if t not in DESCRIPTORS]
    if not tokens:
        return None

    # bigram check (rightmost)
    if len(tokens) >= 2:
        bigram = f"{tokens[-2]} {tokens[-1]}"
        if bigram in SYNONYMS:
            cand = SYNONYMS[bigram]
            return cand if (_wn_is_animal_like(cand) or cand in MULTIWORD_KEEP) else None
        if bigram in MULTIWORD_KEEP:
            return bigram
        if _wn_is_animal_like(bigram):
            return bigram

    # head noun
    head = tokens[-1]
    if head in SYNONYMS:
        head = SYNONYMS[head]
    if _wn_is_animal_like(head):
        return head

    reduced = " ".join(tokens)
    if reduced in SYNONYMS:
        reduced = SYNONYMS[reduced]
    if _wn_is_animal_like(reduced):
        return reduced

    return None


In [73]:
# === FINAL VERSION: Filter to animals/birds/insects and DELETE all others ===
def filter_animals_biggroup_wordnet(work_dir=WORK_DIR, log_dir=LOG_DIR, dry_run=True):
    work_dir = Path(work_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    staging_root = work_dir / "_animals_buckets"

    before_images = count_images(work_dir)
    src_dirs = [d for d in work_dir.iterdir() if d.is_dir() and d.name != staging_root.name]

    mode = "DRY-RUN" if dry_run else "EXECUTE"
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_csv = log_dir / f"filter_animals_wordnet_{mode}_{ts}.csv"

    planned_buckets, planned_moves, planned_drops, collisions = set(), [], [], []

    for d in sorted(src_dirs, key=lambda p: p.name.lower()):
        group = _canonical_big_group(d.name)
        if not group:
            planned_drops.append(d)
            continue
        bucket = staging_root / group
        planned_buckets.add(bucket)
        for src in [p for p in d.rglob("*") if p.is_file()]:
            dst = bucket / src.name
            if dst.exists():
                new_dst = bucket / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                collisions.append((dst, new_dst))
                dst = new_dst
            planned_moves.append((src, dst, d, bucket, group))

    print(f"\n[WORDNET FILTER + CLEANUP ‚Äî {mode}]")
    print(f"Folders scanned  : {len(src_dirs)}")
    print(f"Animal groups    : {len(planned_buckets)}")
    print(f"Files to keep    : {len(planned_moves)}")
    print(f"Non-animal drops : {len(planned_drops)}")
    print(f"üßæ Log: {log_csv}")

    if dry_run:
        print("\nüí° Dry-run only. No changes applied.")
        print(f"üñºÔ∏è Image count (WORK_DIR): {before_images}")
        return

    # --- EXECUTE ---
    # 1. Create staging buckets
    for b in sorted(planned_buckets):
        b.mkdir(parents=True, exist_ok=True)

    # 2. Move all animal/insect/bird files
    kept = 0
    for src, dst, sdir, bdir, grp in planned_moves:
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.move(str(src), str(dst))
        kept += 1

    # 3. DELETE all non-animal folders
    removed_non_animals = 0
    for d in planned_drops:
        try:
            shutil.rmtree(d)
            removed_non_animals += 1
        except Exception as e:
            print(f"‚ö†Ô∏è Could not delete {d}: {e}")

    # 4. Remove emptied originals (even if they had animals)
    for d in sorted(src_dirs, key=lambda p: len(p.parts), reverse=True):
        if not d.exists() or d == staging_root:
            continue
        try:
            if not any(d.iterdir()):
                d.rmdir()
        except OSError:
            pass

    # 5. Promote buckets to top level
    promoted = 0
    for b in sorted(staging_root.iterdir()):
        if not b.is_dir(): continue
        final = work_dir / b.name
        if final.exists() and final != b:
            for src in [p for p in b.rglob("*") if p.is_file()]:
                dst = final / src.name
                if dst.exists():
                    dst = final / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                shutil.move(str(src), str(dst))
            shutil.rmtree(b)
        else:
            b.rename(final)
        promoted += 1

    try:
        staging_root.rmdir()
    except OSError:
        pass

    after_images = count_images(work_dir)
    print(f"\n‚úÖ Executed.")
    print(f" ‚Ä¢ Files kept (animals/insects/birds): {kept}")
    print(f" ‚Ä¢ Non-animal folders deleted        : {removed_non_animals}")
    print(f" ‚Ä¢ Buckets promoted                  : {promoted}")
    print(f"üñºÔ∏è Image count (WORK_DIR): {after_images}")


In [74]:
filter_animals_biggroup_wordnet(dry_run=True)



[WORDNET FILTER + CLEANUP ‚Äî DRY-RUN]
Folders scanned  : 919
Animal groups    : 296
Files to keep    : 1722
Non-animal drops : 623
üßæ Log: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/filter_animals_wordnet_DRY-RUN_20251013_163556.csv

üí° Dry-run only. No changes applied.
üñºÔ∏è Image count (WORK_DIR): 3476


In [75]:
filter_animals_biggroup_wordnet(dry_run=False)



[WORDNET FILTER + CLEANUP ‚Äî EXECUTE]
Folders scanned  : 919
Animal groups    : 296
Files to keep    : 1722
Non-animal drops : 623
üßæ Log: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/filter_animals_wordnet_EXECUTE_20251013_163600.csv

‚úÖ Executed.
 ‚Ä¢ Files kept (animals/insects/birds): 1722
 ‚Ä¢ Non-animal folders deleted        : 623
 ‚Ä¢ Buckets promoted                  : 296
üñºÔ∏è Image count (WORK_DIR): 1721


### Dataset 2: removing all non animal folders

In [76]:
from pathlib import Path
import os, csv, shutil
from datetime import datetime

# RAW2 root (adjust if needed)
RAW2_DIR = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/original data/origami-model")

def import_dataset2_merge_simple(
    raw2_dir=RAW2_DIR,
    work_dir=WORK_DIR,
    log_dir=LOG_DIR,
    dry_run=True,
):
    """
    Dataset 2 import WITHOUT big-grouping:
      - Drop category layer (animals/, insects/, etc.)
      - Skip categories: characters, objects, shapes, unclassified
      - Canonicalize leaf names via _canonicalize_name(name)
      - Merge into WORK_DIR (keep all files; hash on collisions)
      - macOS-safe via staging buckets
      - Print image counts before/after
    """
    raw2_dir = Path(raw2_dir)
    work_dir = Path(work_dir)
    log_dir = Path(log_dir); log_dir.mkdir(parents=True, exist_ok=True)
    staging_root = work_dir / "_import2_buckets_simple"

    if not raw2_dir.exists():
        raise FileNotFoundError(f"RAW2_DIR does not exist: {raw2_dir}")

    # categories to ignore entirely
    drop_cats = {"characters", "objects", "shapes", "unclassified"}
    top_level = [d for d in raw2_dir.iterdir() if d.is_dir()]
    allowed_cats = [d for d in top_level if d.name.lower() not in drop_cats]

    before_images = count_images(work_dir)

    mode = "DRY-RUN" if dry_run else "EXECUTE"
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_csv = log_dir / f"import_dataset2_simple_{mode}_{ts}.csv"

    planned_buckets = set()
    planned_moves = []   # (src_file, dst_file, src_leaf_dir, bucket_dir, canonical)
    planned_skips  = []  # (leaf_dir, reason)
    collisions = []

    # helper: canonicalize via your existing function
    def _canon_leaf(name: str) -> str | None:
        try:
            canon = _canonicalize_name(name)  # already in your notebook
            return canon or None
        except NameError:
            # minimal fallback if needed
            import re
            runs = re.findall(r"[A-Za-z]+", name)
            canon = " ".join(r.lower() for r in runs).strip()
            return canon or None

    # walk allowed categories; each subdir is a leaf (animal/insect/etc. name)
    for cat in sorted(allowed_cats, key=lambda p: p.name.lower()):
        for leaf in sorted([d for d in cat.iterdir() if d.is_dir()], key=lambda p: p.name.lower()):
            canon = _canon_leaf(leaf.name)
            if not canon:
                planned_skips.append((leaf, "canonicalization->None"))
                continue

            bucket = staging_root / canon
            planned_buckets.add(bucket)

            for src in [p for p in leaf.rglob("*") if p.is_file()]:
                dst = bucket / src.name
                if dst.exists():
                    # use your hash helper already defined
                    new_dst = bucket / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                    collisions.append((dst, new_dst))
                    dst = new_dst
                planned_moves.append((src, dst, leaf, bucket, canon))

    # log + preview
    with open(log_csv, "w", newline="") as fp:
        w = csv.writer(fp)
        w.writerow(["action","src","dst","extra"])
        for b in sorted(planned_buckets):
            w.writerow(["create_bucket", "", str(b), b.name])
        for src, dst, sdir, bdir, canon in planned_moves:
            w.writerow(["move_to_bucket", str(src), str(dst), canon])
        for (orig, new) in collisions:
            w.writerow(["collision_rename_file", str(orig), str(new), ""])
        for (leaf, reason) in planned_skips:
            w.writerow(["skip_leaf", str(leaf), "", reason])

    print(f"\n[IMPORT DATASET 2 (simple) ‚Äî {mode}]")
    print(f"Allowed categories        : {len(allowed_cats)}")
    print(f"Canonical buckets         : {len(planned_buckets)}")
    print(f"Files to move (keep)      : {len(planned_moves)}")
    print(f"Leaf folders skipped      : {len(planned_skips)}")
    print(f"Filename collisions       : {len(collisions)}")
    print(f"üñºÔ∏è Images in WORK before  : {before_images}")
    print(f"üßæ Log: {log_csv}")

    # sample
    for i, (src, dst, sdir, bdir, canon) in enumerate(planned_moves[:12]):
        print(f"  {sdir.name} ‚Üí {canon}: {src.name}")
    for i, (leaf, why) in enumerate(planned_skips[:8]):
        print(f"  SKIP: {leaf.name} ({why})")

    if dry_run:
        print("\nüí° Dry-run only. No changes applied.")
        return {
            "planned_buckets": len(planned_buckets),
            "planned_moves": len(planned_moves),
            "planned_skips": len(planned_skips),
            "collisions": len(collisions),
            "log_csv": log_csv,
            "images_before": before_images
        }

    # EXECUTE: create buckets & move files from RAW2 into staging buckets
    for b in sorted(planned_buckets):
        b.mkdir(parents=True, exist_ok=True)

    moved = 0
    for src, dst, sdir, bdir, canon in planned_moves:
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.move(str(src), str(dst))
        moved += 1

    # merge buckets into WORK_DIR
    promoted = 0
    for b in sorted(staging_root.iterdir()):
        if not b.is_dir():
            continue
        final = work_dir / b.name  # canonical (letters-only, lowercase)
        if final.exists() and final != b:
            # merge contents
            for src in [p for p in b.rglob("*") if p.is_file()]:
                dst = final / src.name
                if dst.exists():
                    dst = final / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.move(str(src), str(dst))
            shutil.rmtree(b)
            promoted += 1
        else:
            b.rename(final)
            promoted += 1

    # clean staging root if empty
    try:
        staging_root.rmdir()
    except OSError:
        pass

    after_images = count_images(work_dir)
    print(f"\n‚úÖ Executed.")
    print(f" ‚Ä¢ Files moved from RAW2 : {moved}")
    print(f" ‚Ä¢ Buckets merged/promoted: {promoted}")
    print(f"üñºÔ∏è Images in WORK after   : {after_images}")

    return {
        "moved_files": moved,
        "promoted_buckets": promoted,
        "log_csv": log_csv,
        "images_before": before_images,
        "images_after": after_images
    }


In [77]:
import_dataset2_merge_simple(dry_run=True)


[IMPORT DATASET 2 (simple) ‚Äî DRY-RUN]
Allowed categories        : 4
Canonical buckets         : 64
Files to move (keep)      : 3482
Leaf folders skipped      : 0
Filename collisions       : 0
üñºÔ∏è Images in WORK before  : 1721
üßæ Log: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/import_dataset2_simple_DRY-RUN_20251013_165403.csv
  armadillo ‚Üí armadillo: p_armadillo_szinger_sculpture.jpg
  armadillo ‚Üí armadillo: 34381.jpg
  armadillo ‚Üí armadillo: p_armadillo_wu.jpg
  armadillo ‚Üí armadillo: 2400.jpg
  armadillo ‚Üí armadillo: p_armadillo_yamaguchi_cute.jpg
  armadillo ‚Üí armadillo: 2147.jpg
  armadillo ‚Üí armadillo: 2133.jpg
  armadillo ‚Üí armadillo: p_armadillo_fuchimoto_pet_park.jpg
  bear ‚Üí bear: th - 2023-11-25T115230.138.jpeg
  bear ‚Üí bear: th - 2023-11-25T115229.619.jpeg
  bear ‚Üí bear: th - 2023-11-25T115227.764.jpeg
  bear ‚Üí bear: th - 2023-11-25T115230.883.jpeg

üí° Dry-run only. No c

{'planned_buckets': 64,
 'planned_moves': 3482,
 'planned_skips': 0,
 'collisions': 0,
 'log_csv': PosixPath('/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/import_dataset2_simple_DRY-RUN_20251013_165403.csv'),
 'images_before': 1721}

In [78]:
import_dataset2_merge_simple(dry_run=False)


[IMPORT DATASET 2 (simple) ‚Äî EXECUTE]
Allowed categories        : 4
Canonical buckets         : 64
Files to move (keep)      : 3482
Leaf folders skipped      : 0
Filename collisions       : 0
üñºÔ∏è Images in WORK before  : 1721
üßæ Log: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/import_dataset2_simple_EXECUTE_20251013_165425.csv
  armadillo ‚Üí armadillo: p_armadillo_szinger_sculpture.jpg
  armadillo ‚Üí armadillo: 34381.jpg
  armadillo ‚Üí armadillo: p_armadillo_wu.jpg
  armadillo ‚Üí armadillo: 2400.jpg
  armadillo ‚Üí armadillo: p_armadillo_yamaguchi_cute.jpg
  armadillo ‚Üí armadillo: 2147.jpg
  armadillo ‚Üí armadillo: 2133.jpg
  armadillo ‚Üí armadillo: p_armadillo_fuchimoto_pet_park.jpg
  bear ‚Üí bear: th - 2023-11-25T115230.138.jpeg
  bear ‚Üí bear: th - 2023-11-25T115229.619.jpeg
  bear ‚Üí bear: th - 2023-11-25T115227.764.jpeg
  bear ‚Üí bear: th - 2023-11-25T115230.883.jpeg

‚úÖ Executed.
 ‚Ä¢ File

{'moved_files': 3482,
 'promoted_buckets': 64,
 'log_csv': PosixPath('/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/import_dataset2_simple_EXECUTE_20251013_165425.csv'),
 'images_before': 1721,
 'images_after': 5203}

### Compiled Dataset: Remove scientific names and merge mirco classes into big groups

In [33]:
folder_names = list_all_folders()


üìÅ Listing all folders in: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work


Total folders: 0


#### at this point went manually in to converge groups

In [36]:
final_image_count = count_images(WORK_DIR)
print(f"üñºÔ∏è Total images in WORK_DIR ({WORK_DIR.name}): {final_image_count}")

üñºÔ∏è Total images in WORK_DIR (work): 4842


In [35]:
folder_image_counts = count_images_in_each_folder()



üìÇ Image count per folder in work:

üñºÔ∏è Total images in WORK_DIR (work): 0


In [46]:
from pathlib import Path
import shutil, os
from datetime import datetime
import csv

def transfer_to_clean_final(
    work_dir=WORK_DIR,
    clean_dir=CLEAN_DIR,
    log_dir=LOG_DIR,
    subfolder_name="origami_images",
    dry_run=True
):
    """
    Move all images/folders from WORK_DIR ‚Üí CLEAN_DIR/<subfolder_name>.
    - Creates subfolder under CLEAN_DIR (default: 'origami_images').
    - Preserves subfolder structure inside.
    - Appends hash on duplicate filenames.
    - Logs a CSV of all moves.
    - Deletes everything inside WORK_DIR after transfer (if not dry_run).
    """
    work_dir = Path(work_dir)
    clean_dir = Path(clean_dir)
    target_dir = clean_dir / subfolder_name
    log_dir = Path(log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    target_dir.mkdir(parents=True, exist_ok=True)

    before_work = count_images(work_dir)
    before_clean = count_images(target_dir)
    mode = "DRY-RUN" if dry_run else "EXECUTE"
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_csv = log_dir / f"transfer_to_clean_{mode}_{ts}.csv"

    moved, collisions = 0, 0
    planned_moves = []

    # Plan every file move
    for root, _, files in os.walk(work_dir):
        rel = Path(root).relative_to(work_dir)
        for f in files:
            src = Path(root) / f
            dst = target_dir / rel / f
            if dst.exists():
                new_dst = target_dir / rel / f"{dst.stem}__{_sha1_short(src)}{dst.suffix}"
                dst = new_dst
                collisions += 1
            planned_moves.append((src, dst))

    # Write log CSV
    with open(log_csv, "w", newline="") as fp:
        w = csv.writer(fp)
        w.writerow(["src_path", "dst_path"])
        w.writerows([(str(s), str(d)) for s, d in planned_moves])

    print(f"\n[TRANSFER TO CLEAN/{subfolder_name} ‚Äî {mode}]")
    print(f"üñºÔ∏è Images in WORK_DIR before: {before_work}")
    print(f"üñºÔ∏è Images in CLEAN_DIR before: {before_clean}")
    print(f"Files to move: {len(planned_moves)}  (collisions handled: {collisions})")
    print(f"üßæ Log CSV: {log_csv}")

    if dry_run:
        print("\nüí° Dry-run only. No files moved.")
        return

    # Execute actual move
    for src, dst in planned_moves:
        dst.parent.mkdir(parents=True, exist_ok=True)
        shutil.move(str(src), str(dst))
        moved += 1

    # Delete everything in WORK_DIR
    for item in work_dir.iterdir():
        try:
            if item.is_dir():
                shutil.rmtree(item)
            else:
                item.unlink()
        except Exception as e:
            print(f"‚ö†Ô∏è Could not delete {item}: {e}")

    after_work = count_images(work_dir)
    after_clean = count_images(target_dir)

    print(f"\n‚úÖ Executed transfer.")
    print(f" ‚Ä¢ Files moved        : {moved}")
    print(f" ‚Ä¢ Collisions handled : {collisions}")
    print(f" ‚Ä¢ Emptied WORK_DIR   : {work_dir}")
    print(f"üñºÔ∏è Images in CLEAN/origami_images now : {after_clean}")
    print(f"üßæ Log file saved at  : {log_csv}")


In [48]:
# 1Ô∏è‚É£ Dry-run first (just shows the plan and counts)
transfer_to_clean_final(dry_run=True)




[TRANSFER TO CLEAN/origami_images ‚Äî DRY-RUN]
üñºÔ∏è Images in WORK_DIR before: 4842
üñºÔ∏è Images in CLEAN_DIR before: 0
Files to move: 4849  (collisions handled: 0)
üßæ Log CSV: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/transfer_to_clean_DRY-RUN_20251023_143452.csv

üí° Dry-run only. No files moved.


In [50]:

# 2Ô∏è‚É£ Execute for real (move files and folders)
transfer_to_clean_final(dry_run=False)


[TRANSFER TO CLEAN/origami_images ‚Äî EXECUTE]
üñºÔ∏è Images in WORK_DIR before: 4842
üñºÔ∏è Images in CLEAN_DIR before: 0
Files to move: 4849  (collisions handled: 0)
üßæ Log CSV: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/transfer_to_clean_EXECUTE_20251023_143458.csv

‚úÖ Executed transfer.
 ‚Ä¢ Files moved        : 4849
 ‚Ä¢ Collisions handled : 0
 ‚Ä¢ Emptied WORK_DIR   : /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/work
üñºÔ∏è Images in CLEAN/origami_images now : 4842
üßæ Log file saved at  : /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/_logs/transfer_to_clean_EXECUTE_20251023_143458.csv


In [52]:
### now imagenet

In [54]:
# === IMAGENET MAPPING: folder names -> closest wnid from word.txt ===
from pathlib import Path
import re, csv, os
from collections import defaultdict

# ---- Config ----
WORD_TXT_PATH = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/words.txt")  # <-- set this!
CLASSES_DIR   = CLEAN_DIR / "origami_images"  # where your final class folders live
DATA_DIR      = CLEAN_DIR.parent             # "data folder" = the dataset root that contains 'clean'
OUT_CSV       = DATA_DIR / "imagenet_mappings.csv"

# ---- Optional: RapidFuzz (preferred), else fallback to difflib ----
try:
    from rapidfuzz import fuzz
    _USE_RF = True
except Exception:
    import difflib
    _USE_RF = False

# ---- Normalization helpers ----
def _letters_only_lower(s: str) -> str:
    runs = re.findall(r"[A-Za-z]+", s)
    return " ".join(r.lower() for r in runs).strip()

def _singularize_basic(tok: str) -> str:
    irr = {"wolves":"wolf","geese":"goose","mice":"mouse","teeth":"tooth","feet":"foot","oxen":"ox","deer":"deer","elk":"elk"}
    if tok in irr: return irr[tok]
    if len(tok) > 3 and tok.endswith("s") and not tok.endswith(("ss","us")):
        return tok[:-1]
    return tok

def _norm_phrase(s: str) -> str:
    toks = [_singularize_basic(t) for t in _letters_only_lower(s).split()]
    return " ".join(toks)

# ---- Parse ImageNet word.txt ----
def load_imagenet_labels(word_txt: Path):
    """
    Returns:
      entries: list of (wnid, raw_label, norm_label)
      by_wnid: dict wnid -> {"raw": raw_line, "labels": [raw_label, ...], "norm_labels":[...]}
    Accepts typical lines like:
      n01440764 tench, Tinca tinca
      n02119789 kit fox, Vulpes macrotis
    """
    entries = []
    by_wnid = defaultdict(lambda: {"raw": "", "labels": [], "norm_labels": []})
    with open(word_txt, "r", encoding="utf-8", errors="ignore") as fp:
        for line in fp:
            line = line.strip()
            if not line: continue
            # extract wnid (first token like n########)
            m = re.match(r"^(n\d{8})\s+(.+)$", line)
            if not m:
                # sometimes tab-separated
                m = re.match(r"^(n\d{8})\t(.+)$", line)
            if not m:
                continue
            wnid, labels_part = m.group(1), m.group(2)

            # split synonyms by comma
            raw_labels = [lbl.strip() for lbl in labels_part.split(",") if lbl.strip()]
            norm_labels = [_norm_phrase(lbl) for lbl in raw_labels]

            for rl, nl in zip(raw_labels, norm_labels):
                entries.append((wnid, rl, nl))

            rec = by_wnid[wnid]
            if not rec["raw"]: rec["raw"] = line
            rec["labels"].extend(raw_labels)
            rec["norm_labels"].extend(norm_labels)
    return entries, by_wnid

# ---- Build a flat search list and a quick inverted index by normalized label ----
def build_search_index(entries):
    """
    entries: list of (wnid, raw_label, norm_label)
    returns:
      flat: list of dicts with {"wnid","raw","norm"}
      exact_map: dict norm_label -> set(wnid)
    """
    flat = [{"wnid": wnid, "raw": raw, "norm": norm} for wnid, raw, norm in entries]
    exact_map = defaultdict(set)
    for e in flat:
        if e["norm"]:
            exact_map[e["norm"]].add(e["wnid"])
    return flat, exact_map

# ---- Similarity scoring ----
def _score(a: str, b: str) -> float:
    if not a or not b:
        return 0.0
    if _USE_RF:
        return float(fuzz.token_set_ratio(a, b))
    # difflib fallback (scale ~0..100)
    return 100.0 * difflib.SequenceMatcher(None, a, b).ratio()

# ---- Main mapping function ----



In [60]:
def map_folders_to_imagenet(
    classes_dir: Path = CLEAN_DIR / "origami_images",
    word_txt: Path = WORD_TXT_PATH,
    out_csv: Path = DATA_DIR / "imagenet_mappings_full.csv",
    fuzzy_threshold: float = 70.0,
    topk: int = 3
):
    """
    For each folder in classes_dir:
      - finds best matching ImageNet wnid
      - includes all labels (synonyms) for that wnid in the CSV
    Output columns:
      folder_name, wnid, main_label, all_labels, score, match_type, alt_candidates
    """
    assert word_txt.exists(), f"word.txt not found: {word_txt}"
    classes_dir.mkdir(parents=True, exist_ok=True)
    out_csv.parent.mkdir(parents=True, exist_ok=True)

    entries, by_wnid = load_imagenet_labels(word_txt)
    flat, exact_map = build_search_index(entries)

    folders = [p.name for p in classes_dir.iterdir() if p.is_dir()]
    rows = []

    for cls in sorted(folders, key=str.lower):
        q_raw  = cls
        q_norm = _norm_phrase(q_raw)

        # --- 1) Exact match ---
        exact_wnids = exact_map.get(q_norm, set())
        if exact_wnids:
            chosen = sorted(exact_wnids)[0]
            all_labels = ", ".join(by_wnid[chosen]["labels"])
            main_label = by_wnid[chosen]["labels"][0] if by_wnid[chosen]["labels"] else ""
            rows.append([q_raw, chosen, main_label, all_labels, 100.0, "exact", ""])
            continue

        # --- 2) Fuzzy match ---
        best = []
        for e in flat:
            sc = _score(q_norm, e["norm"])
            best.append((sc, e["wnid"], e["raw"]))
        best.sort(reverse=True)
        if best:
            top = best[0]
            score, wnid, rlabel = top
            all_labels = ", ".join(by_wnid[wnid]["labels"])
            alts = "; ".join([f"{wn}:{lb} ({int(sc)})" for sc, wn, lb in best[:topk]])
            if score >= fuzzy_threshold:
                rows.append([q_raw, wnid, rlabel, all_labels, float(score), "fuzzy", alts])
            else:
                rows.append([q_raw, "", "", "", float(score), "low_score", alts])
        else:
            rows.append([q_raw, "", "", "", 0.0, "no_candidates", ""])

    # --- Write CSV ---
    with open(out_csv, "w", newline="") as fp:
        w = csv.writer(fp)
        w.writerow([
            "folder_name", "wnid", "main_label", "all_labels",
            "score", "match_type", "alt_candidates"
        ])
        w.writerows(rows)

    # --- Summary ---
    exact = sum(1 for r in rows if r[5] == "exact")
    fuzzy = sum(1 for r in rows if r[5] == "fuzzy")
    low   = sum(1 for r in rows if r[5] == "low_score")
    none  = sum(1 for r in rows if r[5] == "no_candidates")
    print(f"\n[IMAGENET MAPPING ‚Äî FULL LABELS]")
    print(f"Classes scanned     : {len(rows)}")
    print(f"Exact matches       : {exact}")
    print(f"Fuzzy matches       : {fuzzy}")
    print(f"Below threshold     : {low}")
    print(f"No candidates       : {none}")
    print(f"üßæ CSV saved to     : {out_csv}")

    return {"csv": out_csv, "total": len(rows), "exact": exact, "fuzzy": fuzzy, "low": low, "none": none}


In [62]:
map_info = map_folders_to_imagenet()


[IMAGENET MAPPING ‚Äî FULL LABELS]
Classes scanned     : 119
Exact matches       : 114
Fuzzy matches       : 5
Below threshold     : 0
No candidates       : 0
üßæ CSV saved to     : /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/imagenet_mappings_full.csv


In [31]:
from pathlib import Path
import tarfile, tempfile, time, csv, hashlib
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

# === CONFIG ===
PROJECT   = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
MAP_CSV   = PROJECT / "Data" / "dataset" / "imagenet_mappings_full.csv"   # <- uses columns: folder_name, wnid
OUT_DIR   = PROJECT / "data" / "dataset" / "clean"/ "animals"                                   # download target
BASE_URL  = "https://image-net.org/data/winter21_whole"

MAX_WORKERS = 4
TIMEOUT     = 45
RETRIES     = 3
PROBE_HEAD  = True   # try a HEAD first to quickly skip 404s

# === HELPERS ===
def ensure(p: Path): 
    p.mkdir(parents=True, exist_ok=True)

def has_files(p: Path) -> bool:
    return p.exists() and any(p.iterdir())

def _sha1_short(path: Path, chunk=1024*1024) -> str:
    h = hashlib.sha1()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk)
            if not b: break
            h.update(b)
    return h.hexdigest()[:10]

def _safe_extract_member(tf: tarfile.TarFile, member: tarfile.TarInfo, out_dir: Path):
    # prevent path traversal
    name = Path(member.name).name
    if not name:
        return None
    dest = out_dir / name
    # avoid overwriting
    if dest.exists():
        stem, suf = dest.stem, dest.suffix
        dest = out_dir / f"{stem}__dup{int(time.time())}{suf}"
    # extract file content manually to preserve safety
    src = tf.extractfile(member)
    if src is None:
        return None
    dest.parent.mkdir(parents=True, exist_ok=True)
    with open(dest, "wb") as f:
        f.write(src.read())
    return dest

def dl(url: str, dst: Path, session: requests.Session) -> bool:
    for _ in range(RETRIES):
        try:
            if PROBE_HEAD:
                rh = session.head(url, timeout=TIMEOUT, allow_redirects=True)
                if rh.status_code == 404:
                    return False
            r = session.get(url, stream=True, timeout=TIMEOUT)
            if r.status_code == 404:
                return False
            r.raise_for_status()
            with open(dst, "wb") as f:
                for chunk in r.iter_content(1024 * 256):
                    if chunk:
                        f.write(chunk)
            return True
        except requests.RequestException:
            time.sleep(2)
    return False

def extract(tp: Path, out: Path) -> int:
    n = 0
    with tarfile.open(tp, "r") as tf:
        for m in tf.getmembers():
            if not m.isfile():
                continue
            if _safe_extract_member(tf, m, out) is not None:
                n += 1
    return n

def load_mapping_from_csv(csv_path: Path):
    """
    Read folder_name -> wnid from imagenet_mappings_full.csv
    Returns dict {species_folder: wnid}, skipping empty wnids.
    """
    mapping = {}
    with open(csv_path, newline="") as fp:
        r = csv.DictReader(fp)
        for row in r:
            sp = (row.get("folder_name") or "").strip()
            wn = (row.get("wnid") or "").strip()
            if sp:
                mapping[sp] = wn
    return mapping

def process(species: str, wnid: str, session: requests.Session) -> str:
    if not wnid:
        return f"{species}: SKIP (no wnid)"
    out = OUT_DIR / species
    if has_files(out):
        return f"{species}: SKIP (exists)"
    ensure(out)
    url = f"{BASE_URL}/{wnid}.tar"
    try:
        with tempfile.TemporaryDirectory() as td:
            tp = Path(td) / f"{wnid}.tar"
            ok = dl(url, tp, session)
            if not ok:
                return f"{species}: FAILED (download/404)"
            try:
                n = extract(tp, out)
            except tarfile.ReadError:
                return f"{species}: FAILED (corrupt tar)"
    except Exception as e:
        return f"{species}: FAILED ({type(e).__name__})"
    return f"{species}: DONE ({n} images)"

def run_from_csv():
    ensure(OUT_DIR)
    mapping = load_mapping_from_csv(MAP_CSV)
    tasks = [(s, w) for s, w in mapping.items() if w]  # skip empty wnids
    print(f"Downloading {len(tasks)} species...\n")
    results = []
    with requests.Session() as session:
        session.headers.update({"User-Agent": "origami-imagenet-downloader/1.0"})
        with ThreadPoolExecutor(max_workers=max(1, MAX_WORKERS)) as ex:
            futs = {ex.submit(process, s, w, session): s for s, w in tasks}
            for fut in as_completed(futs):
                status = fut.result()
                print(status)
                results.append(status)
    done   = sum("DONE" in r for r in results)
    skip   = sum("SKIP" in r for r in results)
    failed = sum("FAILED" in r for r in results)
    print(f"\nSummary ‚Üí DONE: {done}  SKIP: {skip}  FAILED: {failed}")

# === run ===
run_from_csv()


Downloading 116 species...

ankylosaurus: DONE (6 images)
apatosaurus: DONE (2 images)
archaeopteryx: DONE (5 images)
ant: DONE (1656 images)
armadillo: DONE (1282 images)
bat: DONE (1304 images)
bird: DONE (2126 images)
anteater: DONE (1015 images)
bison: DONE (1625 images)
antelope: DONE (1282 images)
beetle: DONE (1488 images)
buffalo: DONE (1226 images)
boar: DONE (1290 images)
cardinal: DONE (1236 images)
camel: DONE (1428 images)
chameleon: DONE (1150 images)
cat: DONE (1485 images)
chipmunk: DONE (1255 images)
chicken: DONE (1125 images)
coelophysis: FAILED (download/404)
cockroach: DONE (1157 images)
cicada: DONE (1227 images)
crab: DONE (1192 images)
cricket: DONE (1308 images)
crocodile: DONE (1322 images)
crow: DONE (1435 images)
butterfly: DONE (2115 images)
deinonychus: FAILED (download/404)
dimetrodon: DONE (48 images)
diplodocus: FAILED (download/404)
deer: DONE (1680 images)
dolphin: DONE (930 images)
cow: DONE (1186 images)
crane: DONE (1355 images)
dragonfly: DONE (21

In [43]:
from pathlib import Path

CLEAN_DIR = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/clean/origami_images")

def count_images_in_dir(base: Path):
    image_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"}
    folder_counts = {}
    total_images = 0

    for folder in sorted([p for p in base.iterdir() if p.is_dir()]):
        count = sum(1 for f in folder.rglob("*") if f.suffix.lower() in image_exts)
        folder_counts[folder.name] = count
        total_images += count

    print(f"\nüìÇ Total folders: {len(folder_counts)}")
    print(f"üñºÔ∏è Total images: {total_images}\n")
    print("Folder-wise counts:")
    for name, cnt in folder_counts.items():
        print(f"  {name:25s}: {cnt}")

    return folder_counts, total_images

folder_counts, total_images = count_images_in_dir(CLEAN_DIR)



üìÇ Total folders: 116
üñºÔ∏è Total images: 4822

Folder-wise counts:
  ankylosaurus             : 13
  ant                      : 8
  anteater                 : 4
  antelope                 : 24
  apatosaurus              : 2
  archaeopteryx            : 6
  armadillo                : 9
  bat                      : 117
  beetle                   : 185
  bird                     : 50
  bison                    : 17
  boar                     : 7
  buffalo                  : 6
  butterfly                : 116
  camel                    : 45
  cardinal                 : 9
  cat                      : 150
  chameleon                : 6
  chicken                  : 80
  chipmunk                 : 7
  cicada                   : 13
  cockroach                : 12
  coelophysis              : 2
  cow                      : 114
  crab                     : 95
  crane                    : 31
  cricket                  : 3
  crocodile                : 31
  crow                     : 6
  deer 

In [45]:
#!/usr/bin/env python3
"""
Simplified script:
Takes counts from origami_images/ folders,
computes how many animal images are needed for each label,
and copies that many random samples from animals_src_dir/<label>/ into output_dir/<label>/
"""

import random
import shutil
from pathlib import Path

# ---------------- CONFIG ----------------
ORIGAMI_DIR = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/clean/origami_images")       # path to origami folders
ANIMALS_SRC_DIR = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/clean/animals")      # path to real animal images
OUTPUT_DIR = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/clean/animals_balanced")      # where to save balanced data
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp"}  # allowed extensions
# ----------------------------------------


def iter_image_files(folder: Path):
    """Return list of image files in folder."""
    if not folder.exists():
        return []
    return [p for p in folder.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTS]


def count_origami_by_label(root: Path):
    """Return list of (label, count) for each origami subfolder."""
    rows = []
    for sub in sorted(root.iterdir()):
        if sub.is_dir():
            n = len(iter_image_files(sub))
            rows.append((sub.name, n))
    return rows


def recommend_real_target(n: int) -> int:
    """Apply balancing rules."""
    if n <= 0:
        target = 0
    elif n <= 10:
        target = min(n * 40, 400)
    elif n <= 50:
        target = min(n * 20, 800)
    elif n <= 150:
        target = min(n * 10, 1500)
    else:
        target = 1500
    return int(round(target / 10.0) * 10)


def main():
    random.seed(42)
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    label_counts = count_origami_by_label(ORIGAMI_DIR)
    if not label_counts:
        print(f"No labels found in {ORIGAMI_DIR}")
        return

    total_origami = sum(c for _, c in label_counts)
    print(f"Found {len(label_counts)} labels with {total_origami} origami images total.")

    total_target = 0
    for label, count in label_counts:
        target = recommend_real_target(count)
        total_target += target

        src_folder = ANIMALS_SRC_DIR / label
        dst_folder = OUTPUT_DIR / label
        dst_folder.mkdir(parents=True, exist_ok=True)

        if not src_folder.exists():
            print(f"‚ö†Ô∏è Missing animal folder: {src_folder}")
            continue

        files = iter_image_files(src_folder)
        if not files:
            print(f"‚ö†Ô∏è No images in {src_folder}")
            continue

        chosen = random.sample(files, min(target, len(files)))
        for f in chosen:
            shutil.copy2(f, dst_folder / f.name)

        print(f"{label:25s} | origami={count:4d} | target={target:4d} | picked={len(chosen):4d}")

    print(f"\n‚úÖ Done. Copied balanced animal images to: {OUTPUT_DIR}")


if __name__ == "__main__":
    main()


Found 106 labels with 4397 origami images total.
ankylosaurus              | origami=  13 | target= 260 | picked=   6
ant                       | origami=   8 | target= 320 | picked= 320
anteater                  | origami=   4 | target= 160 | picked= 160
antelope                  | origami=  24 | target= 480 | picked= 480
apatosaurus               | origami=   2 | target=  80 | picked=   2
archaeopteryx             | origami=   6 | target= 240 | picked=   5
armadillo                 | origami=   9 | target= 360 | picked= 360
bat                       | origami= 117 | target=1170 | picked=1170
beetle                    | origami= 185 | target=1500 | picked=1488
bird                      | origami=  50 | target= 800 | picked= 800
bison                     | origami=  17 | target= 340 | picked= 340
boar                      | origami=   7 | target= 280 | picked= 280
buffalo                   | origami=   6 | target= 240 | picked= 240
butterfly                 | origami= 116 | target=1160