## 1. Merging Data for Origami Images from 2 Datasets

The following are the links to the kaggle datasets:

1. https://www.kaggle.com/datasets/caokhoihuynh/orgami-works-of-some-origamists
2. https://www.kaggle.com/datasets/karthikssalian/origami-models

### 1.1. Dataset 1: Remove artist names and non animal models

In [35]:
from pathlib import Path

ARTISTS_ROOT = Path(
    "/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/original data/origami artists"
).expanduser().resolve()

DEST_ROOT = Path(
    "/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/origami images"
).expanduser().resolve()

CASE_INSENSITIVE = False
INCLUDE_HIDDEN = False

assert ARTISTS_ROOT.exists() and ARTISTS_ROOT.is_dir(), f"Missing dir: {ARTISTS_ROOT}"
DEST_ROOT.mkdir(parents=True, exist_ok=True)
print("‚úÖ ARTISTS_ROOT:", ARTISTS_ROOT)
print("‚úÖ DEST_ROOT:", DEST_ROOT)



‚úÖ ARTISTS_ROOT: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/original data/origami artists
‚úÖ DEST_ROOT: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/Data/dataset/origami images


In [37]:
import os, shutil
from typing import List, Tuple

def safe_target_path(dst_dir: Path, fname: str) -> Path:
    stem, dot, ext = fname.partition(".")
    candidate = dst_dir / fname
    k = 1
    while candidate.exists():
        suffixed = f"{stem} ({k})" + (f".{ext}" if ext else "")
        candidate = dst_dir / suffixed
        k += 1
    return candidate

def merge_dir_into(src: Path, dst: Path, *, include_hidden: bool = False) -> List[Tuple]:
    actions: List[Tuple] = []
    if not dst.exists():
        dst.mkdir(parents=True, exist_ok=True)
        actions.append(("mkdir", dst))

    for root, dirs, files in os.walk(src):
        if not include_hidden:
            dirs[:] = [d for d in dirs if not d.startswith(".")]

        rel = Path(root).relative_to(src)
        dst_here = dst / rel
        dst_here.mkdir(parents=True, exist_ok=True)
        actions.append(("mkdir", dst_here))

        for f in files:
            if not include_hidden and f.startswith("."):
                continue
            sfile = Path(root) / f
            tfile = dst_here / f
            if tfile.exists():
                tfile = safe_target_path(dst_here, f)
            shutil.move(str(sfile), str(tfile))
            actions.append(("move", sfile, tfile))
    return actions


In [27]:
def flatten_artists_to_dataset(
    artists_root: Path,
    dest_root: Path,
    *,
    case_insensitive: bool = False,
    include_hidden: bool = False
) -> List[Tuple]:
    all_actions: List[Tuple] = []
    artist_dirs = sorted(
        p for p in artists_root.iterdir()
        if p.is_dir() and (include_hidden or not p.name.startswith("."))
    )
    if not artist_dirs:
        print("No artist folders found under:", artists_root)
        return []

    print(f"Found {len(artist_dirs)} artist folders.")
    for artist in artist_dirs:
        model_dirs = [
            p for p in artist.iterdir()
            if p.is_dir() and (include_hidden or not p.name.startswith("."))
        ]
        if not model_dirs:
            continue

        print(f"\nArtist: {artist.name}  ({len(model_dirs)} models)")
        for model in sorted(model_dirs):
            name = model.name.strip()
            if case_insensitive:
                name = name.lower()

            dst_model = dest_root / name
            acts = merge_dir_into(model, dst_model, include_hidden=include_hidden)
            all_actions.extend(acts)
            moved_ct = sum(1 for a in acts if a[0] == "move")
            print(f"  ‚Üí {model.name} merged into {dst_model.name} | files moved: {moved_ct}")

        # Remove emptied artist folder
        for p in sorted([d for d in artist.rglob("*") if d.is_dir()], reverse=True):
            try:
                if not any(p.iterdir()):
                    p.rmdir()
            except OSError:
                pass
        try:
            if not any(artist.iterdir()):
                artist.rmdir()
        except OSError:
            pass

    print("\n‚úÖ Done. Total actions:", len(all_actions))
    return all_actions


In [29]:
actions = flatten_artists_to_dataset(
    ARTISTS_ROOT,
    DEST_ROOT,
    case_insensitive=CASE_INSENSITIVE,
    include_hidden=INCLUDE_HIDDEN,
)

moves = [a for a in actions if a[0] == "move"]
print(f"\nTotal files moved: {len(moves)}")
for a in moves[:20]:
    print("MOVE:", a[1], "‚Üí", a[2])
if len(moves) > 20:
    print("... (showing first 20)")

print("‚úÖ All artist model images are now under:", DEST_ROOT)


Found 5 artist folders.

Artist: animals  (36 models)
  ‚Üí armadillo  merged into  armadillo   | files moved: 8
  ‚Üí bear  merged into  bear   | files moved: 106
  ‚Üí camel  merged into  camel   | files moved: 34
  ‚Üí cat  merged into  cat   | files moved: 90
  ‚Üí chameleon  merged into  chameleon   | files moved: 5
  ‚Üí cow  merged into  cow   | files moved: 85
  ‚Üí crab  merged into  crab   | files moved: 81
  ‚Üí crocodile  merged into  crocodile   | files moved: 9
  ‚Üí deer  merged into  deer   | files moved: 16
  ‚Üí dog  merged into  dog   | files moved: 100
  ‚Üí elephant  merged into  elephant   | files moved: 67
  ‚Üí fish  merged into  fish   | files moved: 90
  ‚Üí fox  merged into  fox   | files moved: 19
  ‚Üí frog  merged into  frog   | files moved: 69
  ‚Üí giraffe  merged into  giraffe   | files moved: 54
  ‚Üí gorilla  merged into  gorilla   | files moved: 18
  ‚Üí grasshopper  merged into  grasshopper   | files moved: 59
  ‚Üí hippo  merged into  hippo   | fil

In [13]:
import os, re, shutil
from typing import List, Tuple, Optional
from pathlib import Path

YEAR_RE = re.compile(r"\b(19|20)\d{2}\b")

def normalize_name(s: str) -> str:
    s = s.lower()
    s = YEAR_RE.sub("", s)                 # remove years like 2019, 2021
    s = re.sub(r"\d+", " ", s)             # remove other digits e.g., rabbit2
    s = s.replace("_", " ").replace("-", " ")
    s = re.sub(r"[^\w\s]", " ", s)         # drop punctuation
    s = re.sub(r"\s+", " ", s).strip()
    return s

def singularize(token: str) -> str:
    if token.endswith("ies"): return token[:-3] + "y"
    if token.endswith("ves"): return token[:-3] + "f"
    if token.endswith("s") and not token.endswith("ss"): return token[:-1]
    return token

def safe_target_path(dst_dir: Path, fname: str) -> Path:
    stem, dot, ext = fname.partition(".")
    candidate = dst_dir / fname
    k = 1
    while candidate.exists():
        candidate = dst_dir / (f"{stem} ({k})" + (f".{ext}" if ext else ""))
        k += 1
    return candidate

def move_dir_into(src: Path, dst: Path, *, include_hidden: bool = False) -> List[Tuple]:
    """
    Move entire src tree into dst (merging). No overwrites; conflicts get ' (n)' suffix.
    Returns actions: ("mkdir", path) or ("move", src, dst)
    """
    actions: List[Tuple] = []
    if not dst.exists():
        dst.mkdir(parents=True, exist_ok=True)
        actions.append(("mkdir", dst))

    for root, dirs, files in os.walk(src):
        if not include_hidden:
            dirs[:] = [d for d in dirs if not d.startswith(".")]
        rel = Path(root).relative_to(src)
        dst_here = dst / rel
        if not dst_here.exists():
            dst_here.mkdir(parents=True, exist_ok=True)
            actions.append(("mkdir", dst_here))
        for f in files:
            if not include_hidden and f.startswith("."):
                continue
            sfile = Path(root) / f
            tfile = dst_here / f
            if tfile.exists():
                tfile = safe_target_path(dst_here, f)
            shutil.move(str(sfile), str(tfile))
            actions.append(("move", sfile, tfile))
    return actions

def remove_empty_dirs(root: Path):
    """Remove empty dirs under root, bottom-up."""
    for p in sorted([d for d in root.rglob("*") if d.is_dir()], reverse=True):
        try:
            next(iter(p.iterdir()))
        except StopIteration:
            try: p.rmdir()
            except OSError: pass
        except Exception:
            pass

# --- vocab & bucketing ---

MULTIWORD = {
    "flying squirrel","walking stick","horse fly","ladybird beetle",
    "great horned owl","red tailed hawk","red-tailed hawk","giant stag beetle",
    "white rhinoceros","giant anteater","bactrian camel","humpback whale",
    "great white shark","peacock spider","paper wasp","thread-sail filefish","threadsail filefish",
    "long tailed tit","long-tailed tit","japanese macaque","giant water bug"
}

ANIMAL_TOKENS = {
    "rabbit","hare","lion","tiger","leopard","cheetah","panther","jaguar","cat","kitten","puma","lynx",
    "dog","wolf","fox","hound","coyote","jackal","bear","panda","raccoon","weasel","otter","skunk","badger",
    "elephant","mammoth","muskox","bison","buffalo","boar","hog","pig","hippopotamus","hippo",
    "deer","elk","moose","reindeer","antelope","gazelle","goat","sheep","ram","ewe","cattle","cow","bull","yak",
    "horse","stallion","mare","zebra","camel","llama","alpaca","giraffe",
    "monkey","macaque","gorilla","chimpanzee","orangutan","lemur",
    "mouse","rat","hamster","gerbil","squirrel","chipmunk","shrew","hedgehog","porcupine","beaver",
    "koala","kangaroo","wallaby","wombat","platypus","echidna","sloth","anteater","armadillo",
    "dolphin","whale","seal","sea lion","porpoise","otter",
    "rhinoceros","filefish","stag","anteater","camel"
}

BIRD_TOKENS = {
    "bird","eagle","hawk","falcon","buzzard","vulture","owl","heron","crane","swan","duck","goose","gull",
    "penguin","albatross","tern","cormorant","kingfisher","hummingbird","sparrow","finch","swallow","robin",
    "wren","warbler","jay","mockingbird","cardinal","toucan","woodpecker","ostrich","kiwi","puffin","kite","cuckoo",
    "tit"
}

INSECT_ARTHROPOD_TOKENS = {
    "insect","butterfly","moth","bee","wasp","hornet","ant","termite","beetle","cicada","dragonfly",
    "damselfly","grasshopper","katydid","locust","mantis","cockroach","stick","walking","leaf","silverfish",
    "scorpion","spider","tarantula","tick","mite","lobster","crab","prawn","shrimp","horsefly","ladybird","ladybug",
    "bug"
}

SPECIAL_ALIASES = {
    "lazy rabbit": "rabbit",
    "fox cub": "fox",
    "red fox": "fox",
    "flying squirrel": "squirrel",
    "japanese macaque": "macaque",
    "white rhinoceros": "rhinoceros",
    "great white shark": "shark",
    "humpback whale": "whale",
    "giant stag beetle": "stag beetle",
    "ladybird beetle": "ladybird",
    "paper wasp": "wasp",
    "peacock spider": "spider",
    "thread sail filefish": "filefish",
    "threadsail filefish": "filefish",
    "long tailed tit": "tit",
    "long-tailed tit": "tit",
}

def pick_bucket(folder_name: str) -> Optional[str]:
    norm = normalize_name(folder_name)

    # Multiword phrases first
    for mw in sorted(MULTIWORD, key=len, reverse=True):
        key = normalize_name(mw)
        if re.search(rf"\b{re.escape(key)}\b", norm):
            return SPECIAL_ALIASES.get(key, key)

    # Aliases
    for phrase, canon in SPECIAL_ALIASES.items():
        key = normalize_name(phrase)
        if re.search(rf"\b{re.escape(key)}\b", norm):
            return canon

    # Token-based match
    best = None
    for tok in norm.split():
        t = singularize(tok)
        if t in ANIMAL_TOKENS or t in BIRD_TOKENS or t in INSECT_ARTHROPOD_TOKENS:
            if best is None or len(t) > len(best):
                best = t
    return best


Found 1182 artist folders.

Done (dry_run = True ). Total actions: 0

Planned moves: 0


In [15]:
# Top-level directories in DEST_ROOT
dirs = [p for p in DEST_ROOT.iterdir() if p.is_dir() and (INCLUDE_HIDDEN or not p.name.startswith("."))]

if CASE_INSENSITIVE:
    # Deduplicate by lowercase but keep first original spelling
    seen = {}
    for d in dirs:
        key = d.name.strip().lower()
        if key not in seen:
            seen[key] = d.name.strip()
    unique_names = sorted(seen.values(), key=lambda s: s.lower())
else:
    # Unique by exact name (strip spaces), sorted case-insensitively for display
    unique_names = sorted({d.name.strip() for d in dirs}, key=lambda s: s.lower())

print(f"Found {len(unique_names)} unique folders in DEST_ROOT.\n")
for name in unique_names:
    print(name)



Found 1182 artist folders.

Done (dry_run = False ). Total actions: 0
‚úÖ Flatten & merge complete.


In [17]:
# Build candidates (top-level dirs inside DEST_ROOT)
candidates = [
    p for p in DEST_ROOT.iterdir()
    if p.is_dir() and (INCLUDE_HIDDEN or not p.name.startswith("."))
]

actions: List[Tuple] = []
moved_files = 0
bucket_names_seen = set()

for src in sorted(candidates, key=lambda p: p.name.lower()):
    bucket = pick_bucket(src.name)
    if not bucket:
        # Could not classify ‚Äî skip
        continue

    dst_bucket = DEST_ROOT / bucket
    # If the source directory *is* the bucket directory already, skip moving into itself
    if src.resolve() == dst_bucket.resolve():
        bucket_names_seen.add(bucket)
        continue

    # Move the entire src tree into the bucket (creates bucket if needed)
    acts = move_dir_into(src, dst_bucket, include_hidden=INCLUDE_HIDDEN)
    actions.extend(acts)
    moved_files += sum(1 for a in acts if a[0] == "move")

    # After moving contents, remove the emptied source dir
    try:
        remove_empty_dirs(src)
        # Remove the source itself if now empty
        if src.is_dir():
            try:
                next(iter(src.iterdir()))
            except StopIteration:
                try: src.rmdir()
                except OSError: pass
            except Exception:
                pass
    except Exception:
        pass

print("\n‚úÖ In-place reorg complete.")
print("Files moved:", moved_files)
print("Buckets created/used (seen):", sorted({(DEST_ROOT / b).name for b in bucket_names_seen}))
print("Total actions recorded:", len(actions))

# Show a sample of moves
sample = [a for a in actions if a[0] == "move"][:20]
for a in sample:
    print("MOVE:", a[1], "‚Üí", a[2])
if len(sample) == 20:
    print("... (showing first 20)")


Found 1182 unique folders.


In [53]:
# üêæ List all folders inside the animals_birds_insects directory

# Make sure this matches the folder name you used earlier
ANIMAL_ROOT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/animals_birds_insects").expanduser().resolve() 

if ANIMAL_ROOT.exists() and ANIMAL_ROOT.is_dir():
    folders = sorted([p for p in ANIMAL_ROOT.iterdir() if p.is_dir()])
    print(f"‚úÖ Found {len(folders)} folders inside '{ANIMAL_ROOT.name}'\n")

    # Show folder names (one per line)
    for f in folders:
        print(f.name)
else:
    print(f"‚ùå Folder not found: {ANIMAL_ROOT}")


‚úÖ Found 101 folders inside 'animals_birds_insects'

ant
antelope
armadillo
bear
beaver
bee
beetle
bird
bison
boar
buffalo
bull
butterfly
camel
cardinal
cat
chipmunk
cicada
cockroach
cow
crab
crane
cricket
cuckoo
deer
dog
dolphin
dragonfly
duck
eagle
elephant
elk
falcon
fish
fox
gerbil
giant anteater
giant water bug
giraffe
gorilla
grasshopper
hamster
hawk
hedgehog
heron
hippo
horse
horsefly
hummingbird
insect
kangaroo
katydid
kingfisher
koala
ladybird
leopard
lion
lobster
mantis
mockingbird
monkey
moose
moth
mouse
muskox
orangutan
ostrich
otter
owl
panda
penguin
pig
prawn
puffin
rabbit
raccoon
rat
reindeer
rhinoceros
scorpion
seal
shark
sheep
shrew
skunk
sparrow
spider
squirrel
swallow
swan
tarantula
tiger
tit
toucan
turtle
vulture
wasp
weasel
whale
wolf
woodpecker


In [55]:
# üß¨ Phase 2: Group breed/variant folders into their species folder inside animals_birds_insects

# (uses ARTISTS_ROOT and DEST_ROOT that you already defined earlier)


# master species list (the folders you pasted)
SPECIES_LIST = [
    "ant","antelope","armadillo","bear","beaver","bee","beetle","bird","bison","boar","buffalo","bull","butterfly",
    "camel","cardinal","cat","chipmunk","cicada","cockroach","cow","crab","crane","cricket","cuckoo","deer","dog",
    "dolphin","dragonfly","duck","eagle","elephant","elk","falcon","fish","fox","gerbil","giant anteater",
    "giant water bug","giraffe","gorilla","grasshopper","hamster","hawk","hedgehog","heron","hippo","horse",
    "horsefly","hummingbird","insect","kangaroo","katydid","kingfisher","koala","ladybird","leopard","lion",
    "lobster","mantis","mockingbird","monkey","moose","moth","mouse","muskox","orangutan","ostrich","otter","owl",
    "panda","penguin","pig","prawn","puffin","rabbit","raccoon","rat","reindeer","rhinoceros","scorpion","seal",
    "shark","sheep","shrew","skunk","sparrow","spider","squirrel","swallow","swan","tarantula","tiger","tit",
    "toucan","turtle","vulture","wasp","weasel","whale","wolf","woodpecker"
]

# lowercased map for easy lookup
SPECIES_MAP = {s.lower(): s for s in SPECIES_LIST}

# keywords for special breeds (can expand anytime)
BREED_KEYWORDS = {
    "dog": ["hound","retriever","terrier","spaniel","poodle","beagle","mastiff","bulldog","dalmatian","greyhound","shepherd"],
    "cat": ["siamese","persian","tabby","ragdoll","maine","kitten"],
    "horse": ["stallion","mare","pony","foal","colt","filly"],
    "lion": ["cub","male lion","female lion"],
    "tiger": ["cub","bengal","siberian"],
    "fox": ["red","arctic","fennec","cub","kit"],
    "bear": ["polar","grizzly","black","brown"],
    "rabbit": ["bunny","hare"],
    "eagle": ["golden","bald","harpy","sea"],
    "owl": ["barn","snowy","horned"],
    "penguin": ["emperor","king","adelie"],
    "wolf": ["arctic","grey","gray","timber"],
    "deer": ["elk","reindeer","moose"],
    "squirrel": ["flying","ground"],
    "spider": ["tarantula","widow","orb","jumping"],
}

import os, re, shutil

DRY_RUN = True  # ‚úÖ preview first!

moved = []
unmatched = []

# loop through each folder in animals_birds_insects
for folder in sorted([p for p in ANIMAL_ROOT.iterdir() if p.is_dir()]):
    name = folder.name.lower()

    # Skip the main species folders themselves
    if name in SPECIES_MAP:
        continue

    # try direct species match by substring
    target = None
    for s in SPECIES_LIST:
        if re.search(rf"\b{s}\b", name):
            target = s
            break

    # try keyword-based breed match
    if not target:
        for species, words in BREED_KEYWORDS.items():
            for w in words:
                if w in name:
                    target = species
                    break
            if target:
                break

    if target:
        dest = ANIMAL_ROOT / target
        dest.mkdir(exist_ok=True)
        print(f"‚Üí {folder.name}  ‚Üí  {target}")
        if not DRY_RUN:
            shutil.move(str(folder), str(dest / folder.name))
        moved.append((folder.name, target))
    else:
        unmatched.append(folder.name)

print("\nSummary:")
print(f"  Total moved (planned): {len(moved)}")
print(f"  Unmatched folders: {len(unmatched)}")

if unmatched:
    print("\nUnmatched examples:")
    for u in unmatched[:20]:
        print(" ", u)

if DRY_RUN:
    print("\nüü° DRY_RUN=True ‚Äî only previewed.  Set DRY_RUN=False and re-run this cell to actually move them.")




Summary:
  Total moved (planned): 0
  Unmatched folders: 0

üü° DRY_RUN=True ‚Äî only previewed.  Set DRY_RUN=False and re-run this cell to actually move them.


In [57]:
# üêæ List all folders inside the animals_birds_insects directory

# Make sure this matches the folder name you used earlier
ANIMAL_ROOT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/animals_birds_insects").expanduser().resolve() 

if ANIMAL_ROOT.exists() and ANIMAL_ROOT.is_dir():
    folders = sorted([p for p in ANIMAL_ROOT.iterdir() if p.is_dir()])
    print(f"‚úÖ Found {len(folders)} folders inside '{ANIMAL_ROOT.name}'\n")

    # Show folder names (one per line)
    for f in folders:
        print(f.name)
else:
    print(f"‚ùå Folder not found: {ANIMAL_ROOT}")

‚úÖ Found 96 folders inside 'animals_birds_insects'

ant
antelope
armadillo
bear
beaver
bee
beetle
bird
bison
boar
buffalo
butterfly
camel
cardinal
cat
chipmunk
cicada
cockroach
cow
crab
crane
cricket
cuckoo
deer
dog
dolphin
dragonfly
duck
eagle
elephant
falcon
fish
fox
gerbil
giant anteater
giraffe
gorilla
grasshopper
hamster
hawk
hedgehog
heron
hippo
horse
horsefly
hummingbird
insect
kangaroo
katydid
kingfisher
koala
ladybird
leopard
lion
lobster
mantis
mockingbird
monkey
moth
mouse
muskox
orangutan
ostrich
otter
owl
panda
penguin
pig
prawn
puffin
rabbit
raccoon
rat
rhinoceros
scorpion
seal
shark
sheep
shrew
skunk
sparrow
spider
squirrel
swallow
swan
tarantula
tiger
tit
toucan
turtle
vulture
wasp
weasel
whale
wolf
woodpecker


In [63]:
DRY_RUN = False  # ‚úÖ preview first, then set to False to execute
IMAGE_EXTS = {".jpg",".jpeg",".png",".webp",".gif",".bmp",".tif",".tiff"}

BIRD_SPECIES = {
    "bird","cardinal","crane","cuckoo","duck","eagle","falcon","hawk","heron","hummingbird",
    "kingfisher","mockingbird","ostrich","owl","penguin","puffin","sparrow","swallow","swan",
    "tit","toucan","vulture","woodpecker"
}
FISH_SPECIES = {"fish","crab","dolphin","lobster","prawn","seal","shark","turtle","whale"}
INSECT_SPECIES = {
    "insect","ant","bee","beetle","butterfly","cicada","cockroach","cricket","dragonfly",
    "grasshopper","horsefly","katydid","ladybird","mantis","scorpion","spider","tarantula",
    "wasp","giant water bug"
}

def count_images(p: Path) -> int:
    return sum(1 for x in p.rglob("*") if x.is_file() and x.suffix.lower() in IMAGE_EXTS)

def safe_target(base: Path, name: str) -> Path:
    stem, dot, ext = name.partition(".")
    candidate = base / name
    k = 1
    while candidate.exists():
        candidate = base / f"{stem} ({k}){('.' + ext) if ext else ''}"
        k += 1
    return candidate

moved, skipped, errors = [], [], []

for species_dir in sorted([p for p in ANIMAL_ROOT.iterdir() if p.is_dir()], key=lambda x: x.name.lower()):
    name = species_dir.name.lower()
    if name in {"bird","fish","insect"}:
        continue

    # determine category
    if name in BIRD_SPECIES:
        cat = "bird"
    elif name in FISH_SPECIES:
        cat = "fish"
    elif name in INSECT_SPECIES:
        cat = "insect"
    else:
        skipped.append((species_dir.name,"not bird/fish/insect"))
        continue

    img_count = count_images(species_dir)
    if img_count >= 2:
        skipped.append((species_dir.name,f"{img_count} images (>=2)"))
        continue

    dest_folder = ANIMAL_ROOT / cat
    print(f"‚Üí {species_dir.name}: {img_count} image{'s' if img_count!=1 else ''} ‚Üí flatten into {cat}/")

    # copy/move files directly (no subfolder)
    for f in species_dir.rglob("*"):
        if f.is_file() and f.suffix.lower() in IMAGE_EXTS:
            target = safe_target(dest_folder, f.name)
            if not DRY_RUN:
                shutil.move(str(f), str(target))
            moved.append((f.name, cat))

    # clean up empty dirs
    if not DRY_RUN:
        for p in sorted(species_dir.rglob("*"), reverse=True):
            if p.is_dir() and not any(p.iterdir()):
                p.rmdir()
        try:
            if not any(species_dir.iterdir()):
                species_dir.rmdir()
        except OSError:
            pass

# --- Summary ---
print("\n=== Summary ===")
print(f"DRY_RUN: {DRY_RUN}")
print(f"Moved {len(moved)} image files into category folders.")
print(f"Skipped {len(skipped)} species folders (>=2 images or not bird/fish/insect).")
if DRY_RUN:
    print("üü° Preview only. Set DRY_RUN=False and re-run to apply.")

‚Üí bee: 1 image ‚Üí flatten into insect/
‚Üí cuckoo: 1 image ‚Üí flatten into bird/
‚Üí falcon: 1 image ‚Üí flatten into bird/
‚Üí horsefly: 1 image ‚Üí flatten into insect/
‚Üí ladybird: 1 image ‚Üí flatten into insect/
‚Üí mockingbird: 1 image ‚Üí flatten into bird/
‚Üí swan: 1 image ‚Üí flatten into bird/
‚Üí toucan: 1 image ‚Üí flatten into bird/
‚Üí vulture: 1 image ‚Üí flatten into bird/
‚Üí woodpecker: 1 image ‚Üí flatten into bird/

=== Summary ===
DRY_RUN: False
Moved 10 image files into category folders.
Skipped 83 species folders (>=2 images or not bird/fish/insect).


In [87]:
# üêæ List all folders inside the animals_birds_insects directory

# Make sure this matches the folder name you used earlier
ANIMAL_ROOT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/animals_birds_insects").expanduser().resolve() 

if ANIMAL_ROOT.exists() and ANIMAL_ROOT.is_dir():
    folders = sorted([p for p in ANIMAL_ROOT.iterdir() if p.is_dir()])
    print(f"‚úÖ Found {len(folders)} folders inside '{ANIMAL_ROOT.name}'\n")

    # Show folder names (one per line)
    for f in folders:
        print(f.name)
else:
    print(f"‚ùå Folder not found: {ANIMAL_ROOT}")

‚úÖ Found 101 folders inside 'animals_birds_insects'

ant
antelope
armadillo
bat
bear
beaver
beetle
bird
bison
boar
buffalo
butterfly
camel
cardinal
cat
chameleon
chipmunk
cicada
cockroach
cow
crab
crane
cricket
crocodile
deer
dog
dolphin
dragonfly
duck
eagle
elephant
fish
fly
fox
frog
gerbil
giant anteater
giraffe
gorilla
grasshopper
hamster
hawk
hedgehog
heron
hippo
horse
hummingbird
insect
kangaroo
katydid
kingfisher
koala
leopard
lion
lizard
lobster
mantis
monkey
moth
mouse
muskox
orangutan
ostrich
otter
owl
panda
parrot
peacock
pelican
penguin
pig
prawn
puffin
rabbit
raccoon
rat
rhinoceros
rooster
scorpion
seal
shark
sheep
shrew
skunk
snail
snake
sparrow
spider
squirrel
swallow
swan
tarantula
tiger
tit
tortoise
turtle
wasp
weasel
whale
wolf
zebra


In [83]:
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tif", ".tiff", ".gif"}

def count_images(folder):
    return sum(1 for f in folder.rglob("*") if f.is_file() and f.suffix.lower() in IMAGE_EXTS)

totals = []
for p in sorted([x for x in ANIMAL_ROOT.iterdir() if x.is_dir()], key=lambda x: x.name.lower()):
    n = count_images(p)
    totals.append((p.name, n))

# Print summary
print(f"üìÅ Image counts in '{ANIMAL_ROOT.name}':\n")
for name, n in totals:
    print(f"{name:<25}  {n:>5} images")

# Grand total
grand = sum(n for _, n in totals)
print("\nTotal images across all folders:", grand)

üìÅ Image counts in 'animals_birds_insects':

ant                           10 images
antelope                      11 images
armadillo                      9 images
bat                          114 images
bear                         113 images
beaver                         1 images
beetle                       194 images
bird                          39 images
bison                         28 images
boar                          13 images
buffalo                        6 images
butterfly                    115 images
camel                         45 images
cardinal                      12 images
cat                          162 images
chameleon                      5 images
chipmunk                       7 images
cicada                        23 images
cockroach                     12 images
cow                          116 images
crab                         108 images
crane                         31 images
cricket                        2 images
crocodile                      9 

In [69]:

IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp", ".tif", ".tiff", ".gif"}

# Count recursively across all artist and sub-model folders
total_images = sum(
    1 for f in ARTISTS_ROOT.rglob("*")
    if f.is_file() and f.suffix.lower() in IMAGE_EXTS
)

print(f"üìÇ Total images in '{ARTISTS_ROOT.name}': {total_images}")

üìÇ Total images in 'origami-artists': 3594


In [81]:

try:
    _ = ORIGAMI_ROOT
except NameError:
    ORIGAMI_ROOT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/origami").expanduser().resolve()

ORIGAMI_ROOT = Path(ORIGAMI_ROOT).expanduser().resolve()
assert ORIGAMI_ROOT.exists() and ORIGAMI_ROOT.is_dir(), f"Missing source 'origami' folder: {ORIGAMI_ROOT}"

SRC_DIRS = {
    "animal": ORIGAMI_ROOT / "animals",
    "insect": ORIGAMI_ROOT / "insects",
    "birds":  ORIGAMI_ROOT / "birds",
}

DRY_RUN = False  # ‚úÖ preview first; set False to actually copy
IMAGE_EXTS = {".jpg",".jpeg",".png",".webp",".gif",".bmp",".tif",".tiff"}

# --- Utility helpers ---
def normalize(s: str) -> str:
    s = re.sub(r"[\s_-]+", " ", s.strip().lower())
    return s

def safe_target(dst: Path, fname: str) -> Path:
    """Avoid overwriting; adds (1), (2)... if file exists."""
    stem, dot, ext = fname.partition(".")
    candidate = dst / fname
    k = 1
    while candidate.exists():
        candidate = dst / f"{stem} ({k}){('.' + ext) if ext else ''}"
        k += 1
    return candidate

def existing_species_dir(dest_root: Path, species: str) -> Optional[Path]:
    """Find existing species folder (case-insensitive)."""
    species_l = normalize(species)
    for p in dest_root.iterdir():
        if p.is_dir() and normalize(p.name) == species_l:
            return p
    return None

def copy_images(src_dir: Path, dst_dir: Path):
    """Copy all images from src_dir recursively into dst_dir."""
    copied = 0
    for f in src_dir.rglob("*"):
        if f.is_file() and f.suffix.lower() in IMAGE_EXTS:
            target = safe_target(dst_dir, f.name)
            if not DRY_RUN:
                shutil.copy2(f, target)
            copied += 1
    return copied

# --- Merge logic ---
total_copied = 0
for category, src_path in SRC_DIRS.items():
    if not src_path.exists():
        print(f"‚ö†Ô∏è Skipping missing folder: {src_path}")
        continue

    for species_dir in sorted([p for p in src_path.iterdir() if p.is_dir()], key=lambda x: x.name.lower()):
        species_name = species_dir.name
        normalized = normalize(species_name)

        # Try to find existing destination folder
        dest_species = existing_species_dir(ANIMAL_ROOT, species_name)

        # If missing, create it automatically
        if dest_species is None:
            dest_species = ANIMAL_ROOT / species_name
            if not DRY_RUN:
                dest_species.mkdir(parents=True, exist_ok=True)
            print(f"üÜï Created new species folder: {dest_species.name}")

        # Copy all images into this species folder
        print(f"üìÇ {species_dir.name}  ‚Üí  {dest_species.name}/")
        count = copy_images(species_dir, dest_species)
        total_copied += count
        print(f"   {count} image{'s' if count != 1 else ''} {'(copied)' if not DRY_RUN else '(would copy)'}")

# --- Summary ---
print("\n=== Summary ===")
print(f"DRY_RUN: {DRY_RUN}")
print(f"Total images {'to copy' if DRY_RUN else 'copied'}: {total_copied}")
if DRY_RUN:
    print("\nüü° Set DRY_RUN=False and re-run to actually copy images.")

üìÇ armadillo  ‚Üí  armadillo/
   8 images (copied)
üìÇ bear  ‚Üí  bear/
   106 images (copied)
üìÇ camel  ‚Üí  camel/
   34 images (copied)
üìÇ cat  ‚Üí  cat/
   90 images (copied)
üÜï Created new species folder: chameleon
üìÇ chameleon  ‚Üí  chameleon/
   5 images (copied)
üìÇ cow  ‚Üí  cow/
   85 images (copied)
üìÇ crab  ‚Üí  crab/
   81 images (copied)
üÜï Created new species folder: crocodile
üìÇ crocodile  ‚Üí  crocodile/
   9 images (copied)
üìÇ deer  ‚Üí  deer/
   16 images (copied)
üìÇ dog  ‚Üí  dog/
   100 images (copied)
üìÇ elephant  ‚Üí  elephant/
   67 images (copied)
üìÇ fish  ‚Üí  fish/
   90 images (copied)
üìÇ fox  ‚Üí  fox/
   19 images (copied)
üÜï Created new species folder: frog
üìÇ frog  ‚Üí  frog/
   69 images (copied)
üìÇ giraffe  ‚Üí  giraffe/
   54 images (copied)
üìÇ gorilla  ‚Üí  gorilla/
   18 images (copied)
üìÇ grasshopper  ‚Üí  grasshopper/
   59 images (copied)
üìÇ hippo  ‚Üí  hippo/
   11 images (copied)
üìÇ horse  ‚Üí  horse/
 

In [91]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Auto-map species folder names -> ImageNet WNIDs using words.txt.
Creates:
- species_to_wnid.json
- species_to_wnid_report.csv (with match status & candidate labels)
"""

from pathlib import Path
import csv, json, re, sys

# ==== CONFIG (edit paths if needed) ==========================================
PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
WORDS_TXT = PROJECT / "words.txt"          # downloaded earlier by your code
# Scan species folder names from either of these (pick the one you have):
CANDIDATE_DIRS = [
    PROJECT / "animals_birds_insects",                    # e.g., origami/<species>/
    PROJECT / "data" / "animals",          # or data/animals/<species>/
]
OUT_JSON = PROJECT / "species_to_wnid.json"
OUT_CSV  = PROJECT / "species_to_wnid_report.csv"

# Optional: hard overrides for ambiguous names you care about
OVERRIDES = {
    # "seal": "n02442845",        # earless seal (phocid)
    # "tit": "n01534433",         # great tit
    # "mouse": "n02330245",       # mouse (rodent)
    # "crab": "n01976957",        # crab (generic)
    # "insect": "n02206856",      # insect (generic)
    # "bird": "n01503061",        # bird (generic)
}

# ============================================================================

def load_words(words_path: Path):
    """Return dicts:
       wnid_to_tokens: {wnid: [label tokens...]}
       token_to_wnids: {token: set(wnids...)}
       raw_labels:     {wnid: "label, label2, ..."}
    """
    wnid_to_tokens, token_to_wnids, raw_labels = {}, {}, {}
    with words_path.open("r") as f:
        reader = csv.reader(f, delimiter="\t")
        for row in reader:
            if not row: 
                continue
            wnid = row[0].strip()
            labels = (row[1].strip() if len(row) > 1 else "")
            raw_labels[wnid] = labels
            toks = [t.strip().lower() for t in labels.split(",") if t.strip()]
            wnid_to_tokens[wnid] = toks
            for t in toks:
                token_to_wnids.setdefault(t, set()).add(wnid)
    return wnid_to_tokens, token_to_wnids, raw_labels

def species_from_dirs():
    for root in CANDIDATE_DIRS:
        if root.exists() and root.is_dir():
            subs = [p.name for p in sorted(root.iterdir()) if p.is_dir()]
            if subs:
                return root, subs
    print("No species folders found in:", *CANDIDATE_DIRS, sep="\n  ")
    sys.exit(1)

def normalize(name: str) -> str:
    return re.sub(r"\s+", " ", name.replace("_"," ").replace("-"," ").strip().lower())

def candidates_for(species: str, wnid_to_tokens, raw_labels):
    sp = normalize(species)

    # 1) exact token match across any label token
    exact = [(wnid, raw_labels[wnid]) for wnid, toks in wnid_to_tokens.items() if sp in toks]
    if exact:
        return exact, "exact_token"

    # 2) exact phrase match among comma-separated labels (after normalization)
    exact_phrase = []
    for wnid, labels in raw_labels.items():
        toks = [normalize(t) for t in labels.split(",")]
        if sp in toks:
            exact_phrase.append((wnid, labels))
    if exact_phrase:
        return exact_phrase, "exact_phrase"

    # 3) word-boundary substring search in the whole normalized label string
    pat = re.compile(rf"\b{re.escape(sp)}\b")
    sub = []
    for wnid, labels in raw_labels.items():
        if pat.search(normalize(labels)):
            sub.append((wnid, labels))
    if sub:
        return sub, "substring"

    # 4) looser: token startswith (e.g., "kingfish" -> "kingfisher")
    loose = []
    for wnid, toks in wnid_to_tokens.items():
        if any(t.startswith(sp) or sp.startswith(t) for t in toks):
            loose.append((wnid, raw_labels[wnid]))
    return loose, "loose" if loose else ("", "none")

def pick_best(species: str, cands):
    """Heuristic: prefer wnids whose primary label token matches species,
       else the shortest label string, else first.
    """
    species_l = normalize(species)
    if not cands:
        return None
    # sort by: does first token equal species? then label length
    def score(item):
        wnid, labels = item
        first_tok = normalize(labels.split(",")[0]) if labels else ""
        tiebreak = len(labels)
        return (0 if first_tok == species_l else 1, tiebreak)
    cands_sorted = sorted(cands, key=score)
    return cands_sorted[0][0]

def main():
    words_path = WORDS_TXT
    if not words_path.exists():
        print(f"words.txt not found at: {words_path}")
        sys.exit(1)

    root, species_dirs = species_from_dirs()
    print(f"Found {len(species_dirs)} species folders under: {root}")

    wnid_to_tokens, token_to_wnids, raw_labels = load_words(words_path)

    mapping = {}
    rows = [("species_folder","match_type","picked_wnid","picked_labels","num_candidates","top_candidates")]
    for sp in species_dirs:
        sp_norm = normalize(sp)
        if sp_norm in OVERRIDES:
            wnid = OVERRIDES[sp_norm]
            labels = raw_labels.get(wnid, "")
            mapping[sp] = wnid
            rows.append((sp, "override", wnid, labels, 1, labels))
            continue

        cands, mtype = candidates_for(sp, wnid_to_tokens, raw_labels)
        if not cands:
            mapping[sp] = None
            rows.append((sp, "none", "", "", 0, ""))
            continue

        wnid = pick_best(sp, cands)
        labels = raw_labels.get(wnid, "")
        top_preview = " | ".join([f"{w}:{raw_labels[w]}" for w,_ in cands[:5]])
        mapping[sp] = wnid
        rows.append((sp, mtype, wnid, labels, len(cands), top_preview))

    # write outputs
    OUT_JSON.write_text(json.dumps(mapping, indent=2))
    with OUT_CSV.open("w", newline="") as f:
        w = csv.writer(f)
        w.writerows(rows)

    # summary
    unresolved = [k for k,v in mapping.items() if not v]
    print(f"\nWrote {OUT_JSON} and {OUT_CSV}")
    if unresolved:
        print(f"‚ö†Ô∏è  {len(unresolved)} folders had no match. Open {OUT_CSV} to see them:")
        for u in unresolved[:12]:
            print("  -", u)
        print("Tip: add them to OVERRIDES at top and re-run.")

if __name__ == "__main__":
    main()


Found 99 species folders under: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/animals_birds_insects

Wrote /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid.json and /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid_report.csv


In [95]:
#!/usr/bin/env python3
import csv, json, sys
from pathlib import Path

# --- Defaults (you can override with CLI args) ---
PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
CSV_PATH = PROJECT / "species_to_wnid_report.csv"
OUT_JSON = PROJECT / "species_to_wnid.json"

# --- CSV column names produced by the earlier mapper script ---
HDR_SPECIES = "species_folder"
HDR_PICKED  = "picked_wnid"
HDR_TOP     = "top_candidates"

# Optional: hard overrides take precedence (e.g., ambiguous classes)
OVERRIDES = {
    # "seal": "n02442845",
    # "tit": "n01534433",
    # "mouse": "n02330245",
}

def parse_first_candidate(s: str):
    """Parse first 'wnid:labels' from ' | '-separated preview."""
    if not s: return None
    first = s.split("|", 1)[0].strip()
    if ":" in first:
        wnid = first.split(":", 1)[0].strip()
        return wnid if wnid.startswith("n") else None
    return first if first.startswith("n") else None

mapping = {}
with open(CSV_PATH, newline="", encoding="utf-8", errors="ignore") as f:
    reader = csv.DictReader(f)
    # if header names differ, print(reader.fieldnames) and adjust HDR_* above
    for row in reader:
        species = (row.get(HDR_SPECIES) or "").strip()
        if not species: 
            continue
        if species.lower() in OVERRIDES:
            mapping[species] = OVERRIDES[species.lower()]
            continue
        wnid = (row.get(HDR_PICKED) or "").strip()
        if not wnid.startswith("n"):
            wnid = parse_first_candidate((row.get(HDR_TOP) or "").strip())
        mapping[species] = wnid if wnid else None

OUT_JSON.write_text(json.dumps(mapping, indent=2))

total = len(mapping)
mapped = sum(1 for v in mapping.values() if v)
unmapped = [k for k,v in mapping.items() if not v]

print(f"‚úÖ Wrote {mapped}/{total} mappings ‚Üí {OUT_JSON}")
if unmapped:
    print("Unmapped species (no candidate parsed):")
    for s in unmapped:
        print("  -", s)
    print("Tip: add these into OVERRIDES and re-run this cell.")


‚úÖ Wrote 99/99 mappings ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid.json


In [97]:
from pathlib import Path
import csv, json

# --- Paths ---
project = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
csv_path = project / "species_to_wnid_report.csv"
json_path = project / "species_to_wnid.json"

# --- Column names from your CSV ---
HDR_SPECIES = "species_folder"
HDR_PICKED  = "picked_wnid"
HDR_TOP     = "top_candidates"

def parse_first_candidate(s):
    """Get first 'wnid:label' (e.g., 'n02129165:lion,...')"""
    if not s:
        return None
    first = s.split("|", 1)[0].strip()
    if ":" in first:
        wnid = first.split(":", 1)[0].strip()
        return wnid if wnid.startswith("n") else None
    return first if first.startswith("n") else None

# --- Convert CSV ‚Üí JSON ---
mapping = {}
with open(csv_path, newline="", encoding="utf-8", errors="ignore") as f:
    reader = csv.DictReader(f)
    for row in reader:
        species = (row.get(HDR_SPECIES) or "").strip()
        if not species:
            continue
        wnid = (row.get(HDR_PICKED) or "").strip()
        if not wnid.startswith("n"):
            wnid = parse_first_candidate((row.get(HDR_TOP) or "").strip())
        mapping[species] = wnid if wnid else None

# --- Write JSON ---
json_path.write_text(json.dumps(mapping, indent=2))
print(f"‚úÖ Wrote {sum(1 for v in mapping.values() if v)} of {len(mapping)} species ‚Üí {json_path}")

unmapped = [k for k,v in mapping.items() if not v]
if unmapped:
    print("‚ö†Ô∏è Unmapped species:")
    for k in unmapped:
        print("  -", k)


‚úÖ Wrote 99 of 99 species ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid.json


In [7]:
# JUPYTER CELL: simple one-line status per species (done / failed)

from pathlib import Path
import tarfile, tempfile, time, json
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

# --- Paths (edit if needed) ---
PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
MAP_PATH = PROJECT / "species_to_wnid.json"
OUT_DIR  = PROJECT / "data" / "animals"
BASE_URL = "https://image-net.org/data/winter21_whole"

# --- Settings ---
MAX_WORKERS = 4
TIMEOUT = 45
RETRIES = 3

def ensure(p: Path): p.mkdir(parents=True, exist_ok=True)
def has_files(p: Path) -> bool: return p.exists() and any(p.iterdir())

def download_tar(url: str, dst: Path) -> bool:
    """Download URL to dst with basic retries. Returns True on success."""
    for attempt in range(1, RETRIES + 1):
        try:
            r = requests.get(url, stream=True, timeout=TIMEOUT)
            if r.status_code == 404:
                return False
            r.raise_for_status()
            with open(dst, "wb") as f:
                for chunk in r.iter_content(chunk_size=1024*256):
                    if chunk:
                        f.write(chunk)
            return True
        except requests.RequestException:
            if attempt < RETRIES:
                time.sleep(2)
            else:
                return False

def extract_tar(tar_path: Path, out_dir: Path) -> int:
    """Extract tar; returns number of files extracted."""
    n = 0
    with tarfile.open(tar_path, "r") as tf:
        for m in tf.getmembers():
            if not m.isfile():
                continue
            # flatten any internal folders
            m.name = Path(m.name).name
            tf.extract(m, out_dir)
            n += 1
    return n

def process_species(species: str, wnid: str) -> str:
    """Download+extract one species; returns a status string."""
    if not wnid:
        return f"{species}: FAILED (no WNID)"
    out_dir = OUT_DIR / species
    if has_files(out_dir):
        return f"{species}: SKIP (already exists)"
    ensure(out_dir)

    url = f"{BASE_URL}/{wnid}.tar"
    try:
        with tempfile.TemporaryDirectory() as td:
            tar_path = Path(td) / f"{wnid}.tar"
            ok = download_tar(url, tar_path)
            if not ok:
                return f"{species}: FAILED (download)"
            try:
                n = extract_tar(tar_path, out_dir)
            except tarfile.ReadError:
                return f"{species}: FAILED (corrupt tar)"
    except Exception as e:
        return f"{species}: FAILED ({e})"

    return f"{species}: DONE ({n} images)"

def run_all():
    ensure(OUT_DIR)
    mapping = json.loads(MAP_PATH.read_text())
    tasks = [(s, w) for s, w in mapping.items()]

    print(f"Starting downloads for {len(tasks)} species...\n")
    results = []

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        future_map = {ex.submit(process_species, s, w): s for s, w in tasks}
        for fut in as_completed(future_map):
            status = fut.result()
            print(status)
            results.append(status)

    print("\nAll done.")
    # Optional: quick summary
    done   = sum("DONE"   in r for r in results)
    skip   = sum("SKIP"   in r for r in results)
    failed = sum("FAILED" in r for r in results)
    print(f"Summary ‚Üí DONE: {done}, SKIP: {skip}, FAILED: {failed}")

run_all()


Starting downloads for 99 species...

antelope: SKIP (already exists)
ant: SKIP (already exists)
bat: DONE (1537 images)
armadillo: DONE (1282 images)
bear: DONE (1688 images)
beaver: DONE (1239 images)
boar: DONE (1233 images)
buffalo: FAILED (download)
beetle: DONE (1488 images)
bird: DONE (2126 images)
bison: DONE (1625 images)
cardinal: FAILED (download)
chameleon: FAILED (download)
cat: DONE (1485 images)
chipmunk: DONE (1255 images)
camel: DONE (1428 images)
butterfly: DONE (2115 images)
crab: FAILED (download)
cockroach: DONE (1157 images)
cicada: DONE (1227 images)
crocodile: SKIP (already exists)
deer: SKIP (already exists)
dog: FAILED (download)
dolphin: SKIP (already exists)
dragonfly: SKIP (already exists)
duck: SKIP (already exists)
eagle: FAILED (download)
elephant: SKIP (already exists)
fish: SKIP (already exists)
fly: SKIP (already exists)
fox: SKIP (already exists)
frog: FAILED (download)
gerbil: SKIP (already exists)
giant anteater: SKIP (already exists)
giraffe: SKIP

In [9]:
%pip install nltk
import nltk
nltk.download('wordnet')


Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.9.18-cp39-cp39-macosx_11_0_arm64.whl.metadata (40 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.5/1.5 MB[0m [31m15.3 MB/s[0m  [33m0:00:00[0m
[?25hDownloading regex-2025.9.18-cp39-cp39-macosx_11_0_arm64.whl (286 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
Downloading joblib-1.5.2-py3-none-any.whl (308 kB)
Installing collected packages: regex, joblib, click, nltk
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m4/4[0

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/antranakhasi/nltk_data...


True

In [19]:
# JUPYTER CELL ‚Äî map folder names in anims_birds_inseact -> WNID (WordNet)
from pathlib import Path
import json, re
from collections import defaultdict

# --- EDIT THIS PATH IF NEEDED ---
PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
SOURCE_DIR = PROJECT / "animals_birds_insects"      # folder containing class subfolders
OUT_JSON   = PROJECT / "species_to_wnid_from_ABI.json"

# --- WordNet setup ---
import nltk
try:
    from nltk.corpus import wordnet as wn
    _ = wn.synsets("cat")
except LookupError:
    nltk.download("wordnet")
    from nltk.corpus import wordnet as wn

# ---------- Helpers ----------
BIO_HEADS = {
    "animal", "beast", "organism",
    "vertebrate", "invertebrate",
    "mammal", "reptile", "amphibian", "bird", "aves",
    "fish", "pisces", "insect", "arthropod", "arachnid", "crustacean",
    "mollusk", "gastropod", "cephalopod", "cnidarian", "annelid"
}

def normalize(name: str) -> str:
    # turn folder name into a clean query for WordNet
    name = name.strip().lower()
    name = name.replace("_", " ").replace("-", " ")
    name = re.sub(r"\s+", " ", name)
    return name

def is_biological_synset(ss) -> bool:
    """
    Check if synset is a biological/animal sense by walking hypernyms.
    """
    for h in ss.closure(lambda s: s.hypernyms()):
        # Check lemma names of hypernyms for biological heads
        if any(lem.name().lower() in BIO_HEADS for lem in h.lemmas()):
            return True
    # also check the synset itself for obvious animal terms
    if any(lem.name().lower() in BIO_HEADS for lem in ss.lemmas()):
        return True
    return False

def score_synset(ss, folder_tokens):
    """
    Score a candidate synset. Lower is better.
    Heuristics:
      - prefer biological senses
      - prefer lemma name overlap / exact startswith
      - shorter definitions slightly preferred
    """
    score = 0
    # Biological sense bonus
    if not is_biological_synset(ss):
        score += 100

    lemmas = [l.lower() for l in ss.lemma_names()]
    # token overlap
    overlap = sum(1 for t in folder_tokens if any(l.startswith(t) or t in l for l in lemmas))
    score -= 5 * overlap

    # shorter definitions slightly preferred
    score += min(len(ss.definition()), 80) / 80.0

    return score

def find_best_wnid_for_name(name: str):
    """
    Try several query forms to find the best synset; return WNID like 'n02129165' or None.
    """
    q = normalize(name)
    tokens = q.split()
    candidates = []

    # Query variants: exact, singular/plural heuristics, joined tokens for compounds
    queries = {q}
    if len(tokens) > 1:
        queries.add(" ".join(tokens))
        queries.add("_".join(tokens))
    # Very light plural/singular tweak
    if q.endswith("s"):
        queries.add(q[:-1])
    else:
        queries.add(q + "s")

    seen = set()
    for query in queries:
        for ss in wn.synsets(query, pos='n'):
            if ss.offset() in seen:
                continue
            seen.add(ss.offset())
            candidates.append(ss)

    if not candidates:
        return None

    # Score & pick best
    scored = sorted(((score_synset(ss, tokens), ss) for ss in candidates), key=lambda x: x[0])
    best = scored[0][1]
    wnid = f"n{best.offset():08d}"
    return wnid, best

# ---------- Main ----------
if not SOURCE_DIR.exists():
    raise SystemExit(f"Source dir not found: {SOURCE_DIR}")

folders = sorted([p.name for p in SOURCE_DIR.iterdir() if p.is_dir()])
mapping = {}
report  = {}

for cls in folders:
    result = find_best_wnid_for_name(cls)
    if result is None:
        mapping[cls] = None
        report[cls]  = {"status": "no_match"}
    else:
        wnid, ss = result
        mapping[cls] = wnid
        report[cls]  = {
            "status": "ok",
            "wnid": wnid,
            "lemmas": ss.lemma_names(),
            "definition": ss.definition()
        }

# Save JSON mapping
OUT_JSON.write_text(json.dumps(mapping, indent=2))
print(f"‚úÖ Wrote mapping for {len(mapping)} folders ‚Üí {OUT_JSON}")

# Print quick summary
unmapped = [k for k,v in mapping.items() if not v]
print(f"Resolved: {len(mapping) - len(unmapped)}   Unresolved: {len(unmapped)}")
if unmapped:
    print("Unresolved class names (edit folder name or add manual overrides):")
    for u in unmapped:
        print("  -", u)

# Optional: peek a few mapped entries
print("\nSample mappings:")
for k in list(mapping.keys())[:10]:
    print(f"  {k:20s} -> {mapping[k]}")


‚úÖ Wrote mapping for 99 folders ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid_from_ABI.json
Resolved: 98   Unresolved: 1
Unresolved class names (edit folder name or add manual overrides):
  - muskox

Sample mappings:
  ant                  -> n02219486
  antelope             -> n02419796
  armadillo            -> n02454379
  bat                  -> n02139199
  bear                 -> n02131653
  beaver               -> n09745229
  beetle               -> n02164464
  bird                 -> n09989045
  bison                -> n02410509
  boar                 -> n02396014


In [21]:
# JUPYTER CELL ‚Äî verify ImageNet winter21 tar availability (NO DOWNLOADS)

from pathlib import Path
import json, time
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

# --- Paths ---
PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
MAP_PATH = PROJECT / "species_to_wnid.json"

# --- Settings ---
BASE_URL = "https://image-net.org/data/winter21_whole"
TIMEOUT  = 12
RETRIES  = 2
MAX_WORKERS = 8  # parallelism for faster checks

def exists_on_imagenet(wnid: str) -> tuple[int, str]:
    """
    Return (status_code, 'HEAD'/'GET') for the tar URL.
    """
    url = f"{BASE_URL}/{wnid}.tar"
    # Try HEAD first (no download)
    for _ in range(RETRIES):
        try:
            r = requests.head(url, timeout=TIMEOUT, allow_redirects=True)
            if r.status_code in (200, 301, 302):
                return r.status_code, "HEAD"
            if r.status_code == 404:
                return 404, "HEAD"
        except requests.RequestException:
            time.sleep(1)
    # Fallback: lightweight GET (server sometimes blocks HEAD)
    for _ in range(RETRIES):
        try:
            r = requests.get(url, timeout=TIMEOUT, stream=True)
            # immediately close without reading body to avoid downloading
            r.close()
            if r.status_code in (200, 301, 302):
                return r.status_code, "GET"
            if r.status_code == 404:
                return 404, "GET"
        except requests.RequestException:
            time.sleep(1)
    return -1, "ERROR"

# Load mapping
mapping = json.loads(MAP_PATH.read_text())

# Check all
results = {}
print(f"Checking {len(mapping)} synsets on ImageNet (no downloads)...\n")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futs = {
        ex.submit(exists_on_imagenet, wnid): species
        for species, wnid in mapping.items() if wnid
    }
    for fut in as_completed(futs):
        species = futs[fut]
        code, how = fut.result()
        results[species] = code
        if code in (200, 301, 302):
            print(f"‚úÖ {species:20s} -> AVAILABLE ({how})")
        elif code == 404:
            print(f"‚ùå {species:20s} -> NOT FOUND (404)")
        else:
            print(f"‚ö†Ô∏è  {species:20s} -> UNKNOWN (network/other)")

# Summary
ok   = [s for s,c in results.items() if c in (200,301,302)]
miss = [s for s,c in results.items() if c == 404]
unk  = [s for s,c in results.items() if c not in (200,301,302,404)]

print("\n--- Summary ---")
print(f"Available: {len(ok)}")
print(f"Not found: {len(miss)}")
print(f"Unknown:   {len(unk)}")

if miss:
    print("\nMissing WNIDs (no winter21 tar):")
    for s in sorted(miss):
        print("  -", s)

if unk:
    print("\nUnknown status (retry later or check network):")
    for s in sorted(unk):
        print("  -", s)


Checking 99 synsets on ImageNet (no downloads)...

‚úÖ bird                 -> AVAILABLE (HEAD)
‚úÖ ant                  -> AVAILABLE (HEAD)
‚úÖ armadillo            -> AVAILABLE (HEAD)
‚úÖ beetle               -> AVAILABLE (HEAD)
‚úÖ bear                 -> AVAILABLE (HEAD)
‚úÖ bat                  -> AVAILABLE (HEAD)
‚úÖ antelope             -> AVAILABLE (HEAD)
‚úÖ beaver               -> AVAILABLE (HEAD)
‚úÖ bison                -> AVAILABLE (HEAD)
‚úÖ butterfly            -> AVAILABLE (HEAD)
‚úÖ camel                -> AVAILABLE (HEAD)
‚úÖ cat                  -> AVAILABLE (HEAD)
‚úÖ boar                 -> AVAILABLE (HEAD)
‚úÖ chipmunk             -> AVAILABLE (HEAD)
‚ùå cardinal             -> NOT FOUND (404)
‚ùå chameleon            -> NOT FOUND (404)
‚úÖ cow                  -> AVAILABLE (HEAD)
‚ùå buffalo              -> NOT FOUND (404)
‚úÖ cockroach            -> AVAILABLE (HEAD)
‚úÖ cicada               -> AVAILABLE (HEAD)
‚úÖ crane                -> AVAILABLE (HEAD)
‚ùå cra

In [25]:
from pathlib import Path
import csv, re, json
from collections import defaultdict
import nltk
from nltk.corpus import wordnet as wn

# ---- Paths ----
PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
SOURCE_DIR = PROJECT / "animals_birds_insects"     # <- your folders
CANDIDATES_CSV = PROJECT / "wnid_candidates.csv" # <- output to review

TOP_K = 6  # how many options per folder to list

BIO_HEADS = {
    "animal","beast","organism",
    "vertebrate","invertebrate",
    "mammal","reptile","amphibian","bird","aves",
    "fish","pisces","insect","arthropod","arachnid","crustacean",
    "mollusk","gastropod","cephalopod","cnidarian","annelid"
}

def normalize(name: str) -> str:
    name = name.strip().lower().replace("_"," ").replace("-"," ")
    return re.sub(r"\s+"," ",name)

def is_bio(ss) -> bool:
    # Walk hypernyms to see if it‚Äôs a biological entity
    for h in ss.closure(lambda s: s.hypernyms()):
        if any(lem.name().lower() in BIO_HEADS for lem in h.lemmas()):
            return True
    if any(lem.name().lower() in BIO_HEADS for lem in ss.lemmas()):
        return True
    return False

def score(ss, tokens):
    sc = 0
    if not is_bio(ss):
        sc += 100
    lemmas = [l.lower() for l in ss.lemma_names()]
    overlap = sum(1 for t in tokens if any(l.startswith(t) or t in l for l in lemmas))
    sc -= 5*overlap
    sc += min(len(ss.definition()), 80)/80.0
    return sc

def wnid_of(ss): return f"n{ss.offset():08d}"

# Collect folder names
if not SOURCE_DIR.exists():
    raise SystemExit(f"Source dir not found: {SOURCE_DIR}")
folders = sorted([p.name for p in SOURCE_DIR.iterdir() if p.is_dir()])
print(f"Found {len(folders)} folders.")

rows = []
for cls in folders:
    q = normalize(cls)
    tokens = q.split()
    queries = {q, "_".join(tokens)}
    if q.endswith("s"): queries.add(q[:-1])
    else: queries.add(q+"s")

    cands = {}
    for qv in queries:
        for ss in wn.synsets(qv, pos='n'):
            cands[ss.offset()] = ss
    scored = sorted(((score(ss, tokens), ss) for ss in cands.values()), key=lambda x: x[0])
    top = [ss for _, ss in scored[:TOP_K]]
    if not top:
        rows.append([cls,"", "", "", "", "", "", "", "", "", "", ""])
        continue
    # Build a single row with multiple options
    row = [cls, ""]  # species_folder, chosen_wnid (to be filled later)
    for ss in top:
        row += [wnid_of(ss), ", ".join(ss.lemma_names()), ss.definition()]
    # pad columns if less than TOP_K
    while (len(row)-2)//3 < TOP_K:
        row += ["","",""]
    rows.append(row)

# Header
hdr = ["species_folder","chosen_wnid"]
for i in range(1, TOP_K+1):
    hdr += [f"opt{i}_wnid", f"opt{i}_labels", f"opt{i}_definition"]

with open(CANDIDATES_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(hdr)
    w.writerows(rows)

print(f"‚úÖ Wrote candidate list to: {CANDIDATES_CSV}\nOpen it, review, and fill the 'chosen_wnid' column.")


Found 99 folders.
‚úÖ Wrote candidate list to: /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/wnid_candidates.csv
Open it, review, and fill the 'chosen_wnid' column.


In [29]:
from pathlib import Path
import csv, json

PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
CANDIDATES_CSV = PROJECT / "wnid_candidates.csv"
OUT_JSON = PROJECT / "species_to_wnid.json"

AUTO_TAKE_FIRST = False  # set True to auto-pick opt1_wnid if chosen_wnid is empty

with open(CANDIDATES_CSV, newline="", encoding="utf-8") as f:
    rdr = csv.DictReader(f)
    mapping = {}
    bad = []
    for row in rdr:
        species = row["species_folder"].strip()
        chosen = (row.get("chosen_wnid") or "").strip()
        opts = [ (row.get(f"opt{i}_wnid") or "").strip() for i in range(1,7) ]
        opts = [o for o in opts if o]

        if not chosen and AUTO_TAKE_FIRST and opts:
            chosen = opts[0]

        if chosen:
            if chosen in opts or AUTO_TAKE_FIRST:
                mapping[species] = chosen
            else:
                mapping[species] = None
                bad.append((species, chosen))
        else:
            mapping[species] = None

OUT_JSON.write_text(json.dumps(mapping, indent=2))
total = len(mapping)
mapped = sum(1 for v in mapping.values() if v)
print(f"‚úÖ Wrote {mapped}/{total} mappings ‚Üí {OUT_JSON}")
if bad:
    print("‚ö†Ô∏è Some chosen_wnid values were not in the listed options:")
    for s,c in bad: print("  -", s, c)
unmapped = [k for k,v in mapping.items() if not v]
if unmapped:
    print("Unmapped species (no choice made):")
    for s in unmapped: print("  -", s)


‚úÖ Wrote 98/98 mappings ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid.json


In [37]:
from pathlib import Path
import csv, json

PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
CANDIDATES_CSV = PROJECT / "wnid_candidates.csv"
OUT_JSON = PROJECT / "species_to_wnid.json"

AUTO_TAKE_FIRST = False  # set True to auto-pick opt1_wnid if chosen_wnid is empty

with open(CANDIDATES_CSV, newline="", encoding="utf-8") as f:
    rdr = csv.DictReader(f)
    mapping = {}
    bad = []
    for row in rdr:
        species = row["species_folder"].strip()
        chosen = (row.get("chosen_wnid") or "").strip()
        opts = [ (row.get(f"opt{i}_wnid") or "").strip() for i in range(1,7) ]
        opts = [o for o in opts if o]

        if not chosen and AUTO_TAKE_FIRST and opts:
            chosen = opts[0]

        if chosen:
            if chosen in opts or AUTO_TAKE_FIRST:
                mapping[species] = chosen
            else:
                mapping[species] = None
                bad.append((species, chosen))
        else:
            mapping[species] = None

OUT_JSON.write_text(json.dumps(mapping, indent=2))
total = len(mapping)
mapped = sum(1 for v in mapping.values() if v)
print(f"‚úÖ Wrote {mapped}/{total} mappings ‚Üí {OUT_JSON}")
if bad:
    print("‚ö†Ô∏è Some chosen_wnid values were not in the listed options:")
    for s,c in bad: print("  -", s, c)
unmapped = [k for k,v in mapping.items() if not v]
if unmapped:
    print("Unmapped species (no choice made):")
    for s in unmapped: print("  -", s)


‚úÖ Wrote 98/98 mappings ‚Üí /Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN/species_to_wnid.json


In [39]:
from pathlib import Path
import json, time
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
MAP_PATH = PROJECT / "species_to_wnid.json"
BASE_URL = "https://image-net.org/data/winter21_whole"
TIMEOUT, MAX_WORKERS = 10, 8

mapping = json.loads(MAP_PATH.read_text())

def probe(wnid):
    url = f"{BASE_URL}/{wnid}.tar"
    try:
        r = requests.head(url, timeout=TIMEOUT, allow_redirects=True)
        if r.status_code == 404: return 404
        if r.ok: return 200
    except requests.RequestException:
        pass
    try:
        r = requests.get(url, timeout=TIMEOUT, stream=True)
        r.close()
        if r.status_code == 404: return 404
        if r.ok: return 200
    except requests.RequestException:
        return -1
    return -1

print("Checking availability (no downloads)...")
avail = {}
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
    futs = {ex.submit(probe, wnid): sp for sp, wnid in mapping.items() if wnid}
    for fut in as_completed(futs):
        sp = futs[fut]; code = fut.result()
        avail[sp] = code
        print(f"{sp:20s} ‚Üí {'OK' if code==200 else '404' if code==404 else 'UNKNOWN'}")

missing = [s for s,c in avail.items() if c==404]
print("\nMissing:", len(missing))
for s in missing[:20]: print("  -", s)


Checking availability (no downloads)...
beaver               ‚Üí OK
armadillo            ‚Üí OK
beetle               ‚Üí OK
antelope             ‚Üí OK
bat                  ‚Üí OK
bird                 ‚Üí OK
ant                  ‚Üí OK
bear                 ‚Üí OK
buffalo              ‚Üí OK
bison                ‚Üí OK
camel                ‚Üí OK
cat                  ‚Üí OK
cardinal             ‚Üí OK
boar                 ‚Üí OK
butterfly            ‚Üí OK
chameleon            ‚Üí OK
chipmunk             ‚Üí OK
cicada               ‚Üí OK
cow                  ‚Üí OK
crab                 ‚Üí OK
cockroach            ‚Üí OK
crocodile            ‚Üí OK
crane                ‚Üí OK
cricket              ‚Üí OK
deer                 ‚Üí OK
dog                  ‚Üí OK
dolphin              ‚Üí OK
dragonfly            ‚Üí OK
duck                 ‚Üí OK
elephant             ‚Üí OK
eagle                ‚Üí OK
fish                 ‚Üí OK
fly                  ‚Üí OK
frog                 ‚Üí OK
fox     

In [41]:
from pathlib import Path
import tarfile, tempfile, time, json
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

PROJECT = Path("/Users/antranakhasi/Desktop/Projects/Origami model using CycleGAN/Origami-Model-using-CycleGAN")
MAP_PATH = PROJECT / "species_to_wnid.json"
OUT_DIR  = PROJECT / "data" / "animals"
BASE_URL = "https://image-net.org/data/winter21_whole"

MAX_WORKERS, TIMEOUT, RETRIES = 4, 45, 3

def ensure(p: Path): p.mkdir(parents=True, exist_ok=True)
def has_files(p: Path) -> bool: return p.exists() and any(p.iterdir())

def dl(url, dst):
    for a in range(RETRIES):
        try:
            r = requests.get(url, stream=True, timeout=TIMEOUT)
            if r.status_code == 404: return False
            r.raise_for_status()
            with open(dst,"wb") as f:
                for chunk in r.iter_content(1024*256):
                    if chunk: f.write(chunk)
            return True
        except requests.RequestException:
            time.sleep(2)
    return False

def extract(tp: Path, out: Path):
    n=0
    with tarfile.open(tp,"r") as tf:
        for m in tf.getmembers():
            if not m.isfile(): continue
            m.name = Path(m.name).name
            tf.extract(m, out); n+=1
    return n

def process(species, wnid):
    if not wnid: return f"{species}: SKIP (no wnid)"
    out = OUT_DIR/species
    if has_files(out): return f"{species}: SKIP (exists)"
    ensure(out)
    url = f"{BASE_URL}/{wnid}.tar"
    try:
        with tempfile.TemporaryDirectory() as td:
            tp = Path(td)/f"{wnid}.tar"
            if not dl(url, tp): return f"{species}: FAILED (download/404)"
            try:
                n = extract(tp, out)
            except tarfile.ReadError:
                return f"{species}: FAILED (corrupt tar)"
    except Exception as e:
        return f"{species}: FAILED ({type(e).__name__})"
    return f"{species}: DONE ({n} images)"

def run():
    ensure(OUT_DIR)
    mapping = json.loads(MAP_PATH.read_text())
    tasks = [(s,w) for s,w in mapping.items() if w]
    print(f"Downloading {len(tasks)} species...\n")
    results=[]
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
        futs = {ex.submit(process,s,w): s for s,w in tasks}
        for fut in as_completed(futs):
            status = fut.result()
            print(status); results.append(status)
    done   = sum("DONE" in r for r in results)
    skip   = sum("SKIP" in r for r in results)
    failed = sum("FAILED" in r for r in results)
    print(f"\nSummary ‚Üí DONE: {done}  SKIP: {skip}  FAILED: {failed}")

run()


Downloading 98 species...

armadillo: SKIP (exists)
antelope: SKIP (exists)
ant: SKIP (exists)
bat: SKIP (exists)
beaver: SKIP (exists)
beetle: SKIP (exists)
bear: SKIP (exists)
bird: SKIP (exists)
boar: SKIP (exists)
bison: SKIP (exists)
buffalo: SKIP (exists)
butterfly: SKIP (exists)
cardinal: SKIP (exists)
camel: SKIP (exists)
cat: SKIP (exists)
chameleon: SKIP (exists)
chipmunk: SKIP (exists)
cicada: SKIP (exists)
cockroach: SKIP (exists)
cow: SKIP (exists)
crab: SKIP (exists)
crane: SKIP (exists)
cricket: SKIP (exists)
crocodile: SKIP (exists)
dolphin: SKIP (exists)
dog: SKIP (exists)
dragonfly: SKIP (exists)
deer: SKIP (exists)
duck: SKIP (exists)
eagle: SKIP (exists)
elephant: SKIP (exists)
fish: SKIP (exists)
fox: SKIP (exists)
fly: SKIP (exists)
frog: SKIP (exists)
giant anteater: SKIP (exists)
gerbil: SKIP (exists)
giraffe: SKIP (exists)
grasshopper: SKIP (exists)
hamster: SKIP (exists)
gorilla: SKIP (exists)
hedgehog: SKIP (exists)
heron: SKIP (exists)
hawk: SKIP (exists)
hi