In [None]:
from huggingface_hub import hf_hub_download   # docs: hf_hub_download() guide :contentReference[oaicite:5]{index=5}
import pandas as pd, re, pathlib
import os 

# 1) grab the CSV (~400 MB) once; HF Hub will cache it
cat_path = hf_hub_download(
    repo_id="imageomics/TreeOfLife-10M",
    filename="metadata/catalog.csv",
    repo_type="dataset",
    token = True
)

df = pd.read_csv(cat_path)

phylum_family_list = os.listdir('../cleaned_chordata_images/val')

# 2) normalise your list: drop .DS_Store and strip prefix
targets = [re.sub(r"^chordata_", "", f).lower()
           for f in phylum_family_list if not f.startswith(".")]



# 3) pick only Chordata rows AND families you care about
mask = (df.phylum == "Chordata") & df.family.str.lower().isin(targets)
subset_families = df.loc[mask].copy()

print(f"{len(subset_families):,} matching images")
subset_families.to_csv("chordate_families_subset.csv", index=False)

# delete the original dataframe to clear up RAM space
del df

In [12]:
subset_families = pd.read_csv('chordate_families_subset.csv')

subset_families.head()


  subset_families = pd.read_csv('chordate_families_subset.csv')


Unnamed: 0,split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common
0,train_small,a8b544e8-d02c-4b52-a267-8ffdf34e7bd0,30013717.0,65422934.0,,,,,,Animalia,Chordata,Aves,Apodiformes,Trochilidae,Amazilia,franciae,andean emerald
1,train_small,b7512b6c-c4a7-4f8e-b141-5a62489e2781,28486176.0,59052146.0,,,,,,Animalia,Chordata,Reptilia,Squamata,Diplodactylidae,Bavayia,cyclura,Forest Bavayia
2,train_small,acc400f5-9b01-4aae-90a8-e65b591f0d7a,21162185.0,49157640.0,,,,,,Animalia,Chordata,Reptilia,Squamata,Dactyloidae,Anolis,schiedii,Schiede's Anole
3,train_small,fa231162-eb5c-4803-937d-d52221c5ad79,22392467.0,1254561.0,,,,,,Animalia,Chordata,Reptilia,Testudines,Testudinidae,Testudo,hermanni,Eastern Hermann's Tortoise
4,train_small,26ad8438-dbbf-41e7-9d4b-8159a6991604,29586658.0,46559477.0,,,,,,Animalia,Chordata,Reptilia,Testudines,Cheloniidae,Lepidochelys,olivacea,Olive Ridley


In [13]:
# IDs of families that we need
subset_ids = subset_families.treeoflife_id.tolist()

In [14]:
from huggingface_hub import hf_hub_download

licenses_path = hf_hub_download(
    repo_id   = "imageomics/TreeOfLife-10M",
    filename  = "metadata/licenses.csv",
    repo_type = "dataset",      # tells HF you want a dataset file :contentReference[oaicite:0]{index=0}
    token     = True,           # uses your cached HF token
    force_download = False      # set True to re‑fetch a corrupt cache
)


df = pd.read_csv(licenses_path)           # load into DataFrame :contentReference[oaicite:1]{index=1}
# Suppose your IDs are integers in a Python set:

# Filter only the rows you need
subset = df[df["treeoflife_id"].isin(subset_ids)]   # Boolean mask :contentReference[oaicite:2]{index=2}

# Extract the EOL full‑size URLs
urls = subset["eol_full_size_copy_url"].tolist()

licenses.csv:   0%|          | 0.00/2.22G [00:00<?, ?B/s]

In [18]:
subset.head()

Unnamed: 0,treeoflife_id,eol_content_id,eol_page_id,md5,medium_source_url,eol_full_size_copy_url,license_name,copyright_owner,license_link,title
5,df4a72dc-42fb-4fdc-ae44-2c8c5ef0a1d3,21938526.0,1267996.0,b22abb0f4686e7195cc8bdbf0c8f79ab,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/5b/03/ac/18...,cc-by-nc-4.0,dianne49,https://creativecommons.org/licenses/by-nc/4.0,not provided
40,b5a32e39-fed8-4f5b-9afc-bb16a217852a,22769563.0,1055234.0,6b44b5632a59d45adae1ea10c18884e1,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/5f/9a/11/18...,cc-by-nc-4.0,mobinkargarfard,https://creativecommons.org/licenses/by-nc/4.0,not provided
48,89bcd3e0-c311-49ab-b95a-c7d9b5feba4c,14160409.0,1056414.0,d85c4ff1e152ca95cb536f8c2ad369f3,https://farm8.staticflickr.com/7270/7846311014...,https://content.eol.org/data/media/d7/c2/f8/54...,cc-by-sa-2.0,Cristopher Gonzalez,https://creativecommons.org/licenses/by-sa/2.0,not provided
60,347fee52-cf82-4043-b38b-0cece69e6d71,28678100.0,59052132.0,3c091da6690c25d38595980081ed7048,http://mczbase.mcz.harvard.edu/specimen_images...,https://content.eol.org/data/media/c7/f2/00/26...,cc-by-nc-sa-3.0,"Museum of Comparative Zoology, Harvard University",https://creativecommons.org/licenses/by-nc-sa/3.0,not provided
93,ddd8806a-4fac-4f29-a1ec-8f6d85449956,22694533.0,790725.0,9f210b8d658a0b32ee9f04a890d52dc8,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/5d/96/a2/18...,cc-by-nc-4.0,Daniel Montoya Ferrer,https://creativecommons.org/licenses/by-nc/4.0,not provided


In [22]:
display(subset_families.info())
display(subset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052342 entries, 0 to 1052341
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   split             1052342 non-null  object 
 1   treeoflife_id     1052342 non-null  object 
 2   eol_content_id    613585 non-null   float64
 3   eol_page_id       613585 non-null   float64
 4   bioscan_part      0 non-null        float64
 5   bioscan_filename  0 non-null        float64
 6   inat21_filename   438757 non-null   object 
 7   inat21_cls_name   438757 non-null   object 
 8   inat21_cls_num    438757 non-null   float64
 9   kingdom           1052342 non-null  object 
 10  phylum            1052342 non-null  object 
 11  class             1052342 non-null  object 
 12  order             1052342 non-null  object 
 13  family            1052342 non-null  object 
 14  genus             1047704 non-null  object 
 15  species           1034700 non-null  object 
 16  

None

<class 'pandas.core.frame.DataFrame'>
Index: 560555 entries, 5 to 6219617
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   treeoflife_id           560555 non-null  object 
 1   eol_content_id          560555 non-null  float64
 2   eol_page_id             560555 non-null  float64
 3   md5                     560555 non-null  object 
 4   medium_source_url       560555 non-null  object 
 5   eol_full_size_copy_url  560555 non-null  object 
 6   license_name            560555 non-null  object 
 7   copyright_owner         560555 non-null  object 
 8   license_link            559763 non-null  object 
 9   title                   560555 non-null  object 
dtypes: float64(2), object(8)
memory usage: 47.0+ MB


None

In [25]:
# Clean and standardize the treeoflife_id in both DataFrames
subset['treeoflife_id'] = subset['treeoflife_id'].astype(str).str.strip()
subset_families['treeoflife_id'] = subset_families['treeoflife_id'].astype(str).str.strip()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset['treeoflife_id'] = subset['treeoflife_id'].astype(str).str.strip()


In [27]:
df_url_family = pd.merge(subset, 
                        subset_families[['treeoflife_id', 'phylum', 'family']],
                        on='treeoflife_id',
                        how='inner')

In [28]:
df_url_family



Unnamed: 0,treeoflife_id,eol_content_id,eol_page_id,md5,medium_source_url,eol_full_size_copy_url,license_name,copyright_owner,license_link,title,phylum,family
0,df4a72dc-42fb-4fdc-ae44-2c8c5ef0a1d3,21938526.0,1267996.0,b22abb0f4686e7195cc8bdbf0c8f79ab,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/5b/03/ac/18...,cc-by-nc-4.0,dianne49,https://creativecommons.org/licenses/by-nc/4.0,not provided,Chordata,Columbidae
1,b5a32e39-fed8-4f5b-9afc-bb16a217852a,22769563.0,1055234.0,6b44b5632a59d45adae1ea10c18884e1,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/5f/9a/11/18...,cc-by-nc-4.0,mobinkargarfard,https://creativecommons.org/licenses/by-nc/4.0,not provided,Chordata,Colubridae
2,89bcd3e0-c311-49ab-b95a-c7d9b5feba4c,14160409.0,1056414.0,d85c4ff1e152ca95cb536f8c2ad369f3,https://farm8.staticflickr.com/7270/7846311014...,https://content.eol.org/data/media/d7/c2/f8/54...,cc-by-sa-2.0,Cristopher Gonzalez,https://creativecommons.org/licenses/by-sa/2.0,not provided,Chordata,Viperidae
3,347fee52-cf82-4043-b38b-0cece69e6d71,28678100.0,59052132.0,3c091da6690c25d38595980081ed7048,http://mczbase.mcz.harvard.edu/specimen_images...,https://content.eol.org/data/media/c7/f2/00/26...,cc-by-nc-sa-3.0,"Museum of Comparative Zoology, Harvard University",https://creativecommons.org/licenses/by-nc-sa/3.0,not provided,Chordata,Gekkonidae
4,ddd8806a-4fac-4f29-a1ec-8f6d85449956,22694533.0,790725.0,9f210b8d658a0b32ee9f04a890d52dc8,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/5d/96/a2/18...,cc-by-nc-4.0,Daniel Montoya Ferrer,https://creativecommons.org/licenses/by-nc/4.0,not provided,Chordata,Phrynosomatidae
...,...,...,...,...,...,...,...,...,...,...,...,...
613580,50c7ae23-bbb9-474d-8a5a-6260f2ca42bc,8910880.0,45511280.0,6dd0764f0dc60cea12f9712cbeb861f7,http://calphotos.berkeley.edu/imgs/512x768/666...,https://content.eol.org/data/media/88/1f/4e/7....,cc-by-nc-sa-3.0,2015 California Academy of Sciences,https://creativecommons.org/licenses/by-nc-sa/3.0,not provided,Chordata,Procellariidae
613581,91c0c7ec-0a41-4653-bd20-e69f878c9624,22092768.0,45509111.0,3233e26c2c3c269ebea2b00f14720973,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/76/51/fc/18...,cc-by-nc-4.0,Dra. Laura Elvia Uribe Lara,https://creativecommons.org/licenses/by-nc/4.0,not provided,Chordata,Rallidae
613582,41a74f83-50cf-4d1e-81bf-6cdf16e09c95,21697348.0,795204.0,5ed8a3241679bf01c9094f4b5946bfdd,https://inaturalist-open-data.s3.amazonaws.com...,https://content.eol.org/data/media/6c/df/c0/18...,cc-by-nc-4.0,Ray Turnbull,https://creativecommons.org/licenses/by-nc/4.0,not provided,Chordata,Scincidae
613583,e8a8cbf0-adc9-4310-948c-d71d7029102e,29560128.0,45509305.0,f4d297f0693524f08be61eec8b8db6d3,https://upload.wikimedia.org/wikipedia/commons...,https://content.eol.org/data/media/64/5d/8f/50...,cc-by-sa-3.0,Andreas Eichler,https://creativecommons.org/licenses/by-sa/3.0,not provided,Chordata,Laridae


In [35]:
#!/usr/bin/env python3
"""
flatten_chordata.py

For each directory named chordata_<family>/pictures, move all files
up one level into chordata_<family>/ and delete the empty pictures/ folder.
"""

import shutil
from pathlib import Path

def flatten_chordata(root: Path) -> None:
    """
    Walks through each chordata_<family>/pictures folder under `root`,
    moves its files up to chordata_<family>/, and removes the pictures folder.
    """
    # Glob for all 'pictures' subdirectories in any chordata_* folder
    for pics_dir in root.glob("additional_data/chordata_*/pictures"):
        if not pics_dir.is_dir():
            continue

        parent_dir = pics_dir.parent  # chordata_<family> directory
        print(f"▸ Processing {pics_dir} → {parent_dir}")

        # Move each file from pictures/ to the parent directory
        for item in pics_dir.iterdir():
            if item.is_file():
                dest = parent_dir / item.name
                print(f"    moving {item.name} → {dest}")
                shutil.move(str(item), str(dest))

        # Remove the now-empty pictures directory
        try:
            pics_dir.rmdir()
            print(f"✓ Removed empty folder {pics_dir}")
        except OSError as e:
            print(f"⚠️  Could not remove {pics_dir}: {e}")

if __name__ == "__main__":
    import argparse

    p = argparse.ArgumentParser(
        description="Flatten chordata_<family>/pictures → chordata_<family>/"
    )
    p.add_argument(
        "--root",
        type=Path,
        default=Path.cwd(),
        help="Root directory containing chordata_<family> folders"
    )
    args = p.parse_args()
    flatten_chordata(args.root)


usage: ipykernel_launcher.py [-h] [--root ROOT]
ipykernel_launcher.py: error: unrecognized arguments: --f=/Users/svengoerdes/Library/Jupyter/runtime/kernel-v3c4185bef1aef851e8e4ec3e9c810bf57462f606f.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [36]:
import os
import requests
import pandas as pd
import time # Import the time module
from tqdm.notebook import tqdm # Use 'from tqdm import tqdm' if not in a Jupyter environment

# --- Configuration ---
# !!! IMPORTANT: Replace 'your_url_column_name' with the actual column name containing the image URLs !!!
image_url_column = 'eol_full_size_copy_url' # Assuming this is correct from context
# Assume df_url_family is your DataFrame
# df_url_family = ...

base_output_dir = "additional_data"
images_per_family = 10
request_delay_seconds = 1 # Added: Delay between requests (in seconds)

already_pulled = [
    'Columbidae',
    'Colubridae',
    'Viperidae',
    'Gekkonidae',
    'Phrynosomatidae',
    'Labridae',
    'Spheniscidae',
    'Otariidae',
    'Strigidae',
    'Chaetodontidae',
    'Soricidae'
]
# --- End Configuration ---

# Make sure the DataFrame and necessary columns exist
if 'df_url_family' not in locals():
    print("Error: DataFrame 'df_url_family' not found.")
elif not all(col in df_url_family.columns for col in ['family', 'treeoflife_id', image_url_column]):
    print(f"Error: DataFrame 'df_url_family' must contain 'family', 'treeoflife_id', and '{image_url_column}' columns.")
else:
    os.makedirs(base_output_dir, exist_ok=True)
    unique_families = df_url_family['family'].dropna().unique()
    
    # remvoe out of list that already have been pulled
    unique_families = [family for family in unique_families if family not in already_pulled]
    print(f"Found {len(unique_families)} unique families.")

    for family in tqdm(unique_families, desc="Processing families"):
        family_lower = str(family).lower().replace(' ', '_')
        print(f"\nProcessing family: {family}")

        target_dir = os.path.join(base_output_dir, f"chordata_{family_lower}")
        os.makedirs(target_dir, exist_ok=True)

        family_df = df_url_family[
            (df_url_family['family'] == family) &
            (df_url_family[image_url_column].notna()) &
            (df_url_family[image_url_column] != '')
        ].copy()

        samples_df = family_df.head(images_per_family)

        if samples_df.empty:
            print(f"  No images with valid URLs found for family: {family}")
            continue

        print(f"  Attempting to download {len(samples_df)} images for {family}...")

        downloaded_count = 0
        for index, row in samples_df.iterrows():
            image_url = row[image_url_column]
            tree_id = row['treeoflife_id']

            try:
                file_extension = os.path.splitext(image_url.split('?')[0])[-1]
                if not file_extension or len(file_extension) > 5:
                     file_extension = '.jpg'
            except Exception:
                file_extension = '.jpg'

            filename = f"{tree_id}{file_extension}"
            output_path = os.path.join(target_dir, filename)

            if os.path.exists(output_path):
                downloaded_count += 1
                continue

            try:
                response = requests.get(image_url, stream=True, timeout=20)
                response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

                with open(output_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                downloaded_count += 1

            except requests.exceptions.HTTPError as e:
                 # Specifically handle 429 errors if needed, or just print general HTTP errors
                 if e.response.status_code == 429:
                     print(f"  Rate limit hit (429) for {image_url}. Consider increasing delay.")
                     # Optional: Implement retry logic here later if needed
                 else:
                     print(f"  HTTP Error downloading {image_url}: {e}")
                 if os.path.exists(output_path):
                     try: os.remove(output_path)
                     except OSError: pass
            except requests.exceptions.Timeout:
                print(f"  Timeout downloading {image_url}")
            except requests.exceptions.RequestException as e:
                print(f"  Error downloading {image_url}: {e}")
                if os.path.exists(output_path):
                    try: os.remove(output_path)
                    except OSError: pass
            except IOError as e:
                print(f"  Error saving file {output_path}: {e}")
            except Exception as e:
                print(f"  An unexpected error occurred for {image_url} ({output_path}): {e}")

            # --- Add delay ---
            # Wait for the specified amount of time before the next request
            time.sleep(request_delay_seconds)
            # --- End delay ---


        print(f"  Finished for {family}. Found/Downloaded {downloaded_count}/{len(samples_df)} requested images.")

    print("\n--- Image download process complete ---")

Found 150 unique families.


Processing families:   0%|          | 0/150 [00:00<?, ?it/s]


Processing family: Scincidae
  Attempting to download 10 images for Scincidae...
  Finished for Scincidae. Found/Downloaded 10/10 requested images.

Processing family: Bovidae
  Attempting to download 10 images for Bovidae...
  Finished for Bovidae. Found/Downloaded 10/10 requested images.

Processing family: Dactyloidae
  Attempting to download 10 images for Dactyloidae...
  Finished for Dactyloidae. Found/Downloaded 10/10 requested images.

Processing family: Anatidae
  Attempting to download 10 images for Anatidae...
  Finished for Anatidae. Found/Downloaded 10/10 requested images.

Processing family: Phyllomedusidae
  Attempting to download 10 images for Phyllomedusidae...
  Finished for Phyllomedusidae. Found/Downloaded 10/10 requested images.

Processing family: Muscicapidae
  Attempting to download 10 images for Muscicapidae...
  Finished for Muscicapidae. Found/Downloaded 10/10 requested images.

Processing family: Thraupidae
  Attempting to download 10 images for Thraupidae..

In [6]:
from datasets import load_dataset

ds = load_dataset(
    "imageomics/TreeOfLife-10M",
    split="train",            # or "train"
    streaming=True,         # returns IterableDataset :contentReference[oaicite:4]{index=4}
    token=True              # uses your cached HF token
)

Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

In [7]:

filtered = ds.filter(lambda ex: ex["treeoflife_id"] in id_list)

In [None]:
# subset['phylum_family'] = subset['phylum'].lower() + '_' + subset['family'].lower()

subset['phylum_family'] = subset.apply(lambda row: row['phylum'].lower() + '_' + row['family'].lower(), axis = 1)

In [22]:
subset['__url__']

KeyError: '__url__'

In [15]:
subset.head(2)

Unnamed: 0,split,treeoflife_id,eol_content_id,eol_page_id,bioscan_part,bioscan_filename,inat21_filename,inat21_cls_name,inat21_cls_num,kingdom,phylum,class,order,family,genus,species,common,phylum_family
13,train_small,a8b544e8-d02c-4b52-a267-8ffdf34e7bd0,30013717.0,65422934.0,,,,,,Animalia,Chordata,Aves,Apodiformes,Trochilidae,Amazilia,franciae,andean emerald,chordata_trochilidae
21,train_small,b7512b6c-c4a7-4f8e-b141-5a62489e2781,28486176.0,59052146.0,,,,,,Animalia,Chordata,Reptilia,Squamata,Diplodactylidae,Bavayia,cyclura,Forest Bavayia,chordata_diplodactylidae


In [21]:
from datasets import load_dataset, IterableDataset
import pathlib, random, itertools, re

FAMILIES = {re.sub(r"^chordata_", "", f).lower()
            for f in targets if not f.startswith(".")}

N   = 10
OUT = pathlib.Path("tol10m_test"); OUT.mkdir(exist_ok=True)

# ---- streaming dataset (no repo_type!) ----
ds: IterableDataset = load_dataset(
        "imageomics/TreeOfLife-10M",
        split="train",          # or "train"
        streaming=True,       # enables HTTP streaming :contentReference[oaicite:7]{index=7}
        token=True            # use cached token (optional)
)

# ---- sample-on-the-fly ----
buffers, finished = {f: [] for f in FAMILIES}, set()
for ex in ds.shuffle(buffer_size=10_000, seed=42):      # random order :contentReference[oaicite:8]{index=8}
    fam = ex["family"].lower()
    if fam not in FAMILIES or fam in finished:
        continue

    buffers[fam].append(ex)
    if len(buffers[fam]) == N:
        finished.add(fam)
        (OUT/fam).mkdir(exist_ok=True)
        for e in buffers[fam]:
            e["jpg"].save(OUT/fam/f"{e['treeoflife_id']}.jpg")  # PIL image field :contentReference[oaicite:9]{index=9}
        print(f"✅  {fam}: {N} images saved")

    if len(finished) == len(FAMILIES):
        break


Resolving data files:   0%|          | 0/72 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [17]:
len(targets)

166