In [1]:
import requests
import csv
import time
import os
import json
import re
from urllib.parse import urlparse
import glob
import random
from PIL import Image
from tqdm import tqdm 
from tensorflow.keras.preprocessing.image import ImageDataGenerator # type: ignore
import numpy as np
from tensorflow.keras.preprocessing import image # type: ignore
import shutil

### Generate Taxon IDs for the plants

In [8]:
output_file = "species_taxon_ids.csv"

if os.path.exists(output_file):
    print("Species Taxon IDs file exists")
else:
    # 1. Define your species list
    species = "species_list.csv"
    species_list = []

    with open(species, mode='r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            species_list.append(tuple(row))

    # 2. Function to get taxon ID
    def get_taxon_id(name):
        url = f"https://api.inaturalist.org/v1/search?q={name}&sources=taxa"
        # params = {"q": scientific_name, "rank": "species"}
        response = requests.get(url)
        data = response.json()

        if data.get("results"):
            taxon = data["results"][0]["record"]
            return taxon["id"], taxon.get("preferred_common_name", "")
        else:
            return None, None

    # 3. Fetch IDs & save to CSV
    with open(output_file, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["common_name", "scientific_name", "taxon_id", "inat_common_name"])

        for sci_name, common_name in species_list:
            taxon_id, inat_common = get_taxon_id(sci_name)
            writer.writerow([sci_name, common_name, taxon_id, inat_common])
            print(f"{sci_name} → Taxon ID: {taxon_id}, iNat Common: {inat_common}")
            time.sleep(1)  # Avoid rate limiting

    print(f"Taxon IDs saved to {output_file}")

Species Taxon IDs file exists


### Download the images using the Generated Taxon IDs

In [9]:
TAXON_IDS = "species_taxon_ids.csv"
OUTPUT_DIR = "data"
IMAGES_PER_SPECIES = 100
data_folder = "data"

if os.path.exists(data_folder and os.path.isdir(data_folder)):
    subfolders = [x for x in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, x))]
    if len(subfolders) == 100:
        print("All images downloaded")
    else:
        def download_images_for_taxon(taxon_id, folder_name, limit=50):
            os.makedirs(folder_name, exist_ok=True)
            page = 1
            downloaded = set(os.listdir(folder_name))
            counter = len(downloaded)

            while len(downloaded) < limit:
                url = "https://api.inaturalist.org/v1/observations"
                params = {
                    "taxon_id": taxon_id,
                    "quality_grade": "research",
                    "per_page": 200,
                    "page": page,
                    "order_by": "votes",
                    "order": "desc",
                    # Public Licenced images only
                    "license": "cc0,cc-by,cc-by-sa",
                    "photo_license": "cc0,cc-by,cc-by-sa"
                }

                response = requests.get(url, params=params)
                data = response.json()

                if not data.get("results"):
                    break

                for obs in data["results"]:
                    for photo in obs.get("photos", []):
                        img_url = photo["url"].replace("square", "medium")
                        if img_url in downloaded:
                            continue
                        downloaded.add(img_url)

                        # Use unique filename: photo_id + counter
                        photo_id = photo["id"]
                        img_ext = os.path.splitext(urlparse(img_url).path)[1]
                        img_name = f"{photo_id}_{counter}{img_ext}"
                        img_path = os.path.join(folder_name, img_name)

                        try:
                            img_data = requests.get(img_url).content
                            with open(img_path, "wb") as f:
                                f.write(img_data)
                            counter += 1
                        except Exception as e:
                            print(f"Error saving {img_url}: {e}")

                        if counter >= limit:
                            return
                page += 1
                time.sleep(0.5)  # avoid API rate limit

        # Main loop
        with open(TAXON_IDS, newline="", encoding="utf-8") as file:
            reader = csv.DictReader(file)
            for row in reader:
                sci_name = row["scientific_name"].replace(" ", "_")
                taxon_id = row["taxon_id"]

                if not taxon_id or taxon_id.lower() == "none":
                    print(f"Skipping {sci_name} (no taxon_id)")
                    continue

                folder_path = os.path.join(OUTPUT_DIR, sci_name)
                if os.path.isdir(folder_path) and len(os.listdir(folder_path)) >= IMAGES_PER_SPECIES:
                    print(f"Skipping {sci_name} (already has enough images)")
                    continue
                download_images_for_taxon(taxon_id, folder_path, IMAGES_PER_SPECIES)
                print(f"Downloaded {IMAGES_PER_SPECIES} images for {sci_name}")

Skipping Rosa_indica (already has enough images)
Skipping Helianthus_annuus (already has enough images)
Skipping Hibiscus_rosa-sinensis (already has enough images)
Skipping Tagetes_erecta (already has enough images)
Skipping Jasminum_sambac (already has enough images)
Skipping Lavandula_angustifolia (already has enough images)
Skipping Orchidaceae_spp. (already has enough images)
Skipping Lilium_spp. (already has enough images)
Skipping Tulipa_spp. (already has enough images)
Skipping Bellis_perennis (already has enough images)
Skipping Bougainvillea_glabra (already has enough images)
Skipping Chrysanthemum_indicum (already has enough images)
Skipping Nelumbo_nucifera (already has enough images)
Skipping Pelargonium_spp. (already has enough images)
Skipping Narcissus_spp. (already has enough images)
Skipping Petunia_spp. (already has enough images)
Skipping Zinnia_elegans (already has enough images)
Skipping Dianthus_caryophyllus (already has enough images)
Skipping Ipomoea_purpurea (a

### Scrape the Descriptions

In [10]:

# SETTINGS
CSV_FILE = "species_taxon_ids.csv"
OUTPUT_JSON = "plant_data.json"
DESCRIPTION_SENTENCES = 2
SLEEP_TIME = 0.5
output_file = "plant_data.json"

if os.path.exists(output_file):
    print("Plant Data JSON file exists")
else:
    # Wikipedia API helper
    def get_wikipedia_page(query):
        url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{query.replace(' ', '_')}"
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        return None

    def get_full_wikipedia_text(query):
        url = f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&explaintext&format=json&titles={query.replace(' ', '_')}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            page = next(iter(data["query"]["pages"].values()))
            if "extract" in page:
                return page["extract"]
        return None

    # Main script
    plant_info = {}

    with open(CSV_FILE, newline="", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            sci_name = row["scientific_name"].strip()
            common_name = row["common_name"].strip()
            inat_common_name = row["inat_common_name"].strip()

            print(f"Processing {common_name} ({sci_name})...")
            # Get description
            data = get_wikipedia_page(sci_name) or get_wikipedia_page(common_name)
            description = None
            if data and "extract" in data:
                desc_text = data["extract"]
                description = '.'.join(desc_text.split('.')[:DESCRIPTION_SENTENCES]) + '.'

            if not description:
                description = "Description not available."
            # Get fun fact
            # --------------------
            full_text = get_full_wikipedia_text(sci_name) or get_full_wikipedia_text(common_name)
            fun_fact = None
            if full_text:
                paragraphs = [p.strip() for p in full_text.split("\n") if len(p.strip()) > 50]
                if len(paragraphs) > 1:
                    # Take second paragraph if it seems interesting
                    fact_candidate = paragraphs[1]
                    fact_candidate = re.sub(r'\[\d+\]', '', fact_candidate)  # Remove citations
                    fun_fact = '.'.join(fact_candidate.split('.')[:2]).strip() + '.'

            if not fun_fact:
                fun_fact = "Fun-fact not available"
            # Store data
            plant_info[sci_name] = {
                "common_name": common_name,
                "inat_common_name": inat_common_name,
                "description": description,
                "fun_fact": fun_fact
            }

            print(f"Added {common_name}")
            time.sleep(SLEEP_TIME)

    # Save to JSON
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(plant_info, f, ensure_ascii=False, indent=4)

    print(f"\nAll done! Data saved to {OUTPUT_JSON}")

Plant Data JSON file exists


In [None]:
# Path to your main data folder
data_dir = "data"

# Allowed image extensions (case-insensitive)
image_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}

i = 0
# Walk through each subfolder
for subfolder in sorted(os.listdir(data_dir)):
    subfolder_path = os.path.join(data_dir, subfolder)

    if os.path.isdir(subfolder_path):
        # Count files with valid extensions
        count = sum(
            1 for f in os.listdir(subfolder_path)
            if os.path.splitext(f)[1].lower() in image_exts
        )
        if count < 100:
            print(f"{subfolder}: {count} images")
        elif count == 100:
            i += 1
print(i)

Citrus_limon: 48 images
Litchi_chinensis: 72 images
94


In [7]:
datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    brightness_range=[0.8, 1.2]
)

folders = [r"data\Citrus_limon", r"data\Litchi_chinensis"]   # subfolder with only 48 or 72 images
target_count = 100

for folder in folders:
    files = os.listdir(folder)
    current_count = len(files)
    i = 0
    while current_count < target_count:
        img_path = os.path.join(folder, files[i % len(files)])
        img = image.load_img(img_path, target_size=(224, 224))
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)

        for batch in datagen.flow(x, batch_size=1, save_to_dir=folder, save_prefix="aug", save_format="jpg"):
            current_count += 1
            break  # one new image per loop

        i += 1

    print(f"{folder} now has {current_count} images")

data\Citrus_limon now has 100 images
data\Litchi_chinensis now has 100 images


In [9]:
# Path to your main data folder
data_dir = "data"

# Allowed image extensions (case-insensitive)
image_exts = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff"}

i = 0
# Walk through each subfolder
for subfolder in sorted(os.listdir(data_dir)):
    subfolder_path = os.path.join(data_dir, subfolder)

    if os.path.isdir(subfolder_path):
        # Count files with valid extensions
        count = sum(
            1 for f in os.listdir(subfolder_path)
            if os.path.splitext(f)[1].lower() in image_exts
        )
        if count < 100:
            print(f"{subfolder}: {count} images")
        elif count == 100:
            i += 1
print(i)

96


In [2]:
DATASET_DIR = r"data"  # your root dataset folder
REPLACEMENTS = {
    "×": "x",  # replace multiplication sign with lowercase x
}

def safe_name(name):
    new_name = name
    for bad_char, replacement in REPLACEMENTS.items():
        new_name = new_name.replace(bad_char, replacement)
    return new_name

def rename_folders(root_dir):
    for current_dir, dirs, files in os.walk(root_dir, topdown=False):
        for d in dirs:
            new_d = safe_name(d)
            if new_d != d:
                old_path = os.path.join(current_dir, d)
                new_path = os.path.join(current_dir, new_d)
                os.rename(old_path, new_path)
                print(f"Renamed: {old_path} → {new_path}")

rename_folders(DATASET_DIR)
print("\nFolder renaming complete.")

Renamed: data\Fragaria_×_ananassa → data\Fragaria_x_ananassa
Renamed: data\Mentha_×_piperita → data\Mentha_x_piperita

Folder renaming complete.


### Preprocess the Images for Modeling

In [11]:
# SETTINGS
RAW_DATA_DIR = "data"
OUTPUT_DIR = "processed_data"
IMAGE_SIZE = (224, 224)
TRAIN_SPLIT = 0.8
VAL_SPLIT = 0.1  # Remaining 0.1 will be test
ALLOWED_EXTENSIONS = {".jpg", ".jpeg", ".png"}

# Helper: check valid image
def is_valid_image(path):
    try:
        with Image.open(path) as img:
            img.verify()
        return True
    except:
        return False

# Step 1: Clean and gather all images
all_species = os.listdir(RAW_DATA_DIR)
print(f"Found {len(all_species)} species folders.")

for species in tqdm(all_species, desc="Processing species"):
    species_path = os.path.join(RAW_DATA_DIR, species)
    if not os.path.isdir(species_path):
        continue

    images = []
    for ext in ALLOWED_EXTENSIONS:
        images.extend(glob.glob(os.path.join(species_path, f"*{ext}")))

    # Remove invalid files
    valid_images = [img for img in images if is_valid_image(img)]

    # Shuffle for randomness
    random.shuffle(valid_images)

    # Step 2: Train/Val/Test split
    n_total = len(valid_images)
    n_train = int(n_total * TRAIN_SPLIT)
    n_val = int(n_total * VAL_SPLIT)
    n_test = n_total - n_train - n_val

    splits = {
        "train": valid_images[:n_train],
        "val": valid_images[n_train:n_train + n_val],
        "test": valid_images[n_train + n_val:]
    }

    # Step 3: Save resized images to OUTPUT_DIR
    for split_name, file_list in splits.items():
        split_dir = os.path.join(OUTPUT_DIR, split_name, species)
        os.makedirs(split_dir, exist_ok=True)

        for file_path in file_list:
            try:
                with Image.open(file_path) as img:
                    img = img.convert("RGB")  # Ensure 3 channels
                    img = img.resize(IMAGE_SIZE)
                    file_name = os.path.basename(file_path)
                    img.save(os.path.join(split_dir, file_name), "JPEG")
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

print("\nPreprocessing complete!")
print(f"Processed dataset saved in: {OUTPUT_DIR}")


Found 96 species folders.


Processing species: 100%|██████████| 96/96 [01:53<00:00,  1.19s/it]


Preprocessing complete!
Processed dataset saved in: processed_data





In [12]:
DATASET_DIR = r"processed_data"  # your root dataset folder
REPLACEMENTS = {
    "×": "x",  # replace multiplication sign with lowercase x
}

def safe_name(name):
    new_name = name
    for bad_char, replacement in REPLACEMENTS.items():
        new_name = new_name.replace(bad_char, replacement)
    return new_name

def rename_folders(root_dir):
    for current_dir, dirs, files in os.walk(root_dir, topdown=False):
        for d in dirs:
            new_d = safe_name(d)
            if new_d != d:
                old_path = os.path.join(current_dir, d)
                new_path = os.path.join(current_dir, new_d)
                os.rename(old_path, new_path)
                print(f"Renamed: {old_path} → {new_path}")

rename_folders(DATASET_DIR)
print("\nFolder renaming complete.")

Renamed: processed_data\test\Fragaria_×_ananassa → processed_data\test\Fragaria_x_ananassa
Renamed: processed_data\test\Mentha_×_piperita → processed_data\test\Mentha_x_piperita
Renamed: processed_data\train\Fragaria_×_ananassa → processed_data\train\Fragaria_x_ananassa
Renamed: processed_data\train\Mentha_×_piperita → processed_data\train\Mentha_x_piperita
Renamed: processed_data\val\Fragaria_×_ananassa → processed_data\val\Fragaria_x_ananassa
Renamed: processed_data\val\Mentha_×_piperita → processed_data\val\Mentha_x_piperita

Folder renaming complete.


In [None]:
# Paths
data_folder = "data"
output_folder = "images/representatives"

# Make sure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Loop over each subfolder
for subfolder in os.listdir(data_folder):
    subfolder_path = os.path.join(data_folder, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Get all image files in subfolder
        images = [f for f in os.listdir(subfolder_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))]
        
        if images:
            # Pick one random image
            random_image = random.choice(images)
            random_image_path = os.path.join(subfolder_path, random_image)
            
            # Clean folder name (replace _ with space)
            # clean_name = subfolder.replace("_", " ")
            
            # Keep original extension
            ext = os.path.splitext(random_image)[1]
            
            # Destination path
            dest_path = os.path.join(output_folder, f"{subfolder}{ext}")
            
            # Copy image
            shutil.copy(random_image_path, dest_path)
            
            print(f"Picked {random_image} → {dest_path}")

In [None]:
def fetch_wikipedia_image(query, output_path):
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{query.replace(' ', '_')}"
    response = requests.get(url).json()
    
    if "thumbnail" in response:
        img_url = response["thumbnail"]["source"]
        img_data = requests.get(img_url).content
        with open(output_path, "wb") as f:
            f.write(img_data)
        print(f"Downloaded {query} → {output_path}")
    else:
        print(f"No image found for {query}")

data_folder = "data"
plants = [subfolder.replace("_", " ") for subfolder in os.listdir(data_folder)]
os.makedirs("images/representatives_clean", exist_ok=True)

for plant in plants:
    output_path = f"images/representatives_clean/{plant.replace(' ', '_')}.jpg"
    fetch_wikipedia_image(plant, output_path)


In [6]:
HEADERS = {
    "User-Agent": "PlantClassifierBot/1.0 (https://github.com/themrandroid; contact: rasheedmuhammed002@gmail.com)"
}

def fetch_wikipedia_image(query, output_path):
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{query.replace(' ', '_')}"
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)

        if response.status_code != 200:
            print(f"Failed request for {query} (status {response.status_code})")
            return False

        data = response.json()

        if "thumbnail" in data:
            img_url = data["thumbnail"]["source"]
            img_data = requests.get(img_url, headers=HEADERS).content
            with open(output_path, "wb") as f:
                f.write(img_data)
            print(f"Downloaded {query} → {output_path}")
            return True
        else:
            print(f"No thumbnail for {query}")
            return False

    except requests.exceptions.RequestException as e:
        print(f"Request error for {query}: {e}")
        return False
    except ValueError:
        print(f"JSON parse error for {query}")
        return False

# Main loop
data_folder = "data"
plants = [subfolder.replace("_", " ") for subfolder in os.listdir(data_folder)]
os.makedirs("images/representatives_clean", exist_ok=True)

for plant in plants:
    output_path = f"images/representatives_clean/{plant.replace(' ', '_')}.jpg"
    
    if os.path.exists(output_path):
        print(f"Skipping {plant}, already exists.")
        continue
    
    if not fetch_wikipedia_image(plant, output_path):
        fetch_wikipedia_image(plant.replace(" ", "_"), output_path)
    
    time.sleep(1)

Failed request for Acacia spp (status 404)
Failed request for Acacia_spp (status 404)
Failed request for Acer spp (status 404)
Failed request for Acer_spp (status 404)
Downloaded Achillea millefolium → images/representatives_clean/Achillea_millefolium.jpg
Downloaded Actinidia deliciosa → images/representatives_clean/Actinidia_deliciosa.jpg
Downloaded Adansonia digitata → images/representatives_clean/Adansonia_digitata.jpg
Failed request for Aloe barbadensis miller (status 404)
Failed request for Aloe_barbadensis_miller (status 404)
Downloaded Anacardium occidentale → images/representatives_clean/Anacardium_occidentale.jpg
Downloaded Ananas comosus → images/representatives_clean/Ananas_comosus.jpg
Downloaded Antirrhinum majus → images/representatives_clean/Antirrhinum_majus.jpg
Failed request for Arecaceae spp (status 404)
Failed request for Arecaceae_spp (status 404)
Downloaded Artemisia absinthium → images/representatives_clean/Artemisia_absinthium.jpg
Downloaded Artocarpus altilis → 