# import packages

In [4]:
import os
import pandas as pd
import random
import shutil
import requests
import matplotlib.pyplot as plt
from pyinaturalist import *
from PIL import Image
import numpy as np
from typing_extensions import Counter

Optional: get place code for limiting search spatially

In [5]:
response = get_places_autocomplete(q='France')
pprint({p['id']: p['name'] for p in  response['results']})

[1m{[0m
    [1;36m6753[0m: [32m'France'[0m,
    [1;36m10577[0m: [32m'Île-de-France'[0m,
    [1;36m11367[0m: [32m'Fort-de-France'[0m,
    [1;36m104968[0m: [32m'Francesti'[0m,
    [1;36m30178[0m: [32m'Seine-Saint-Denis'[0m,
    [1;36m38738[0m: [32m'Fort-de-France'[0m,
    [1;36m99548[0m: [32m'Hauts-de-Seine'[0m,
    [1;36m99550[0m: [32m"Val-d'Oise"[0m,
    [1;36m99546[0m: [32m'Val-de-Marne'[0m,
    [1;36m30182[0m: [32m'Yvelines'[0m
[1m}[0m


# Initialize function to scrap images from Inaturalist

In [None]:
# Scrape Images and Metadata

# Here we make a function to save photos of a target species locally and save necessary metada: latitude, longitude, observation ID, photo IDs, Photo urls. Metadata is saved as a dictionary and photos are saved in a directory specified by the user.
# Metadata example:
# observation_id   latitude  longitude    photo_ids  \
# 0       259323505  57.736160  10.629406  [465488543]
# 1       259193935  48.037273  11.509971  [465220887]
# 2       258982331  49.385485  19.790977  [464790765]
# 3       258835093  46.517517   9.908752  [464493974]
# 4       258811645  52.674268   6.516881  [464445739]
#
#                                               photos
# 0  [https://inaturalist-open-data.s3.amazonaws.co...
# 1  [https://inaturalist-open-data.s3.amazonaws.co...
# 2  [https://static.inaturalist.org/photos/4647907...
# 3  [https://static.inaturalist.org/photos/4644939...
# 4  [https://static.inaturalist.org/photos/4644457...

# Parameters
output_dir = "drive/MyDrive/Colab Notebooks/SnakeMorphs/source_images"
max_accuracy = 1000
record_limiter = 5000

def scrape_inaturalist_images(species_name):
    """Scrape images and metadata for a target species from iNaturalist."""
    # Fetch observations
    observations = []
    page = 1
    per_page = 30  # Set a reasonable per_page value (e.g., 30, max=200)

    # Iterate through paginated results until record_limiter is reached
    while len(observations) < record_limiter:
        response = get_observations(
            taxon_id=species_name,
            photos=True,
            geo=True,
            place_id=6753,
            identified=True,
            geoprivacy='open',
            acc_below=max_accuracy,
            page=page,
            per_page=per_page
        )

        # Add observations from current page
        observations.extend(response.get('results', []))

        # Check if there are more pages or if we've reached the desired number of observations
        if response.get('page') == response.get('pages') or len(observations) >= record_limiter:
            break  # No more pages or enough observations collected

        # Increment page for next iteration
        page += 1

    # Limit records to record_limiter if exceeded
    observations = observations[:record_limiter]


    # Ensure the output directory exists and clear if not empty
    if os.path.exists(output_dir) and os.listdir(output_dir):
        for file in os.listdir(output_dir):
            os.remove(os.path.join(output_dir, file))
    else:
        os.makedirs(output_dir)

    # Process observations and store metadata
    metadata = []
    for obs in observations:
        observation_id = obs.get('id', None)
        latitude = obs.get('geojson', {}).get('coordinates', [None, None])[1]
        longitude = obs.get('geojson', {}).get('coordinates', [None, None])[0]
        photos = obs.get('photos', [])

        # Collect high-resolution photo URLs and IDs
        photo_urls = [photo.get('url', "").replace("square", "original") for photo in photos]
        photo_ids = [photo.get('id', None) for photo in photos]

        # Download and save photos
        for i, img_url in enumerate(photo_urls):
            try:
                photo_id = photo_ids[i]
                img_path = os.path.join(output_dir, f"{photo_id}.jpg")
                with open(img_path, 'wb') as f:
                    f.write(requests.get(img_url).content)
            except Exception as e:
                print(f"Error downloading photo {photo_id}: {e}")
                continue

        # Append metadata for the observation
        metadata.append({
            "observation_id": observation_id,
            "latitude": latitude,
            "longitude": longitude,
            "photo_ids": photo_ids,  # List of photo IDs
            "photos": photo_urls  # List of photo URLs
        })

    return metadata


# Use scraping function

In [66]:
# Usage
species_name = "30889"
metadata = scrape_inaturalist_images(species_name)

# Convert metadata to a DataFrame for easier visualization
df = pd.DataFrame(metadata)

# Save metadata to a CSV file
metadata_path = os.path.join(output_dir, "metadata.csv")
df.to_csv(metadata_path, index=False)

# Display the DataFrame
print(df.head())
len(df)


   observation_id   latitude  longitude    photo_ids  \
0       259414932  61.048887  14.443412  [465674164]   
1       259341747  60.338892  28.577207  [465523509]   
2       259323505  57.736160  10.629406  [465488543]   
3       259194544  43.119729  -1.040918  [465223629]   
4       259193935  48.037273  11.509971  [465220887]   

                                              photos  
0  [https://inaturalist-open-data.s3.amazonaws.co...  
1  [https://inaturalist-open-data.s3.amazonaws.co...  
2  [https://inaturalist-open-data.s3.amazonaws.co...  
3  [https://inaturalist-open-data.s3.amazonaws.co...  
4  [https://inaturalist-open-data.s3.amazonaws.co...  


[1;36m1000[0m

# Open a photo and check size


In [67]:
import os
import random
from PIL import Image

# Get a list of all image files in the directory
image_files = []
for f in os.listdir(output_dir):
    if os.path.isfile(os.path.join(output_dir, f)):
        image_files.append(f)

# Select a random image file
random_image_file = random.choice(image_files)

# Construct the full path to the random image
random_image_path = os.path.join(output_dir, random_image_file)

# Open the random image
image = Image.open(random_image_path)

# Get dimensions
width, height = image.size
print(f"Width: {width}, Height: {height}")

# Display the image (optional)
display(image)

print(image_files)

Output hidden; open in https://colab.research.google.com to view.

# Zip photos (.jpg)

In [71]:
import os
import zipfile

def zip_jpg_files(output_directory, zip_file_name="images.zip"):
    """
    This function zips all the .jpg files in a given directory.

    Args:
        output_directory: The directory where the .jpg files are located.
        zip_file_name: The name of the zip file to be created (default: "images.zip").

    Returns:
        None. It creates a zip file in the output directory.
    """

    # Create the full path for the zip file
    zip_file_path = os.path.join(output_directory, zip_file_name)

    # Open the zip file in write mode ('w')
    with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
        # Go through all the files and folders in the output directory
        for current_folder, subfolders, files in os.walk(output_directory):
            # Check each file in the current folder
            for file_name in files:
                # If the file ends with '.jpg', add it to the zip file
                if file_name.endswith('.jpg'):
                    # Get the full path of the file
                    file_path = os.path.join(current_folder, file_name)

                    # Add the file to the zip file using its original name
                    zip_file.write(file_path, arcname=file_name)

    # Print a message to confirm the zip file creation
    print(f"All .jpg files in '{output_directory}' have been zipped to '{zip_file_name}'")


# Call the zip function
zip_jpg_files(output_dir)

<generator object _walk at 0x79304ec91a20>
