In [95]:
# testing retrieval of iNaturalist API bird sightings
import requests
import time
import pandas as pd

def get_bird_observations(place_id, start_date, end_date, per_page=200):

    url = "https://api.inaturalist.org/v1/observations"
    all_observations = []
    page = 1

    while True:
        # Define query parameters
        params = {
            "taxon_id": 3,            # Taxon ID for "Aves" (birds)
            "place_id": place_id,     # Geographic place ID
            "nelat": 43.85,  # Northeast latitude
            "nelng": -79.2,  # Northeast longitude
            "swlat": 43.6,   # Southwest latitude
            "swlng": -79.6,  # Southwest longitude
            "d1": start_date,         # Start date
            "d2": end_date,           # End date
            "per_page": per_page,     # Number of observations per page
            "page": page,             # Current page
        }

        # Send GET request
        response = requests.get(url, params=params)
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            break

        # Parse JSON response
        data = response.json()
        results = data.get("results", [])
        
        if not results:
            break  # Exit loop if no more results

        # Extract relevant fields
        for obs in results:
            all_observations.append({
                "species_name": obs.get("taxon", {}).get("name"),
                "observed_on": obs.get("observed_on"),
                "latitude": obs.get("geojson", {}).get("coordinates", [None, None])[1],
                "longitude": obs.get("geojson", {}).get("coordinates", [None, None])[0],
                "time": obs.get("time_observed_at"),
                "place_guess": obs.get("place_guess"),
            })

        # Break the loop if we've reached the last page
        if page >= data["total_results"] // per_page + 1:
            break

        page += 1  # Increment page

    # Convert to DataFrame
    return pd.DataFrame(all_observations)

place_id = 204988  
start_date = "2022-01-01"
end_date = "2025-02-24"

bird_df = get_bird_observations(place_id, start_date, end_date)
bird_df['time'] = bird_df['time'].apply(lambda x: x[-14:] if x is not None else None)

In [97]:
# scientific to common name for iNaturalist data

def get_common_name(species_name, delay=1):
    url = "https://api.inaturalist.org/v1/taxa"
    try:
        response = requests.get(url, params={"q": species_name, "rank": "species"})
        time.sleep(delay) 

        if response.status_code == 200:
            results = response.json().get("results", [])
            if results:
                taxon = results[0]
                common_name = taxon.get("preferred_common_name", "No common name found")
                return common_name
        else:
            print(f"Error: {response.status_code} for species: {species_name}")
            return None
    except Exception as e:
        print(f"Exception occurred: {e}")
        return None
    
unique_species = bird_df["species_name"].unique()

mapped_names = []
for species in unique_species:
    common_name = get_common_name(species, delay=1)
    mapped_names.append({"species_name": species, "common_name": common_name})

common_name_df = pd.DataFrame(mapped_names)
bird_df = bird_df.merge(common_name_df, on="species_name", how="left") 

In [22]:
#eBird observations retrieval (2020-2025)

import requests
import pandas as pd
from datetime import datetime, timedelta

api_key = "6vn88i3d7iln"
base_url = "https://api.ebird.org/v2/data/obs/CA-ON-TO/historic/{year}/{month}/{day}"
headers = {"X-eBirdApiToken": api_key}

# Define date range
start_date = datetime(2020, 1, 1)
end_date = datetime(2025, 2, 24)

all_data = []

# Iterate through the date range
current_date = start_date
while current_date <= end_date:
    year, month, day = current_date.year, current_date.month, current_date.day
    url = base_url.format(year=year, month=month, day=day)

    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        daily_data = response.json()
        if daily_data:
            all_data.extend(daily_data)
    else:
        print(f"Error retrieving data for {current_date.strftime('%Y-%m-%d')}: {response.status_code}")

    current_date += timedelta(days=1)  # Move to the next day

# Convert list to DataFrame
if all_data:
    data = pd.DataFrame(all_data)

    data = data[['speciesCode', 'comName', 'obsDt', 'lat', 'lng', 'howMany', 'locName', 'obsValid']]
    data['obsValid'] = data['obsValid'].astype(bool)

    summary_stats = {
        "Number of observations": [len(data)],
        "Number of unique species": [data['comName'].nunique()],
        "Number of unique locations": [data['locName'].nunique()],
        "Percentage of validated observations": [data['obsValid'].mean() * 100], 
        "Average sightings per species": [data.groupby('comName')['howMany'].sum().mean()],
        "Std dev of sightings per species": [data.groupby('comName')['howMany'].sum().std()]
    }

    summary_table = pd.DataFrame(summary_stats)

    print(summary_table)

else:
    print("No data retrieved.")

data.to_csv("ebird_20_25.csv", index=False)

   Number of observations  Number of unique species  \
0                  187479                       351   

   Number of unique locations  Percentage of validated observations  \
0                        6954                                 100.0   

   Average sightings per species  Std dev of sightings per species  
0                    2803.521368                       8369.794123  


In [26]:
# filtering out locations with less than 100 sightings
location_counts = data["locName"].value_counts()

valid_locations = location_counts[location_counts >= 100].index

filtered_df = data[data["locName"].isin(valid_locations)]

filtered_summary_stats = {
    "Number of unique species": [filtered_df['comName'].nunique()],
    "Number of unique birding locations": [filtered_df['locName'].nunique()],
    "Average sightings per species": [filtered_df.groupby('comName')['howMany'].sum().mean()],
    "Std dev of sightings per species": [filtered_df.groupby('comName')['howMany'].sum().std()]
}

filtered_summary_table = pd.DataFrame(filtered_summary_stats)

In [41]:
# filtering out species with less than 20 sightings

species_sightings = filtered_df.groupby('comName')['howMany'].sum()
valid_species = species_sightings[species_sightings >= 10].index
species_filtered_df = filtered_df[filtered_df['comName'].isin(valid_species)]

species_summary_stats = {
    "Number of unique species": [species_filtered_df['comName'].nunique()],
    "Number of unique birding locations": [species_filtered_df['locName'].nunique()],
    "Average sightings per species": [species_filtered_df.groupby('comName')['howMany'].sum().mean()],
    "Std dev of sightings per species": [species_filtered_df.groupby('comName')['howMany'].sum().std()]
}

species_summary_table = pd.DataFrame(species_summary_stats)

In [None]:
from meteostat import Daily, Stations
import pandas as pd
from datetime import datetime
from tqdm import tqdm  # Progress bar

# Load your eBird dataset (update file path if needed)
ebird_data = pd.read_csv("ebird_sightings.csv")

# Ensure 'date' column is in datetime format
ebird_data["date"] = pd.to_datetime(ebird_data["date"])

# Initialize an empty list to store weather data
weather_results = []

# Iterate over each observation
for _, row in tqdm(ebird_data.iterrows(), total=len(ebird_data), desc="Processing observations"):
    lat, lon, sighting_date = row["latitude"], row["longitude"], row["date"]
    
    # Convert sighting date to datetime format
    sighting_date = datetime(sighting_date.year, sighting_date.month, sighting_date.day)

    # Find the nearest weather station
    stations = Stations().nearby(lat, lon).fetch(1)
    
    if not stations.empty:
        station_id = stations.iloc[0]['id']  # Get closest 

