In [12]:
## environment
# conda create -n base2023geonat python numpy pandas scipy matplotlib jupyterlab nodejs plotly dill; conda install -c conda-forge pyinaturalist; 
# pip install ipyplot
# conda activate base2023geonat; conda update --all; conda install -c conda-forge 'pyrate-limiter<3' ## for now required pending fix

## imports
import pandas as pd
import datetime as dt
import ipyplot
import pyinaturalist as inat
import sys

sys.path.insert(0, r"C:\Users\drsvs\Desktop\code")
from pynat import helpers

pd.options.display.max_rows=250

In [13]:
%load_ext autoreload
%autoreload 2

In [None]:
assert False, "run once to set up your iNat access key"

## save API_KEY

API_KEY = ""

with open('pyinaturalistkey.pkd', 'wb') as f:
    dill.dump(API_KEY, f)

# ## load api key
# with open('pyinaturalistkey.pkd', 'rb') as f:
#     API_KEY = dill.load(f)

# import logging
# logging.basicConfig()
# logging.getLogger('pyinaturalist').setLevel('INFO')

### learn about identifications by location of identified not location of identifier

In [None]:
PLACES = {1491:'Powhatan County',
          2920:'Goochland County',
          3032:'Louisa County'}
local_ids = inat.v1.identifications.get_identifications(place_id=list(PLACES.keys()))
local_ids['results'][0].keys()

### check accuracy of identifications

# tri-county stats

In [None]:
PLACES = {1491:'Powhatan County',
          2920:'Goochland County',
          3032:'Louisa County'}

total_observations = inat.get_observations(
    place_id=list(PLACES.keys()),
    verifiable=True,
    per_page=0,
)['total_results']
print(f'Total observations: {total_observations}')

total_taxa = inat.get_observation_species_counts(
    place_id=list(PLACES.keys()),
    verifiable=True,
    per_page=0,
)['total_results']
print(f'Total taxa observed: {total_taxa}')

total_identifiers = inat.get_observation_identifiers(place_id=list(PLACES.keys()), per_page=0)['total_results']
print(f'Total identifiers: {total_identifiers}')

total_observers = inat.get_observation_observers(place_id=list(PLACES.keys()), per_page=0)['total_results']
print(f'Total observers: {total_observers}')

In [None]:
## helper functions from pyinaturalist example

THROTTLING_DELAY = 1.0  # Time to wait in between subsequent requests
TAXON_IMAGE_URL = 'https://raw.githubusercontent.com/inaturalist/inaturalist/main/app/assets/images/iconic_taxa/{taxon}-75px.png'
iconic_taxa = list(inat.ICONIC_TAXA.values())
iconic_taxa.remove('Unknown')

# Run one search for each iconic taxon
def get_iconic_taxa_counts(function):
    iconic_taxa_counts = {}
    for taxon_name in iconic_taxa:
        total_taxon_observations = function(
            place_id=list(PLACES.keys()),
            iconic_taxa=taxon_name,
            verifiable=True,
            per_page=0,
        )['total_results']

        iconic_taxa_counts[taxon_name] = total_taxon_observations
        print(f'Total results for {taxon_name}: {total_taxon_observations}')
        if taxon_name != iconic_taxa[-1]:
            sleep(THROTTLING_DELAY)
    return iconic_taxa_counts

def get_iconic_icon(taxon_name):
    return TAXON_IMAGE_URL.format(taxon=taxon_name.lower())

In [None]:
total_observations_by_iconic_taxon = get_iconic_taxa_counts(inat.get_observations)

# Create a chart, sorted by number of observations, using the appropriate iNaturalist icons
observations_df = pd.DataFrame(
    [
        {'iconic taxon': k, 'observations': v, 'img': get_iconic_icon(k)}
        for k, v in total_observations_by_iconic_taxon.items()
    ]
)

# ## display with altair
# alt.Chart(
#     observations_df,
#     title=f'Verifiable observations in {PLACE_NAME} by iconic taxon',
#     width=750,
#     height=500,
# ).mark_image().encode(x=alt.X('iconic taxon:N', sort='-y'), y='observations:Q', url='img')

observations_df.sort_values('observations', ascending=False)

In [None]:
observations_by_year = inat.get_observation_histogram(
    place_id=list(PLACES.keys()),
    interval='year',
    d1='2008-01-01',
    d2=dt.date.today(),
    verifiable=True,
)
observations_by_year = pd.DataFrame(
    [{'date': k, 'observations': v} for k, v in observations_by_year.items()]
)
observations_by_year.set_index('date').plot();

In [None]:
observations_by_month = inat.get_observation_histogram(
    place_id=list(PLACES.keys()),
    interval='month',
    d1='2016-01-02',
    d2=dt.date.today(),
    verifiable=True,
)
observations_by_month = pd.DataFrame(
    [{'metric': 'Observations', 'date': k, 'count': v} for k, v in observations_by_month.items()]
)
observations_by_month.set_index('date').plot();

In [None]:
monthly_observations = observations_by_month.groupby(observations_by_month['date'].dt.month)['count'].sum().to_frame()
monthly_observations['metric'] = 'Observations'
monthly_observations['count'].plot()

In [None]:
## helper functions from pyinaturalist example
def count_date_range_results(function, start_date, end_date):
    """Get the count of results for the given date range and search function"""
    # Running this search with per_page=0 will (quickly) return only a count of results, not complete results
    response = function(
        place_id=list(PLACES.keys()),
        d1=start_date,
        d2=end_date,
        verifiable=True,
        per_page=0,
    )
    print(f'Total results for {start_date.strftime("%b")}: {response["total_results"]}')
    return response['total_results']


def get_monthly_counts(function, label):
    """Get the count of results per month for the given search function"""
    month_ranges = inat.get_interval_ranges(dt.datetime(2016, 1, 1), dt.datetime.today(), 'month')
    counts_by_month = {
        start_date: count_date_range_results(function, start_date, end_date)
        for (start_date, end_date) in month_ranges
    }
    return pd.DataFrame(
        [{'metric': label, 'date': k, 'count': v} for k, v in counts_by_month.items()]
    )

### this part didn't quite work...

In [None]:
taxa_by_month = get_monthly_counts(inat.get_observation_species_counts, 'Taxa')
observers_by_month = get_monthly_counts(inat.get_observation_observers, 'Observers')
identifiers_by_month = get_monthly_counts(inat.get_observation_identifiers, 'Identifiers')

In [None]:
# taxa_by_month
combined_results = pd.concat([taxa_by_month, observations_by_month, observers_by_month, identifiers_by_month]).pivot(index='date', columns='metric', values='count')
combined_results.plot()

### back on track after simplifying the scope...

In [None]:
observations = []     
taxa = []
observers = []
identifiers = []

for month in range(1,13):
    observations.append(inat.get_observations(place_id=list(PLACES.keys()), month=month, verifiable=True, per_page=0)['total_results'])
    taxa.append(inat.get_observation_species_counts(place_id=list(PLACES.keys()), month=month, verifiable=True, per_page=0)['total_results'])
    observers.append(inat.get_observation_observers(place_id=list(PLACES.keys()), month=month, verifiable=True, per_page=0)['total_results'])
    identifiers.append(inat.get_observation_identifiers(place_id=list(PLACES.keys()), month=month, verifiable=True, per_page=0)['total_results'])
    
counts = pd.DataFrame({'observations':observations, 'taxa':taxa, 'observers':observers, 'identifiers':identifiers})
counts.plot()

### maybe want to separately count observers vs species IDs? two ways of removing bias.

In [None]:
## Question: What if I wanted to contrast observation rate versus identification rate? I'd want to index that on the time of identification not the time of observation. 

# convenience: my recent observations, formatted for photo names

In [None]:
## missing captive/cultivated??

# helpers.get_mine(uname='schizoform', lookback_to=dt.datetime(2024,1,29))
helpers.get_mine(uname='schizoform', 
                 STRT=dt.datetime(2025,8,4,0,0,0),
                 #FNSH=dt.datetime(2024,8,12,0,0,0),
                )

## lookback_to -> lookback_at 


# bloomers
The goal here is to see what plants will be blooming when in my local region. But along the way...

### Coming soon!

what plants might be blooming nest? seeding next? what mushrooms might I see? what (esp. migratory) birds? when are mammals active? 

* DONE alternate entrypoint of specifying lat/long/dist instead of places
* try to narrow time frame to consistently 21-day period?
* normalizations for sort order:
   1. count of all observations at time/place
   2. count of all observations in time and at place separately
   3. count of all observations at time/place by taxa
   4. count of all observations in time and at place separately by taxa
   5. (similar but by phenology?)
   6. (separate totals for 'research grade' and 'informal' counts)
* photographs should match requested phenotype where possible
* add support for caterpillars/butterflies (and similar for benthic macroinverts?)
* split animals by clade and/or generalize interface?

In [None]:
PLACES = {1491:'Powhatan County',
          2920:'Goochland County',
          3032:'Louisa County'}

total_observations = inat.get_observations(
    place_id=list(PLACES.keys()),
    month=list(set( [(dt.date.today()+dt.timedelta(days=-7)).month, (dt.date.today()+dt.timedelta(days=7)).month] )),
    taxon_name='plants',
    verifiable=True,
    per_page=0,
)['total_results']
print(f'Total plant observations: {total_observations}')

inat.get_observations(
    place_id=list(PLACES.keys()),
    month=list(set( [(dt.date.today()+dt.timedelta(days=-7)).month, (dt.date.today()+dt.timedelta(days=7)).month] )),
    taxon_name='plants',
    verifiable=True,
    per_page=0,
)

In [None]:
# target_loc = (37.679849,-77.442868,10) # upper_chick
target_loc = (47.9,-91.6,250) # BWCA Farm Lake
limit = 7
norm = 'overall' # 'time'

In [None]:
# helpers.coming_soon('flowers', places=list(PLACES.keys()))
res = helpers.coming_soon('flowers', loc=target_loc, norm=norm, limit=limit)

In [None]:
res = helpers.coming_soon('fruits', loc=target_loc, norm=norm, limit=limit)

In [None]:
res = helpers.coming_soon('mushrooms', loc=target_loc, norm=norm, limit=limit)

In [None]:
res = helpers.coming_soon('birds', loc=target_loc, norm=norm, limit=limit)

In [None]:
res = helpers.coming_soon('herps', loc=target_loc, norm=norm, limit=limit)

In [None]:
res = helpers.coming_soon('mammals', loc=target_loc, norm=norm, limit=limit)

In [None]:
### NOTE: should exclude amphibia
res = helpers.coming_soon('wugs', loc=target_loc, norm=norm, limit=limit)

In [None]:
## filter images to show desired life stage
## note: graceful handling of null return values

res = helpers.coming_soon('caterpillars', loc=target_loc, norm='overall', limit=3)

In [None]:
## filter images to show desired life stage

res = helpers.coming_soon('butterflies', loc=target_loc, norm='overall', limit=3)

# Experiments w/ ChatGPT 5

## confidence_manimal

general idea is to look at the rate at which I ID vs withdraw ID, slash the rate at which I nevermind-withdrawn ID vs the community taxon. how full of shit am I?

In [5]:
#!/usr/bin/env python3
# iNat ID outcomes for a user on a given taxon
# Requires: Python 3.9+, requests, pandas (optional but nice)

def check_accuracy(target_user, taxon_query):

    import requests, time, math, pandas as pd
    from collections import Counter, defaultdict
    
    BASE = "https://api.inaturalist.org/v1"
    
    TARGET_USER  = target_user #"schizoform"
    TAXON_QUERY  = taxon_query #"partridge pea"   # you can also try "Chamaecrista fasciculata"
    PER_PAGE     = 200               # API max is 200
    SLEEP        = 0.2               # be gentle to the API
    
    def get_taxon_id(q: str) -> int:
        r = requests.get(f"{BASE}/taxa", params={"q": q, "per_page": 1})
        r.raise_for_status()
        results = r.json().get("results", [])
        if not results:
            raise ValueError(f"No taxon found for query: {q}")
        return results[0]["id"]  # take the top match
    
    def paged(endpoint, params):
        page = 1
        while True:
            p = dict(params)
            p["page"] = page
            p["per_page"] = PER_PAGE
            r = requests.get(f"{BASE}/{endpoint}", params=p)
            r.raise_for_status()
            j = r.json()
            results = j.get("results", [])
            if not results:
                break
            for item in results:
                return_obj = item
                yield return_obj
            total_results = j.get("total_results", 0)
            pages = math.ceil(total_results / PER_PAGE) if total_results else page
            page += 1
            if total_results and page > pages:
                break
            time.sleep(SLEEP)
    
    def fetch_identifications(user, taxon_id):
        # Only IDs made BY this user, for observations at/under this taxon.
        # Note: includes withdrawn IDs; includes category (leading/improving/supporting/maverick)
        ids = []
        for it in paged(
            "identifications",
            {
                "user_id": user,
                "taxon_id": taxon_id,
                "current": "any",        # include withdrawn
                "order_by": "created_at",
                "order": "asc",
                "per_page": PER_PAGE,
            },
        ):
            ids.append(it)
        return ids
    
    def chunked(iterable, size):
        chunk = []
        for x in iterable:
            chunk.append(x)
            if len(chunk) == size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk
    
    def fetch_observations(obs_ids):
        obs = {}
        for chunk in chunked(list(obs_ids), 200):
            r = requests.get(f"{BASE}/observations", params={"id": ",".join(map(str, chunk)), "per_page": 200})
            r.raise_for_status()
            for o in r.json().get("results", []):
                obs[o["id"]] = o
            time.sleep(SLEEP)
        return obs
    
    def taxon_lineage_ids(taxon):
        """Return set of ancestor ids + self id if present (from iNat taxon object)."""
        if not taxon:
            return set()
        ids = set(taxon.get("ancestor_ids") or [])
        self_id = taxon.get("id")
        if self_id:
            ids.add(self_id)
        return ids
    
    def classify_identification(id_obj, obs_obj):
        """
        Returns one of: 'withdrawn', 'validated', 'overruled', plus some detail.
        Heuristics:
        - withdrawn: id.current == False
        - validated: id.category in {'leading','improving','supporting'}  OR
                     id.taxon is the same as (or an ancestor/descendant of) the community taxon
        - overruled: id.category == 'maverick'  OR
                     id.taxon not in lineage of community taxon (and not withdrawn)
        """
        # withdrawn?
        if not id_obj.get("current", True):
            return "withdrawn", {"reason": "not current"}
    
        cat = (id_obj.get("category") or "").lower()
        if cat in {"leading", "improving", "supporting"}:
            return "validated", {"reason": f"category={cat}"}
    
        # If category missing/ambiguous, compare to community taxon
        id_taxon = id_obj.get("taxon") or {}
        obs_comm = (obs_obj or {}).get("community_taxon") or {}
    
        id_lineage = taxon_lineage_ids(id_taxon)
        comm_lineage = taxon_lineage_ids(obs_comm)
    
        if id_lineage & comm_lineage:
            return "validated", {"reason": "lineage overlap with community taxon"}
    
        if cat == "maverick":
            return "overruled", {"reason": "category=maverick"}
    
        # fall back: if there is a community taxon and no overlap, call it overruled
        if comm_lineage:
            return "overruled", {"reason": "no lineage overlap with community taxon"}
    
        # If no community taxon yet, treat as 'pending/unclear' (counts separate)
        return "unclear", {"reason": "no community taxon"}
    
    def main():
        taxon_id = get_taxon_id(TAXON_QUERY)
        print(f"Taxon for '{TAXON_QUERY}': {taxon_id}")
    
        id_list = fetch_identifications(TARGET_USER, taxon_id)
        if not id_list:
            print("No identifications found.")
            return
    
        obs_ids = {i["observation"]["id"] for i in id_list if i.get("observation")}
        obs_map = fetch_observations(obs_ids)
    
        rows = []
        for i in id_list:
            obs_id = (i.get("observation") or {}).get("id")
            classification, meta = classify_identification(i, obs_map.get(obs_id))
            rows.append({
                "identification_id": i["id"],
                "obs_id": obs_id,
                "created_at": i.get("created_at"),
                "current": i.get("current"),
                "category": i.get("category"),
                "maverick": i.get("maverick"),
                "id_taxon_id": (i.get("taxon") or {}).get("id"),
                "id_taxon_name": (i.get("taxon") or {}).get("name"),
                "community_taxon_id": ((obs_map.get(obs_id) or {}).get("community_taxon") or {}).get("id"),
                "community_taxon_name": ((obs_map.get(obs_id) or {}).get("community_taxon") or {}).get("name"),
                "classification": classification,
                "detail": meta.get("reason"),
            })
    
        df = pd.DataFrame(rows).sort_values("created_at")
        counts = df["classification"].value_counts().to_dict()
        total = len(df)
    
        print("\n=== Summary for user @{} on taxon '{}': ===".format(TARGET_USER, TAXON_QUERY))
        for k in ["validated", "overruled", "withdrawn", "unclear"]:
            n = counts.get(k, 0)
            pct = (n/total*100) if total else 0
            print(f"{k:10s}: {n:4d}  ({pct:5.1f}%)")
        print(f"total IDs : {total}")
    
        # out_csv = f"inaturalist_{TARGET_USER}_{TAXON_QUERY.replace(' ','_')}_id_outcomes.csv"
        # df.to_csv(out_csv, index=False)
        # print(f"\nWrote detailed rows to: {out_csv}")
    
    if __name__ == "__main__":
        main()

In [3]:
check_accuracy(taxon_user='schizoform', target_taxon='Chamaecrista')

Taxon for 'partridge pea': 53262

=== Summary for user @schizoform on taxon 'partridge pea': ===
validated :   59  ( 90.8%)
overruled :    0  (  0.0%)
withdrawn :    6  (  9.2%)
unclear   :    0  (  0.0%)
total IDs : 65

Wrote detailed rows to: inaturalist_schizoform_partridge_pea_id_outcomes.csv


In [7]:
check_accuracy(target_user='schizoform', taxon_query='Chamaecrista') ### this one should be different bc should include sensitive pea

Taxon for 'Chamaecrista': 53262

=== Summary for user @schizoform on taxon 'Chamaecrista': ===
validated :   59  ( 90.8%)
overruled :    0  (  0.0%)
withdrawn :    6  (  9.2%)
unclear   :    0  (  0.0%)
total IDs : 65


In [8]:
check_accuracy(target_user='schizoform', taxon_query='american sweetgum')

Taxon for 'american sweetgum': 49658

=== Summary for user @schizoform on taxon 'american sweetgum': ===
validated :  189  ( 98.4%)
overruled :    0  (  0.0%)
withdrawn :    3  (  1.6%)
unclear   :    0  (  0.0%)
total IDs : 192


In [9]:
check_accuracy(target_user='schizoform', taxon_query='russula')

Taxon for 'russula': 48341

=== Summary for user @schizoform on taxon 'russula': ===
validated :  310  ( 88.1%)
overruled :    0  (  0.0%)
withdrawn :   42  ( 11.9%)
unclear   :    0  (  0.0%)
total IDs : 352


In [10]:
check_accuracy(target_user='schizoform', taxon_query='mammalia') 
### this taxa id seems oddly high, even if the number of observations seems plausible... 
### though i wonder if the unclear percent is too low?

Taxon for 'mammalia': 40151

=== Summary for user @schizoform on taxon 'mammalia': ===
validated :  888  ( 95.1%)
overruled :    0  (  0.0%)
withdrawn :   44  (  4.7%)
unclear   :    2  (  0.2%)
total IDs : 934


In [11]:
check_accuracy(target_user='schizoform', taxon_query='aves')

Taxon for 'aves': 3

=== Summary for user @schizoform on taxon 'aves': ===
validated : 2509  ( 91.7%)
overruled :    1  (  0.0%)
withdrawn :  223  (  8.2%)
unclear   :    3  (  0.1%)
total IDs : 2736


In [None]:
## get a list of all the clades I provided an ID
## idea: give it a user name and an ID -- what percent were subsequently confirmed accurate? confirmed inaccurate?

In [None]:
# ## step one: get all a usr's observations (e.g. mine)


# def get_species_preview(api_key: str,
#                         user: Tuple[float, float], 
#                        ) -> pd.DataFrame:
#     """
#     Fetches observations of flowering plants from the iNaturalist API within a specified distance 
#     of a given location over the next two weeks for the past 10 years.

#     Args:
#         location (Tuple[float, float]): The latitude and longitude of the location of interest.
#         distance (Union[int, float]): The radius around the location of interest, in kilometers.
#         api_key (str): The iNaturalist API key.

#     Returns:
#         pd.DataFrame: A DataFrame containing the observations of flowering plants.

#     Note:
#         The taxon_id parameter is set to 47126, which is the ID for the kingdom Plantae. This will 
#         return observations of all plants. If you're interested in a specific group of plants, 
#         you'll need to find the appropriate taxon ID.

#         The term_id parameter is set to 12, which is the ID for "flowering". This will return 
#         observations where the plant was reported to be flowering.
#     """
#     # Define the base URL for the iNaturalist API
#     base_url = "https://api.inaturalist.org/v1/observations"

#     # Fetch observations
    
#     # Define the parameters for the API request
#     params: Dict[str, Union[str, int, float]] = {
        
#         "order": "desc",
#         "order_by": "observed_on",
#         #"term_id": 13,  # Term ID for flowering
#     }

#     # Make the API request
#     response = requests.get(base_url, params=params, headers={"Authorization": "Bearer " + api_key})

#     # Convert the response to a pandas DataFrame and append it to the results DataFrame
#     df.append(pd.json_normalize(response.json()["results"]))

