### Batch processing of iMap records using iNaturalist VisionAPI

#### Import necessary libraries

In [118]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests
%matplotlib inline
from PIL import Image
from io import BytesIO
import math
import json
from IPython.display import display, HTML

### Set working directory

In [121]:
os.getcwd() # Get current working directory

'C:\\Users\\tmollick\\Documents\\iMapML\\Outputs'

In [123]:
os.chdir(r"C:\Users\tmollick\Documents\iMapML\Outputs") # Change current working directory
# os.listdir() # Check new files in the new directory

### Read the selected iMap presence ID for identification by VisionAPI

In [126]:
# Reading the numbers from the text file
with open(r"C:\Users\tmollick\Documents\iMapML\Datasets\presence_ids_with_photos.txt", "r") as file:  # Change the directory to read the presence ids from the text file
    presence_ids_with_photos = [int(line.strip()) for line in file]

# Print the list to verify
print(presence_ids_with_photos)


[1332475, 1332476, 1393477, 1338545, 1248259, 1340080, 1352795, 1352795, 1019292, 1183051, 1183051, 1045374, 1323985, 1324561, 1332475, 1342737, 1344279, 1355565, 1346222, 1348527, 1351159, 1351160, 1351161, 1351361, 1352774, 1355297, 1355661, 1356143, 1391918, 512894, 1249974, 512896, 1352795, 1365813, 1410194, 1410778, 1390011, 1182390, 1391610, 1393455, 1069269, 1165069, 1165071, 1248256, 1271707, 1271710, 1272390, 1272676, 1272681, 1273444, 1283776, 1393478, 1037387, 1045228, 1061881, 1064408, 1078275, 1393542, 1394580, 1395140, 1395141, 1395697, 1407697, 1411932, 1411934, 1412011, 1412340, 1412327, 1413908, 1413910, 1414413, 1413907, 1438087, 1438089, 1438090, 1416694, 1441578, 1443662, 1435543, 1437877, 1439152, 1439282, 1440283, 1440287, 1441455, 1441456, 1442429, 1443498, 1443499, 1443499, 1443499, 1443499, 1444043, 1443662, 1443662, 1443662, 1443662, 1443662, 1443662]


In [128]:
presence_ids = presence_ids_with_photos

### Accessing iMapInvasives API 
Create an account using https://www.imapinvasives.org/ address and use the username and password to access the iMap API
#### Use your iMap username and password in the code below to get access to the iMap API

In [131]:
import requests

imap_site = "imapinvasives"
iMap_username = "your iMap_username"   # Replcae with your iMap username
iMap_password = "your iMap_password"   # Replace with your iMap password

login_url = r"https://{0}.natureserve.org/imap/j_spring_security_check".format(imap_site)

# Attempt to log in
print("\nAttempting to authenticate with: {0}".format(login_url))
iMapSession = requests.Session()  # This is a global variable accessed in later functions
login_response = iMapSession.post(login_url,{'j_username':iMap_username,'j_password':iMap_password})
login_response_message = "\nlogin response: {0}".format(login_response.status_code)
print(login_response_message)
login_response.raise_for_status()

# Attempt to access a record to check if log-in was successful
test_aoi_url = r"https://{0}.natureserve.org/imap/services/aoi/new".format(imap_site)
test_aoi_record = iMapSession.get(test_aoi_url)
test_aoi_record_message = "\nTest record access response: {0}".format(test_aoi_record.status_code)
print(test_aoi_record_message)

if test_aoi_record.status_code == 403:
    print("\nResponse Code 403 is most likely the result of an incorrectly entered iMap username or password.  It may also be caused by logging in as a user with insufficient permissions.")

test_aoi_record.raise_for_status()


Attempting to authenticate with: https://imapinvasives.natureserve.org/imap/j_spring_security_check

login response: 200

Test record access response: 200


### Species Identification using VisionAPI
#### This code saves progress incrementally by writing to the Excel file after processing each presence ID. If the script encounters an error or disconnection, it can be resumed without reprocessing the completed records.

#### Create an account in RapidAPI using the link https://rapidapi.com and subscribe to iNaturalist VisionAPI. If you don't get the VisionAPI by searching, contact the iNaturalist team. They will create an account for you.

### Use your RapidAPI key for the iNaturalist VisionAPI in the code below

In [134]:
import requests
import json
from PIL import Image
from io import BytesIO
import pandas as pd
import os
import time

# Set the path where you will save the output file
output_file_path = r"C:\Users\tmollick\Documents\iMapML\Outputs\Unconfirmed_100_mixed_species.xlsx" # Change your output path

# Function to extract latitude and longitude from the imap_dictionary
def get_lat_lon_from_imap(imap_dictionary):
    presence_point = imap_dictionary.get("presencePoint")
    if presence_point:
        return presence_point.get('latitude'), presence_point.get('longitude')

    presence_line = imap_dictionary.get("presenceLine")
    if presence_line:
        return presence_line.get('latitude'), presence_line.get('longitude')

    presence_polygon = imap_dictionary.get("presencePolygon")
    if presence_polygon:
        return presence_polygon.get('latitude'), presence_polygon.get('longitude')

    return None, None

# Function to reduce the image size in memory
def reduce_image_size(image, max_size_kb=700):
    exif_data = image.info.get('exif')
    quality = 95
    while True:
        buffer = BytesIO()
        if exif_data:
            image.save(buffer, "JPEG", quality=quality, exif=exif_data)
        else:
            image.save(buffer, "JPEG", quality=quality)
        if buffer.tell() <= max_size_kb * 1024 or quality <= 10:
            return buffer
        quality -= 5

# Function to identify species using iNaturalist VisionAPI
def identify_species(image_bytes, lat=None, lon=None):
    url = "https://visionapi.p.rapidapi.com/v1/rapidapi/score_image"
    headers = {
        "X-RapidAPI-Key": "Replace with your RapidAPI key for the iNaturalist VisionAPI",   # Replace with your VisionAPI RapidAPI key   
        "X-RapidAPI-Host": "visionapi.p.rapidapi.com"
    }
    
    files = {'image': image_bytes}
    data = {}
    if lat is not None and lon is not None:
        data['lat'] = lat
        data['lng'] = lon
    
    response = requests.post(url, headers=headers, files=files, data=data)
    
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Function to get the image from a URL, handling redirects and setting headers
def get_image_from_url(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    
    response = requests.get(url, headers=headers, allow_redirects=True)
    
    if response.status_code == 200:
        try:
            return Image.open(BytesIO(response.content))
        except Exception as e:
            print(f"Failed to open image. Error: {e}")
            return None
    else:
        print(f"Failed to retrieve image. Status code: {response.status_code}")
        return None

# Function to process and identify species for all presentSpeciesId
def process_species_images(imap_dictionary, base_url):
    data_rows = []
    species_list = imap_dictionary.get("speciesList", [])
    
    for species in species_list:
        photo_url = None
        present_species_id = species.get("presentSpeciesId")
        presence_id = species.get("presenceId")
        imap_sci = species.get("nationalSpeciesList", {}).get("scientificName", "Unknown")
        imap_com = species.get("nationalSpeciesList", {}).get("commonName", "Unknown")
        
        # Extract imap_record_taxon
        imap_record_taxon = species.get("nationalSpeciesList", {}).get("inaturalistTaxonId", "Unknown")

        # Select only the first photo URL
        if "photos" in species and species["photos"]:
            photo_url = species["photos"][0].get("photoUrl", "")
        
        if photo_url:
            img = get_image_from_url(photo_url)
            if img:
                resized_image_buffer = reduce_image_size(img, max_size_kb=700)
                lat, lon = get_lat_lon_from_imap(imap_dictionary)
                resized_image_buffer.seek(0)
                result = identify_species(resized_image_buffer, lat, lon)
                
                if result and "results" in result and result["results"]:
                    # Select the best result based on combined score
                    top_result = max(result["results"], key=lambda x: x["original_combined_score"])
                    inat_sci = top_result['taxon']['name']
                    inat_com = top_result['taxon'].get('preferred_common_name', 'Unknown').capitalize()
                    inat_taxon = top_result['taxon']['id']
                    geo_score = round(top_result['original_geo_score'], 2)
                    com_score = round(top_result['original_combined_score'], 2)
                    inatlink_html = f'=HYPERLINK("https://www.inaturalist.org/taxa/{inat_taxon}-{inat_sci.replace(" ", "-")}", "View")'
                else:
                    inat_sci = "Unknown"
                    inat_com = "Unknown"
                    inat_taxon = "Unknown"
                    geo_score = "Unknown"
                    com_score = "Unknown"
                    inatlink_html = "Unknown"
            else:
                photo_url = "No Photo"
                inat_sci = "Unknown"
                inat_com = "Unknown"
                inat_taxon = "Unknown"
                geo_score = "Unknown"
                com_score = "Unknown"
                inatlink_html = "Unknown"
        else:
            photo_url = "No Photo"
            inat_sci = "Unknown"
            inat_com = "Unknown"
            inat_taxon = "Unknown"
            geo_score = "Unknown"
            com_score = "Unknown"
            inatlink_html = "Unknown"
            
        species_label = 1 if str(imap_record_taxon) == str(inat_taxon) else 0
        imaplink = f"https://imapinvasives.natureserve.org/imap/services/page/Presence/{presence_id}.html"
        imaplink_html = f'=HYPERLINK("{imaplink}", "View")'
        iMapPhoto_html = f'=HYPERLINK("{photo_url}", "View")' if photo_url != "No Photo" else "No Photo"
        
        row = [None, imaplink_html, presence_id, present_species_id, iMapPhoto_html, imap_sci, imap_com, imap_record_taxon, inatlink_html, inat_sci, inat_com, inat_taxon, geo_score, com_score, "", species_label, ""]
        data_rows.append(row)
    
    return data_rows

# Process all the presence IDs
def process_multiple_species_ids(presence_ids):
    base_url = "https://imapinvasives.natureserve.org/imap/services/presence/"  # Keeping the base URL unchanged for the API
    output_file = output_file_path
    
    # Load existing data if the file exists
    if os.path.exists(output_file):
        df_existing = pd.read_excel(output_file)
        processed_ids = set(df_existing["presenceId"].dropna().astype(int))
    else:
        df_existing = pd.DataFrame(columns=[
            "S.L.", "imaplink", "presenceId", "presentSpeciesId", "iMapPhoto", "imap_sci",
            "imap_com", "imap_record_taxon", "inatlink", "inat_sci", "inat_com", "inat_taxon", 
            "geo_score", "com_score", "visual_model", "species_label", "com_status"
        ])
        processed_ids = set()

    for idx, presence_id in enumerate(presence_ids, start=1):
        if presence_id in processed_ids:
            print(f"Skipping already processed presence ID: {presence_id}")
            continue

        print(f"\nProcessing presence ID: {presence_id}")
        full_url = f"{base_url}{presence_id}"
        
        while True:
            try:
                imap_record = iMapSession.get(full_url)  # Using iMapSession to access the API
                
                if imap_record.status_code == 200:
                    imap_dictionary = imap_record.json()
                    data_rows = process_species_images(imap_dictionary, base_url)
                    for row in data_rows:
                        row[0] = idx  # Insert S.L. as the first column
                        df_existing.loc[len(df_existing)] = row
                    # Save progress after each record
                    df_existing.to_excel(output_file, index=False)
                    processed_ids.add(presence_id)
                else:
                    print(f"Failed to retrieve data for presence ID {presence_id}. Status code: {imap_record.status_code}")
                break
            except requests.exceptions.RequestException as e:
                print(f"Error processing presence ID {presence_id}: {e}. Retrying in 5 seconds...")
                time.sleep(5)
            except Exception as e:
                print(f"Unexpected error: {e}. Saving progress and exiting.")
                df_existing.to_excel(output_file, index=False)
                return

# Example usage: Replace presence_ids with your actual list of IDs
process_multiple_species_ids(presence_ids)


Processing presence ID: 1332475

Processing presence ID: 1332476

Processing presence ID: 1393477

Processing presence ID: 1338545

Processing presence ID: 1248259

Processing presence ID: 1340080

Processing presence ID: 1352795
Skipping already processed presence ID: 1352795

Processing presence ID: 1019292

Processing presence ID: 1183051
Skipping already processed presence ID: 1183051

Processing presence ID: 1045374

Processing presence ID: 1323985

Processing presence ID: 1324561
Skipping already processed presence ID: 1332475

Processing presence ID: 1342737

Processing presence ID: 1344279

Processing presence ID: 1355565

Processing presence ID: 1346222

Processing presence ID: 1348527

Processing presence ID: 1351159

Processing presence ID: 1351160

Processing presence ID: 1351161

Processing presence ID: 1351361

Processing presence ID: 1352774

Processing presence ID: 1355297

Processing presence ID: 1355661

Processing presence ID: 1356143

Processing presence ID: 139191

### Apply threshold values to the output file from iNaturalist

### The invasive species Tier can be accessed through the url https://www.nynhp.org/invasives/species-tiers-table/

In [136]:
# conda install lxml # Run this command if it creates a problem opening the table

# URL of the table
url = "https://www.nynhp.org/invasives/species-tiers-table/"

# Read all tables on the page
tables = pd.read_html(url)

# Check how many tables were found
print(f"Total tables found: {len(tables)}")

# View the first table (adjust index if needed)
df2 = tables[0]

# Replace MultiIndex with only the lower-level column names
df2.columns = df2.columns.get_level_values(1)

# Rename 'NYS' column to 'NYS_Tier'
df2.rename(columns={'NYS': 'NYS_Tier'}, inplace=True)

df2.head()

Total tables found: 1


Unnamed: 0,Common Name,Scientific Name,Type,Ecological,Socio-Economic,NYS Part 575,NYS_Tier,APIPP,Capital Region,CRISP,Finger Lakes,Lower Hudson,LIISMA,SLELO,WNY
0,African clawed frog,Xenopus laevis,AA,Moderate,Insignificant Positive,Regulated,,,,,,M,,,
1,African elodea,Lagarosiphon major,AP,High,Low Negative,,1b,,,,,1c,,,
2,Africanized honey bee,"Apis mellifera scutellata x A. m. ligustica, A...",TA,Not assessed,Not assessed,Prohibited,,,,,,1,,,
3,Agrilus sp. 9895,Agrilus sp. 9895,TA,Not assessed,Not assessed,,M,,,,,,,,
4,Alewife,Alosa pseudoharengus,AA,Moderate,Insignificant Negative,,,,2.0,,4.0,,,,4.0


### Added thredhold and Geo Score

In [139]:
# Threshold values for com_score and geo_score
threshold_values = {
    "Litylenchus crenatae mccannii": 86.69,
    "Rhamnus cathartica": 81.5,
    "Alliaria petiolata": 88.91,
    "Adelges tsugae": 82.85,
    "Japanese Knotweed": 76.73,
    "Berberis thunbergii": 90.06,
    "Rosa multiflora": 74.05,
    "Celastrus orbiculatus": 65.45,
    "Lythrum salicaria": 89.08,
    "Lycorma delicatula": 92.79,
    "Ailanthus altissima": 87.09
}
default_threshold = 77.62

Geo_score_mean = {
    "Litylenchus crenatae mccannii": 21.54,
    "Rhamnus cathartica": 31.47,
    "Alliaria petiolata": 39.89,
    "Adelges tsugae": 21.26,
    "Reynoutria japonica": 31.94,
    "Berberis thunbergii": 36.55,
    "Rosa multiflora": 41.31,
    "Celastrus orbiculatus": 33.65,
    "Lythrum salicaria": 38.21,
    "Lycorma delicatula": 72.02,
    "Ailanthus altissima": 32.68
}
default_geo_score = 36.5

### Generate the final output by applying threshold combined score and mean geo score as well as the NYS invasive species Tier

In [143]:
# Define file paths
input_file = "Unconfirmed_100_mixed_species.xlsx"         # Replace with your desired input file here
output_file = "Unconfirmed_100_mixes_reports_final.xlsx"  # Rename the output file to save to your location

# Load the original Excel file using OpenPyXL and Pandas
wb = load_workbook(input_file, data_only=False)
ws = wb.active
df = pd.read_excel(input_file, engine="openpyxl")


# Convert columns to numeric where necessary
df["com_score"] = pd.to_numeric(df["com_score"], errors="coerce")
df["geo_score"] = pd.to_numeric(df["geo_score"], errors="coerce")


# Merge df with df2 to get the NYS_Tier info
df = df.merge(df2[['Scientific Name', 'NYS_Tier']], how='left', left_on='imap_sci', right_on='Scientific Name')

# Assign thresholds
df["Species_selected_threshold"] = df["imap_sci"].apply(lambda x: threshold_values.get(x, default_threshold))
df["Geo_score_mean"] = df["imap_sci"].apply(lambda x: Geo_score_mean.get(x, default_geo_score))

# Check com_score status (applies to everyone)
df["Com_score_above_or_below"] = df.apply(
    lambda row: "Above" if row["com_score"] >= row["Species_selected_threshold"] else "Below", axis=1
)

# Check geo_score status
df["Geo_score_above_or_below"] = df.apply(
    lambda row: "Above" if row["geo_score"] >= row["Geo_score_mean"] else "Below", axis=1
)

# Apply exception rule based on 'species_label'
df["com_status"] = df["species_label"].apply(lambda x: "Match" if x == 1 else "Unmatch")

# Default value for Recommended_status
df["Recommended_status"] = "Manual review"

# Clean up NYS_Tier values (no conversion to numeric because there are non-numeric tiers like 1a, 1b, etc.)
df["NYS_Tier"] = df["NYS_Tier"].astype(str).str.strip()

# Define the tiers where geo_score applies
tiers_with_geo_score = ['1a', '1b', '1c', '1', '2', '3']

# Condition for Tiers 1a, 1b, 1c, 1, 2, 3 -> apply com_score + geo_score + match
tier_1_to_3_condition = (
    (df["NYS_Tier"].isin(tiers_with_geo_score)) &
    (df["com_score"] >= df["Species_selected_threshold"]) &
    (df["geo_score"] >= df["Geo_score_mean"]) &
    (df["com_status"] == "Match")
)

# Condition for Tier 4 and blanks -> apply com_score + match only
tier_4_or_blank_condition = (
    (~df["NYS_Tier"].isin(tiers_with_geo_score)) &
    (df["com_score"] >= df["Species_selected_threshold"]) &
    (df["com_status"] == "Match")
)

# Apply conditions
df.loc[tier_1_to_3_condition | tier_4_or_blank_condition, "Recommended_status"] = "Automatically confirmed"

# Drop 'Scientific Name' and any other unnecessary columns before export
df.drop(columns=["Scientific Name", "visual_model"], inplace=True, errors='ignore')

# Preserve hyperlinks using `values_only=False`
hyperlink_columns = ["imaplink", "iMapPhoto", "inatlink"]

# Create a new workbook for writing
wb_new = Workbook()
ws_new = wb_new.active
ws_new.title = "Processed Data"

# Copy headers (Include both original and new columns)
headers = list(df.columns)
ws_new.append(headers)

# Loop through rows and copy data while preserving hyperlinks
for row_idx, row in enumerate(ws.iter_rows(values_only=False), start=1):
    if row_idx == 1:  # Skip header row (already written)
        continue

    new_row = []
    row_data = {}

    for col_idx, cell in enumerate(row, start=1):
        column_name = ws.cell(row=1, column=col_idx).value

        # Skip columns we dropped earlier
        if column_name in ["Scientific Name", "visual_model"]:
            continue

        if cell.hyperlink:
            new_cell = ws_new.cell(row=row_idx, column=col_idx)
            new_cell.value = cell.value
            new_cell.hyperlink = cell.hyperlink.target
            new_cell.style = "Hyperlink"
        else:
            row_data[column_name] = cell.value

    # Add the data from Pandas DataFrame
    for col_name in [
        "Species_selected_threshold", "Com_score_above_or_below", "Geo_score_mean",
        "Geo_score_above_or_below", "Recommended_status", "com_status", "NYS_Tier"
    ]:
        row_data[col_name] = df.at[row_idx - 2, col_name]

    # Ensure the new row is in the correct order (excluding dropped columns)
    new_row = [row_data.get(col, None) for col in headers]

    ws_new.append(new_row)

# Save the final workbook
wb_new.save(output_file)

print(f"Processing complete. File saved at: {output_file} with hyperlinks preserved and tier-specific geo_score applied.")

Processing complete. File saved at: Unconfirmed_100_mixes_reports_final.xlsx with hyperlinks preserved and tier-specific geo_score applied.
