In [25]:

import pandas as pd
import re
from pathlib import Path
import os

In [26]:
SRC_File = Path('Data/Obs_df.csv')

if SRC_File.exists():
	df_whole = pd.read_csv(SRC_File)
else:
	raise FileNotFoundError(f"The file {SRC_File} does not exist. Please check the path.")

In [27]:
import pandas as pd
import numpy as np

df = pd.read_csv(SRC_File, low_memory=False)

def coalesce(a, b):
    """Prefer a; if blank/NA use b."""
    a = a.fillna("").astype(str).str.strip().replace({"": np.nan})
    b = b.fillna("").astype(str).str.strip().replace({"": np.nan})
    return a.combine_first(b)

def clean_text(s):
    """Trim whitespace; turn '', 'nan', 'None' into NA; keep original case."""
    s = s.astype(str).str.strip()
    s = s.replace({"": np.nan, "nan": np.nan, "None": np.nan, "NONE": np.nan})
    return s

In [28]:
# Coalesced scientific names per side
pred_sci = coalesce(df.get("predator_scientific_name"), df.get("predator_taxon_species_name"))
prey_sci = coalesce(df.get("prey_scientific_name"), df.get("prey_taxon_species_name"))

In [29]:
# Assemble candidate taxa rows from predator side
pred_taxa = pd.DataFrame({
    "scientific_name": pred_sci,
    "common_name": df.get("predator_common_name"),
    "iconic_taxon_name": df.get("predator_iconic_taxon_name"),
    "taxon_kingdom": df.get("predator_taxon_kingdom_name"),
    "taxon_species": df.get("predator_taxon_species_name"),
    "taxon_phylum": df.get("predator_taxon_phylum_name"),
    "taxon_subphylum": df.get("predator_taxon_subphylum_name"),
    "taxon_superclass": df.get("predator_taxon_superclass_name"),
    "taxon_class": df.get("predator_taxon_class_name"),
    "taxon_subclass": df.get("predator_taxon_subclass_name"),
    "taxon_superorder": df.get("predator_taxon_superorder_name"),
    "taxon_order": df.get("predator_taxon_order_name"),
    "taxon_suborder": df.get("predator_taxon_suborder_name"),
    "taxon_superfamily": df.get("predator_taxon_superfamily_name"),
    "taxon_family": df.get("predator_taxon_family_name"),
    "taxon_subfamily": df.get("predator_taxon_subfamily_name"),
    "taxon_supertribe": df.get("predator_taxon_supertribe_name"),
    "taxon_tribe": df.get("predator_taxon_tribe_name"),
    "taxon_subtribe": df.get("predator_taxon_subtribe_name"),
    "taxon_genus": df.get("predator_taxon_genus_name"),
    "taxon_genushybrid": df.get("predator_taxon_genushybrid_name"),
    "taxon_species": df.get("predator_taxon_species_name"),
    "taxon_hybrid": df.get("predator_taxon_hybrid_name"),
    "taxon_subspecies": df.get("predator_taxon_subspecies_name"),
    "taxon_variety": df.get("predator_taxon_variety_name"),
    "taxon_form": df.get("predator_taxon_form_name"),
})

# â€¦and prey side
prey_taxa = pd.DataFrame({
    "scientific_name": prey_sci,
    "common_name": df.get("prey_common_name"),
    "iconic_taxon_name": df.get("prey_iconic_taxon_name"),
    "taxon_kingdom": df.get("prey_taxon_kingdom_name"),
    "taxon_species": df.get("prey_taxon_species_name"),
    "taxon_phylum": df.get("prey_taxon_phylum_name"),
    "taxon_subphylum": df.get("prey_taxon_subphylum_name"),
    "taxon_superclass": df.get("prey_taxon_superclass_name"),
    "taxon_class": df.get("prey_taxon_class_name"),
    "taxon_subclass": df.get("prey_taxon_subclass_name"),
    "taxon_superorder": df.get("prey_taxon_superorder_name"),
    "taxon_order": df.get("prey_taxon_order_name"),
    "taxon_suborder": df.get("prey_taxon_suborder_name"),
    "taxon_superfamily": df.get("prey_taxon_superfamily_name"),
    "taxon_family": df.get("prey_taxon_family_name"),
    "taxon_subfamily": df.get("prey_taxon_subfamily_name"),
    "taxon_supertribe": df.get("prey_taxon_supertribe_name"),
    "taxon_tribe": df.get("prey_taxon_tribe_name"),
    "taxon_subtribe": df.get("prey_taxon_subtribe_name"),
    "taxon_genus": df.get("prey_taxon_genus_name"),
    "taxon_genushybrid": df.get("prey_taxon_genushybrid_name"),
    "taxon_species": df.get("prey_taxon_species_name"),
    "taxon_hybrid": df.get("prey_taxon_hybrid_name"),
    "taxon_subspecies": df.get("prey_taxon_subspecies_name"),
    "taxon_variety": df.get("prey_taxon_variety_name"),
    "taxon_form": df.get("prey_taxon_form_name"),
    
})

In [30]:
Species = pd.concat([pred_taxa, prey_taxa], ignore_index=True)
Species["scientific_name"] = clean_text(Species["scientific_name"])
# Species = Species.dropna(subset=["scientific_name"])


In [31]:
Species.count()

scientific_name      7937
common_name          7251
iconic_taxon_name    7998
taxon_kingdom        7998
taxon_species        7806
taxon_phylum         7998
taxon_subphylum      7894
taxon_superclass      118
taxon_class          7993
taxon_subclass       3017
taxon_superorder     1210
taxon_order          7997
taxon_suborder       2781
taxon_superfamily    2477
taxon_family         7992
taxon_subfamily      4678
taxon_supertribe       26
taxon_tribe          2986
taxon_subtribe        826
taxon_genus          7981
taxon_genushybrid       0
taxon_hybrid           70
taxon_subspecies      617
taxon_variety          49
taxon_form              0
dtype: int64

In [32]:
Species.isna().sum()

scientific_name        61
common_name           747
iconic_taxon_name       0
taxon_kingdom           0
taxon_species         192
taxon_phylum            0
taxon_subphylum       104
taxon_superclass     7880
taxon_class             5
taxon_subclass       4981
taxon_superorder     6788
taxon_order             1
taxon_suborder       5217
taxon_superfamily    5521
taxon_family            6
taxon_subfamily      3320
taxon_supertribe     7972
taxon_tribe          5012
taxon_subtribe       7172
taxon_genus            17
taxon_genushybrid    7998
taxon_hybrid         7928
taxon_subspecies     7381
taxon_variety        7949
taxon_form           7998
dtype: int64

In [33]:

def most_common_nonnull(series):
    nonnull = series.dropna()
    if nonnull.empty:
        return np.nan
    return nonnull.mode().iloc[0] 


Species = (Species
        .sort_values("scientific_name")
        .groupby("scientific_name", as_index=False)
        .agg({
            "common_name": most_common_nonnull,
            "iconic_taxon_name": most_common_nonnull,
            "taxon_kingdom": most_common_nonnull,
            "taxon_species": most_common_nonnull,
            "taxon_phylum": most_common_nonnull,
            "taxon_subphylum": most_common_nonnull,
            "taxon_superclass": most_common_nonnull,
            "taxon_class": most_common_nonnull,
            "taxon_subclass": most_common_nonnull,
            "taxon_superorder": most_common_nonnull,
            "taxon_order": most_common_nonnull,
            "taxon_suborder": most_common_nonnull,
            "taxon_superfamily": most_common_nonnull,
            "taxon_family": most_common_nonnull,
            "taxon_subfamily": most_common_nonnull,
            "taxon_supertribe": most_common_nonnull,
            "taxon_tribe": most_common_nonnull,
            "taxon_subtribe": most_common_nonnull,
            "taxon_genus": most_common_nonnull,
            "taxon_genushybrid": most_common_nonnull,
            "taxon_species": most_common_nonnull,
            "taxon_hybrid": most_common_nonnull,
            "taxon_subspecies": most_common_nonnull,
            "taxon_variety": most_common_nonnull,
            "taxon_form": most_common_nonnull,
            
        }))

Species.head(), Species.shape

(         scientific_name                 common_name iconic_taxon_name  \
 0      Abantis paradisea            Paradise Skipper           Insecta   
 1  Abisares viridipennis  Notched Shield Grasshopper           Insecta   
 2          Abramis brama                Common Bream    Actinopterygii   
 3       Abrus laevigatus                  Lucky Bean           Plantae   
 4        Acalitus mallyi       Mispel Leaf Gall Mite         Arachnida   
 
   taxon_kingdom          taxon_species  taxon_phylum taxon_subphylum  \
 0      Animalia      Abantis paradisea    Arthropoda        Hexapoda   
 1      Animalia  Abisares viridipennis    Arthropoda        Hexapoda   
 2      Animalia          Abramis brama      Chordata      Vertebrata   
 3       Plantae       Abrus laevigatus  Tracheophyta    Angiospermae   
 4      Animalia        Acalitus mallyi    Arthropoda     Chelicerata   
 
   taxon_superclass     taxon_class taxon_subclass  ... taxon_subfamily  \
 0              NaN         Insec

In [34]:
rels = pd.DataFrame({
    "predator_scientific_name": pred_sci,
    "prey_scientific_name": prey_sci,
    "observed_on": df.get("observed_on"),
    "time_zone": df.get("time_zone"),
    "image_url": df.get("image_url"),
    "sound_url": df.get("sound_url"),
    "positional_accuracy": df.get("positional_accuracy"),
    "place": df.get("place_guess"),
    "place_town": df.get("place_town_name"),
    "place_county": df.get("place_county_name"),
    "place_state": df.get("place_state_name"),
    "place_country": df.get("place_country_name"),
    "latitude": df.get("latitude"),
    "longitude": df.get("longitude"),
    "type_of_feeding": df.get("special_type_of_feeding"),
    "url": df.get("url"),
    "captive_cultivated": df.get("captive_cultivated"),
    "description": df.get("description"),
    "prey_agreements": df.get("prey_agreements"),
    "predator_agreements": df.get("predator_agreements")
})

In [35]:
# Clean endpoints
rels["predator_scientific_name"] = clean_text(rels["predator_scientific_name"])
rels["prey_scientific_name"] = clean_text(rels["prey_scientific_name"])

# Separate rows missing either endpoint
missing_rels = rels[rels[["predator_scientific_name", "prey_scientific_name"]].isna().any(axis=1)]
rels = rels.dropna(subset=["predator_scientific_name", "prey_scientific_name"])

# Append missing relationships to Null_Partner.csv
null_partner_path = "Data/Null_Partner.csv"
if os.path.exists(null_partner_path):
    missing_rels.to_csv(null_partner_path, mode="a", header=False, index=False)
else:
    missing_rels.to_csv(null_partner_path, index=False)

In [36]:
# Ensure numeric types (non-numeric coerced to NaN)
rels["latitude"] = pd.to_numeric(rels["latitude"], errors="coerce")
rels["longitude"] = pd.to_numeric(rels["longitude"], errors="coerce")

In [37]:
# 1) Ensure every relationship endpoint exists in taxa
pred_missing = set(rels["predator_scientific_name"]) - set(Species["scientific_name"])
prey_missing = set(rels["prey_scientific_name"]) - set(Species["scientific_name"])

In [38]:

print("Predators missing in taxa:", len(pred_missing))
print("Prey missing in taxa:", len(prey_missing))

# 2) Basic sanity counts
print("Taxa rows:", len(Species))
print("Relationships rows:", len(rels))

Predators missing in taxa: 0
Prey missing in taxa: 0
Taxa rows: 1876
Relationships rows: 3938


In [39]:
# 3) Check for obvious duplication in scientific names (post-normalization)
dup_names = Species["scientific_name"].value_counts()
dup_names = dup_names[dup_names > 1]
print("Duplicate scientific names:", len(dup_names))

Duplicate scientific names: 0


In [40]:
Species.to_csv("Data/Species.csv", index=False)
rels.to_csv("Data/interactions.csv", index=False)