In [3]:
import os
from pathlib import Path
from dotenv import load_dotenv

cwd = Path.cwd()
# Load dotenv into sys PATH
dotenv_file = cwd / "../../example/.env"
assert dotenv_file.is_file()
load_dotenv(dotenv_file)

True

In [5]:
import pandas as pd
csv_file = r"../../example/gbif_example\0026013-240906103802322\0026013-240906103802322.csv"
coff_dl_df = pd.read_csv(csv_file, sep='\t')

In [6]:
df = coff_dl_df.loc[coff_dl_df["taxonRank"] == "SPECIES"]
df = df.loc[:, ["verbatimScientificName", "decimalLongitude", "decimalLatitude", "year"]]
df = df.reset_index().drop(columns="index")
df["specimen_id"] = df.apply(lambda row: f"{row['verbatimScientificName']}_{row.name}", axis=1)
df["specimen_id"] = df["specimen_id"].str.strip("Coffea ")
df.columns = ["genus_species", "longitude", "latitude", "year", "specimen_id"]
df.head()

Unnamed: 0,genus_species,longitude,latitude,year,specimen_id
0,Psilanthus brassii,0.0,0.0,1967.0,Psilanthus brassii_0
1,Psilanthus brassii,147.333333,-9.333333,1967.0,Psilanthus brassii_1
2,Psilanthus brassii,146.5,-8.783333,1933.0,Psilanthus brassii_2
3,Coffea arabica,0.0,0.0,1928.0,rabica_3
4,Coffea arabica Linnaeus,167.425,-22.658333,1981.0,rabica Linnaeus_4


In [7]:
# Last 6 years
df_2018_p = df.loc[df["year"] > 2018]
print(len(df_2018_p))

3927


In [8]:
import sys
import os
import re
import pandas as pd

# Assuming df_2018_p is already defined
df_gbif = df_2018_p[['specimen_id', 'longitude', 'latitude']].copy()  # Use .copy() to avoid warnings
df_node = pd.read_csv(r"..\input\node_names.csv")

# Function to extract part of the specimen_id for matching
def extract_name(specimen_id):
    # Use regex to extract pattern (e.g., everything before the underscore)
    return re.sub(r'_\d+', '', specimen_id)

# Apply extraction function to both DataFrames
df_gbif.loc[:, 'key'] = df_gbif['specimen_id'].apply(extract_name)  # Use .loc to avoid SettingWithCopyWarning
df_node.loc[:, 'key'] = df_node['Node Name'].apply(lambda x: re.sub(r'^C_|_[\dA-Za-z]+$', '', x))

# Create a dictionary for mapping key to Node Name
mapping = df_node.set_index('key')['Node Name'].to_dict()

# Map the Node Name into a new column in df_gbif
df_gbif.loc[:, 'Node Name'] = df_gbif['key'].map(mapping)

# Drop the key column (optional)
df_gbif.drop(columns='key', inplace=True)

# Drop rows where 'Node Name' is NaN
df_gbif.dropna(subset=['Node Name'], inplace=True)

# Select the final columns
df_new = df_gbif[['Node Name', 'longitude', 'latitude']].copy()

# Renaming columns
df_new.rename(columns={'Node Name': 'specimen_id'}, inplace=True)

# Save the DataFrame to a new CSV
base_name, extension = os.path.splitext(csv_file)
formatted_csv_file = base_name + '_formatted' + extension
df_new.to_csv(formatted_csv_file, index=False)

print(f"Data saved to {formatted_csv_file}")


Data saved to ../../example/gbif_example\0026013-240906103802322\0026013-240906103802322_formatted.csv


In [11]:
import pandas as pd

input_file = r'..\input\0026013-240906103802322_formatted.csv'
caffeine_file = r'..\input\no_caffeine_nodes_w_specimen.csv'

def add_caffeine(input_file, caffeine_file):
    
    gbif_df = pd.read_csv(input_file)
    node_names_df = pd.read_csv(caffeine_file)

    # Merge the two DataFrames based on 'specimen_id' in gbif_df and 'Species_name' in node_names_df
    merged_df = pd.merge(gbif_df, node_names_df[['Species_name', 'caffeine_percent']], 
                         left_on='specimen_id', right_on='Species_name', how='left')

    # Drop the 'Species_name' column as it's no longer needed
    merged_df = merged_df.drop(columns=['Species_name'])
    
    # Retain only the required columns
    merged_df = merged_df[['specimen_id', 'longitude', 'latitude', 'caffeine_percent']]
    
    # Drop rows where 'caffeine_percent' is NaN
    merged_df = merged_df.dropna(subset=['caffeine_percent'])

    # Create output filename by replacing 'formatted' with 'w_caffeine'
    output_file = input_file.replace('formatted', 'w_caffeine')

    # Save the merged DataFrame to a new CSV file
    merged_df.to_csv(output_file, index=False)

    # Print the output file name
    print(f"Data saved to: {output_file}")
    
    

add_caffeine(input_file, caffeine_file)



Data saved to: ..\input\0026013-240906103802322_w_caffeine.csv
