In [1]:

import pandas as pd
import os
import re

csv_file = '../data/gbif_coffea_5years.csv'
nodes_file = '../data/node_names.csv'


df_gbif = pd.read_csv(csv_file, usecols=['specimen_id', 'longitude', 'latitude'])
df_node = pd.read_csv(nodes_file)

#print(df_node)

# Function to extract part of the specimen_id for matching
def extract_name(specimen_id):
    # Use regex to extract pattern (e.g., everything before the underscore)
    return re.sub(r'_\d+', '', specimen_id)

# Apply extraction function to both DataFrames
df_gbif['key'] = df_gbif['specimen_id'].apply(extract_name)
df_node['key'] = df_node['Node Name'].apply(lambda x: re.sub(r'^C_|_[\dA-Za-z]+$', '', x))

# Create a dictionary for mapping key to Node Name
mapping = df_node.set_index('key')['Node Name'].to_dict()

# Map the Node Name into a new column in df_gbif
df_gbif['Node Name'] = df_gbif['key'].map(mapping)



# Drop the key column (optional)
df_gbif.drop(columns='key', inplace=True)
df_gbif = df_gbif.dropna(subset=['Node Name'])

#print(df_gbif)


df_new = df_gbif[['Node Name','longitude', 'latitude']]

# Renaming columns
df_new = df_new.rename(columns={'Node Name': 'specimen_id'})

#print(df_new)
    
base_name, extension = os.path.splitext(csv_file)

formatted_csv_file = base_name + '_formatted' + extension

df_new.to_csv(formatted_csv_file, index=False)

print(f"Data saved to {formatted_csv_file}")

Data saved to ../data/gbif_coffea_5years_formatted.csv


In [None]:
from Bio import Phylo
from ete3 import Tree, TreeStyle, NodeStyle, TextFace
import pandas as pd
import matplotlib.pyplot as plt

nwk_file = "../tree/new_phylo_tree.nwk"
offsets_file = "../data/offsets.csv"

# Use BioPython to read the Newick file
try:
    phylo_tree = Phylo.read(nwk_file, "newick")
    print("Tree read successfully with BioPython.")
    # Convert the BioPython tree to Newick string
    newick_string = phylo_tree.format('newick')
except Exception as e:
    print(f"Failed to read tree with BioPython: {e}")
    raise

# Load the tree from the Newick string
try:
    tree = Tree(newick_string)
except Exception as e:
    print(f"Failed to load tree with ETE3: {e}")
    raise

# Load offsets from CSV using pandas
offsets_df = pd.read_csv(offsets_file)
offsets_dict = pd.Series(offsets_df.XOffset.values, index=offsets_df.NodeName).to_dict()

# Define the nodes you want to highlight
highlight_nodes = ['Tricalysia', 'C_booviniana_A980']

# Create a TreeStyle object for customizing the appearance
ts = TreeStyle()
ts.show_leaf_name = False  # Disable default leaf names to add custom ones

def highlight_node(node):
    """Customize node style based on whether it is in highlight_nodes"""
    nstyle = NodeStyle()
    if node.name in highlight_nodes:
        nstyle['fgcolor'] = 'red'  # Font color
        nstyle['size'] = 20  # Font size
        nstyle['font'] = 'Helvetica-Bold'  # Font weight
    else:
        nstyle['fgcolor'] = 'black'  # Font color
        nstyle['size'] = 12  # Font size
        nstyle['font'] = 'Helvetica'  # Font weight
    
    # Apply custom offsets if available
    x_offset = offsets_dict.get(node.name, 0) if node.name else 0
    # Add text face for the node label with custom styling and position
    node.add_face(TextFace(node.name, fsize=nstyle['size'], fgcolor=nstyle['fgcolor'], font=nstyle['font']), column=0, position="aligned", x_offset=x_offset)
    
    return nstyle

# Apply the style to each node
for node in tree.traverse():
    node.set_style(highlight_node(node))

# Render the tree
ts.layout_fn = lambda: None  # Disable automatic layout updates
tree.show(tree_style=ts, show_leaf_name=False)  # Draw tree with custom labels


Tree read successfully with BioPython.
Failed to load tree with ETE3: Unexpected newick format 'Inner19:0.00844' 
You may want to check other newick loading flags like 'format' or 'quoted_node_names'.


  """
  """Prunes the topology of a node to conserve only the selected list of leaf
  """
  _ILEGAL_NEWICK_CHARS = ":;(),\[\]\t\n\r="
  _NHX_RE = "\[&&NHX:[^\]]*\]"
  _FLOAT_RE = "\s*[+-]?\d+\.?\d*(?:[eE][-+]?\d+)?\s*"
  matcher_str= '^\s*%s\s*%s\s*(%s)?\s*$' % (FIRST_MATCH, SECOND_MATCH, _NHX_RE)


NewickError: Unexpected newick format 'Inner19:0.00844' 
You may want to check other newick loading flags like 'format' or 'quoted_node_names'.

In [4]:
import pandas as pd



                              specimen_id  longitude   latitude      Node Name
0                          mangoroensis_0  48.380066 -19.189873            NaN
1                 coursiana J.-F. Leroy_1  48.553056 -19.164722            NaN
2   perrieri Drake ex Jum. & H. Perrier_2  44.272222 -22.853611            NaN
3                             bissetiae_3  47.590008 -15.407416    C_bissetiae
4                             bissetiae_6  47.504282 -15.403732    C_bissetiae
..                                    ...        ...        ...            ...
69                             heimii_254  49.275700 -12.398991  C_heimii_A516
70                          buxifolia_255  46.557159 -20.616193            NaN
71                             rabica_256  47.526778 -18.930902            NaN
72                             heimii_257  49.549809 -12.580463  C_heimii_A516
73              richardii J.-F. Leroy_259  49.459444 -17.705556            NaN

[74 rows x 4 columns]
