In [2]:
import os
os.chdir('..')  #cd to project folder

import pandas as pd
from pathlib import Path
from Bio import Phylo
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from src.utils import read_config, generate_query, run_query


In [27]:

#setup logging
log_file = f"logs/nps_in_genera_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",  # Custom date format without milliseconds
    handlers=[
        logging.FileHandler(log_file, mode='w'), #logs to file
        logging.StreamHandler()])  #logs to also to console

#load config
logging.info(f"Reading config file...")
config = read_config(config_path="config/config.yaml")

#import Angiosperms tree
logging.info(f"Importing Angiosperms phylogenetic tree...")
tree_file_path = Path(config["input_files"]["phylogenetic_tree"])
if tree_file_path.exists():
        tree = Phylo.read(tree_file_path, "newick")
        logging.info(f"Angiosperms phylogenetic tree successfully imported!")
else:
    logging.error(f"No tree file found at {tree_file_path}.")


#extract leaf names
logging.info(f"Parsing phylogenetic tree...")
tree_leaves = [leaf.name for leaf in tree.get_terminals()]
tree_leaves = pd.Series(tree_leaves, name='leaf_name')
logging.info(f'Tree contains {len(tree_leaves)} leaves')
logging.info(f"{tree_leaves.str.endswith('sp.').sum()} species names are not defined (e.g., 'Lessertia_sp.')")

#create df with order, family, genus, species
logging.info(f'Converting tree into pandas dataframe...')
tree_df = tree_leaves.str.split('_', expand=True)
tree_df = tree_df.iloc[:, :4] #keep first 4 columns
tree_df.columns = ['Order', 'Family', 'Genus', 'Species'] #rename columns
tree_df = pd.concat([tree_leaves, tree_df], axis=1).rename(columns={0: 'leaf_name'}).set_index('leaf_name')

#list of genera in the tree to be queried in Wikidata
logging.info(f'Extracting list of genera...')
genera_list = tree_df['Genus'].unique()
####
#select first 500 genera for testing
genera_list = genera_list[10:20]
####

2024-11-18 12:32:52 - INFO: Reading config file...
2024-11-18 12:32:52 - INFO: Importing Angiosperms phylogenetic tree...
2024-11-18 12:32:52 - INFO: Angiosperms phylogenetic tree successfully imported!
2024-11-18 12:32:52 - INFO: Parsing phylogenetic tree...
2024-11-18 12:32:52 - INFO: Tree contains 7922 leaves
2024-11-18 12:32:52 - INFO: 293 species names are not defined (e.g., 'Lessertia_sp.')
2024-11-18 12:32:52 - INFO: Converting tree into pandas dataframe...
2024-11-18 12:32:52 - INFO: Extracting list of genera...


In [28]:
print(len(genera_list))

10


In [None]:
def process_genera_parallel(genera_list, threads=20, max_attempts=5):
    all_results = []
    failed_tasks = {}

    with ThreadPoolExecutor(max_workers=threads) as executor:
        tasks = {executor.submit(run_query, genus, max_attempts): genus for genus in genera_list}

        for completed_task in as_completed(tasks):
            genus_name = tasks[completed_task]
            try:
                output = completed_task.result()
                all_results.append(output)  # Append all results, including None
                if output.empty:
                    print(f"Query for '{genus_name}' returned no results")
                    #append to all_results

            except Exception as e:
                failed_tasks[genus_name] = str(e)

    logging.info(f"Processing completed: {len(genera_list)} queries run.")
    logging.info(f"{len(genera_list) - len(failed_tasks)} queries succeeded.")
    if failed_tasks:
        logging.warning(f"{len(failed_tasks)} queries failed:")
        for genus, reason in failed_tasks.items():
            logging.error(f"'{genus}': {reason}")

    return pd.concat(all_results, ignore_index=True) if all_results else pd.DataFrame(), failed_tasks

#run SPARQL queries (parallelized)
results_df, failed_tasks = process_genera_parallel(genera_list, threads=20)

2024-11-18 12:32:56 - INFO: Query for 'Trichoneura' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Lophacme' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Dignathia' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Hubbardochloa' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Coleanthus' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Tetrachaete' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Bewsia' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Perotis' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Query for 'Mosdenia' genus completed in 1 attempts.


Query for 'Trichoneura' returned no output.
Query for 'Lophacme' returned no output.
Query for 'Dignathia' returned no output.
Query for 'Hubbardochloa' returned no output.
Query for 'Coleanthus' returned no output.
Query for 'Tetrachaete' returned no output.
Query for 'Bewsia' returned no output.
Query for 'Perotis' returned no output.
Query for 'Mosdenia' returned no output.


2024-11-18 12:32:56 - INFO: Query for 'Lepturidium' genus completed in 1 attempts.
2024-11-18 12:32:56 - INFO: Processing completed: 10 queries run.
2024-11-18 12:32:56 - INFO: 10 queries succeeded.


Query for 'Lepturidium' returned no output.


In [14]:
from src.utils import run_query

#run query on Afroligusticum
genus = 'Afroligusticum'
results = run_query(genus)
results

2024-11-18 12:30:18 - INFO: Query for 'Afroligusticum' genus completed in 1 attempts.


In [None]:
#is results_df empty?
if results.empty:
    

#is results



True