#Creating the species list for query

In [4]:
import re
import pandas as pd

# Load the Newick tree file
tre_file_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\global_tree_brlen_pruned_renamed.tre'

# Read the Newick tree as plain text
with open(tre_file_path, 'r') as file:
    tree_data = file.read()

# Define a regular expression to capture Order, Family, Genus, and Species
pattern = r"([A-Za-z]+)_([A-Za-z]+)_([A-Za-z]+)_([A-Za-z]+)"

# Find all matches of the taxonomic strings in the Newick tree
matches = re.findall(pattern, tree_data)

# Convert the matches into a DataFrame with separate columns for each taxonomic rank
df_taxonomy = pd.DataFrame(matches, columns=["Order", "Family", "Genus", "Species"])

# Save the resulting DataFrame as a CSV file
output_csv_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Extracted_species_tree.csv'
df_taxonomy.to_csv(output_csv_path, index=False)

print(f"Taxonomic ranks extracted and saved to: {output_csv_path}")


Taxonomic ranks extracted and saved to: C:\Users\HebraT\Desktop\08_Data_Of_Plants\Extracted_species_tree.csv


#Retrieving ALL plants compounds

In [14]:
import requests
import pandas as pd
import os
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# SPARQL query template
sparql_query_template = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX p: <http://www.wikidata.org/prop/>
PREFIX ps: <http://www.wikidata.org/prop/statement/>
PREFIX pr: <http://www.wikidata.org/prop/reference/>
PREFIX prov: <http://www.w3.org/ns/prov#>

#title: Available referenced structure-organism pairs on Wikidata for a specified genus (using genus name)
SELECT DISTINCT ?structure ?structure_inchikey ?structure_smiles ?taxon ?taxon_name ?reference ?reference_doi WHERE {{
    # Find the genus QID by its scientific name
    ?genus wdt:P225 "{genus_name}".

    # Query for all descendant taxa (species) within the genus
    ?taxon wdt:P171* ?genus.                      # Include all descendant taxa (species)
    ?structure wdt:P235 ?structure_inchikey;      # Get the InChIKey of the structure
               wdt:P233 ?structure_smiles;        # Get the SMILES of the structure
               p:P703 [                           # Statement found in taxon
               ps:P703 ?taxon;                    # Link to taxon
               (prov:wasDerivedFrom/pr:P248) ?reference ].  # Get the reference
    ?taxon wdt:P225 ?taxon_name.                  # Get the taxon scientific name
    ?reference wdt:P356 ?reference_doi.           # Get the reference DOI
}}

"""

def run_sparql_query(genus_name, max_retries=50, backoff_factor=1):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = sparql_query_template.format(genus_name=genus_name)
    
    retries = 0
    wait_time = 1  # Start with 1 second wait time

    while retries < max_retries:
        response = requests.get(endpoint_url, params={'query': query, 'format': 'json'})
        
        if response.status_code == 200:
            data = response.json()
            return data["results"]["bindings"]
        elif response.status_code == 429:
            # Handle rate limiting (429 Too Many Requests)
            print(f"Rate limit exceeded for genus {genus_name}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            retries += 1
            wait_time *= backoff_factor  # Exponential backoff
        else:
            print(f"Error querying genus {genus_name}: {response.status_code}")
            break
    
    print(f"Failed to retrieve data for genus {genus_name} after {max_retries} retries.")
    return None

def save_results_to_csv(genus_name, results, output_folder):
    # Create a DataFrame to store the results
    rows = []
    for result in results:
        row = {
            "structure": result.get("structure", {}).get("value", ""),
            "structure_inchikey": result.get("structure_inchikey", {}).get("value", ""),
            "structure_smiles": result.get("structure_smiles", {}).get("value", ""),
            "taxon": result.get("taxon", {}).get("value", ""),
            "taxon_name": result.get("taxon_name", {}).get("value", ""),
            "reference": result.get("reference", {}).get("value", ""),
            "reference_doi": result.get("reference_doi", {}).get("value", "")
        }
        rows.append(row)
    
    # Create a DataFrame from the rows
    df = pd.DataFrame(rows)
    
    # Save the DataFrame to a CSV file named after the genus
    file_path = os.path.join(output_folder, f"{genus_name}.csv")
    df.to_csv(file_path, index=False)
    print(f"Saved results for genus {genus_name} to {file_path}")

def process_genus(genus_name, output_folder):
    print(f"Processing genus {genus_name}...")
    results = run_sparql_query(genus_name)
    if results:
        save_results_to_csv(genus_name, results, output_folder)
    else:
        print(f"No results for genus {genus_name}")

def process_genus_csv_multithreaded(input_csv, output_folder, max_workers=5):
    # Read the genus names from the CSV file
    df = pd.read_csv(input_csv)
    if 'Genus' not in df.columns:
        print("Error: 'genus' column not found in the CSV file.")
        return
    
    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Process each genus using multithreading
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for idx, row in df.iterrows():
            genus_name = row['Genus']
            futures.append(executor.submit(process_genus, genus_name, output_folder))
        
        # Track progress
        for future in as_completed(futures):
            future.result()  # Ensures exceptions are raised if any

# Usage example:
process_genus_csv_multithreaded(r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Extracted_species_tree.csv', r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants', max_workers=2)



Processing genus Neostapfia...
Processing genus Orcuttia...
No results for genus OrcuttiaNo results for genus Neostapfia
Processing genus Orinus...

Processing genus Tridentopsis...
No results for genus Orinus
Processing genus Gouinia...
No results for genus Tridentopsis
Processing genus Sphenopholis...
No results for genus Gouinia
Processing genus Triplasiella...
No results for genus Sphenopholis
Processing genus Vaseyochloa...
No results for genus Triplasiella
Processing genus Craspedorhachis...
No results for genus Vaseyochloa
Processing genus Farrago...
No results for genus Farrago
Processing genus Perotis...
No results for genus Craspedorhachis
Processing genus Mosdenia...
Rate limit exceeded for genus Perotis. Retrying in 1 seconds...
Rate limit exceeded for genus Mosdenia. Retrying in 1 seconds...
Rate limit exceeded for genus Mosdenia. Retrying in 1 seconds...
No results for genus Perotis
Processing genus Trichoneura...
Rate limit exceeded for genus Trichoneura. Retrying in 1 s

  return compile(source, filename, mode, flags,


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\HebraT\\Desktop\\08_Data_Of_Plants\\Isolated_Cpd_Plants\\Abeliophyllum.csv'

Cleaning non plant metabolites

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Draw
from IPython.display import display, HTML
from io import BytesIO
import base64

# Function to convert molecule to an image and then to base64
def mol_to_img(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return "Invalid SMILES"
        img = Draw.MolToImage(mol)
        
        # Convert the image to a PNG and then to base64
        buffered = BytesIO()
        img.save(buffered, format="PNG")
        img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
        
        # Return HTML image tag with base64-encoded PNG
        return f'<img src="data:image/png;base64,{img_str}" />'
    except:
        return "Error generating image"

# Load your CSV data
csv_file_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants\Euphorbia.csv'  # Update with your local file path
df = pd.read_csv(csv_file_path)

# Apply the image generation to the DataFrame
df['Molecule Image'] = df['structure_smiles'].apply(mol_to_img)

# Display the DataFrame as an HTML table
def display_table(df):
    return HTML(df.to_html(escape=False))

# Display the table in your notebook
display(display_table(df[['taxon_name', 'Molecule Image']]))


In [15]:
import os
import pandas as pd
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

# Folder path containing the CSV files
folder_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants'

# File to save non-plant entries
non_plant_file = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\non_plants3.csv'

# Cache to store results of taxon queries
taxon_cache = {}

# GBIF API base URL for name backbone searches
GBIF_API_URL = "https://api.gbif.org/v1/species/match?name="

# Function to check if a taxon is a plant using requests
def fetch_taxon(taxon_name):
    if taxon_name in taxon_cache:
        return taxon_name, taxon_cache[taxon_name]
    
    try:
        response = requests.get(GBIF_API_URL + taxon_name)
        if response.status_code == 200:
            result = response.json()
            is_plant_taxon = 'kingdom' in result and result['kingdom'].lower() == 'plantae'
            taxon_cache[taxon_name] = is_plant_taxon
            return taxon_name, is_plant_taxon
    except requests.RequestException:
        pass
    
    # If the request fails, mark it as not a plant
    taxon_cache[taxon_name] = False
    return taxon_name, False

# Function to query taxon names in parallel using threading
def query_taxon_names_parallel(taxon_name_list, batch_size=100):
    all_results = []
    with ThreadPoolExecutor(max_workers=8) as executor:
        # Process taxon names in batches
        taxon_batches = [taxon_name_list[i:i + batch_size] for i in range(0, len(taxon_name_list), batch_size)]
        
        for batch in tqdm(taxon_batches, desc="Querying taxon names in batches"):
            results = list(executor.map(fetch_taxon, batch))
            all_results.extend(results)
    
    # Store results in cache
    for taxon_name, is_plant in all_results:
        taxon_cache[taxon_name] = is_plant

# Step 1: Collect all unique taxon names from all files
def collect_all_taxon_names():
    all_taxon_names = set()
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    for file in tqdm(csv_files, desc="Collecting taxon names"):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        all_taxon_names.update(df['taxon_name'].unique())
    
    return list(all_taxon_names)

# Step 3: Update each file by removing non-plant entries
def update_files():
    all_non_plants = []
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    
    for file in tqdm(csv_files, desc="Updating files"):
        file_path = os.path.join(folder_path, file)
        df = pd.read_csv(file_path)
        
        # Apply the cached results
        df['is_plant'] = df['taxon_name'].map(taxon_cache)
        
        # Collect non-plant entries
        non_plants = df[~df['is_plant']]
        for _, row in non_plants.iterrows():
            all_non_plants.append({'Original_File': file, 'Taxon': row['taxon_name']})
        
        # Remove non-plant entries from the original file
        df_plants_only = df[df['is_plant']]
        df_plants_only.drop(columns=['is_plant'], inplace=True)
        df_plants_only.to_csv(file_path, index=False)
    
    # Save all non-plant entries to a new CSV file
    non_plant_df = pd.DataFrame(all_non_plants)
    non_plant_df.to_csv(non_plant_file, index=False)

    print(f"Processing completed. Non-plant entries saved to {non_plant_file}")

if __name__ == '__main__':
    # Step 1: Collect all unique taxon names
    taxon_name_list = collect_all_taxon_names()

    # Step 2: Query all taxon names using parallel processing
    query_taxon_names_parallel(taxon_name_list)

    # Step 3: Update all files and remove non-plant entries
    update_files()

Collecting taxon names: 100%|██████████| 3456/3456 [00:09<00:00, 353.25it/s]
Querying taxon names in batches: 100%|██████████| 233/233 [07:56<00:00,  2.05s/it]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plants_only.drop(columns=['is_plant'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plants_only.drop(columns=['is_plant'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_plants_only.drop(columns=['is_plant'], inplace=True)
A value is trying to be set on a copy of a slice

Processing completed. Non-plant entries saved to C:\Users\HebraT\Desktop\08_Data_Of_Plants\non_plants3.csv





In [16]:
import os
import shutil
import pandas as pd

# Load the CSV file
file_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\non_plants3.csv'
data = pd.read_csv(file_path)

# Directories to search and copy files
source_folder = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\04_Isolated_Cpd_Plants'
destination_folder = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants'

# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Filter rows where the 'Plant' column is marked as 'Plant'
plant_rows = data[data['Plant'] == 'Plant']

# Loop through the filtered rows
for index, row in plant_rows.iterrows():
    original_file = row['Original_File']
    
    # Construct the full file path in the source directory
    source_file = os.path.join(source_folder, original_file)
    
    # Check if the file exists in the source folder
    if os.path.isfile(source_file):
        # Construct the destination file path
        destination_file = os.path.join(destination_folder, original_file)
        
        # Copy the file to the destination folder
        shutil.copy(source_file, destination_file)
        print(f"Copied {original_file} to {destination_folder}")
    else:
        print(f"File {original_file} not found in {source_folder}")


Copied Abrus.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Acanthus.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Acourtia.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Actaea.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Adenophora.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Aglaia.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Aglaia.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Aglaia.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Alpinia.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Amaranthus.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Amorpha.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants
Copied Angelica.csv to C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_

In [23]:
import os
import pandas as pd

# Specify the folder path where your .csv files are located
folder_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Isolated_Cpd_Plants'

# Load the Extracted_species_tree.csv file
extracted_species_tree_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\Extracted_species_tree.csv'
extracted_species_tree = pd.read_csv(extracted_species_tree_path)

# Ensure the Genus column is in lowercase for case-insensitive matching
extracted_species_tree['Genus'] = extracted_species_tree['Genus'].str.lower()

# Create a new column in extracted_species_tree to store the reported count
extracted_species_tree['Reported_Cpd'] = 0

# Iterate over each .csv file in the folder
for file_name in os.listdir(folder_path):
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Count the number of lines excluding the header
        with open(file_path, 'r') as f:
            line_count = sum(1 for line in f) - 1  # Subtract 1 to exclude the header
        
        # Get the file name without extension
        genus_name = os.path.splitext(file_name)[0].lower()

        # Check if the file name (as genus) matches any in the Genus column
        if genus_name in extracted_species_tree['Genus'].values:
            # Update the Reported_Cpd column with the line count for the matching genus
            extracted_species_tree.loc[extracted_species_tree['Genus'] == genus_name, 'Reported_Cpd'] = line_count

# Save the updated extracted_species_tree.csv file
updated_csv_path = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\00_Species_extraction\Extracted_species_tree_isolated_wikidata.csv'
extracted_species_tree.to_csv(updated_csv_path, index=False)

print("Updated Extracted_species_tree.csv saved successfully!")


Updated Extracted_species_tree.csv saved successfully!


In [24]:
import csv

# Path to the CSV file
csv_file = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\00_Species_extraction\Extracted_species_tree_isolated_wikidata.csv'

# Set to store all unique IDs
all_ids = set()

# Read the CSV file and extract the 'Full_string' and 'Reported_Cpd' columns
with open(csv_file, newline='', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter=',')
    for row in reader:
        full_string = row.get('Full_string', '').strip()
        reported_cpd = row.get('Reported_Cpd', '').strip()
        
        # Ensure both fields are not empty
        if full_string != '' and reported_cpd != '':
            all_ids.add((full_string, reported_cpd))

# Path to output annotation file
output_file = r'C:\Users\HebraT\Desktop\08_Data_Of_Plants\00_Species_extraction\Isolated_cpd_annotation.txt'

# Write headers and data to the annotation file
with open(output_file, 'w', newline='', encoding='utf-8') as f:
    f.write('DATASET_SIMPLEBAR\n')
    f.write('SEPARATOR COMMA\n')
    f.write('DATASET_LABEL,Isolated_cpds\n')
    f.write('COLOR,#000000\n')
    f.write('DATA\n')
    
    # Write each unique pair of Full_string and Reported_Cpd
    for full_string, reported_cpd in all_ids:
        f.write(f'{full_string},{reported_cpd}\n')

print("Annotation file has been successfully created.")


Annotation file has been successfully created.


In [None]:
import re
from collections import defaultdict

# File paths
input_file_path = 'path_to_your_tree_file.tre'
output_file_path = 'path_to_output_file.txt'

# Regular expression to capture strings in the format Order_Family_Genus_Specie
pattern = re.compile(r'([A-Za-z]+_[A-Za-z]+_[A-Za-z]+_[A-Za-z]+)')

# Dictionary to store the first and last occurrences of each order
order_occurrences = defaultdict(list)

# Read the tree file
with open(input_file_path, 'r') as file:
    tree_data = file.readlines()

# Search for the taxonomic strings and group them by order
for line in tree_data:
    matches = pattern.findall(line)
    for match in matches:
        order = match.split("_")[0]  # Extract the order part of the string
        order_occurrences[order].append(match)

# Prepare the output with alternating colors and range format
output_data = []
color1 = "#f4cccc"
color2 = "#cfe2f3"

for i, (order, taxa_list) in enumerate(order_occurrences.items()):
    first_occurrence = taxa_list[0]  # First occurrence of the order
    last_occurrence = taxa_list[-1]  # Last occurrence of the order
    color = color1 if i % 2 == 0 else color2  # Alternate between color1 and color2
    
    # Format the data with comma separators
    formatted_line = f"{first_occurrence},{last_occurrence},range,{color},{order}"
    output_data.append(formatted_line)

# Prepare the header and dataset information
header = [
    "TREE_COLORS",
    "SEPARATOR COMMA",
    "DATA"
]

# Combine header and data
final_output = "\n".join(header + output_data)

# Write the output to a file
with open(output_file_path, 'w') as output_file:
    output_file.write(final_output)

print(f"Output file written to {output_file_path}")


Replacing wrong ID buy correct ones

In [26]:
# Step 1: Read the incomplete-to-complete ID mappings
mapping_file = r"C:\Users\HebraT\Desktop\08_Data_Of_Plants\Problematic_ID.txt"
incomplete_to_complete = {}

with open(mapping_file, "r") as f:
    for line in f:
        incomplete_id, complete_id = line.strip().split(',')
        incomplete_to_complete[incomplete_id] = complete_id

# Step 2: Read the file containing strings with incomplete IDs
input_file = r"C:\Users\HebraT\Desktop\08_Data_Of_Plants\05_Tree_annotation\01_Isolated_cpd_annotation.txt"
output_file = r"C:\Users\HebraT\Desktop\08_Data_Of_Plants\05_Tree_annotation\01_Isolated_cpd_annotation.txt"

with open(input_file, "r") as f:
    content = f.readlines()

# Step 3: Replace incomplete IDs with complete IDs
updated_content = []
for line in content:
    for incomplete_id, complete_id in incomplete_to_complete.items():
        if incomplete_id in line:
            line = line.replace(incomplete_id, complete_id)
    updated_content.append(line)

# Step 4: Write the updated content to a new file (or overwrite the original file)
with open(output_file, "w") as f:
    f.writelines(updated_content)

print(f"Updated file saved as {output_file}")


Updated file saved as C:\Users\HebraT\Desktop\08_Data_Of_Plants\05_Tree_annotation\01_Isolated_cpd_annotation.txt


In [5]:
from pygbif import species

def is_plant(genus_name):
    # Query GBIF for genus information
    search_result = species.name_backbone(name=genus_name)
    
    if 'kingdom' in search_result and search_result['kingdom'].lower() == 'plantae':
        return True  # It's a plant genus
    return False  # Not a plant genus or no data found

# Example usage
genus = "Abroma augustum"
if is_plant(genus):
    print(f"{genus} is a plant genus.")
else:
    print(f"{genus} is not a plant genus.")


Abroma augustum is a plant genus.
