In [1]:
import os
os.chdir('..')  #cd to project folder

import pandas as pd
from pathlib import Path
from Bio import Phylo
from src.utils import read_config, parse_tree

In [66]:
#import config
config = read_config(config_path="config/config.yaml")

#import tree file
tree_file_path = Path(config["input_files"]["phylogenetic_tree"])
tree = Phylo.read(tree_file_path, "newick")
tree_df = parse_tree(tree)

#import nps_in_genera output
file_path = Path(config["output_files"]["nps_in_genera"])
nps_in_genera = pd.read_csv(file_path)

In [67]:
#count unique nps in each genus
nps_in_genera_count = nps_in_genera.groupby('genus_name')['structure_inchikey'].nunique()
nps_in_genera_count.rename('Count', inplace=True)
tree_df = tree_df.merge(nps_in_genera_count,  how='left', left_on='Genus', right_index=True)

#summary
print(f"Tree has {tree_df['Count'].shape[0]} genera")
print(f"At least 1 natural product report in Wikidata for {tree_df['Count'].count()} genera")
print(f"No natural products reported in Wikidata for {tree_df['Count'].isna().sum()} genera")
tree_df.head(n=3)

Tree has 7922 genera
At least 1 natural product report in Wikidata for 3457 genera
No natural products reported in Wikidata for 4465 genera


Unnamed: 0_level_0,Order,Family,Genus,Species,Count
leaf_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Poales_Poaceae_Neostapfia_colusana,Poales,Poaceae,Neostapfia,colusana,
Poales_Poaceae_Orcuttia_tenuis,Poales,Poaceae,Orcuttia,tenuis,
Poales_Poaceae_Orinus_thoroldii,Poales,Poaceae,Orinus,thoroldii,


In [68]:
#export NPs in genera count
tree_df['Count'].fillna(0, inplace=True)
tree_df['Count'].astype(int).to_csv(Path(config["output_files"]["nps_in_genera_count"]), header=False)

In [70]:
#NPs in Family
# tree_df = tree_df.groupby('Family')['Count'].sum()
nps_in_family = nps_in_genera.merge(tree_df[['Genus', 'Family']], left_on='genus_name', right_on='Genus').drop(columns=['genus', 'genus_name'])
nps_in_family.groupby('Family')['structure_inchikey'].nunique().to_csv(Path(config["output_files"]["nps_in_family_count"]), header=False)

In [None]:
#count unique nps in each Family
np_in_family = nps_in_genera.groupby('genus_name')['structure_inchikey'].nunique()
np_in_family.rename('Count', inplace=True)
np_in_family = pd.DataFrame(tree_df['Genus']).merge(np_in_family,  how='left', left_on='Genus', right_index=True) #merge NPs count to tree_df

#export
np_in_family['Count'].fillna(0, inplace=True)
np_in_family['Count'].astype(int).to_csv(Path(config["output_files"]["nps_in_genera_count"]), header=False)

#summary
print(f"Tree has {np_in_family['Count'].shape[0]} genera")
print(f"At least 1 natural product report in Wikidata for {np_in_family['Count'].count()} genera")
print(f"No natural products reported in Wikidata for {np_in_family['Count'].isna().sum()} genera")
np_in_family.head(n=3)

In [26]:
#export NPs count
tree_df['NPs_count'] = tree_df['NPs_count'].fillna(0)
tree_df.head(n=5)

tree_df['NPs_count'].astype(int).to_csv(Path(config["output_files"]["nps_in_genera_count"]), header=False)
# output_file_path = Path(config["output_files"]["nps_count_in_genera"])
# tree_df.to_csv(output_file_path, index=False)

In [None]:
tree_df[~tree_df['NPs_count'].isna()]['NPs_count'].astype(int)


count    3457.000000
mean       70.358982
std       151.513886
min         1.000000
25%         6.000000
50%        21.000000
75%        64.000000
max      2127.000000
Name: NPs_count, dtype: float64