In [1]:
import os
os.chdir('..')  #cd to project folder

import pandas as pd
from Bio import Phylo
from pathlib import Path
from src.utils import read_config

#load config
config = read_config(config_path="config/config.yaml")

In [2]:
#import data
path = Path(config['input_files']['troja_plant_list'])
plant_list = pd.read_csv(path, header=None)
species_troja=plant_list[0]
genera_troja = species_troja.str.split(' ', expand=True)[0].drop_duplicates()

In [3]:
#import Angiosperms tree
tree_path = Path(config["tree_files"]["zuntini_genus"])
tree = Phylo.read(tree_path, "newick")

#create df from tree (order, family, genus, species)
tree_leaves = [leaf.name for leaf in tree.get_terminals()] #extract leaf names
tree_leaves = pd.Series(tree_leaves, name='leaf_name')
tree_df = tree_leaves.str.split('_', expand=True)
tree_df = tree_df.iloc[:, :4] #keep first 4 columns
tree_df.columns = ['Order', 'Family', 'Genus', 'Species'] #rename columns
tree_df = pd.concat([tree_leaves, tree_df], axis=1).rename(columns={0: 'leaf_name'}).set_index('leaf_name')
genera_tree = pd.Series(tree_df['Genus'].unique())  #list of genera to be queried in Wikidata

In [29]:
# Find overlap
overlap = tree_df['Genus'][tree_df['Genus'].isin(genera_troja)]

#write iTOL binary datasets
output_path = Path('data/trees/iTOL_genera_in_troja.txt')
with open(output_path, 'w') as f: 
    #write headers
    f.write('DATASET_BINARY\n')
    f.write('SEPARATOR COMMA\n')
    f.write(f'DATASET_LABEL,Troja\n') #dataset name
    f.write(f'COLOR,#178513\n') #dataset color
    f.write(f'FIELD_SHAPES,1\n') #dataset shape (square)
    f.write(f'FIELD_LABELS,Troja\n') #scaffold names
    f.write('DATA\n')

    #write dataset
    for leaf_name, genus in overlap.items():
        f.write(f'{leaf_name},1\n')