In [93]:
import ete3
import pandas as pd
import numpy as np
import os

In [94]:
# import necessary data to get from our "species" key to names in ncbi's newick

In [95]:
taxids = pd.read_csv('allspecies.taxid', delimiter='\t', header=None)

In [96]:
name2species = pd.read_csv('name2species.csv', header=None)

In [97]:
merged = taxids.merge(name2species, left_on=0, right_on=1)

In [98]:
# and import metadata
metadata = pd.read_csv('metaall.csv')

In [99]:
merged = merged.merge(metadata, left_on='0_y', right_on='species')

In [100]:
# add naming that matched that produced by NCBI
merged.loc[:, '#Names'] = merged.loc[:,['key_0','1_x']].apply(lambda x: ' - '.join([str(w) for w in x]), axis=1)
# metadata feature engineering for quality proxies
# higher numbers indicate better quality genomes (hopefully)
# normalized N50
merged.loc[:, "N50_frac"] = merged.loc[:,'N50'] / merged.loc[:,'total_len']
# not n
merged.loc[:, "not_N"] = 1 -  merged.loc[:,'N']
# not loosing buscos in proteome compared to genome
merged.loc[:, "busco_retention"] = merged.loc[:,'busco_C_prot'] -  merged.loc[:,'busco_C_geno']
# alternative splicing called
merged.loc[:, "busco_altsplice"] = merged.loc[:,'busco_D_prot'] -  merged.loc[:,'busco_D_geno']
# UTRs explicity labelled
merged.loc[:, "UTRs_explicit"] = (merged.loc[:,'five_prime_UTR'] + merged.loc[:,'three_prime_UTR']) / merged.loc[:,'mRNA'] / 2

# finally training or not
merged.loc[:, "is_test"] = merged.loc[:, 'species'].apply(lambda x: int(x in os.listdir('test')))

In [101]:
to_scale = ["N50_frac", "not_N", "busco_retention", "busco_altsplice", "UTRs_explicit"]

In [102]:
# scale engineered proxies
x = merged.loc[:, to_scale]
x = (x - x.mean()) / x.std()  # z-score
merged.loc[:, to_scale] = x

In [103]:
# export just target columns, exactly how ete3 wants them (tab sep, ''#Names' matches 'name' from newick tre)
merged.loc[:,['#Names', "is_test"] + to_scale].to_csv('meta_plottable.csv', sep='\t', index=False)

In [104]:
# finally plot meta info on tree
t = ete3.ClusterTree('ncbi.mod.tre', text_array='meta_plottable.csv')
ts = ete3.TreeStyle()
ts.show_leaf_name = True
t.show("heatmap", tree_style=ts)

In [105]:
# moving on to plots with prediction quality (F1 scores)
# import (currently old binary encoding scores)
aug = pd.read_csv('/home/ali/Ankylosaurus/Core_projects/Puma/geenuff_helixer_data/metafun/Augustus_results.csv')
nn = pd.read_csv('/home/ali/Ankylosaurus/Core_projects/Puma/geenuff_helixer_data/metafun/single_genome_results_10k_LSTM_O647Aaic_CDS.csv')

In [106]:
withpred = merged.merge(aug, left_on="species", right_on="genome", how="outer")

In [107]:
withpred = withpred.merge(nn, left_on="species", right_on="genome", how="outer")

In [108]:
# export tsv exactly how ete3 wants it
withpred.loc[:, ['#Names', 'f1_cds_1_RT', 'f1_total_1', 'f1_cds_1']].to_csv('preds_table.csv', 
                                                                             sep='\t', index=False)

In [109]:
# plot with pred quality
t = ete3.ClusterTree('ncbi.mod.tre', text_array='preds_table.csv')
t.show('heatmap', tree_style=ts)