# Compile results

The code run here compiles each of the program outputs into a common format, for further analysis. 

In [1]:
# to suppress warning from ete3 because it's not up to date with py3.12
import warnings
# ignore SyntaxWarning
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# ignore SettingWithCopyWarning for pandas
pd.options.mode.chained_assignment = None

In [3]:
from lib.output_compilation_functions import *

In [4]:
# make a directory to store the compiled results
res_dir = '../data/compiled_results/'
if not os.path.exists(res_dir):
    os.makedirs(res_dir)

data_dir = '../data'
program_runs_dir = '../data/program_runs/'
taxonomic_id = '1236'
input_tree_filepath = f"{
    data_dir}/1236_wol_tree_pruned_with_internal_labels.nwk"

In [5]:
%%bash -s "$res_dir" 
mkdir -p $1/angst $1/ale $1/ranger $1/gloome_ml $1/gloome_mp $1/count_ml $1/count_mp $1/wn

## AnGST

In [7]:
angst_output_dir = f"{program_runs_dir}/AnGST/Results/"
nogwise_branchwise_angst_df, nogwise_angst_df = compile_angst_results(angst_output_dir, input_tree_filepath)
print("AnGST results compiled.\nNOGwise DF:")
display(nogwise_angst_df)
print("NOGwise branchwise DF:")
display(nogwise_branchwise_angst_df)
nogwise_angst_df.to_csv(f"{res_dir}/angst/compiled_transfers.nogwise.angst.tsv", index=False, header=True, sep='\t')
nogwise_branchwise_angst_df.to_csv(f"{res_dir}/angst/compiled_transfers.nogwise.branchwise.angst.tsv", index=False, header=True, sep='\t')

Original transfers DF looks like:


Unnamed: 0,nog_id,source_branch,recipient_branch
0,ER9VY,1461581,216142-243924-1028989-223283-205918-157783-384...
1,ER9VY,157783-384676-160488,1548547
2,ER9VY,1548547,1300345-913325-1199154-1122185-1736549-1385517...
3,ER9VY,1548547,697282-1116472-1091494-857087-1538553
4,ER9VY,1548547,167879-58049
...,...,...,...
47479,ERT8F,207954-619304,1897630
47480,ERT8F,207954-619304,1543721
47481,ERT8F,207954-619304,62101
47482,ERT8F,1897630,1859457


Processing transfer thresholds:   0%|          | 0/1 [00:00<?, ?it/s]

AnGST results compiled.
NOGwise DF:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,33,1
1,EQRDS,25,1
2,EQRFZ,47,1
3,EQRG2,28,1
4,EQRGC,39,1
...,...,...,...
1295,ETCI3,53,1
1296,ETCI9,30,1
1297,ETCIZ,46,1
1298,ETCJF,50,1


NOGwise branchwise DF:


Unnamed: 0,nog_id,source_branch,recipient_branch,transfers
0,ER9VY,1461581,N166,1
1,ER9VY,N165,1548547,1
2,ER9VY,1548547,N28,1
3,ER9VY,1548547,N72,1
4,ER9VY,1548547,N229,1
...,...,...,...,...
47479,ERT8F,N203,1897630,1
47480,ERT8F,N203,1543721,1
47481,ERT8F,N203,62101,1
47482,ERT8F,1897630,1859457,1


## ALE

In [8]:
# Line #3 of the uml_rec files have the ALE tree. Extract that newick string, and read the internal node names of that tree
# Map those nodes of the ALE tree to the input species tree.

input_tree = ete3.Tree(input_tree_filepath, format=1)

# do the following for each dir in run_dir that ends with 'ALE'

# first, get the list of all the ALE dirs
ale_dirs = [d for d in os.listdir(program_runs_dir) if d.endswith('ALE')]
print(ale_dirs)

for ale_dir in ale_dirs:
    print(f"Processing {ale_dir}...")
    nogwise_ale_df, nogwise_branchwise_ale_df = compile_ale_outputs(os.path.join(program_runs_dir, ale_dir, 'Results'), input_tree)

    # write it out
    nogwise_branchwise_ale_df.to_csv(
        f"{res_dir}/ale/compiled_transfers.nogwise.branchwise.{ale_dir.lower()}.tsv",
        index=False,
        header=True,
        sep="\t",
    )
    nogwise_ale_df.to_csv(
        f"{res_dir}/ale/compiled_transfers.nogwise.{ale_dir.lower()}.tsv",
        index=False,
        header=True,
        sep="\t",
    )
    print("Nogwise, branchwise transfers:")
    display(nogwise_branchwise_ale_df)
    print("Nogwise transfers:")
    display(nogwise_ale_df)

    print("-----------------------------------")

['ALE']
Processing ALE...
Files to be written with ale:


Processing transfer thresholds:   0%|          | 0/100 [00:00<?, ?it/s]

Nogwise, branchwise transfers:


Unnamed: 0,nog_id,source_branch,recipient_branch,transfers
0,ERC6U,1005057,N52,0.01
1,ERC6U,1006000,349521,0.02
2,ERC6U,1009858,247634,0.01
3,ERC6U,1009858,743720,0.01
4,ERC6U,1009858,N168,0.01
...,...,...,...,...
2351035,ERZXU,N348,N207,0.01
2351036,ERZXU,N348,N332,0.05
2351037,ERZXU,N348,N344,0.02
2351038,ERZXU,N349,N59,0.01


Nogwise transfers:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,33.18,0.010000
1,EQRDS,28.04,0.010000
2,EQRFZ,50.38,0.010000
3,EQRG2,29.78,0.010000
4,EQRGC,43.20,0.010000
...,...,...,...
0,EREPP,2.27,2.178687
0,EREPP,2.27,2.201515
0,EREPP,2.27,2.224343
0,EREPP,2.27,2.247172


-----------------------------------


## Ranger

In [9]:
ranger_output_dirs = [f"{program_runs_dir}/{d}/Results/" for d in os.listdir(program_runs_dir) if d.startswith('RANGER')]

for ranger_output_dir in ranger_output_dirs:
    print(f"Processing {ranger_output_dir}...")
    ranger_dir = os.path.basename(os.path.dirname(os.path.dirname(ranger_output_dir)))
    print(f"Ranger dir: {ranger_dir}")

    nogwise_branchwise_ranger_df, nogwise_ranger_df = compile_ranger_results(
        ranger_output_dir, input_tree_filepath
    )
    print("NOGwise DF:")
    display(nogwise_ranger_df)
    print("NOGwise branchwise DF:")
    display(nogwise_branchwise_ranger_df)
    nogwise_ranger_df.to_csv(f"{res_dir}/ranger/compiled_transfers.nogwise.{os.path.basename(ranger_dir).lower()}.tsv", index=False, header=True, sep='\t')
    nogwise_branchwise_ranger_df.to_csv(f"{res_dir}/ranger/compiled_transfers.nogwise.branchwise.{os.path.basename(ranger_dir).lower()}.tsv", index=False, header=True, sep='\t')

Processing ../data/program_runs//RANGER/Results/...
Ranger dir: RANGER


Processing NOGs:   0%|          | 0/1300 [00:00<?, ?it/s]

Processing transfer thresholds:   0%|          | 0/100 [00:00<?, ?it/s]

NOGwise DF:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,33.21,0.010000
1,EQRDS,26.65,0.010000
2,EQRFZ,50.00,0.010000
3,EQRG2,28.96,0.010000
4,EQRGC,41.41,0.010000
...,...,...,...
0,ET4RU,3.71,3.560505
0,ET4RU,3.71,3.597879
0,ET4RU,3.71,3.635253
0,ET4RU,3.71,3.672626


NOGwise branchwise DF:


Unnamed: 0,nog_id,source_branch,recipient_branch,transfers
0,ERCE7,1187848,N334,0.32
1,ERCE7,N322,N309,0.51
2,ERCE7,326297,211586,0.43
3,ERCE7,698738,N231,0.05
4,ERCE7,N109,247634,0.03
...,...,...,...,...
184761,ERVXT,768671,1300345,0.04
184762,ERVXT,743721,N13,0.10
184763,ERVXT,N63,1300345,0.16
184764,ERVXT,N14,765910,0.13


Processing ../data/program_runs//RANGER-Fast/Results/...
Ranger dir: RANGER-Fast


Processing NOGs:   0%|          | 0/1300 [00:00<?, ?it/s]

Processing transfer thresholds:   0%|          | 0/100 [00:00<?, ?it/s]

NOGwise DF:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,32.0,0.010000
1,EQRDS,26.0,0.010000
2,EQRFZ,50.0,0.010000
3,EQRG2,27.0,0.010000
4,EQRGC,41.0,0.010000
...,...,...,...
0,ET4FF,4.0,3.838788
0,ET4FF,4.0,3.879091
0,ET4FF,4.0,3.919394
0,ET4FF,4.0,3.959697


NOGwise branchwise DF:


Unnamed: 0,nog_id,source_branch,recipient_branch,transfers
0,ERDFX,N278,667129,1.00
1,ERDFX,N94,1797696,1.00
2,ERDFX,N293,N326,1.00
3,ERDFX,406818,406817,1.00
4,ERDFX,1681196,N317,1.00
...,...,...,...,...
49266,ERVXT,N13,216778,1.00
49267,ERVXT,1392540,N128,0.54
49268,ERVXT,225848,314608,1.00
49269,ERVXT,N14,N230,1.00


## GLOOME

GLOOME only infers gains or losses on branches, but without any inference of the source of the gene transfers for the gains. This means that we don't have anything for the source column but we have only the recipient column.

In [10]:
pa_matrix_tsv_filepath = f"{data_dir}/1236_pa_matrix.tsv"

# cases with species tree
gloome_output_dir_wt = f"{program_runs_dir}/GLOOME_with_tree/Results_GLOOME_ML_with_tree/"

# ml
gloome_ml_results_dict = read_and_compile_gloome_results(
    gloome_output_dir_wt, pa_matrix_tsv_filepath, "ml", input_tree_filepath
)
for key, df in gloome_ml_results_dict.items():
    print(f"{key} df:")
    display(df)
    df.to_csv(f"{res_dir}/gloome_ml/{key}tsv", index=False, header=True, sep='\t')
# mp
gloome_mp_output_dir_wt = [f"{program_runs_dir}/GLOOME_with_tree/{fi}" for fi in os.listdir(f"{program_runs_dir}/GLOOME_with_tree/") if fi.startswith('Results_GLOOME_MP_')]
gloome_mp_results_dict = read_and_compile_gloome_results(
    gloome_mp_output_dir_wt, pa_matrix_tsv_filepath, "mp", input_tree_filepath
)
for key, df in gloome_mp_results_dict.items():
    if key.endswith('8'): # only show the df for gain penalty ratio 8
        print(f"{key} df:")
        display(df)
    df.to_csv(f"{res_dir}/gloome_mp/{key}tsv", index=False, header=True, sep='\t')
# cases without species tree
gloome_output_dir_wot = f"{program_runs_dir}/GLOOME_without_tree/Results_GLOOME_ML_without_tree/"
# ml
gloome_ml_results_dict = read_and_compile_gloome_results(
    gloome_output_dir_wot, pa_matrix_tsv_filepath, "ml", None
)
for key, df in gloome_ml_results_dict.items():
    print(f"{key} df:")
    display(df)
    df.to_csv(f"{res_dir}/gloome_ml/{key}without_tree.tsv", index=False, header=True, sep='\t')
# mp
gloome_mp_output_dir = [f"{program_runs_dir}/GLOOME_without_tree/{fi}" for fi in os.listdir(f"{program_runs_dir}/GLOOME_without_tree/") if fi.startswith('Results_GLOOME_MP_')]
gloome_mp_results_dict = read_and_compile_gloome_results(
    gloome_mp_output_dir, pa_matrix_tsv_filepath, "mp", None
)
for key, df in gloome_mp_results_dict.items():
    if key.endswith('8'):
        print(f"{key} df:")
        display(df)
    key = key.replace('.gloome.mp.', '.gloome.mp.without_tree.')
    df.to_csv(f"{res_dir}/gloome_mp/{key}tsv", index=False, header=True, sep='\t')

Processing transfer thresholds:   0%|          | 0/100 [00:00<?, ?it/s]

compiled_transfers.nogwise.branchwise.gloome.ml. df:


Unnamed: 0,nog_id,source_branch,recipient_branch,gloome_branch_name,transfers,transfer_threshold
0,ERJME,unknown,1896966,1896966,0.10960,0.10550
2,ERJME,unknown,N357,N2,0.12460,0.12360
15,ERJME,unknown,216778,216778,0.94510,0.94460
19,ERJME,unknown,870187,870187,0.73870,0.73370
24,ERJME,unknown,519989,519989,0.72170,0.72100
...,...,...,...,...,...,...
230858,ERVXT,unknown,N287,N314,0.07843,0.07701
230862,ERVXT,unknown,515618,515618,0.16350,0.15470
230864,ERVXT,unknown,N343,N348,0.24960,0.24870
230871,ERVXT,unknown,762983,762983,0.12800,0.12200


compiled_transfers.nogwise.gloome.ml. df:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,16.85095,0.050000
1,EQRDS,15.71871,0.050000
2,EQRFZ,11.55448,0.050000
3,EQRG2,1.43731,0.050000
4,EQRGC,38.81869,0.050000
...,...,...,...
213,ETC37,0.99770,0.990107
214,ETCDP,1.99540,0.990107
0,EQYJP,1.00000,0.999700
1,ES59Z,1.00000,0.999700


Processing transfer thresholds:   0%|          | 0/100 [00:00<?, ?it/s]

compiled_transfers.nogwise.branchwise.gloome.ml. df:


Unnamed: 0,nog_id,source_branch,recipient_branch,gloome_branch_name,transfers,transfer_threshold
0,ERJME,unknown,N19,N19,0.05641,0.05574
2,ERJME,unknown,N22,N22,0.08738,0.08582
5,ERJME,unknown,N27,N27,0.08184,0.08038
7,ERJME,unknown,N28,N28,0.20670,0.19580
9,ERJME,unknown,N35,N35,0.05857,0.05849
...,...,...,...,...,...,...
278741,ERVXT,unknown,N346,N346,0.07211,0.07206
278743,ERVXT,unknown,1859457,1859457,0.05085,0.05009
278745,ERVXT,unknown,1632859,1632859,0.05082,0.05006
278747,ERVXT,unknown,N356,N356,0.06755,0.06615


compiled_transfers.nogwise.gloome.ml. df:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,20.35391,0.05
1,EQRDS,35.32471,0.05
2,EQRFZ,27.24602,0.05
3,EQRG2,10.47929,0.05
4,EQRGC,60.32069,0.05
...,...,...,...
931,ETCEW,1.01600,1.00
932,ETCI3,1.00400,1.00
933,ETCI9,2.01800,1.00
934,ETCIZ,2.03000,1.00


## Count

### Asymmetric Wagner Parsimony

In [11]:
count_MP_output_dir = f"{program_runs_dir}/Count/Count_MP/"
count_MP_nogwise_transfers_df = compile_count_mp_nogwise_transfers(count_MP_output_dir, taxonomic_id, res_dir)
print("Count MP nogwise transfers:")
display(count_MP_nogwise_transfers_df)
count_MP_nogwise_transfers_df.to_csv(f"{res_dir}/count_mp/compiled_transfers.nogwise.count.mp.tsv", 
                                    index=False, header=True, sep='\t')

Processing transfer thresholds:   0%|          | 0/10 [00:00<?, ?it/s]

Count MP nogwise transfers:


Unnamed: 0,nog_id,transfers,transfer_threshold
11,EQRZ4,1,8
14,EQSA4,1,8
15,EQSAX,1,8
23,EQSTQ,2,8
24,EQSTU,1,8
...,...,...,...
1295,ETCI3,126,0.33
1296,ETCI9,138,0.33
1297,ETCIZ,157,0.33
1298,ETCJF,180,0.33


In [12]:
!bash lib/compile_count_mp_nw_bw_parallel.sh $program_runs_dir $taxonomic_id $input_tree_filepath $res_dir

Input dir is ../data/program_runs//Count/Count_MP/
Tree filepath is ../data/1236_wol_tree_pruned_with_internal_labels.nwk and output dir is ../data/compiled_results//count_mp/
Files are ../data/program_runs//Count/Count_MP//1236_Count_output_gain_0.33_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_0.5_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_1_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_2_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_3_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_4_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_5_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_6_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_7_families.tsv ../data/program_runs//Count/Count_MP//1236_Count_output_gain_8_families.tsv
Number of files is 10
Running: python3 lib/compil

In [13]:
# list of files. This doesn't include the nogwise branchwise file to be written out at the end,
# in case a previous version exists
count_mp_files = [
    f"{res_dir}/count_mp/{f}"
    for f in os.listdir(f"{res_dir}/count_mp/")
    if f.startswith("compiled_transfers.nogwise.branchwise.count.mp") 
]
count_mp_nogwise_branchwise_df = combine_count_mp_nw_bw_transfers(count_mp_files)
print("Count MP nogwise branchwise transfers:")
display(count_mp_nogwise_branchwise_df)
count_mp_nogwise_branchwise_df.to_csv(f"{res_dir}/compiled_transfers.nogwise.branchwise.count.mp.tsv",
                                    index=False, header=True, sep='\t')

Gain penalty ratios:


['0.5', '8', '1', '4', '2', '3', '7', '5', '6', '0.33']

Count MP nogwise branchwise transfers:


Unnamed: 0,nog_id,recipient_branch,transfers,source_branch
0,EQRBG,N146,0.33,unknown
1,EQRBG,N236,0.33,unknown
2,EQRBG,1515746,0.33,unknown
3,EQRBG,N100,0.33,unknown
4,EQRBG,58049,0.33,unknown
...,...,...,...,...
712639,ETCUH,N311,8,unknown
712640,ETCUH,1926881,8,unknown
712641,ETCUH,N304,8,unknown
712642,ETCUH,69222,8,unknown


### Maximum Likelihood

In [14]:
count_ML_output_dir = f"{program_runs_dir}/Count/Count_ML/"
count_ML_output_file = os.path.join(count_ML_output_dir, "Count_output.tsv")
count_ml_nw_bw_gains_df, count_ml_nogwise_gains_df, count_ml_nw_bw_losses_df = process_count_ml_output(count_ML_output_file)

# Display and write out the dataframes to TSV files
print("Count_ML NOGwise branchwise transfers:")
display(count_ml_nw_bw_gains_df)
count_ml_nw_bw_gains_df.to_csv(f"{res_dir}/count_ml/compiled_transfers.nogwise.branchwise.count.ml.tsv", index=False, header=True, sep='\t')

print("Count ML NOGwise gains:")
display(count_ml_nogwise_gains_df)
count_ml_nogwise_gains_df.to_csv(f"{res_dir}/count_ml/compiled_transfers.nogwise.count.ml.tsv", index=False, header=True, sep='\t')

print("Count_ML NOGwise branchwise losses:")
display(count_ml_nw_bw_losses_df)
count_ml_nw_bw_losses_df.to_csv(f"{res_dir}/count_ml/compiled_losses.nogwise.branchwise.count.ml.tsv", index=False, header=True, sep='\t')

Processing transfer thresholds:   0%|          | 0/100 [00:00<?, ?it/s]

Count_ML NOGwise branchwise transfers:


Unnamed: 0,nog_id,recipient_branch,transfers,source_branch
33,ERTPC,1896966,0.996428,unknown
57,ET8I8,1896966,0.996019,unknown
148,ERUVU,1896966,0.996502,unknown
158,ERJ56,1896966,0.996487,unknown
176,ERENP,1896966,0.996191,unknown
...,...,...,...,...
930795,ER3QB,N357,0.000007,unknown
930796,ER3QZ,N357,0.000005,unknown
930797,ESNNU,N357,0.000017,unknown
930798,ER6B2,N357,0.000005,unknown


Count ML NOGwise gains:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQRBG,15.700713,1.107008e-15
1,EQRDS,17.821400,1.107008e-15
2,EQRFZ,19.090728,1.107008e-15
3,EQRG2,6.146018,1.107008e-15
4,EQRGC,32.652265,1.107008e-15
...,...,...,...
996,ETCI9,3.989736,9.898990e-01
997,ETCIZ,0.999188,9.898990e-01
998,ETCJF,1.982222,9.898990e-01
999,ETCUH,1.992892,9.898990e-01


Count_ML NOGwise branchwise losses:


Unnamed: 0,nog_id,branch,losses
0,ERJME,1896966,0.000053
1,ESZHK,1896966,0.000053
2,ERZ77,1896966,0.000053
3,ESRQ3,1896966,0.000053
4,ESD2G,1896966,0.068212
...,...,...,...
930795,ER3QB,N357,0.000053
930796,ER3QZ,N357,0.000053
930797,ESNNU,N357,0.000053
930798,ER6B2,N357,0.000053


## Wn

In [15]:
wn_hgt_genes_filepath = f"{program_runs_dir}/Wn/Results/HGT_genes.tsv"
members_filepath = f"{data_dir}/{taxonomic_id}_nog_members.tsv"

wn_nogwise_hgt_df, wn_nogwise_branchwise_hgt_df = compile_wn_results(wn_hgt_genes_filepath, members_filepath)

print("Wn NOGwise HGTs:")
display(wn_nogwise_hgt_df)
print("Wn NOGwise branchwise HGTs:")
display(wn_nogwise_branchwise_hgt_df)

wn_nogwise_hgt_df.to_csv(f"{res_dir}/wn/compiled_transfers.nogwise.wn.tsv", index=False, header=True, sep='\t')
wn_nogwise_branchwise_hgt_df.to_csv(f"{res_dir}/wn/compiled_transfers.nogwise.branchwise.wn.tsv", index=False, header=True, sep='\t')

Gene to NOG map for some genes:
1051646.IX91_06790: ERJME
1051646.IX91_17690: ERJME
1076588.TBH_C2789: ERJME
1127673.GLIP_2986: ERJME
1177154.Y5S_01274: ERJME


Processing transfer thresholds:   0%|          | 0/10 [00:00<?, ?it/s]

Wn NOGwise HGTs:


Unnamed: 0,nog_id,transfers,transfer_threshold
0,EQSTU,1,13.0
1,EQTSP,1,13.0
2,EQUN1,1,13.0
3,EQUY3,2,13.0
4,EQUY4,1,13.0
...,...,...,...
965,ETCDP,32,4.0
966,ETCEN,41,4.0
967,ETCI9,5,4.0
968,ETCIZ,27,4.0


Wn NOGwise branchwise HGTs:


Unnamed: 0,nog_id,source_branch,recipient_branch,transfers
0,EQRDS,unknown,585056,4.0
1,EQRFZ,unknown,966,6.0
2,EQRFZ,unknown,28173,5.0
3,EQRFZ,unknown,43263,5.0
4,EQRFZ,unknown,69222,5.0
...,...,...,...,...
16695,ETCIZ,unknown,1134474,4.0
16696,ETCIZ,unknown,1245471,4.0
16697,ETCIZ,unknown,1354303,4.0
16698,ETCIZ,unknown,1392540,5.0
