## Estimate the genome tree using ASTRAL

In [3]:
import os

# to suppress warning from ete3 because it's not up to date with py3.12
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [4]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), 'data')
filtered_dir = os.path.join(data_dir, 'filtered')
# if filtered_dir doesn't exist, raise an error
if not os.path.exists(filtered_dir):
    raise FileNotFoundError(f"Directory {filtered_dir} not found. Please run the previous notebook first.")

genome_tree_dir = os.path.join(data_dir, 'genome_tree')
# if genome_tree_dir doesn't exist, make it
if not os.path.exists(genome_tree_dir):
    os.makedirs(genome_tree_dir)

In [None]:
# we set up filepaths for the ASTRAL run
astral_bin_path = os.path.expanduser('~/bin/ASTER-Linux/bin/astral-pro')

astral_input_filepath = os.path.join(
    filtered_dir, 'gene_trees.broad_distribution.pruned.nwk')
astral_mapping_filepath = os.path.join(
    filtered_dir, 'map.gene_taxa.broad_distribution.txt')

astral_log_path = os.path.join(data_dir, 'run.astral.log')
astral_output_filepath = os.path.join(
    genome_tree_dir, 'genome_tree.astral.nwk')

# we set up filepaths for the IQ-TREE run
iqtree_input_algs = os.path.join(filtered_dir, 'algs.filtered.concatenated.nex')
iqtree_bin_path = os.path.expanduser('~/bin/iqtree-2.2.2.6-Linux/bin/iqtree2')

In [None]:
%%bash -s "$astral_bin_path" "$astral_input_filepath" "$astral_mapping_filepath" "$astral_output_filepath" "$astral_log_path"
$1 -i $2 -a $3 -o $4 -t 100 2>$5

## Estimate branch lengths of genome tree using IQTree

<span style="color:red">While ASTRAL-Pro should run very quickly, in the order of ~1min, IQTree might take several hours to complete. You might want to run it in a `screen` session instead of the notebook, or run it with `nohup` and `disown`</span>

E.g. 

```bash
# with current working directory being data/
~/bin/iqtree-2.2.2.6-Linux/bin/iqtree2 -te genome_tree/genome_tree.astral.nwk -s filtered/algs.filtered.concatenated.nex -T 8 -m Q.pfam+I+R8 --prefix genome_tree/iqtree_output
```

...or with `nohup` and `disown`

```bash
nohup ~/bin/iqtree-2.2.2.6-Linux/bin/iqtree2 -te genome_tree/genome_tree.astral.nwk -s filtered/algs.filtered.concatenated.nex -T 8 -m Q.pfam+I+R8 --prefix genome_tree/iqtree_output & disown
```

## Root the genome tree using MAD

MAD is Minimum Ancestor Deviation

In [1]:
%%bash
~/bin/mad/mad ../data/genome_tree/genome_tree.iqtree.treefile


MAD phylogenetic rooting

Analyzing file '../data/genome_tree/genome_tree.iqtree.treefile'...
>> [MAD=0.086_AI=0.994_CCV=9.77%_N=1/1]

Minimal ancestor deviation, MAD = 0.086
           Ambiguity index,  AI = 0.994
                  Clock CV, CCV = 9.77%
Rooted tree written to '../data/genome_tree/genome_tree.iqtree.treefile.rooted'


    - Please cite DOI:10.1038/s41559-017-0193



In [6]:
# read in the treeFile, label internal nodes, and write out the tree with internal nodes labeled
import ete3
import os
treefilepath = os.path.join(genome_tree_dir, 'genome_tree.iqtree.treefile.rooted')
print('labelling ', treefilepath)
tree = ete3.Tree(treefilepath, format=1)
for i, node in enumerate(tree.traverse()):
    if not node.is_leaf():
        node.name = f'N{i+1}'
# # this tree is rooted, so we can name the root node as 'NR'
# tree.get_tree_root().name = 'NR'
tree.write(outfile=f'{treefilepath}.labeled', format=1, format_root_node=True)
print('done.')

labelling  /root/work/projects/hgt_ecosystem/data/genome_tree/genome_tree.iqtree.treefile.rooted
done.
