# Prepare data for dN/dS analysis

## Download genome data

We have gff files of the genomes of all the NOGs in the dataset already. We will download the genomes of the species in the dataset from NCBI, using a script that calls 'datasets' from the NCBI command line tools, but in parallel.


In [1]:
# to suppress warning from ete3 because it's not up to date with py3.12
import warnings
warnings.filterwarnings("ignore", category=SyntaxWarning)

In [2]:
import os
import glob
import shutil
import gzip
import base64
from io import StringIO
import multiprocessing as mp
import subprocess
import traceback


import pandas as pd
from Bio import AlignIO
from Bio.Nexus import Nexus
import tqdm
import ete3

In [3]:
# define the directories
data_dir = os.path.abspath("../data/")
genome_data_dir = os.path.join(data_dir, "genome_data/")
filtered_dir = os.path.join(data_dir, "filtered/")
algs_dir = os.path.join(filtered_dir, "single_copy_nog_algs/")

dnds_dir = os.path.join(data_dir, "inferences", "dn_ds")
genome_tree_filepath = os.path.join(
    data_dir, "genome_tree", "genome_tree.iqtree.treefile.rooted.labeled"
)
dnds_input_dir = os.path.join(dnds_dir, "input")
if not os.path.exists(dnds_input_dir):
    print(f"Creating directory {dnds_input_dir}")
    os.makedirs(dnds_input_dir, exist_ok=True)

In [4]:

# from genome_data/ get all .gff files and extract the genome accession ID from their headers into a list
genome_accession_ids = []
for file in os.listdir(genome_data_dir):
    if file.endswith(".gff"):
        with open(genome_data_dir + file) as f:
            for line in f:
                if line.startswith("#!genome-build-accession"):
                    genome_accession_ids.append(line.split(':')[1].strip())
                    break
            else:
                raise ValueError("No genome accession ID found in " + file)

# write the genome accession IDs to a file
with open(os.path.join(genome_data_dir, "genome_accession_ids.txt"), "w") as f:
    f.write("\n".join(genome_accession_ids) + "\n")

Then we download this list of genome assembly accessions from NCBI in parallel.

```bash
python code/download_ncbi_genome_sequences.py -m acc -i data/genome_data/genome_accession_ids.txt -o data/genome_data/ -k ~/ncbi_credentials.txt 2> data/nohup_genome_fna_dload.log
```

# Prepare NOG nucleotide sequence files

Extract the nucleotide sequences of the NOGs from the genomes of the species in the dataset.

```bash
python code/prepare_nog_nucl_seqs.py -m data/filtered/map.nog_members_single_copy.tsv  -j data/filtered/gene_features.json -g data/genome_data -o data/filtered/single_copy_nog_algs/ 2>&1 | tee data/prepare_nog_nucl_seqs.log
```

In [5]:
# extract the compressed alignments from the nog-> alg mapping file
nog_to_alg_filepath = "../data/filtered/map.nog_alg.filtered.tsv"
single_copy_nog_members_map_filepath = "../data/filtered/map.nog_members_single_copy.tsv"
filtered_taxa_w_available_gff_filepath = os.path.join(
    filtered_dir, 'taxa.overlap_filtered.gff_filtered.txt')

if not os.path.exists(algs_dir):
    os.makedirs(algs_dir, exist_ok=True)

# read in the taxa with available GFF files
# read it in as a set
filtered_taxa_w_available_gff_set = set(pd.read_csv(
    filtered_taxa_w_available_gff_filepath, header=None, names=['taxon_id'], dtype=str)['taxon_id'].tolist())

# read in the single copy nog members map
single_copy_nog_members_map = pd.read_csv(single_copy_nog_members_map_filepath, sep="\t", header=None,
                                          names=["nog", "members"], dtype=str)
single_copy_nogs_set = set(single_copy_nog_members_map["nog"].tolist())
# read in the nog to alg mapping
nog_to_alg = pd.read_csv(nog_to_alg_filepath, sep="\t", header=None,
                         names=["nog", "algs"], dtype=str)
# filter out the single copy nogs
single_copy_nog_to_alg_df = nog_to_alg[nog_to_alg["nog"].isin(
    single_copy_nogs_set)]

# fn to process an alignment and keep only those with taxa that have available GFF files
def process_alignment(args):
    row, taxa_set = args
    nog_name = row['nog']
    compressed_algs = row['algs']
    # decompress the alignment and decode it
    alg_string = StringIO(gzip.decompress(base64.b64decode(compressed_algs)).decode())
    # read the alignment
    alignment = AlignIO.read(alg_string, format='fasta')
    # retain only the taxa that are present in the pruned tree
    pruned_alignment = [record for record in alignment if record.id.split('.')[0] in taxa_set]
    # write each alignment in a separate file per OG
    with open(os.path.join(algs_dir, f'{nog_name}.faa'), 'w') as f:
        for record in pruned_alignment:
            f.write(f'>{record.id.split(".")[0]}\n{record.seq}\n')

# process the alignments in parallel
algs_args = [(row, filtered_taxa_w_available_gff_set) for _, row in single_copy_nog_to_alg_df.iterrows()]
with mp.Pool(mp.cpu_count() - 2) as pool:
    list(tqdm.tqdm(pool.imap_unordered(process_alignment, algs_args), total=len(single_copy_nog_to_alg_df), desc='Writing AA alignments'))

Writing AA alignments: 100%|██████████| 3718/3718 [00:04<00:00, 800.23it/s] 


## Prepare input files for Hyphy BUSTED


In [6]:
# first, read in the output of `count` to find which branches have transfers
count_compiled_branchwise_tsv = os.path.join(data_dir, 'compiled_results', 'compiled_transfers.branchwise.ecotype.count.tsv')
count_compiled_branchwise = pd.read_csv(count_compiled_branchwise_tsv, sep='\t', header=0, dtype=str)
branches_w_transfers = count_compiled_branchwise[count_compiled_branchwise['transfers'] != '0']['branch'].dropna().tolist()
branches_wo_transfers = count_compiled_branchwise[count_compiled_branchwise['transfers'] == '0']['branch'].dropna().tolist()
print(f'Branches with ecotype transfers: {len(branches_w_transfers)}, branches without ecotype transfers: {len(branches_wo_transfers)}')

# write the lists of branches with and without transfers to files
branches_w_transfers_filepath = os.path.join(data_dir, 'compiled_results', 'branches_w_ecotype_transfers.count.txt')
branches_wo_transfers_filepath = os.path.join(data_dir, 'compiled_results', 'branches_wo_ecotype_transfers.count.txt')


Branches with ecotype transfers: 227, branches without ecotype transfers: 89


In [7]:
def convert_pal2nal(args):
    i, j = args
    cmd = [
        'perl', '/root/bin/pal2nal.v14/pal2nal.pl', 
        i, os.path.join(algs_dir, f'{j}.fna'),
        '-output', 'fasta',
        # '-nogap' # do not allow gaps in the output
    ]
    output_file = os.path.join(algs_dir, f'{j}.aln.fna')
    with open(output_file, 'w') as out_f:
        result = subprocess.run(cmd, stdout=out_f, stderr=subprocess.PIPE, text=True)
        if result.returncode != 0:
            print(f"Error processing {j}: {result.stderr}")

# Prepare the arguments for parallel processing
pal2nal_args = [(i, os.path.basename(i).replace('.faa', '')) for i in glob.glob(os.path.join(algs_dir, '*.faa'))]

# Convert the protein alignments to nucleotide alignments using pal2nal in parallel
with mp.Pool(mp.cpu_count() - 2) as pool:
    list(tqdm.tqdm(pool.imap_unordered(convert_pal2nal, pal2nal_args), total=len(pal2nal_args), desc='Converting protein alignments to nucleotide alignments'))

# Prepare hyphy tree files for each alignment and rewrite taxa IDs
gtree = ete3.Tree(genome_tree_filepath, format=1)
empty_aln_files = []

aln_treefile_filepaths = []
aln_treefile_fg_filepaths = []
aln_treefile_bg_filepaths = []
skip_tree_ogs = []
branch_count_dict = {}
for i in tqdm.tqdm(glob.glob(os.path.join(algs_dir, '*.aln.fna')), desc='Preparing hyphy tree files and rewriting taxa IDs'):
    with open(i) as f:
        aln_taxa_list = [line.strip().strip('>') for line in f if line.startswith('>')]
        if not aln_taxa_list:
            empty_aln_files.append(i)
            continue
    pruned_tree = gtree.copy()
    pruned_tree.prune(aln_taxa_list)

    # Rewrite taxa IDs in the alignments and treefiles
    j = os.path.basename(i).replace('.aln.fna', '')
    try:
        aln = AlignIO.read(i, format='fasta')
    except ValueError as e:
        print(f"Error reading {i}: {e}")
        continue
    for record in aln:
        if not record.id.startswith('L'):
            record.id = f'L{record.id}'
            record.description = ''
    for leaf in pruned_tree:
        if not leaf.name.startswith('L'):
            leaf.name = 'L' + leaf.name
    
    tree_taxa_list = [leaf.name for leaf in pruned_tree]

    for b, node_name in enumerate(branches_wo_transfers):
        if not node_name.startswith('L') and not node_name.startswith('N'):
            branches_wo_transfers[b] = 'L' + node_name
    for b, node_name in enumerate(branches_w_transfers):
        if not node_name.startswith('L') and not node_name.startswith('N'):
            branches_w_transfers[b] = 'L' + node_name

    # write out this tree to a file
    tree_out_filepath = os.path.join(dnds_input_dir, f'{j}.treefile')
    tree_out_filepath_fg = os.path.join(dnds_input_dir, f'{j}.fg.treefile')
    tree_out_filepath_bg = os.path.join(dnds_input_dir, f'{j}.bg.treefile')
    pruned_tree.write(outfile=tree_out_filepath, format=1)

    fg_suffix = '{Test}'
    bg_suffix = ''

    # Create two copies of pruned_tree
    pruned_tree_fg = pruned_tree.copy()
    pruned_tree_bg = pruned_tree.copy()

    # Note: we are making two copies of the pruned tree, 
    # one where the branches with transfers are labeled as foreground branches,
    # and the other where the branches without transfers are labeled as foreground branches.
    # BUSTED results should be symmetric wrt which tree is used, 
    # and we can check this by comparing the results from the two runs.

    # first check if each set of branches has more than one branch in the tree
    w_transfer_nodes = [node for node in pruned_tree_fg.traverse() if node.name in branches_w_transfers]
    wo_transfer_nodes = [node for node in pruned_tree_fg.traverse() if node.name in branches_wo_transfers]
    
    if len(w_transfer_nodes) < 2 or len(wo_transfer_nodes) < 2:
        skip_tree_ogs.append(j)
        continue

    for node in pruned_tree_fg.traverse():
        if node.name in branches_w_transfers:
            node.name += fg_suffix
        elif node.name in branches_wo_transfers:
            node.name += bg_suffix

    for node in pruned_tree_bg.traverse():
        if node.name in branches_w_transfers:
            node.name += bg_suffix
        elif node.name in branches_wo_transfers:
            node.name += fg_suffix

    # Write out the alignment file
    aln_out_filepath = os.path.join(dnds_input_dir, f'{j}.aln.fna')
    AlignIO.write(aln, aln_out_filepath, format='fasta')

    # Write out the tree files
    pruned_tree_fg.write(outfile=tree_out_filepath_fg, format=1)
    pruned_tree_bg.write(outfile=tree_out_filepath_bg, format=1)

    # Append pairs of filepaths to the list
    aln_treefile_fg_filepaths.append((aln_out_filepath, tree_out_filepath_fg))
    aln_treefile_bg_filepaths.append((aln_out_filepath, tree_out_filepath_bg))
    aln_treefile_filepaths.append((aln_out_filepath, tree_out_filepath))

    # in a different file, we write out for each NOG, the number of branches with and without transfers, and total
    branch_count_dict[j] = {
        'with_transfers': len(w_transfer_nodes),
        'without_transfers': len(wo_transfer_nodes),
        'total': len(w_transfer_nodes) + len(wo_transfer_nodes)
    }
    

print(f"Empty alignment files: {len(empty_aln_files)} files: {empty_aln_files}")
print(f"Skipped {len(skip_tree_ogs)} trees because they had less than 2 branches with/without transfers: {skip_tree_ogs}")
# write out this list of skipped trees to a file
skip_tree_ogs_filepath = os.path.join(dnds_dir, 'skipped_tree_ogs.txt')
with open(skip_tree_ogs_filepath, 'w') as f:
    f.write('\n'.join(skip_tree_ogs) + '\n')

# write out the pair of aln and tree filepaths to a TSV file
busted_input_filepaths_fg_tsv = os.path.join(dnds_dir, 'map.aln_treefile_input_filepaths.fg.tsv')
with open(busted_input_filepaths_fg_tsv, 'w') as f:
    for aln, tree in aln_treefile_fg_filepaths:
        f.write(f"{aln}\t{tree}\n")

busted_input_filepaths_bg_tsv = os.path.join(dnds_dir, 'map.aln_treefile_input_filepaths.bg.tsv')
with open(busted_input_filepaths_bg_tsv, 'w') as f:
    for aln, tree in aln_treefile_bg_filepaths:
        f.write(f"{aln}\t{tree}\n")

busted_input_filepaths_tsv = os.path.join(dnds_dir, 'map.aln_treefile_input_filepaths.tsv')
with open(busted_input_filepaths_tsv, 'w') as f:
    for aln, tree in aln_treefile_filepaths:
        f.write(f"{aln}\t{tree}\n")

# write out the branch count dict to a file
branch_count_dict_filepath = os.path.join(dnds_dir, 'branch_count.tsv')
# make a dataframe from the dict, with nog_id as the index
branch_count_df = pd.DataFrame.from_dict(branch_count_dict, orient='index').reset_index().rename(columns={'index': 'nog_id'})
branch_count_df.to_csv(branch_count_dict_filepath, sep='\t', index=False)


Converting protein alignments to nucleotide alignments: 100%|██████████| 3718/3718 [00:09<00:00, 401.56it/s]
Preparing hyphy tree files and rewriting taxa IDs: 100%|██████████| 3718/3718 [00:22<00:00, 166.67it/s]

Empty alignment files: 38 files: ['/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/D3Q46.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/D5CU4.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/COG4822.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/D133R.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/COG3080.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/COG1939.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/COG4161.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/D4X8R.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/D50IS.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/COG3417.aln.fna', '/root/work/projects/hgt_ecosystem/data/filtered/single_copy_nog_algs/COG5567.aln.fna', '/root/w




In [8]:
from Bio import Phylo

gtree = ete3.Tree(genome_tree_filepath, format=1)
# first we label the leaves of the genome tree with 'L' to match the taxa IDs in the concatenated alignment
for leaf in gtree:
    leaf.name = 'L' + leaf.name
# then we add {Test} to the branches with transfers and {Reference} to the branches without transfers
for node in gtree.traverse():
    if node.name in branches_w_transfers:
        node.name += '{Test}'
    elif node.name in branches_wo_transfers:
        node.name += '{Reference}'

# write this tree to a newick file
output_genome_tree_filepath = os.path.join(dnds_dir, 'genome_tree.iqtree.treefile.rooted.labeled.reference_test.newick')
gtree.write(outfile=output_genome_tree_filepath, format=1)