## Imports

In [1]:
import os
import sys

# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from subset_pr2_database import transform_taxonomy, verify_id_names, compare_length

## Variables

In [2]:
raw_data = os.path.join('..', 'raw_data')
subset_align_dir = os.path.join(raw_data, 'reference_alignments', 'pr2_subset')

# Creating taxon file

This file is used to assign taxonomic paths to taxa (tips) of the reference tree. The format is as follows. Each line assigns a taxonomic path to one taxon, and contains two columns: the taxon label (ID) as it appears in the tree, followed by the semicolon-separated taxonomic path. The two columns are separated by a tab character.
(More here: https://github.com/lczech/gappa/wiki/Subcommand:-assign)

We will need this file for phylogenetic placement downstream GAPPA analyses.Bovidae

In [None]:
Example:
Seal    Eukaryota;Animalia;Chordata;Mammalia;Carnivora;Phocoiae
Whale   Eukaryota;Animalia;Chordata;Mammalia;Cetartiodactyla;
Mouse   Eukaryota;Animalia;Chordata;Mammalia;Rodentia;Muridae
Human   Eukaryota;Animalia;Chordata;Mammalia;Primates;Homonidae
Chicken Eukaryota;Animalia;Chordata;Amphibia;Galliformes;Phasianidae
Frog    Eukaryota;Animalia;Chordata;Amphibia;Anura;Dendrobatidae
Loach   Eukaryota;Animalia;Chordata;Amphibia;Anura;Rhacophoridae
Cow     Eukaryota;Animalia;Chordata;Mammalia;Artiodactyla;Bovidae

We have the extracted sequence IDs in 'sequence_full_names.txt' file. These IDs contains the taxon names and their taxopaths, so we just need to reformat them into a correct format (based on the example above).

## Compare taxon IDs in the reference alignment and 'sequence_full_names.txt' file

In [None]:
refernce_alignment_path = os.path.join(subset_align_dir, 'reference_alignment_gblocksStrict.fasta')

with open(refernce_alignment_path, 'r') as infile:
    print(f"\n{infile.readline()}")

In [3]:
# Reference alignment
ref_align_path = os.path.join(subset_align_dir, 'reference_alignment_gblocksStrict.fasta')

with open(ref_align_path, 'r') as infile:
    print(f"\n{infile.readline()}")


>AB353770.1.1740_U



In [4]:
# sequence_full_names.txt file
full_names_path = os.path.join(subset_align_dir, 'sequence_full_names.txt')

with open(full_names_path, 'r') as infile:
    print(f"\n{infile.readline()}")


AB353770.1.1740_U;tax=k:Eukaryota,d:TSAR,p:Alveolata-Dinoflagellata,c:Dinophyceae,o:Peridiniales,f:Kryptoperidiniaceae,g:Unruhdinium,s:Unruhdinium_kevei



So, we can see that the taxon names in the reference alignment correspond to the beggining of the names in the 'sequence_full_names.txt' file:

>AB353770.1.1740_U

>AB353770.1.1740_U;tax=...

Based on this observation, we can just split the taxon IDs from the taxopath by tabulator in the 'sequence_full_names.txt' file.
After this, we also need to adjust the taxopaths so they will match the pattern/format of the taxon file (see example above). For that, we need to do the following:
1. strip *;tax=*
2. change *,* to *;*
3. delete the *k:*, *d:*, *p:*, and so on

## Create taxon file

In [5]:
# Read from the "full names" file and write to a new "taxon" file
full_names_path = os.path.join(subset_align_dir, 'sequence_full_names.txt')
taxon_file_path = os.path.join(subset_align_dir, 'taxon_file.txt')

with open(full_names_path, 'r') as infile, open(taxon_file_path, 'w') as outfile:
    for line in infile:
        # Split the line into taxon ID and taxonomy based on ';tax='
        taxon_id, taxonomy = line.strip().split(';tax=', 1)
        transformed_taxonomy = transform_taxonomy(taxonomy)
        outfile.write(f"{taxon_id}\t{transformed_taxonomy}\n")

## Verification of the taxon file

### Verifing taxon depth

I want to ensure consistency in the taxonomic depth across all entries in the taxon file. We can achieve this by counting the number of taxa in each taxopath and then checking if this number is consistent across all lines in the taxon file.

In [6]:
taxon_file_path = os.path.join(subset_align_dir, 'taxon_file.txt')

# Read the taxon file and store the number of taxa for each entry in a set
taxa_counts = set()

with open(taxon_file_path, 'r') as file:
    for line in file:
        _, taxopath = line.strip().split("\t")
        taxa_count = len(taxopath.split(";"))
        taxa_counts.add(taxa_count)

# Check the consistency
if len(taxa_counts) == 1:
    print(f"All sequences have the same number of taxa in taxopath: {taxa_counts.pop()}.")
else:
    print(f"Inconsistent number of taxa in taxopath. Found counts: {', '.join(map(str, sorted(taxa_counts)))}.")


All sequences have the same number of taxa in taxopath: 8.


### Verifing the ID names 

In [7]:
ref_align_names = [file_ for file_ in os.listdir(subset_align_dir) if 'reference_alignment_gblocks' in file_]
taxon_file_path = os.path.join(subset_align_dir, 'taxon_file.txt')

for ref_align in ref_align_names:
    ref_align_path = os.path.join(subset_align_dir, ref_align)
    print(f'{ref_align}:')
    verify_id_names(taxon_file_path = taxon_file_path,
                    ref_alignment_path = ref_align_path)
    print('\n')

reference_alignment_gblocksStrict.fasta:
The ID names in the taxon file match the ID names in the reference alignment.


reference_alignment_gblocksLoose.fasta:
The ID names in the taxon file match the ID names in the reference alignment.


reference_alignment_gblocksModerate.fasta:
The ID names in the taxon file match the ID names in the reference alignment.




### Verifing the length of the sequences/names in the reference alignment and taxon file

In [8]:
ref_align_names = [file_ for file_ in os.listdir(subset_align_dir) if 'reference_alignment_gblocks' in file_]
taxon_file_path = os.path.join(subset_align_dir, 'taxon_file.txt')

for ref_align in ref_align_names:
    ref_align_path = os.path.join(subset_align_dir, ref_align)
    num_taxon_ids, num_ref_seqs = compare_length(taxon_file_path = taxon_file_path,
                                                 ref_alignment_path = ref_align_path)
    print(f'{ref_align}:')
    if num_taxon_ids == num_ref_seqs:
        print(f"Both the taxon file and the reference alignment contain {num_taxon_ids} sequences/ID names.")
    else:
        print(f"The taxon file contains {num_taxon_ids} sequences/ID names, while the reference alignment contains {num_ref_seqs} sequences/ID names.")
    print('\n')

reference_alignment_gblocksStrict.fasta:
Both the taxon file and the reference alignment contain 1371 sequences/ID names.


reference_alignment_gblocksLoose.fasta:
Both the taxon file and the reference alignment contain 1371 sequences/ID names.


reference_alignment_gblocksModerate.fasta:
Both the taxon file and the reference alignment contain 1371 sequences/ID names.


