## Imports

In [1]:
import os

## Variables

In [4]:
project = 'Suthaus_2022'
marker = 'Full18S'
raw_data = os.path.join('..', 'raw_data')
ref_euk_align_dir = os.path.join(raw_data, 'reference_alignments', 'vamp_phylo_placement', 'eukaryotes', 'reference_alignment_2022')




# sim = 'sim90'
# denoise_method = 'RAD'
# raw_data = os.path.join('..', 'raw_data')
# original_pr2 = os.path.join(raw_data, 'reference_alignments', 'pr2_v5', 'pr2_version_5.0.0_SSU_UTAX.fasta')
# vamp_alignment = os.path.join(raw_data, 'reference_alignments', 'vamp', '2023_VAMPYRELLIDA_SSU_annotated_PR2_AM_AS_not_aligned_version.fasta')
# adjusted_vamp_alignment = os.path.join(raw_data, 'reference_alignments', 'pr2_v5', 'vamp_alignment.fasta')
# modified_pr2 = os.path.join(raw_data, 'reference_alignments', 'pr2_v5', 'pr2_version_5.0.0_SSU_UTAX_plus_vamp_2023.fasta')

# Create taxon file for eukaryotic reference tree/alignment

In [5]:
# List of Vampyrellida sequence identifiers
vampyrellida_ids = [
    "HE609038_LV03", "AY941200", "EU567294_Arachnula_A155_Baikal_Fw",
    "MF621963_VE01", "MF621964C", "HE609040_VL01", "HE609035_VP01",
    "KC779515_MVa1x_B5", "KibAr_NRRA_isolate_Cumbria_BrMa", "En42C_NRRA_isolate_Cumbria_BrMa",
    "CAraX_NRRA_isolate_California_Ma", "MH235264_MAC_1", "MW969735",
    "NVam1_NRRA_isolate_North_Carolina_Br", "OK381882_B3"
]

ref_align = os.path.join(ref_euk_align_dir, 'reference_alignment.phy')
output_tax_file = os.path.join(ref_euk_align_dir, 'taxon_file_new.txt')

# Open the phylip file and the output taxon file
with open(ref_align, 'r') as phylip_file, open(output_tax_file, 'w') as taxon_file:
    # Skip the first line of the phylip file as it contains the dimensions of the matrix
    next(phylip_file)

    # Iterate over the lines in the phylip file
    for line in phylip_file:
        # Extract the sequence ID (strip whitespace and take the first column)
        seq_id = line.split()[0].strip()

        # Determine the taxon category based on presence in vampyrellida_ids list
        taxon_category = "Vampyrellida" if seq_id in vampyrellida_ids else "Reference"

        # Write the seq_id and taxon_category to the taxon file, separated by a tab
        taxon_file.write(f"{seq_id}\t{taxon_category}\n")

print("Taxon file created successfully.")


Taxon file created successfully.


In [6]:
# Path to your taxon file
taxon_file_path = os.path.join(ref_euk_align_dir, 'taxon_file_new.txt')

# Read the taxon file and store assignments
taxon_assignments = {}
with open(taxon_file_path, 'r') as file:
    for line in file:
        identifier, clade = line.strip().split('\t')
        if identifier in taxon_assignments:
            print(f"Duplicate found: {identifier} is assigned to both {taxon_assignments[identifier]} and {clade}.")
        taxon_assignments[identifier] = clade

# Check for any identifiers assigned to both clades
for identifier, clade in taxon_assignments.items():
    if clade == 'Reference':
        continue
    if identifier in taxon_assignments and taxon_assignments[identifier] == 'Vampyrellida':
        print(f"Collision found: {identifier} is in both clades.")


Collision found: HE609038_LV03 is in both clades.
Collision found: AY941200 is in both clades.
Collision found: EU567294_Arachnula_A155_Baikal_Fw is in both clades.
Collision found: MF621963_VE01 is in both clades.
Collision found: MF621964C is in both clades.
Collision found: HE609040_VL01 is in both clades.
Collision found: HE609035_VP01 is in both clades.
Collision found: KC779515_MVa1x_B5 is in both clades.
Collision found: KibAr_NRRA_isolate_Cumbria_BrMa is in both clades.
Collision found: En42C_NRRA_isolate_Cumbria_BrMa is in both clades.
Collision found: CAraX_NRRA_isolate_California_Ma is in both clades.
Collision found: MH235264_MAC_1 is in both clades.
Collision found: MW969735 is in both clades.
Collision found: NVam1_NRRA_isolate_North_Carolina_Br is in both clades.
Collision found: OK381882_B3 is in both clades.


In [7]:
# Specify the path to the original taxon file and the output file
original_taxon_file_path = os.path.join(ref_euk_align_dir, 'taxon_file_new.txt')
corrected_taxon_file_path = os.path.join(ref_euk_align_dir, 'corrected_taxon_file.txt')

# List the identifiers that should only be labeled as 'Vampyrellida'
vampyrellida_ids = [
    "HE609038_LV03", "AY941200", "EU567294_Arachnula_A155_Baikal_Fw",
    "MF621963_VE01", "MF621964C", "HE609040_VL01", "HE609035_VP01",
    "KC779515_MVa1x_B5", "KibAr_NRRA_isolate_Cumbria_BrMa",
    "En42C_NRRA_isolate_Cumbria_BrMa", "CAraX_NRRA_isolate_California_Ma",
    "MH235264_MAC_1", "MW969735", "NVam1_NRRA_isolate_North_Carolina_Br",
    "OK381882_B3"
]

# Read the original taxon file and write the corrected entries to the new file
with open(original_taxon_file_path, 'r') as original, open(corrected_taxon_file_path, 'w') as corrected:
    for line in original:
        identifier, clade = line.strip().split('\t')
        # Write only the correct clade assignment for the listed vampyrellida_ids
        if identifier in vampyrellida_ids and clade == 'Reference':
            continue
        else:
            corrected.write(line)

print(f"Corrected taxon file written to {corrected_taxon_file_path}.")


Corrected taxon file written to ../raw_data/reference_alignments/vamp_phylo_placement/eukaryotes/reference_alignment_2022/corrected_taxon_file.txt.
