## Imports

In [3]:
from Bio import SeqIO
import os

## Variables

In [16]:
project = 'Suthaus_2022'
marker = 'Full18S'
sim = 'sim90'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
original_pr2 = os.path.join(raw_data, 'reference_alignments', 'pr2_v5', 'pr2_version_5.0.0_SSU_UTAX.fasta')
vamp_alignment = os.path.join(raw_data, 'reference_alignments', 'vamp', '2023_VAMPYRELLIDA_SSU_annotated_PR2_AM_AS_not_aligned_version.fasta')
adjusted_vamp_alignment = os.path.join(raw_data, 'reference_alignments', 'pr2_v5', 'vamp_alignment.fasta')
modified_pr2 = os.path.join(raw_data, 'reference_alignments', 'pr2_v5', 'pr2_version_5.0.0_SSU_UTAX_plus_vamp_2023.fasta')

# Removing vampyrellids from PR2 database

Before we will make a taxonomic assignment using VSEARCH, we want to adjust the PR2 reference alignment. We want to enrich this database with our own alignment with sequences that belong to the order Vampyrellida. Therefore, we need to delete all the sequences from pr2 database that belong to Vampyrellida first and then attach our vampyrellida-specific alignment to the pr2 database.

In [17]:
# Load the PR2 database and filter out Vampyrellida

with open(original_pr2, 'r') as original, open(modified_pr2, 'w') as modified:
    for record in SeqIO.parse(original, 'fasta'):
        if 'vampyrellida' not in record.description.lower():
            SeqIO.write(record, modified, 'fasta')

# Adjusting the sequence IDs of our vampyrellid alignment to match the format used in the PR2 database

In [18]:
# Adjust the sequence IDs
with open(vamp_alignment, 'r') as infile, open(adjusted_vamp_alignment, 'w') as outfile:
    for record in SeqIO.parse(infile, 'fasta'):
        # Split the header at the pipe characters
        parts = record.description.split('|')
        
        # Reconstruct the header with the correct format
        taxonomic_hierarchy = f';tax=k:Eukaryota,d:TSAR,p:Rhizaria-Cercozoa,c:Endomyxa,o:Vampyrellida,f:{parts[2]},g:{parts[3]},s:{parts[4]}'
        
        # Replace the first pipe with the taxonomic hierarchy
        record.id = parts[0] + taxonomic_hierarchy
        record.description = ''  # Clear the redundant description
        
        # Write the adjusted record to the new file
        SeqIO.write(record, outfile, 'fasta')

# Adding vampyrellid-specific alignment to PR2 database

In [19]:
# Now, append your vampyrellida-specific sequences
with open(adjusted_vamp_alignment, 'r') as vamp, open(modified_pr2, 'a') as pr2:
    for record in SeqIO.parse(vamp, 'fasta'):
        SeqIO.write(record, pr2, 'fasta')