## Imports

In [1]:
import os
import sys

python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from subset_query_seqs import find_bacteria_archaea, filter_fasta_file
from toolbox import num_seqs

## Variables

In [2]:
project = 'Suthaus_2022'
marker = 'Full18S'
sim = 'sim90'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
tax_assign_dir = os.path.join(raw_data, 'tax_assign_results', project, marker, sim, denoise_method)
otu_original_dir = os.path.join(raw_data, 'OTU_original', project, marker, sim, denoise_method)
otu_eukaryotes_dir = os.path.join(raw_data, 'OTU_eukaryotes', project, marker, sim, denoise_method)

# Subset Query Sequences

## Filter out all the bacterial and archaeal sequences from the OTUs

In [3]:
# list of samples you want to filter
samples = ['A3', 
           'NH1', 
           'NH4', 
           'Sim17', 
           'Sim22', 
           'Th16', 
           'Th38', 
           'Th40', 
           'X17007']

In [4]:
# Loop through the samples
for sample in samples:
    
    # Step 1: Extract sequence IDs with Bacteria or Archaea from the taxonomy assignment file
    bacterial_and_archaeal_ids = find_bacteria_archaea(tax_assign_dir, f'{sample}_blast6.tab')
    
    # Define input and output paths for fasta files
    input_fasta_file = os.path.join(otu_original_dir, f'{sample}_18S_otu.fasta')
    output_fasta_file = os.path.join(otu_eukaryotes_dir, f'{sample}_18S_otu_filtered.fasta')
    
    # Step 2: Filter fasta files and save them to the designated directory
    filter_fasta_file(input_fasta_file, output_fasta_file, bacterial_and_archaeal_ids)