## Imports

## Variables

# Subset Query Sequences

## Filter out all the bacterial and archaeal sequences from the OTUs

In [None]:
# Functions

# Extract sequence IDs with Bacteria or Archaea in their taxonomic assignment
def find_bacteria_archaea(tax_assign_dir, file):
    '''
    Extract sequence IDs with Bacteria or Archaea in their taxonomic assignment.

    Parameters:
    - tax_assign_dir (str): Directory path containing taxonomic assignment files.
    - file (str): Taxonomic assignment file.

    Returns:
    - bacterial_and_archaeal_ids (set): Set of sequence IDs with either Bacteria or Archaea in their taxonomic assignment.
    '''
    bacterial_and_archaeal_ids = []
    with open(os.path.join(tax_assign_dir, file), "r") as f:
        for line in f:
            parts = line.strip().split("\t")
            taxonomy = parts[1]
            seq_id = parts[0]
            if 'k:Bacteria' in taxonomy or 'k:Archaea' in taxonomy:
                bacterial_and_archaeal_ids.append(seq_id)
    return bacterial_and_archaeal_ids

# Filter fasta files
def filter_fasta_file(input_file, output_file, ids_to_exclude):
    '''
    Filters sequences in a fasta file based on a list of sequence IDs to exclude.

    Parameters:
    - input_file (str): Path to the input fasta file.
    - output_file (str): Path where the filtered fasta file should be saved.
    - ids_to_exclude (set): Set of sequence IDs to exclude.
    '''
    sequences = [record for record in SeqIO.parse(input_file, "fasta") if record.id not in ids_to_exclude]
    SeqIO.write(sequences, output_file, "fasta")

In [None]:
# list of samples you want to filter
samples = ['A3', 
           'NH1', 
           'NH4', 
           'Sim17', 
           'Sim22', 
           'Th16', 
           'Th38', 
           'Th40', 
           'X17007']

In [None]:
# Usage

# Loop through the samples
for sample in samples:
    
    # Step 1: Extract sequence IDs with Bacteria or Archaea from the taxonomy assignment file
    file_name = f'blast6_{sample}_18S.tab'
    bacterial_and_archaeal_ids = find_bacteria_archaea(tax_assign_dir, file_name)
    
    # Define input and output paths for fasta files
    input_fasta_file = os.path.join(otu_nonchimeric_dir, f'{sample}_18S_otu.fasta')
    output_fasta_file = os.path.join(otu_eukaryotes_dir, f'{sample}_18S_otu_filtered.fasta')
    
    
    # Step 2: Filter fasta files and save them to the designated directory
    filter_fasta_file(input_fasta_file, output_fasta_file, bacterial_and_archaeal_ids)