## Imports

In [25]:
import os
from collections import Counter

## Variables

In [2]:
project = 'Suthaus_2022'
marker = 'Full18S'
sim = 'sim90'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
otu_eukaryotes_dir = os.path.join(raw_data, 'OTU_eukaryotes', project, marker, sim, denoise_method)

# Rename and merge query sequences

In the next step, I want to merge fasta files and adjust their names. As I have fasta files per sample, I want to merge all the fasta files into one, so I can analyze all the samples at once. Moreover, to identify the OTU sample origin, I want to add the sample name to each OTU ID.

In [3]:
os.listdir(otu_eukaryotes_dir)

['A3_18S_otu_filtered.fasta',
 'Th40_18S_otu_filtered.fasta',
 'Th16_18S_otu_filtered.fasta',
 'X17007_18S_otu_filtered.fasta',
 'NH4_18S_otu_filtered.fasta',
 'NH1_18S_otu_filtered.fasta',
 'Sim22_18S_otu_filtered.fasta',
 'Th38_18S_otu_filtered.fasta',
 'Sim17_18S_otu_filtered.fasta',
 'merged_samples.fasta']

In [10]:
# Path to save the merged file
merged_fasta_path = os.path.join(otu_eukaryotes_dir, 'merged_samples.fasta')

# List of FASTA files
fasta_files = os.listdir(otu_eukaryotes_dir)

with open(merged_fasta_path, 'w') as merged_fasta:
    for file in fasta_files:
        # Extract sample name from the filename
        sample_name = file.split('_')[0]

        with open(os.path.join(otu_eukaryotes_dir, file), 'r') as infile:
            for line in infile:
                if line.startswith('>'):
                    # Modify the header line to include the sample name
                    merged_fasta.write(f'{line.strip()}_{sample_name}\n')
                else:
                    merged_fasta.write(line)

## Verification

I want to verify that the merging process went smoothly and we didn't miss any sequences or accidentally duplicate them.

### Merged file

Tally up the occurrences of the individual samples in the merged file. 

In [38]:
# Initialize dictionary with zero counts
merged_count = {sample.split('_')[0]:0 for sample in os.listdir(otu_eukaryotes_dir) if sample.endswith('.fasta') and 'merged_samples.fasta' not in sample}

# Use a Counter to tally up occurrences in the merged file
merged_counter = Counter()

with open(os.path.join(otu_eukaryotes_dir, 'merged_samples.fasta'), 'r') as infile:
    for line in infile:
        if line.startswith('>'):
            # Extract the sample name from the header
            sample_name = line.split('_')[-1].strip()
            merged_counter[sample_name] += 1

## Individual sample files

Now, to tally up occurrences in the individual FASTA files.

In [35]:
individual_counts = {sample.split('_')[0]:0 for sample in os.listdir(otu_eukaryotes_dir) if sample.endswith('.fasta') and 'merged_samples.fasta' not in sample}

for sample_file in os.listdir(otu_eukaryotes_dir):
    if sample_file.endswith('.fasta') and 'merged_samples.fasta' not in sample_file:
        sample_name = sample_file.split('_')[0]
        with open(os.path.join(otu_eukaryotes_dir, sample_file), 'r') as f:
            # Count the number of header lines (i.e., sequences)
            count = sum(1 for line in f if line.startswith('>'))
        individual_counts[sample_name] = count

{'A3': 140, 'Th40': 62, 'Th16': 115, 'X17007': 103, 'NH4': 44, 'NH1': 45, 'Sim22': 89, 'Th38': 81, 'Sim17': 124}


## Final comparison

In [49]:
for sample_name in individual_counts.keys():
    if merged_counter[sample_name] == individual_counts[sample_name]:
        print(f'The OTU numbers match between the merged and {sample_name} FASTA file!')
    else:
        print(f"""
        The OTU numbers DOESN'T match between the merged and {sample_name} FASTA file!
        Merge file: {merged_counter[sample_name]}
        Sample file: {individual_counts[sample_name]}
        """)

The OTU numbers match between the merged and A3 FASTA file!
The OTU numbers match between the merged and Th40 FASTA file!
The OTU numbers match between the merged and Th16 FASTA file!
The OTU numbers match between the merged and X17007 FASTA file!
The OTU numbers match between the merged and NH4 FASTA file!
The OTU numbers match between the merged and NH1 FASTA file!
The OTU numbers match between the merged and Sim22 FASTA file!
The OTU numbers match between the merged and Th38 FASTA file!
The OTU numbers match between the merged and Sim17 FASTA file!
