In [2]:
# imports
import os
import re

# Merge and filter sequences from the OTU_nonchimeric file

In [3]:
# Variables
raw_data = '../raw_data'

In [4]:
def tester(seq_path):
    """
    Count the number of headers and sequences in a sequence file.

    Args:
        seq_path (str): The path to the sequence file.

    Returns:
        str: A string containing the count of headers and sequences in the format:
             "number of headers: <count>\nnumber of sequences: <count>"
    """
    with open(seq_path, 'rt') as f:
        lines = f.readlines()
        count_headers = 0
        count_seqs = 0
        for line in lines:
            if line.startswith('>'):
                count_headers += 1
                if not line.rstrip('\n')[-1].isalnum():
                    print(f'something wrong with the sequence:\n{line}')
            elif line.rstrip('\n').isupper():
                    count_seqs += 1
            else:
                print(f'something wrong with the sequence:\n{line}')
        return f'number of headers: {count_headers}\nnumber of sequences: {count_seqs}'

## Create a single FASTA file for OTU_nonchimeric sequences per project (so all the samples within a single project and marker)

In [None]:
project = 'Suthaus_2022'
cell = 'cellCombined'
marker = 'Full18S'
sim = 'sim97'
otu_chim_filt = f'{raw_data}/OTU_nonchimeric/{project}/{marker}/{cell}/{sim}'


file_names = os.listdir(otu_chim_filt)
file_paths = [otu_chim_filt + '/' + file_name for file_name in file_names if file_name.startswith('otu')]

file_paths

adjusted_seqs = []

for file in file_paths:
    with open(file, 'rt') as f:
        lines = f.readlines()
        for index, line in enumerate(lines):
            if line.startswith('>'):
                header = re.sub('^.*?=.*?=', '', line)
                abundance = f"abund={header.split(';')[-1].strip('seqs=')}"
                if project.startswith('Jamy_'):
                    sample_name_and_id = header.split(';')[0]
                    sample_name = sample_name_and_id.split('.')[0]
                    id_ = sample_name_and_id.split('.')[1]
                if project.startswith('Suthaus_'):
                    id_ = header.split('/')[1]
                    sample_name = file.split('/')[-1].lstrip('otu_').rstrip('.fasta')
                adjusted_header = f'>{id_}_{sample_name}_{project}_{abundance}'
                adjusted_seqs.append(adjusted_header.rstrip('\n') + '\n')
            else:
                adjusted_seqs.append(line.upper().rstrip('\n') + '\n')
                
with open(f'{otu_chim_filt}/{project}_{marker}_{sim}_all_seqs.fasta', 'w') as fp:
    for adjusted_seq in adjusted_seqs:
        fp.write(adjusted_seq)

In [None]:
print(tester(f'{otu_chim_filt}/{project}_{marker}_{sim}_all_seqs.fasta'))

## Filter singletons from the single FASTA file for OTU_nonchimeric sequences

In [None]:
project = 'Suthaus_2022'
cell = 'cellCombined'
marker = 'Full18S'
sim = 'sim97'
otu_chim_filt = f'{raw_data}/OTU_nonchimeric/{project}/{marker}/{cell}/{sim}'
all_seqs_file = f'{otu_chim_filt}/{project}_{marker}_{sim}_all_seqs.fasta'
threshold = 1
seqs_nonsingl = []


# filter singletons
with open(all_seqs_file, 'rt') as f:
    lines = f.readlines()
    for line in lines[::-1]:
        if not line.startswith('>'):
            seq_ = line
        if ('>' in line):
            abundance = int(line.split('=')[-1].rstrip('\n'))
            if abundance > threshold:
                seqs_nonsingl.append(line)
                seqs_nonsingl.append(seq_)

# write the file into fasta
with open(f'{otu_chim_filt}/{project}_{marker}_{sim}_all_seqs_filtered.fasta', 'w') as fp:
    for seq in seqs_nonsingl:
        fp.write(seq)

In [None]:
print(tester(f'{otu_chim_filt}/{project}_{marker}_{sim}_all_seqs_filtered.fasta'))

# Merge and filter sequences from the extracted_18S

## Create a single FASTA file for extracted sequences per project (so all the samples within a single project and marker)

In [None]:
project = 'Jamy_2022'
cell = 'cell'
marker = 'rDNA'
sim = 'sim97'
extracted_18S = f'{raw_data}/extracted_18S/{project}/{marker}/{cell}/{sim}'

file_names = os.listdir(extracted_18S)
file_paths = [extracted_18S + '/' + file_name for file_name in file_names if file_name.startswith('extracted_18S_')]


adjusted_seqs = []

for file in file_paths:
    with open(file, 'rt') as f:
        lines = f.readlines()
        for index, line in enumerate(lines):
            if line.startswith('>'):
                header = re.sub('^.*?=.*?=', '', line)
                abundance = f"abund={header.split(';')[-1].strip('seqs=')}"
                if project.startswith('Jamy_'):
                    sample_name_and_id = header.split(';')[0]
                    sample_name = sample_name_and_id.split('.')[0]
                    id_ = sample_name_and_id.split('.')[1]
                if project.startswith('Suthaus_'):
                    id_ = header.split('/')[1]
                    sample_name = file.split('/')[-1].lstrip('extracted_18S_').rstrip('.fasta')
                adjusted_header = f'>{id_}_{sample_name}_{project}_{abundance}'
                adjusted_seqs.append(adjusted_header.rstrip('\n') + '\n')
            else:
                adjusted_seqs.append(line.upper().rstrip('\n') + '\n')
                
with open(f'{extracted_18S}/{project}_{marker}_{sim}_all_seqs.fasta', 'w') as fp:
    for adjusted_seq in adjusted_seqs:
        fp.write(adjusted_seq)

In [None]:
print(tester(f'{extracted_18S}/{project}_{marker}_{sim}_all_seqs.fasta'))

## Filter singletons from the single FASTA file for extracted 18S sequences

In [None]:
project = 'Jamy_2022'
cell = 'cell'
marker = 'rDNA'
sim = 'sim97'
extracted_18S = f'{raw_data}/extracted_18S/{project}/{marker}/{cell}/{sim}'
all_seqs_file = f'{extracted_18S}/{project}_{marker}_{sim}_all_seqs.fasta'
threshold = 1
seqs_nonsingl = []


# filter singletons
with open(all_seqs_file, 'rt') as f:
    lines = f.readlines()
    for line in lines[::-1]:
        if not line.startswith('>'):
            seq_ = line
        if ('>' in line):
            abundance = int(line.split('=')[-1].rstrip('\n'))
            if abundance > threshold:
                seqs_nonsingl.append(line)
                seqs_nonsingl.append(seq_)

# write the file into fasta
with open(f'{extracted_18S}/{project}_{marker}_{sim}_all_seqs_filtered.fasta', 'w') as fp:
    for seq in seqs_nonsingl:
        fp.write(seq)

In [None]:
print(tester(f'{extracted_18S}/{project}_{marker}_{sim}_all_seqs.fasta'))

# Merging the all_seqs fasta files for individual projects into a single fasta file

## Unfiltered sequences (singletons retained)

In [5]:
# List of input files
input_files = [
               f'{raw_data}/extracted_18S/Jamy_2019/rDNA/cell/sim97/Jamy_2019_rDNA_sim97_all_seqs.fasta', 
               f'{raw_data}/extracted_18S/Jamy_2022/rDNA/cell/sim97/Jamy_2022_rDNA_sim97_all_seqs.fasta', 
               f'{raw_data}/OTU_filtered/Suthaus_2022/Full18S/cellCombined/sim97/qiime/otu_seqs_filtered.fasta'
#                f'{raw_data}/OTU_nonchimeric/Suthaus_2022/Full18S/cellCombined/sim97/Suthaus_2022_Full18S_sim97_all_seqs.fasta'
              ]


# Output file
output_file = f'{raw_data}/OTU_results/all_projects/all_seqs.fasta'

# Remove the existing output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Open the output file in 'append' mode
with open(output_file, 'a') as outfile:
    # Iterate over each input file
    for file in input_files:
        # Open each input file in 'read' mode
        with open(file, 'r') as infile:
            # Read the contents of the input file
            content = infile.read()
            # Write the contents to the output file
            outfile.write(content)

In [None]:
print(tester(f'{raw_data}/OTU_results/all_projects/all_seqs.fasta'))

In [None]:
print(tester(f'{raw_data}/OTU_results/all_projects/all_seqs_test.fasta'))

## Filtered sequences (singletons filtered out)

In [None]:
# List of input files
input_files = [
               f'{raw_data}/extracted_18S/Jamy_2019/rDNA/cell/sim97/Jamy_2019_rDNA_sim97_all_seqs_filtered.fasta', 
               f'{raw_data}/extracted_18S/Jamy_2022/rDNA/cell/sim97/Jamy_2022_rDNA_sim97_all_seqs_filtered.fasta',
               f'{raw_data}/OTU_nonchimeric/Suthaus_2022/Full18S/cellCombined/sim97/Suthaus_2022_Full18S_sim97_all_seqs_filtered.fasta'
              ]


# Output file
output_file = f'{raw_data}/OTU_results/all_projects/all_seqs_filtered.fasta'

# Remove the existing output file if it exists
if os.path.exists(output_file):
    os.remove(output_file)

# Open the output file in 'append' mode
with open(output_file, 'a') as outfile:
    # Iterate over each input file
    for file in input_files:
        # Open each input file in 'read' mode
        with open(file, 'r') as infile:
            # Read the contents of the input file
            content = infile.read()
            # Write the contents to the output file
            outfile.write(content)

In [None]:
print(tester(f'{raw_data}/OTU_results/all_projects/all_seqs_filtered.fasta'))