In [1]:
# imports
from Bio import SeqIO
from Bio import Entrez
from dotenv import load_dotenv
import pandas as pd
import os
import re
import sys
import openpyxl


# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from low_similarity_otus import extract_data_from_filename, read_and_augment_csv, get_taxonomy_from_genbank, add_taxopath_column

In [2]:
# variables
project = 'Suthaus_2022'
cell = 'cellCombined'
sim = 'sim90'
marker = 'Full18S'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
otu_results = os.path.join(raw_data, 'OTU_results', project)
load_dotenv(os.path.join('..', 'secrets.env'))
fasta_dir = os.path.join(raw_data, 'OTU_nonchimeric', project, marker, cell, sim, denoise_method)
low_similarity_dir = os.path.join(raw_data, 'low_similarity_OTUs', project, marker, denoise_method, sim)
blast_results_dir = os.path.join(low_similarity_dir, 'BLASTn')

# Create FASTA file with the low similarity OTUs

## Load data into dataframe

In [None]:
tax_assign_df = pd.read_csv(os.path.join(otu_results, 'final_tax_table_Full18S_sim90.csv'))

## Subset the dataframe based on the similarity threshold

In [None]:
similarity_threshold = 80

low_assign_df = tax_assign_df[tax_assign_df['Pident'] < similarity_threshold]

## Create FASTA file for the OTUs in the subset dataframe

In [None]:
# Create a copy of the dataframe
low_assign_df = low_assign_df.copy()

# Format the OTU ID
low_assign_df['OTU_FASTA_id'] = 'centroid=' + low_assign_df['OTU'] + ';seqs=' + low_assign_df['OTU_Num'].astype(str)

# List of OTUs to extract
otus_to_extract = low_assign_df['OTU_FASTA_id'].tolist()  # replace 'OTU_id_column' with the name of the column with OTU IDs

# Maintain a set of sequence IDs that have been added to the list
added_ids = set()

# Initialize an empty list to store sequences
sequences = []

# Loop through each file in the directory
for fasta_file in os.listdir(fasta_dir):
    # Ensure you only process .fasta files
    if fasta_file.endswith('.fasta') and fasta_file != 'Mock_18S_otu.fasta':
        file_path = os.path.join(fasta_dir, fasta_file)
        
        # Parse the FASTA file and extract sequences as before
        for record in SeqIO.parse(file_path, 'fasta'):
            if record.id in otus_to_extract and record.id not in added_ids:
                sequences.append(record)
                added_ids.add(record.id)

# Writing sequences to a new FASTA file
SeqIO.write(sequences, os.path.join(low_similarity_dir, 
                                    'FASTA', 
                                    f'low_similarity_OTUs_TH{str(similarity_threshold)}.fasta'), 
                                    'fasta')

# Creating a single table with all the BLASTn search

Checking the OTUs with the low percentage similarity to the reference sequences. In other words, checking OTUs that are not similar or close to the known sequences in the database used.

## Defining variables

In [5]:
# Files
blast_tables = [blast_table for blast_table in os.listdir(blast_results_dir) if blast_table.endswith('.tsv')]

# List of dataframes
dfs = []

# Regex patterns for extracting data from the CSV table names:
# Sequence ID:
reg_sequence_id = 'seq\d+_\d+'
# Taxon name:
reg_taxon_name = '[^_]+(?:_[^_]+)*'
# Percentage coverage from taxonomical assignement:
reg_score_tax_assign = '\d+\.\d+'
# Sample name:
reg_sample_name = '\w+'
# Final regex pattern

pattern = re.compile(fr'^({reg_sequence_id})_({reg_taxon_name})_({reg_score_tax_assign})_({reg_sample_name})\.tsv$')

## Creating the table

In [6]:
# 2. Iterate over each file
for blast_table in blast_tables:
    # Extracting details from filename
    details = extract_data_from_filename(blast_table, pattern)
    if not details:
        continue
    sequence_id, taxon_name, score_tax_assign, sample_name = details
    
    # Reading and augmenting dataframe
    df = read_and_augment_csv(os.path.join(blast_results_dir, blast_table), *details)
    dfs.append(df)

# Merge all the dataframes together
blastn_df = pd.concat(dfs, ignore_index=True)

In [7]:
blastn_df

Unnamed: 0,Sequence_id,Sample,Tax_assignment,Score_tax_assign,BLASTn,Max_Ident,Score,E_value,Hit_accession
0,seq223_11,X17007,Olkasia_polycarbonata,72.2,Gaulosia striata,89.22%,2675.680,0.000000e+00,OQ331014
1,seq223_11,X17007,Olkasia_polycarbonata,72.2,Ploeotid sp.,91.36%,1096.830,0.000000e+00,MK239307
2,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.58%,1075.190,0.000000e+00,KY947150
3,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.56%,1074.290,0.000000e+00,KY947155
4,seq223_11,X17007,Olkasia_polycarbonata,72.2,Uncultured marine,73.58%,1066.180,0.000000e+00,KX189158
...,...,...,...,...,...,...,...,...,...
495,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,74.74%,384.502,5.959290e-101,AF372801
496,seq125_12,Th40,Caliculium_glossobalani,77.4,Apicomplexa sp.,71.47%,380.895,7.259900e-100,KC890798
497,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured Gregarina,74.44%,379.994,7.259900e-100,MG766260
498,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,79.29%,375.485,3.086990e-98,AF372709


## Adding the taxopath for each sequence using Entrez APIs

In [8]:
Entrez.email = os.getenv('ENTREZ_EMAIL')

# Add the taxopath column (can take some time)
blastn_df = add_taxopath_column(blastn_df)

Processing GenBank ID: OQ331014
Processing GenBank ID: MK239307
Processing GenBank ID: KY947150
Processing GenBank ID: KY947155
Processing GenBank ID: KX189158
Processing GenBank ID: KY947157
Processing GenBank ID: KX189173
Processing GenBank ID: KJ757308
Processing GenBank ID: AY665087
Processing GenBank ID: KX189160
Processing GenBank ID: KX189167
Processing GenBank ID: KX189170
Processing GenBank ID: KJ757736
Processing GenBank ID: KX189157
Processing GenBank ID: KJ760279
Processing GenBank ID: KJ757377
Processing GenBank ID: KX189137
Processing GenBank ID: KX189164
Processing GenBank ID: KJ762734
Processing GenBank ID: KX189134
Processing GenBank ID: OQ331014
Processing GenBank ID: MK239307
Processing GenBank ID: AY425009
Processing GenBank ID: KX189125
Processing GenBank ID: KX189169
Processing GenBank ID: KX189123
Processing GenBank ID: KJ757646
Processing GenBank ID: KX189126
Processing GenBank ID: AF380996
Processing GenBank ID: MF422192
Processing GenBank ID: AY425011
Processi

Processing GenBank ID: GQ184295
Processing GenBank ID: GU290092
Processing GenBank ID: LC669580
Processing GenBank ID: AY224692
Processing GenBank ID: AF274257
Processing GenBank ID: KC787453
Processing GenBank ID: KC787375
Processing GenBank ID: KF378556
Processing GenBank ID: KC787491
Processing GenBank ID: FJ222082
Processing GenBank ID: KJ764339
Processing GenBank ID: KJ763428
Processing GenBank ID: KJ763405
Processing GenBank ID: KJ763293
Processing GenBank ID: KJ763046
Processing GenBank ID: KJ762935
Processing GenBank ID: KJ762860
Processing GenBank ID: KJ759734
Processing GenBank ID: KJ568711
Processing GenBank ID: KC787515
Processing GenBank ID: KC787457
Processing GenBank ID: JX841996
Processing GenBank ID: JX841974
Processing GenBank ID: JX841849
Processing GenBank ID: OQ331009
Processing GenBank ID: OQ331014
Processing GenBank ID: KP306753
Processing GenBank ID: KY963138
Processing GenBank ID: MK239290
Processing GenBank ID: MK239288
Processing GenBank ID: AF380996
Processi

In [9]:
blastn_df

Unnamed: 0,Sequence_id,Sample,Tax_assignment,Score_tax_assign,BLASTn,Max_Ident,Score,E_value,Hit_accession,Taxopath
0,seq223_11,X17007,Olkasia_polycarbonata,72.2,Gaulosia striata,89.22%,2675.680,0.000000e+00,OQ331014,Eukaryota; Discoba; Euglenozoa; Euglenida; Eug...
1,seq223_11,X17007,Olkasia_polycarbonata,72.2,Ploeotid sp.,91.36%,1096.830,0.000000e+00,MK239307,Eukaryota; Discoba; Euglenozoa; Euglenida; Plo...
2,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.58%,1075.190,0.000000e+00,KY947150,Eukaryota; Discoba; Euglenozoa; Diplonemea
3,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.56%,1074.290,0.000000e+00,KY947155,Eukaryota; Discoba; Euglenozoa; Diplonemea
4,seq223_11,X17007,Olkasia_polycarbonata,72.2,Uncultured marine,73.58%,1066.180,0.000000e+00,KX189158,Eukaryota; Discoba; Euglenozoa; Diplonemea; en...
...,...,...,...,...,...,...,...,...,...,...
495,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,74.74%,384.502,5.959290e-101,AF372801,Eukaryota; environmental samples
496,seq125_12,Th40,Caliculium_glossobalani,77.4,Apicomplexa sp.,71.47%,380.895,7.259900e-100,KC890798,Eukaryota; Sar; Alveolata; Apicomplexa
497,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured Gregarina,74.44%,379.994,7.259900e-100,MG766260,Eukaryota; Sar; Alveolata; Apicomplexa; Conoid...
498,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,79.29%,375.485,3.086990e-98,AF372709,Eukaryota; environmental samples


## Save the table as Excel and TSV files

In [10]:
# Save to Excel
filename_excel = "blastn_results_table.xlsx"
blastn_df.to_excel(os.path.join(low_similarity_dir, 'tables', filename_excel), index=False)

In [11]:
# Save to TSV
filename_tsv = "blastn_results_table.tsv"
blastn_df.to_csv(os.path.join(low_similarity_dir, 'tables', filename_tsv), sep='\t', index=False)