In [1]:
# imports
from Bio import SeqIO
from Bio import Entrez
from dotenv import load_dotenv
import pandas as pd
import os
import re
import sys
import openpyxl


# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from low_similarity_otus import extract_data_from_filename, read_and_augment_csv, get_taxonomy_from_genbank, add_taxopath_column

In [3]:
# variables
project = 'Suthaus_2022'
cell = 'cellCombined'
sim = 'sim90'
marker = 'Full18S'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
otu_results = os.path.join(raw_data, 'OTU_results', project)
load_dotenv(os.path.join('..', 'secrets.env'))
fasta_dir = os.path.join(raw_data, 'OTU_nonchimeric', project, marker, cell, sim, denoise_method)
low_similarity_dir = os.path.join(raw_data, 'low_similarity_OTUs', project, marker, denoise_method, sim)
blast_results_dir = os.path.join(low_similarity_dir, 'BLASTn')

# Create FASTA file with the low similarity OTUs

## Load data into dataframe

In [None]:
tax_assign_df = pd.read_csv(os.path.join(otu_results, 'final_tax_table_Full18S_sim90.csv'))

## Subset the dataframe based on the similarity threshold

In [None]:
similarity_threshold = 80

low_assign_df = tax_assign_df[tax_assign_df['Pident'] < similarity_threshold]

## Create FASTA file for the OTUs in the subset dataframe

In [None]:
# Create a copy of the dataframe
low_assign_df = low_assign_df.copy()

# Format the OTU ID
low_assign_df['OTU_FASTA_id'] = 'centroid=' + low_assign_df['OTU'] + ';seqs=' + low_assign_df['OTU_Num'].astype(str)

# List of OTUs to extract
otus_to_extract = low_assign_df['OTU_FASTA_id'].tolist()  # replace 'OTU_id_column' with the name of the column with OTU IDs

# Maintain a set of sequence IDs that have been added to the list
added_ids = set()

# Initialize an empty list to store sequences
sequences = []

# Loop through each file in the directory
for fasta_file in os.listdir(fasta_dir):
    # Ensure you only process .fasta files
    if fasta_file.endswith('.fasta') and fasta_file != 'Mock_18S_otu.fasta':
        file_path = os.path.join(fasta_dir, fasta_file)
        
        # Parse the FASTA file and extract sequences as before
        for record in SeqIO.parse(file_path, 'fasta'):
            if record.id in otus_to_extract and record.id not in added_ids:
                sequences.append(record)
                added_ids.add(record.id)

# Writing sequences to a new FASTA file
SeqIO.write(sequences, os.path.join(low_similarity_dir, 
                                    'FASTA', 
                                    f'low_similarity_OTUs_TH{str(similarity_threshold)}.fasta'), 
                                    'fasta')

# Creating a single table with all the BLASTn search

Checking the OTUs with the low percentage similarity to the reference sequences. In other words, checking OTUs that are not similar or close to the known sequences in the database used.

## Defining variables

In [7]:
# Files
blast_tables = [blast_table for blast_table in os.listdir(blast_results_dir) if blast_table.endswith('.tsv')]

# List of dataframes
dfs = []

# Regex patterns for extracting data from the CSV table names:
# Sequence ID:
reg_sequence_id = 'seq\d+_\d+'
# Taxon name:
reg_taxon_name = '[^_]+(?:_[^_]+)*'
# Percentage coverage from taxonomical assignement:
reg_score_tax_assign = '\d+\.\d+'
# Sample name:
reg_sample_name = '\w+'
# Final regex pattern
pattern = re.compile(fr'^(centroid={reg_sequence_id})_({reg_taxon_name})_({reg_score_tax_assign})_({reg_sample_name})\.tsv$')

## Creating the table

In [8]:
# 2. Iterate over each file
for blast_table in blast_tables:
    # Extracting details from filename
    details = extract_data_from_filename(blast_table, pattern)
    if not details:
        continue
    sequence_id, taxon_name, score_tax_assign, sample_name = details
    
    # Reading and augmenting dataframe
    df = read_and_augment_csv(os.path.join(blast_results_dir, blast_table), *details)
    dfs.append(df)

# Merge all the dataframes together
blastn_df = pd.concat(dfs, ignore_index=True)

Filename centroid=seq149_11;seqs=1.tsv did not match the pattern!
Filename centroid=seq302_14;seqs=1.tsv did not match the pattern!
Filename centroid=seq203_53;seqs=1.tsv did not match the pattern!
Filename centroid=seq29_39;seqs=4.tsv did not match the pattern!
Filename centroid=seq168_20;seqs=1.tsv did not match the pattern!
Filename centroid=seq395_6;seqs=1.tsv did not match the pattern!
Filename centroid=seq102_34;seqs=1.tsv did not match the pattern!
Filename centroid=seq437_6;seqs=1.tsv did not match the pattern!
Filename centroid=seq247_15;seqs=1.tsv did not match the pattern!
Filename centroid=seq173_41;seqs=17.tsv did not match the pattern!
Filename centroid=seq121_35;seqs=1.tsv did not match the pattern!
Filename centroid=seq64_18;seqs=1.tsv did not match the pattern!
Filename centroid=seq303_9;seqs=1.tsv did not match the pattern!
Filename centroid=seq112_23;seqs=1.tsv did not match the pattern!
Filename centroid=seq354_6;seqs=1.tsv did not match the pattern!
Filename centro

ValueError: No objects to concatenate

In [None]:
blastn_df

## Adding the taxopath for each sequence using Entrez APIs

In [None]:
Entrez.email = os.getenv('ENTREZ_EMAIL')

# Add the taxopath column (can take some time)
blastn_df = add_taxopath_column(blastn_df)

In [None]:
blastn_df

## Save the table as Excel and TSV files

In [None]:
# Save to Excel
filename_excel = "blastn_results_table.xlsx"
blastn_df.to_excel(os.path.join(low_similarity_dir, 'tables', filename_excel), index=False)

In [None]:
# Save to TSV
filename_tsv = "blastn_results_table.tsv"
blastn_df.to_csv(os.path.join(low_similarity_dir, 'tables', filename_tsv), sep='\t', index=False)