In [1]:
# imports
import pandas as pd
import os
import re
import sys
from Bio import Entrez
from dotenv import load_dotenv
import openpyxl

# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from low_similarity_otus import extract_data_from_filename, read_and_augment_csv, get_taxonomy_from_genbank, add_taxopath_column

In [2]:
# variables
project = 'Suthaus_2022'
cell = 'cellCombined'
sim = 'sim90'
marker = 'Full18S'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
blast_results_dir = os.path.join(raw_data, 'blast_results')
load_dotenv(os.path.join('..', 'secrets.env'))

True

# Creating a single table with all the BLASTn search

Checking the OTUs with the low percentage similarity to the reference sequences. In other words, checking OTUs that are not similar or close to the known sequences in the database used.

## Defining variables

In [3]:
# Files
blast_tables = [blast_table for blast_table in os.listdir(blast_results_dir) if blast_table.endswith('.tsv')]

# List of dataframes
dfs = []

# Regex patterns for extracting data from the CSV table names:
# Sequence ID:
reg_sequence_id = 'seq\d+_\d+'
# Taxon name:
reg_taxon_name = '[^_]+(?:_[^_]+)*'
# Percentage coverage from taxonomical assignement:
reg_score_tax_assign = '\d+\.\d+'
# Sample name:
reg_sample_name = '\w+'
# Final regex pattern
pattern = re.compile(fr'^({reg_sequence_id})_({reg_taxon_name})_({reg_score_tax_assign})_({reg_sample_name})\.tsv$')

## Creating the table

In [4]:
# 2. Iterate over each file
for blast_table in blast_tables:
    # Extracting details from filename
    details = extract_data_from_filename(blast_table, pattern)
    if not details:
        continue
    sequence_id, taxon_name, score_tax_assign, sample_name = details
    
    # Reading and augmenting dataframe
    df = read_and_augment_csv(os.path.join(blast_results_dir, blast_table), *details)
    dfs.append(df)

# Merge all the dataframes together
blastn_df = pd.concat(dfs, ignore_index=True)

In [None]:
blastn_df

## Adding the taxopath for each sequence using Entrez APIs

In [7]:
Entrez.email = os.getenv('ENTREZ_EMAIL')

# Add the taxopath column (can take some time)
blastn_df = add_taxopath_column(blastn_df)

Processing GenBank ID: OQ331014
Processing GenBank ID: MK239307
Processing GenBank ID: KY947150
Processing GenBank ID: KY947155
Processing GenBank ID: KX189158
Processing GenBank ID: KY947157
Processing GenBank ID: KX189173
Processing GenBank ID: KJ757308
Processing GenBank ID: AY665087
Processing GenBank ID: KX189160
Processing GenBank ID: KX189167
Processing GenBank ID: KX189170
Processing GenBank ID: KJ757736
Processing GenBank ID: KX189157
Processing GenBank ID: KJ760279
Processing GenBank ID: KJ757377
Processing GenBank ID: KX189137
Processing GenBank ID: KX189164
Processing GenBank ID: KJ762734
Processing GenBank ID: KX189134
Processing GenBank ID: OQ331014
Processing GenBank ID: MK239307
Processing GenBank ID: AY425009
Processing GenBank ID: KX189125
Processing GenBank ID: KX189169
Processing GenBank ID: KX189123
Processing GenBank ID: KJ757646
Processing GenBank ID: KX189126
Processing GenBank ID: AF380996
Processing GenBank ID: MF422192
Processing GenBank ID: AY425011
Processi

Processing GenBank ID: KX360147
Processing GenBank ID: KX360151
Processing GenBank ID: XR_008288893
Processing GenBank ID: XR_001214623
Processing GenBank ID: AF372825
Processing GenBank ID: JQ480022
Processing GenBank ID: JQ480021
Processing GenBank ID: KJ736741
Processing GenBank ID: FJ459740
Processing GenBank ID: AF372805
Processing GenBank ID: AF372802
Processing GenBank ID: AF372800
Processing GenBank ID: AF372799
Processing GenBank ID: AF372798
Processing GenBank ID: AF372797
Processing GenBank ID: AF372710
Processing GenBank ID: GQ330638
Processing GenBank ID: LN575250
Processing GenBank ID: KP178168
Processing GenBank ID: AF372801
Processing GenBank ID: AF372806
Processing GenBank ID: GU825152
Processing GenBank ID: AY919803
Processing GenBank ID: AF372709
Processing GenBank ID: MK239284
Processing GenBank ID: MK239283
Processing GenBank ID: MK239285
Processing GenBank ID: MK239282
Processing GenBank ID: MK239281
Processing GenBank ID: LC683679
Processing GenBank ID: OQ331014


In [8]:
blastn_df

Unnamed: 0,Sequence_id,Sample,Tax_assignment,Score_tax_assign,BLASTn,Max_Ident,Score,E_value,Hit_accession,Taxopath
0,seq223_11,X17007,Olkasia_polycarbonata,72.2,Gaulosia striata,89.22%,2675.680,0.000000e+00,OQ331014,Eukaryota; Discoba; Euglenozoa; Euglenida; Eug...
1,seq223_11,X17007,Olkasia_polycarbonata,72.2,Ploeotid sp.,91.36%,1096.830,0.000000e+00,MK239307,Eukaryota; Discoba; Euglenozoa; Euglenida; Plo...
2,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.58%,1075.190,0.000000e+00,KY947150,Eukaryota; Discoba; Euglenozoa; Diplonemea
3,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.56%,1074.290,0.000000e+00,KY947155,Eukaryota; Discoba; Euglenozoa; Diplonemea
4,seq223_11,X17007,Olkasia_polycarbonata,72.2,Uncultured marine,73.58%,1066.180,0.000000e+00,KX189158,Eukaryota; Discoba; Euglenozoa; Diplonemea; en...
...,...,...,...,...,...,...,...,...,...,...
375,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,74.74%,384.502,5.892750e-101,AF372801,Eukaryota; environmental samples
376,seq125_12,Th40,Caliculium_glossobalani,77.4,Apicomplexa sp.,71.47%,380.895,7.178840e-100,KC890798,Eukaryota; Sar; Alveolata; Apicomplexa
377,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured Gregarina,74.44%,379.994,7.178840e-100,MG766260,Eukaryota; Sar; Alveolata; Apicomplexa; Conoid...
378,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,79.29%,375.485,3.052520e-98,AF372709,Eukaryota; environmental samples


## Save the table as Excel and TSV files

In [9]:
# Save to Excel
filename_excel = "blastn_results_table.xlsx"
blastn_df.to_excel(os.path.join(blast_results_dir, filename_excel), index=False)

In [None]:
# Save to TSV
filename_tsv = "blastn_results_table.tsv"
blastn_df.to_csv(os.path.join(blast_results_dir, filename_excel), sep='\t', index=False)