In [1]:
# imports
import pandas as pd
import os
import re
import sys

# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from low_similarity_otus import extract_data_from_filename, read_and_augment_csv

In [2]:
# variables
project = 'Suthaus_2022'
cell = 'cellCombined'
sim = 'sim90'
marker = 'Full18S'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
blast_results_dir = os.path.join(raw_data, 'blast_results')

# Loading tables from the BLASTn search

Checking the OTUs with the low percentage similarity to the reference sequences. In other words, checking OTUs that are not similar or close to the known sequences in the database used.

## Defining variables

In [3]:
# Files
blast_tables = [blast_table for blast_table in os.listdir(blast_results_dir) if blast_table.endswith('.tsv')]

# List of dataframes
dfs = []

# Regex patterns for extracting data from the CSV table names:
# Sequence ID:
reg_sequence_id = 'seq\d+_\d+'
# Taxon name:
reg_taxon_name = '[^_]+(?:_[^_]+)*'
# Percentage coverage from taxonomical assignement:
reg_score_tax_assign = '\d+\.\d+'
# Sample name:
reg_sample_name = '\w+'
# Final regex pattern
pattern = re.compile(fr'^({reg_sequence_id})_({reg_taxon_name})_({reg_score_tax_assign})_({reg_sample_name})\.tsv$')

In [4]:
# 2. Iterate over each file
for blast_table in blast_tables:
    # Extracting details from filename
    details = extract_data_from_filename(blast_table, pattern)
    if not details:
        continue
    sequence_id, taxon_name, score_tax_assign, sample_name = details
    
    # Reading and augmenting dataframe
    df = read_and_augment_csv(os.path.join(blast_results_dir, blast_table), *details)
    dfs.append(df)

# Merge all the dataframes together
blastn_df = pd.concat(dfs, ignore_index=True)

In [5]:
blastn_df

Unnamed: 0,Sequence_id,Sample,Tax_assignment,Score_tax_assign,BLASTn,Max_Ident,Score,E_value,Hit_accession
0,seq223_11,X17007,Olkasia_polycarbonata,72.2,Gaulosia striata,89.22%,2675.680,0.000000e+00,OQ331014
1,seq223_11,X17007,Olkasia_polycarbonata,72.2,Ploeotid sp.,91.36%,1096.830,0.000000e+00,MK239307
2,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.58%,1075.190,0.000000e+00,KY947150
3,seq223_11,X17007,Olkasia_polycarbonata,72.2,Diplonemida sp.,73.56%,1074.290,0.000000e+00,KY947155
4,seq223_11,X17007,Olkasia_polycarbonata,72.2,Uncultured marine,73.58%,1066.180,0.000000e+00,KX189158
...,...,...,...,...,...,...,...,...,...
375,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,74.74%,384.502,5.892750e-101,AF372801
376,seq125_12,Th40,Caliculium_glossobalani,77.4,Apicomplexa sp.,71.47%,380.895,7.178840e-100,KC890798
377,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured Gregarina,74.44%,379.994,7.178840e-100,MG766260
378,seq125_12,Th40,Caliculium_glossobalani,77.4,Uncultured eukaryote,79.29%,375.485,3.052520e-98,AF372709
