## Imports

In [18]:
import os
import subprocess

# Custom functions
python_dir_path = os.path.join('..', 'scripts', 'python')
sys.path.append(python_dir_path)
from toolbox import check_if_exist

## Variables

In [22]:
project = 'Suthaus_2022'
marker = 'Full18S'
sim = 'sim90'
denoise_method = 'RAD'
raw_data = os.path.join('..', 'raw_data')
tax_assign_results = os.path.join(raw_data, 'tax_assign_results', project, marker, sim, denoise_method)
otu_original = os.path.join(raw_data, 'OTU_original', project, marker, sim, denoise_method)
pr2 = os.path.join(raw_data, 'reference_alignments', 'pr2', 'pr2_version_5.0.0_SSU_UTAX.fasta')

# Taxonomic assignment using VSEARCH

In [6]:
# list of samples picked for the taxonomic assignment
samples = ['A3', 
           'NH1', 
           'NH4', 
           'Sim17', 
           'Sim22', 
           'Th16', 
           'Th38', 
           'Th40', 
           'X17007']

In [37]:
percent_ident_limit = '0.6'

for sample in samples:
    print(f'\n\n###### Working on sample: {sample} ######\n\n')
    # check if the file already exist:
    if check_if_exist(os.path.join(tax_assign_results, f'{sample}_blast6.tab')):
        # Creating paths
        query_seq = os.path.join(otu_original, f'{sample}_18S_otu.fasta')
        output_seq = os.path.join(tax_assign_results, f'{sample}_blast6.tab')
        
        # VSEARCH command via Python Subprocess module
        cmd = ['vsearch', 
               '--usearch_global', query_seq, 
               '--dbmask', 'none', 
               '--qmask', 'none', 
               '--db', pr2,
               '--id', percent_ident_limit,
               '--iddef', '3',
               '--blast6out', output_seq]

        results = subprocess.run(cmd, capture_output = True, text = True)
        
        print(f'''
        ===== Standard Output =====
        {results.stdout}
        ===== Standard Error =====
        {results.stderr}
        ''')

        # Save both stdout and stderr to the same file
        with open(os.path.join(tax_assign_results, f'{sample}_vsearch_terminal_output.log'), 'w') as f:
            f.write('===== Standard Output =====\n')
            f.write(results.stdout)
            f.write('\n===== Standard Error =====\n')
            f.write(results.stderr)



###### Working on sample: A3 ######



        ===== Standard Output =====
        
        ===== Standard Error =====
        vsearch v2.22.1_linux_x86_64, 11.9GB RAM, 16 cores
https://github.com/torognes/vsearch

Reading file ../raw_data/reference_alignments/pr2/pr2_version_5.0.0_SSU_UTAX.fasta 100%
312096825 nt in 221085 seqs, min 374, max 10223, avg 1412
Counting k-mers 100%
Creating k-mer index 100%
Searching 100%
Matching unique query sequences: 140 of 140 (100.00%)

        


###### Working on sample: NH1 ######



        ===== Standard Output =====
        
        ===== Standard Error =====
        vsearch v2.22.1_linux_x86_64, 11.9GB RAM, 16 cores
https://github.com/torognes/vsearch

Reading file ../raw_data/reference_alignments/pr2/pr2_version_5.0.0_SSU_UTAX.fasta 100%
312096825 nt in 221085 seqs, min 374, max 10223, avg 1412
Counting k-mers 100%
Creating k-mer index 100%
Searching 100%
Matching unique query sequences: 46 of 46 (100.00%)

        


###### Working on sam