# Dependencies

In [1]:
import os
import argparse
import pathlib
import sys
import subprocess
import math
import warnings
from collections import defaultdict
from multiprocessing import Pool, Manager
from itertools import repeat
warnings.filterwarnings("ignore")

import pandas as pd
from ete3 import NCBITaxa
from Bio import SeqIO

In [2]:
from utils.reference_finder import prepare_reference_genomes
from utils.alignment import run_minimap2, run_bwa, sort_samfile, samtools_calculate_coverage
from utils.summary import alignment_summary, merge_reference_fasta, call_present_absent
from utils.ani import samtools_merged_consensus, ani_summary
from utils.input_parsing import parsing_input_f, filter_input_df, get_seq2assembly_dict

# Results

In [14]:
result_df = pd.read_csv('/home/Users/yl181/magnet/zymo_bowtie2_a_k2_0.002/magnet_results.csv')

In [15]:
result_df = call_present_absent(result_df, min_coverage_score=0.7)

In [16]:
result_df

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species,...,Secondary Expected,Secondary Score,Secondary Depth,Primary Breadth,Primary Expected,Primary Score,Primary Depth,Consensus ANI,Combined PS and ANI (Sqrt(ANI)xPSx100),Presence/Absence
0,96241,GCF_006094475.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Bacillus spizizenii ATCC 6633 = JCM 2499,ATCC 6633,4045538.0,True,Bacillus spizizenii,...,1.0,1.0,132.511,0.999999,1.0,0.999999,68.478268,0.999977,100.0,Present
1,287,GCF_000006765.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Pseudomonas aeruginosa PAO1,PAO1,6264404.0,True,Pseudomonas aeruginosa,...,1.0,0.96768,65.565285,0.967013,1.0,0.967013,56.030159,0.994526,96.44,Present
2,28901,GCF_000006945.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Salmonella enterica subsp. enterica serovar Ty...,LT2,4951383.0,True,Salmonella enterica,...,1.0,0.935623,99.628887,0.92855,1.0,0.92855,64.813931,0.989556,92.37,Present
3,1280,GCF_000013425.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,NCTC 8325,2821361.0,True,Staphylococcus aureus,...,1.0,0.921382,160.315715,0.917263,1.0,0.917263,83.129377,0.993009,91.41,Present
4,1639,GCF_000196035.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Listeria monocytogenes EGD-e,EGD-e,2944528.0,True,Listeria monocytogenes,...,1.0,0.931354,129.529645,0.911952,1.0,0.911952,77.818246,0.947604,88.77,Present
5,1351,GCF_001598635.1,SOURCE_DATABASE_REFSEQ,False,Complete Genome,Enterococcus faecalis,LD33,2803429.0,True,Enterococcus faecalis,...,1.0,0.890446,132.768305,0.889125,1.0,0.889125,101.940818,0.991167,88.52,Present
6,1613,GCF_029961225.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Limosilactobacillus fermentum,EFEL6800,2103331.0,True,Limosilactobacillus fermentum,...,1.0,0.892914,212.556811,0.885748,1.0,0.885748,136.140257,0.993539,88.29,Present
7,623,GCF_000006925.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Shigella flexneri 2a str. 301,301,4828820.0,True,Shigella flexneri,...,1.0,0.889687,114.269365,0.78634,1.0,0.78634,32.865802,0.98088,77.88,Present
8,562,GCF_000008865.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,5594605.0,True,Escherichia coli,...,1.0,0.80836,97.632873,0.71223,1.0,0.71223,33.563861,0.977177,70.41,Present
9,1642,GCF_009648575.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Listeria innocua,CFSAN044836,2922148.0,True,Listeria innocua,...,1.0,0.862307,111.90501,0.536577,0.999999,0.536578,25.403433,0.908522,51.14,Absent


In [17]:
ground_truth_abd = {
"Listeria monocytogenes": 12.0,
"Pseudomonas aeruginosa": 12.0,
"Bacillus spizizenii": 12.0,
"Escherichia coli": 12.0,
"Salmonella enterica": 12.0,
"Limosilactobacillus fermentum": 12.0,
"Enterococcus faecalis": 12.0,
"Staphylococcus aureus": 12.0,
"Saccharomyces cerevisiae": 2.0,
"Cryptococcus neoformans": 2.0,
}

In [18]:
ground_truth_set = set(ground_truth_abd.keys())

In [19]:
positive_calls = set(result_df[result_df['Presence/Absence'] == 'Present']['Species'].values)

In [20]:
len(ground_truth_set.intersection(positive_calls))

8

In [21]:
ground_truth_set - positive_calls

{'Cryptococcus neoformans', 'Saccharomyces cerevisiae'}

In [22]:
result_df[result_df['Species'].isin(list(positive_calls - ground_truth_set))]

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species,...,Secondary Expected,Secondary Score,Secondary Depth,Primary Breadth,Primary Expected,Primary Score,Primary Depth,Consensus ANI,Combined PS and ANI (Sqrt(ANI)xPSx100),Presence/Absence
7,623,GCF_000006925.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Shigella flexneri 2a str. 301,301,4828820.0,True,Shigella flexneri,...,1.0,0.889687,114.269365,0.78634,1.0,0.78634,32.865802,0.98088,77.88,Present


In [23]:
result_df[result_df['Species'].isin(list(ground_truth_set - positive_calls))]

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species,...,Secondary Expected,Secondary Score,Secondary Depth,Primary Breadth,Primary Expected,Primary Score,Primary Depth,Consensus ANI,Combined PS and ANI (Sqrt(ANI)xPSx100),Presence/Absence


# Argument Parser

In [3]:
parser = argparse.ArgumentParser(description="Universal Taxonomic Classification Verifier.")

parser.add_argument("-c", "--classification", type=pathlib.Path, required=True, help="Path to the Taxonomic Classification Report. Accepting csv/tsv file format, other text formats are treated as tsv.")
parser.add_argument("-i", "--fastq", type=pathlib.Path, required=True, help="Path to the first fastq file.")
parser.add_argument("-I", "--fastq2", type=pathlib.Path, required=False, help="Path to the second fastq file for paired-end reads.")
parser.add_argument("-m", "--mode", type=str, required=False, choices=['ont', 'illumina'], help="Modes for different sequencing platforms [ont, illumina]. Default:[ont]",  default='ont')
parser.add_argument("-o", "--output", type=pathlib.Path, required=True, help="Path to the output directory.")
parser.add_argument("-t", "--taxid-idx", type=int, required=False, help="The column index (0-based) of the taxids. Default:[0]", default=0)
parser.add_argument("-a", "--abundance-idx", type=int, required=False, help="The column index (0-based) of the abundance. Default:[None]")
parser.add_argument("--min-abundance", type=float, required=False, help="Minimum abundance (0-1) for pre-filtering, exclude taxa below the threshold.", default=0)
parser.add_argument("--min-mapq", type=int, required=False, help="Minimum MAPQ for primary alignments. Default:[20]", default=20)
parser.add_argument("--min-covscore", type=float, required=False, help="Minimum Coverage Score for supplementary alignments. Default:[0.7]", default=0.7)
parser.add_argument("--threads", type=int, required=False, help="Number of threads for Multi-threading. Default:[1]", default=1)
parser.add_argument("--include-mag", action='store_true', required=False, help="Include metagenomic assemble genomes. Default:[False]")
parser.set_defaults(include_mag=False)
parser.add_argument("--subspecies", action='store_true', required=False, help="Verify taxonomic classification at subspecies rank. Default:[False]")
parser.set_defaults(subspecies=False)

In [4]:
args = parser.parse_args(['-c', '/home/Users/ns58/emu40/Sim-Output/Large-uniform-8-0/Mob-combined-async/Mob-combined-async_rel-abundance.tsv',
                          '-i', '/home/Users/ns58/emu40/Sim-Data/Samples/Large_30species_rs3_startx10._ONT_8_0.fastq',
                          '-o', '../test',
                          '-t', str(0),
                          '--threads', str(40),
                          '--subspecies'])

In [76]:
print(' '.join(['-c', '/home/Users/ns58/emu40/Sim-Output/Large-uniform-8-0/Mob-combined-async/Mob-combined-async_rel-abundance.tsv',
                          '-i', '/home/Users/ns58/emu40/Sim-Data/Samples/Large_30species_rs3_startx10._ONT_8_0.fastq',
                          '-o', '../test',
                          '-t', str(0),
                          '--threads', str(40),
                          '--subspecies']))

-c /home/Users/ns58/emu40/Sim-Output/Large-uniform-8-0/Mob-combined-async/Mob-combined-async_rel-abundance.tsv -i /home/Users/ns58/emu40/Sim-Data/Samples/Large_30species_rs3_startx10._ONT_8_0.fastq -o ../test -t 0 --threads 40 --subspecies


In [9]:
input_tsv = args.classification
input_fastq = args.fastq
input_fastq2 = args.fastq2
mode = args.mode
working_directory = args.output

taxid_col_idx = args.taxid_idx
abundance_col_idx = args.abundance_idx
min_abundance = args.min_abundance
min_mapq = args.min_mapq
min_coverage_score = args.min_covscore
threads = args.threads

if args.include_mag:
    mag_flag = 'all'
else:
    mag_flag = 'exclude'
    
call_subspecies = args.subspecies

sep = '\t'
if str(input_file)[-3:] == 'csv':
    sep = ','

# Run

In [10]:
ncbi_taxa_db = NCBITaxa()

In [11]:
if not os.path.exists(working_directory):
    os.mkdir(working_directory)

In [12]:
input_df, min_abundance = parsing_input_f(input_file, sep, input_tsv, taxid_col_idx, abundance_col_idx, min_abundance)
# make valid_kingdom a variable?
valid_taxids = filter_input_df(input_df, min_abundance, ncbi_taxa_db, valid_kingdom={2, 4751, 2157, 10239}, ret_subspecies=call_subspecies)

unassigned is not a valid taxid.


In [13]:
# can be parallelized
reference_metadata = prepare_reference_genomes(valid_taxids, working_directory, ncbi_taxa_db, mag_flag=mag_flag)

1246       	 GCF_007954605.1 	 Leuconostoc lactis             	 CBA3625    	 Complete Genome
173262     	 Genome Not Found.
927083     	 GCF_000737325.1 	 Sandaracinus amylolyticus      	 DSM 5366.. 	 Complete Genome
63363      	 GCF_000008625.1 	 Aquifex aeolicus VF5           	 VF5        	 Complete Genome
658062     	 GCF_000014005.1 	 Candidatus Koribacter versat.. 	 Ellin345   	 Complete Genome
1293412    	 GCF_006542355.1 	 Swingsia samuiensis            	 AH83       	 Complete Genome
2203724    	 GCF_004102045.2 	 Oenococcus sicerae             	 UCMA1522.. 	 Complete Genome
664643     	 GCF_024347035.1 	 Vibrio plantisponsor           	 CECT 758.. 	 Complete Genome
623        	 GCF_000006925.2 	 Shigella flexneri 2a str. 30.. 	 301        	 Complete Genome
154981     	 GCF_025137635.1 	 Aliiroseovarius crassostreae   	 S044       	 Complete Genome
2305508    	 GCF_003576455.2 	 Mucilaginibacter celer         	 HYN0043    	 Complete Genome
436        	 GCF_014843995.1 	 Novaceti

In [14]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

In [20]:
seq2assembly_dict = get_seq2assembly_dict(working_directory, downloaded_assemblies)

In [20]:
reference_fasta = merge_reference_fasta(list(downloaded_assemblies['Assembly Accession ID']), working_directory)

In [21]:
%%time
aligner_output = run_minimap2(input_fastq, reference_fasta, 'merged', working_directory, threads=threads)
sort_samfile('merged', aligner_output, working_directory, min_mapq=0, threads=threads)

In [21]:
%%time
pool = Pool(processes=threads)
pool.starmap(samtools_calculate_coverage, zip(repeat(working_directory), [True, False]))
pool.close()
pool.join()

CPU times: user 26.2 ms, sys: 254 ms, total: 280 ms
Wall time: 2min 18s


In [33]:
downloaded_assemblies = alignment_summary(downloaded_assemblies,
                                          working_directory,
                                          seq2assembly_dict,
                                          include_supp=True)

In [34]:
downloaded_assemblies = alignment_summary(downloaded_assemblies,
                                          working_directory,
                                          seq2assembly_dict,
                                          include_supp=False)

In [24]:
%%time
consensus_record_dict = samtools_merged_consensus(working_directory, threads)

CPU times: user 1.03 s, sys: 392 ms, total: 1.43 s
Wall time: 3min 31s


In [37]:
%%time
downloaded_assemblies = ani_summary(downloaded_assemblies, consensus_record_dict, working_directory, threads)

CPU times: user 5.29 s, sys: 14.9 s, total: 20.2 s
Wall time: 37.5 s


In [63]:
downloaded_assemblies = call_present_absent(downloaded_assemblies, min_coverage_score)

In [64]:
downloaded_assemblies.sort_values(['Primary Score'], ascending=False).to_csv(os.path.join(working_directory, 
                                                                                          'magnet_results.csv'), 
                                                                             index=False)

# Results

In [65]:
downloaded_assemblies[downloaded_assemblies['Presence/Absence'] == 'Present']

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species,...,Secondary Breadth,Secondary Expected,Secondary Score,Secondary Depth,Primary Breadth,Primary Expected,Primary Score,Primary Depth,Combined PS and ANI (Sqrt(ANI)xPSx100),Presence/Absence
0,1246,GCF_007954605.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Leuconostoc lactis,CBA3625,1791608.0,True,Leuconostoc lactis,...,0.999951,1.0,0.999951,99.860177,0.999951,1.0,0.999951,98.808125,99.99,Present
2,927083,GCF_000737325.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Sandaracinus amylolyticus,DSM 53668,10327335.0,True,Sandaracinus amylolyticus,...,1.0,1.0,1.0,49.606,0.999992,1.0,0.999992,49.413092,100.0,Present
3,63363,GCF_000008625.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Aquifex aeolicus VF5,VF5,1590791.0,True,Aquifex aeolicus,...,0.999932,1.0,0.999932,49.586588,0.999932,1.0,0.999932,49.447579,99.99,Present
4,658062,GCF_000014005.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Candidatus Koribacter versatilis Ellin345,Ellin345,5650368.0,True,Candidatus Koribacter versatilis,...,0.999977,1.0,0.999977,98.98126,0.999977,1.0,0.999977,98.947459,100.0,Present
5,1293412,GCF_006542355.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Swingsia samuiensis,AH83,2193407.0,True,Swingsia samuiensis,...,0.999929,1.0,0.999929,100.964187,0.999929,1.0,0.999929,99.078553,99.99,Present
6,2203724,GCF_004102045.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Oenococcus sicerae,UCMA15228,1664394.0,True,Oenococcus sicerae,...,0.999871,1.0,0.999871,49.581504,0.999871,1.0,0.999871,49.436986,99.99,Present
7,664643,GCF_024347035.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Vibrio plantisponsor,CECT 7581,4509247.0,True,Vibrio plantisponsor,...,0.99996,1.0,0.99996,50.684451,0.99996,1.0,0.99996,49.405227,100.0,Present
8,623,GCF_000006925.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Shigella flexneri 2a str. 301,301,4828820.0,True,Shigella flexneri,...,0.999904,1.0,0.999904,22.716979,0.999904,1.0,0.999904,19.765296,99.95,Present
9,154981,GCF_025137635.1,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Aliiroseovarius crassostreae,S044,3367659.0,True,Aliiroseovarius crassostreae,...,0.999794,1.0,0.999794,19.906297,0.999794,1.0,0.999794,19.789472,99.98,Present
10,2305508,GCF_003576455.2,SOURCE_DATABASE_REFSEQ,True,Complete Genome,Mucilaginibacter celer,HYN0043,7149106.0,True,Mucilaginibacter celer,...,0.99976,1.0,0.99976,19.861267,0.99976,1.0,0.99976,19.78955,99.98,Present


In [66]:
ground_truth_abd = {
"Leuconostoc lactis": 10.0,
"Cloacibacillus porcorum": 10.0,
"Limihaloglobus sulfuriphilus": 10.0,
"Sinorhizobium fredii": 10.0,
"Massilia oculi": 10.0,
"Raoultella terrigena": 10.0,
"Agromyces marinus": 10.0,
"Thermoanaerobacter wiegelii": 10.0,
"Candidatus Koribacter versatilis": 10.0,
"Swingsia samuiensis": 10.0,
"Sphingobium fuliginis": 5.0,
"Aquifex aeolicus": 5.0,
"Corynebacterium flavescens": 5.0,
"Oenococcus sicerae": 5.0,
"Vibrio plantisponsor": 5.0,
"Bradyrhizobium paxllaeri": 5.0,
"Sandaracinus amylolyticus": 5.0,
"Halomonas sulfidoxydans": 5.0,
"Neisseria musculi": 5.0,
"Streptomyces cavourensis": 5.0,
"Desulfomonile tiedjei": 2.0,
"Mycoplasma haemocanis": 2.0,
"Paraburkholderia caffeinilytica": 2.0,
"Shigella flexneri": 2.0,
"Vibrio rarus": 2.0,
"Pandoraea thiooxydans": 2.0,
"Aliiroseovarius crassostreae": 2.0,
"Mucilaginibacter celer": 2.0,
"Novacetimonas hansenii": 1.0,
"Companilactobacillus crustorum": 1.0,
}

In [67]:
ground_truth_set = set(ground_truth_abd.keys())

In [68]:
positive_calls = set(downloaded_assemblies[downloaded_assemblies['Presence/Absence'] == 'Present']['Species'].values)

In [69]:
#positive_calls = set(downloaded_assemblies[downloaded_assemblies['Combined CS2 and ANI (Sqrt(ANI)xCS2x100)'] >= 97]['Species'].values)

In [70]:
len(positive_calls)

30

In [71]:
len(ground_truth_set.intersection(positive_calls))

30

In [72]:
len(ground_truth_set - positive_calls)

0

In [73]:
len(positive_calls - ground_truth_set)

0

In [74]:
downloaded_assemblies[downloaded_assemblies['Species'].isin(list(positive_calls - ground_truth_set))]

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Total Length,Downloaded,Species,...,Secondary Breadth,Secondary Expected,Secondary Score,Secondary Depth,Primary Breadth,Primary Expected,Primary Score,Primary Depth,Combined PS and ANI (Sqrt(ANI)xPSx100),Presence/Absence
