In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import argparse
import sys
import subprocess
import math
import pickle
from collections import defaultdict
import json
import pandas as pd
import math
from ete3 import NCBITaxa
from Bio import SeqIO

In [2]:
from utils.reference_finder import prepare_reference_genomes
from utils.alignment import run_minimap2, sort_samfile, run_bwa, samtools_calculate_depth
from utils.summary import alignment_1_summary, alignment_2_summary, merge_reference_fasta, call_present_absent
from utils.ani import samtools_merged_consensus, ani_summary

In [3]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

In [4]:
def filter_kraken2_taxonomy(kraken2_report, min_frac, ncbi_taxa_db, valid_kingdom):
    classification_result_df = pd.read_csv(kraken2_report, sep='\t', names=['Abundance', 'Cumulative Count', 'Count', 'Rank', 'Taxid', 'Taxname']) 
    taxids = classification_result_df[(classification_result_df['Rank'] == 'S') & (classification_result_df['Abundance'] >= min_frac*100)]['Taxid'].tolist()
    valid_taxids = []
    for taxid in taxids:
        species_taxid = get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom)
        if species_taxid is not None:
            valid_taxids.append(species_taxid)
    return valid_taxids

In [5]:
ncbi_taxa_db = NCBITaxa()

In [40]:
input_fastq_1 = '../test_data/Zymo_illumina/SRR11207265_1.downsampled.fastq'
input_fastq_2 = '../test_data/Zymo_illumina/SRR11207265_2.downsampled.fastq'
kraken2_report = "../test_data/Zymo_illumina/SRR11207265.report"
threads = 40
min_mapq = 20
min_coverage_score = 0.7
min_frac = 0.0005

In [41]:
working_directory = '../test_output_downsampled_0002'
if not os.path.exists(working_directory):
    os.mkdir(working_directory)

In [42]:
taxid_queries = filter_kraken2_taxonomy(kraken2_report,
                                        min_frac=min_frac, 
                                        ncbi_taxa_db=ncbi_taxa_db, 
                                        valid_kingdom={2, 4751, 2157, 10239})

In [43]:
len(taxid_queries)

16

In [9]:
reference_metadata = prepare_reference_genomes(taxid_queries, working_directory, ncbi_taxa_db)

28901      	 GCF_000006945.2 	 Salmonella enterica subsp. e.. 	 LT2        	 Complete Genome
54736      	 GCF_000439255.1 	 Salmonella bongori N268-08     	 N268-08    	 Complete Genome
562        	 GCF_000008865.2 	 Escherichia coli O157:H7 str.. 	 Sakai su.. 	 Complete Genome
208962     	 GCF_016904755.1 	 Escherichia albertii           	 Sample 1.. 	 Complete Genome
1499973    	 GCF_002900365.1 	 Escherichia marmotae           	 HT073016   	 Complete Genome
564        	 GCF_013892435.1 	 Escherichia fergusonii         	 RHB19-C0.. 	 Complete Genome
2044467    	 GCF_005843885.1 	 Escherichia sp. E4742          	 E4742      	 Complete Genome
546        	 GCF_003812345.1 	 Citrobacter freundii           	 FDAARGOS.. 	 Complete Genome
623        	 GCF_000006925.2 	 Shigella flexneri 2a str. 30.. 	 301        	 Complete Genome
621        	 GCF_001027225.1 	 Shigella boydii                	 ATCC 921.. 	 Complete Genome
622        	 GCF_002741615.1 	 Shigella dysenteriae           	 BU53M1


31 archives were successfully processed.


In [10]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

In [11]:
for assembly_id in downloaded_assemblies['Assembly Accession ID']:
    reference_fasta = os.path.join(working_directory, 'reference_genomes', f'{assembly_id}.fasta')
    run_bwa(input_fastq_1, input_fastq_2, reference_fasta, assembly_id, working_directory, threads=threads)
    sort_samfile(assembly_id, working_directory, min_mapq=min_mapq, threads=threads)
    samtools_calculate_depth(assembly_id, working_directory)

In [12]:
downloaded_assemblies = alignment_1_summary(downloaded_assemblies, working_directory)

In [13]:
filtered_assemblies = list(downloaded_assemblies[downloaded_assemblies['Coverage Score'] >= min_coverage_score]['Assembly Accession ID'])

In [14]:
reference_fasta = merge_reference_fasta(filtered_assemblies, working_directory)
run_bwa(input_fastq_1, input_fastq_2, reference_fasta, 'merged', working_directory, threads=threads)
sort_samfile('merged', working_directory, min_mapq=0, threads=threads)
samtools_calculate_depth('merged', working_directory)

In [15]:
downloaded_assemblies = alignment_2_summary(downloaded_assemblies, working_directory)

In [16]:
consensus_record_dict = samtools_merged_consensus(working_directory, threads)
downloaded_assemblies = ani_summary(downloaded_assemblies, consensus_record_dict, working_directory)

In [17]:
downloaded_assemblies = call_present_absent(downloaded_assemblies)

In [18]:
downloaded_assemblies.sort_values(['CS2'], ascending=False).to_csv(os.path.join(working_directory, 'reference_alignment.csv'), index=False)

In [19]:
downloaded_assemblies

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species,Breadth Coverage,Expected Coverage,Coverage Score,Depth Coverage,BC2,EC2,CS2,DC2,Consensus ANI,Combined CS2 and ANI (Sqrt(ANI)xCS2x100),Presence/Absence
0,28901,GCF_000006945.2,Washington University Genome Sequencing Center,True,Complete Genome,Salmonella enterica subsp. enterica serovar Ty...,LT2,True,Salmonella enterica,0.90764,0.99943,0.908158,8.230061,0.912043,0.997296,0.914515,6.483257,0.989812,90.98,Present
1,54736,GCF_000439255.1,NCBI RefSeq,True,Complete Genome,Salmonella bongori N268-08,N268-08,True,Salmonella bongori,0.743137,0.997109,0.745292,7.866824,0.022666,0.077416,0.292787,3.554857,0.967281,28.8,Absent
2,562,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,True,Escherichia coli,0.751993,0.99813,0.753402,8.35334,0.372702,0.659305,0.565296,2.889042,0.988459,56.2,Present
3,208962,GCF_016904755.1,NCBI RefSeq,True,Complete Genome,Escherichia albertii,Sample 167,True,Escherichia albertii,0.733252,0.997331,0.735214,8.08178,0.012326,0.029806,0.413548,2.454837,0.966724,40.66,Absent
4,1499973,GCF_002900365.1,NCBI RefSeq,False,Complete Genome,Escherichia marmotae,HT073016,True,Escherichia marmotae,0.730958,0.997523,0.732773,8.209433,0.019633,0.059327,0.330929,3.115106,0.965896,32.52,Absent
5,564,GCF_013892435.1,NCBI RefSeq,True,Complete Genome,Escherichia fergusonii,RHB19-C05,True,Escherichia fergusonii,0.614433,0.993791,0.618272,8.270563,0.0,0.0,0.0,0.0,0.0,0.0,Absent
6,2044467,GCF_005843885.1,NCBI RefSeq,False,Complete Genome,Escherichia sp. E4742,E4742,True,Escherichia sp. E4742,0.734095,0.997641,0.73583,8.240902,0.038282,0.086245,0.443874,2.355988,0.971706,43.75,Present
7,546,GCF_003812345.1,NCBI RefSeq,True,Complete Genome,Citrobacter freundii,FDAARGOS_549,True,Citrobacter freundii,0.478365,0.96891,0.493714,7.255633,0.0,0.0,0.0,0.0,0.0,0.0,Absent
8,623,GCF_000006925.2,Microbial Genome Center of ChMPH,True,Complete Genome,Shigella flexneri 2a str. 301,301,True,Shigella flexneri,0.794622,0.998802,0.795576,8.465387,0.387796,0.633569,0.612083,2.588805,0.990129,60.91,Present
9,621,GCF_001027225.1,NCBI RefSeq,False,Complete Genome,Shigella boydii,ATCC 9210,True,Shigella boydii,0.82104,0.999083,0.821793,8.519167,0.459718,0.715478,0.642533,2.734124,0.991279,63.97,Present
