In [1]:
import os
import subprocess
import sys
import math
from collections import defaultdict
from Bio import Entrez
import time

import pandas as pd
import json
from ete3 import NCBITaxa
from Bio import SeqIO

In [13]:
sys.path.insert(0, '../utils')
from reference_finder import download_reference_genome, unpack, cat_reference_genome
from alignment import run_minimap2, sort_samfile, samtools_calculate_depth
from summary import alignment_1_summary, alignment_2_summary

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom={2, 4751, 2157, 10239}):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

In [5]:
def filter_seqscreen_taxonomy(seqscreen_output, min_frac, ncbi_taxa_db, valid_kingdom):
    classification_result_df = pd.read_csv(os.path.join(seqscreen_output, 
                                                        'taxonomic_identification', 
                                                        'taxonomic_assignment', 
                                                        'taxonomic_results.txt'), 
                                           sep='\t')
    total_read_count, _ = classification_result_df.shape
    
    taxid_count_dict = defaultdict(int)
    taxid_species_lookup = dict()
    error_count = 0
    for taxid in classification_result_df['taxid']:
        try:
            taxid = int(taxid)
            try:
                species_taxid = taxid_species_lookup[taxid]
            except KeyError:
                species_taxid = get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom)
                taxid_species_lookup[taxid] = species_taxid

            if species_taxid is not None:
                taxid_count_dict[species_taxid] += 1
        except ValueError:
            error_count += 1

    taxid_queries = []
    for taxid in taxid_count_dict:
        if taxid_count_dict[taxid] >= min_frac * total_read_count:
            taxid_queries.append(taxid)
            
    return taxid_queries

In [6]:
def prepare_reference_genomes(taxid_queries, output_directory, ncbi_taxa_db):
    working_dir = os.path.join(output_directory, 'ncbi_downloads')
    if not os.path.exists(working_dir):
        os.mkdir(working_dir)
        
    download_result = []
    for taxid in taxid_queries:
        download_result.append(download_reference_genome(taxid, working_dir))
        
    unpack(working_dir, output_directory)
    reference_metadata = pd.DataFrame(download_result,
                                      columns=['Taxonomy ID', 
                                               'Assembly Accession ID', 
                                               'Source Database', 
                                               'Is Representative', 
                                               'Assembly Level', 
                                               'Organism of Assembly',
                                               'Strain',
                                               'Downloaded'])
    
    taxonomy_name = []
    for taxid in reference_metadata['Taxonomy ID']:
        taxonomy_name.append(ncbi_taxa_db.get_taxid_translator([taxid])[taxid])
    reference_metadata['Species'] = taxonomy_name
    
    reference_metadata.to_csv(os.path.join(output_directory, 'reference_metadata.csv'), index=False)
    cat_reference_genome(reference_metadata, output_directory, reference_genome_path=os.path.join(output_directory, 'reference_genomes'))
    
    return reference_metadata

## Load ETE3 NCBITaxa

In [7]:
database_path = '/home/dbs/SeqScreenDB_21.4'

ete3db = os.path.join(database_path, "reference_inference", "taxa.sqlite")
sequences_db_f = os.path.join(database_path, "bowtie2", "blacklist.seqs.nt.fna")
mapping_f = os.path.join(database_path, "reference_inference", "taxid2seqid.pickle")

ncbi_taxa_db = NCBITaxa(dbfile=ete3db)

In [8]:
output_directory = "/home/Users/yl181/seqscreen_nano/test"

In [9]:
input_fasta = '/home/Users/yl181/seqscreen_nano/input_datasets/ERR3152364.downsampled.fasta'
seqscreen_output = "/home/Users/yl181/seqscreen_nano/output_datasets/ZymoBIOMICS.STD.Even.ont.seqscreen"
threads = 40
min_frac = 0.002
min_mapq = 20
min_coverage_score = 0.7

In [9]:
# ncbi.update_taxonomy_database()

## Filtering Seqscreen Taxonomic Assignment

In [None]:
taxid_queries = filter_seqscreen_taxonomy(seqscreen_output, 
                                          min_frac=min_frac, 
                                          ncbi_taxa_db=ncbi_taxa_db, 
                                          valid_kingdom={2, 4751, 2157, 10239})

## Fetch Reference Genomes

In [None]:
reference_metadata = prepare_reference_genomes(taxid_queries, output_directory, ncbi_taxa_db)

In [11]:
reference_metadata = pd.read_csv(os.path.join(output_directory, 'reference_metadata.csv'))

In [12]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]
for assembly_id in downloaded_assemblies['Assembly Accession ID']:
    reference_fasta = os.path.join(output_directory, 'reference_genomes', f'{assembly_id}.fasta')
    run_minimap2(input_fasta, reference_fasta, assembly_id, output_directory, threads=threads)
    sort_samfile(assembly_id, output_directory, min_mapq, threads)
    samtools_calculate_depth(assembly_id, output_directory)

In [13]:
reference_metadata

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species
0,1280,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,NCTC 8325,True,Staphylococcus aureus
1,1613,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,SCB0035,True,Limosilactobacillus fermentum
2,1351,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,LD33,True,Enterococcus faecalis
3,1423,GCF_000009045.1,BSNR,True,Complete Genome,Bacillus subtilis subsp. subtilis str. 168,168,True,Bacillus subtilis
4,287,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,PAO1,True,Pseudomonas aeruginosa
5,5207,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,VNII,True,Cryptococcus neoformans
6,1639,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,EGD-e,True,Listeria monocytogenes
7,1642,GCF_009648575.1,NCBI RefSeq,True,Complete Genome,Listeria innocua,CFSAN044836,True,Listeria innocua
8,562,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,True,Escherichia coli
9,1638,GCF_000763515.1,NCBI RefSeq,True,Complete Genome,Listeria ivanovii subsp. ivanovii,WSLC 3010,True,Listeria ivanovii


In [23]:
downloaded_assemblies = alignment_1_summary(downloaded_assemblies, output_directory)

In [29]:
#downloaded_assemblies.sort_values('Coverage Score', ascending=False)

In [31]:
filtered_assemblies = list(downloaded_assemblies[downloaded_assemblies['Coverage Score'] >= min_coverage_score]['Assembly Accession ID'])

In [33]:
def merge_reference_fasta(assembly_ids, output_directory):
    merged_fasta = os.path.join(output_directory, 'reference_genomes', f'merged.fasta')
    
    seq_records = []
    for assembly_id in assembly_ids:
        reference_fasta = os.path.join(output_directory, 'reference_genomes', f'{assembly_id}.fasta')

        with open(reference_fasta, "r") as handle:
            for record in SeqIO.parse(handle, "fasta"):   
                seq_records.append(record)
                    
    with open(merged_fasta, "w") as output_handle:
        SeqIO.write(seq_records, output_handle, "fasta")
		
    return merged_fasta

In [34]:
reference_fasta = merge_reference_fasta(filtered_assemblies, output_directory)
run_minimap2(input_fasta, reference_fasta, 'merged', output_directory, threads=threads)
sort_samfile('merged', output_directory, min_mapq, threads)
samtools_calculate_depth('merged', output_directory)

In [10]:
downloaded_assemblies = pd.read_csv(os.path.join(output_directory, 'alignment.csv'))

In [14]:
downloaded_assemblies = alignment_2_summary(downloaded_assemblies, output_directory)

In [16]:
downloaded_assemblies.sort_values('CS2', ascending=False)

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species,Breadth Coverage,Expected Coverage,Coverage Score,Depth Coverage,BC2,EC2,CS2,DC2
15,96241,GCF_006094475.1,NCBI RefSeq,True,Complete Genome,Bacillus subtilis subsp. spizizenii ATCC 6633 ...,ATCC 6633,True,Bacillus spizizenii,1.0,1.0,1.0,28.817028,1.0,1.0,1.0,28.435225
4,287,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,PAO1,True,Pseudomonas aeruginosa,0.969225,0.999858,0.969363,9.133267,0.969233,0.999831,0.969397,8.957587
5,5207,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,VNII,True,Cryptococcus neoformans,0.423306,0.441691,0.958376,1.376761,0.42334,0.441638,0.958569,1.376418
6,1639,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,EGD-e,True,Listeria monocytogenes,0.943656,1.0,0.943656,29.246736,0.932411,1.0,0.932411,26.44673
10,28901,GCF_000006945.2,Washington University Genome Sequencing Center,True,Complete Genome,Salmonella enterica subsp. enterica serovar Ty...,LT2,True,Salmonella enterica,0.929463,1.0,0.929463,20.590929,0.925799,0.999995,0.925804,13.176773
0,1280,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,NCTC 8325,True,Staphylococcus aureus,0.923386,1.0,0.923386,27.298416,0.923337,1.0,0.923337,26.221774
14,4932,GCF_000146045.2,Saccharomyces Genome Database,True,Complete Genome,Saccharomyces cerevisiae S288C,S288C,True,Saccharomyces cerevisiae,0.619563,0.678052,0.91374,1.829052,0.619487,0.677297,0.914646,1.825484
2,1351,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,LD33,True,Enterococcus faecalis,0.894675,1.0,0.894675,26.467055,0.894216,1.0,0.894216,25.709091
1,1613,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,SCB0035,True,Limosilactobacillus fermentum,0.863003,1.0,0.863003,34.935137,0.863029,1.0,0.863029,34.607012
8,562,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,True,Escherichia coli,0.783966,1.0,0.783966,20.741795,0.651192,0.990895,0.657175,7.212012
