In [1]:
import os
import subprocess
import sys

from collections import defaultdict
from Bio import Entrez
import time

import pandas as pd
import json
from ete3 import NCBITaxa

In [2]:
sys.path.insert(0, '../utils')
from reference_finder import download_reference_genome, unpack, cat_reference_genome
from alignment import run_minimap2, sort_samfile, samtools_calculate_depth

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom={2, 4751, 2157, 10239}):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

In [5]:
def filter_seqscreen_taxonomy(seqscreen_output, min_frac, ncbi_taxa_db, valid_kingdom):
    classification_result_df = pd.read_csv(os.path.join(seqscreen_output, 
                                                        'taxonomic_identification', 
                                                        'taxonomic_assignment', 
                                                        'taxonomic_results.txt'), 
                                           sep='\t')
    total_read_count, _ = classification_result_df.shape
    
    taxid_count_dict = defaultdict(int)
    taxid_species_lookup = dict()
    error_count = 0
    for taxid in classification_result_df['taxid']:
        try:
            taxid = int(taxid)
            try:
                species_taxid = taxid_species_lookup[taxid]
            except KeyError:
                species_taxid = get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom)
                taxid_species_lookup[taxid] = species_taxid

            if species_taxid is not None:
                taxid_count_dict[species_taxid] += 1
        except ValueError:
            error_count += 1

    taxid_queries = []
    for taxid in taxid_count_dict:
        if taxid_count_dict[taxid] >= min_frac * total_read_count:
            taxid_queries.append(taxid)
            
    return taxid_queries

In [6]:
def prepare_reference_genomes(taxid_queries, output_directory, ncbi_taxa_db):
    working_dir = os.path.join(output_directory, 'ncbi_downloads')
    if not os.path.exists(working_dir):
        os.mkdir(working_dir)
        
    download_result = []
    for taxid in taxid_queries:
        download_result.append(download_reference_genome(taxid, working_dir))
        
    unpack(working_dir, output_directory)
    reference_metadata = pd.DataFrame(download_result,
                                      columns=['Taxonomy ID', 
                                               'Assembly Accession ID', 
                                               'Source Database', 
                                               'Is Representative', 
                                               'Assembly Level', 
                                               'Organism of Assembly',
                                               'Strain',
                                               'Downloaded'])
    
    taxonomy_name = []
    for taxid in reference_metadata['Taxonomy ID']:
        taxonomy_name.append(ncbi_taxa_db.get_taxid_translator([taxid])[taxid])
    reference_metadata['Species'] = taxonomy_name
    
    reference_metadata.to_csv(os.path.join(output_directory, 'reference_metadata.csv'), index=False)
    cat_reference_genome(reference_metadata, output_directory, reference_genome_path=os.path.join(output_directory, 'reference_genomes'))
    
    return reference_metadata

## Load ETE3 NCBITaxa

In [7]:
database_path = '/home/dbs/SeqScreenDB_21.4'

ete3db = os.path.join(database_path, "reference_inference", "taxa.sqlite")
sequences_db_f = os.path.join(database_path, "bowtie2", "blacklist.seqs.nt.fna")
mapping_f = os.path.join(database_path, "reference_inference", "taxid2seqid.pickle")

ncbi_taxa_db = NCBITaxa(dbfile=ete3db)

In [8]:
output_directory = "/home/Users/yl181/seqscreen_nano/test"

In [10]:
input_fasta = '/home/Users/yl181/seqscreen_nano/input_datasets/ERR3152364.downsampled.fasta'
seqscreen_output = "/home/Users/yl181/seqscreen_nano/output_datasets/ZymoBIOMICS.STD.Even.ont.seqscreen"
threads = 40
min_frac = 0.002

In [9]:
# ncbi.update_taxonomy_database()

## Filtering Seqscreen Taxonomic Assignment

In [None]:
taxid_queries = filter_seqscreen_taxonomy(seqscreen_output, 
                                          min_frac=min_frac, 
                                          ncbi_taxa_db=ncbi_taxa_db, 
                                          valid_kingdom={2, 4751, 2157, 10239})

## Fetch Reference Genomes

In [None]:
reference_metadata = prepare_reference_genomes(taxid_queries, output_directory, ncbi_taxa_db)

In [11]:
reference_metadata = pd.read_csv(os.path.join(output_directory, 'reference_metadata.csv'))

In [24]:
downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]
for assembly_id in downloaded_assemblies['Assembly Accession ID']:
    reference_fasta = os.path.join(output_directory, 'reference_genomes', f'{assembly_id}.fasta')
    run_minimap2(input_fasta, reference_fasta, assembly_id, output_directory, threads=threads)
    sort_samfile(assembly_id, output_directory, threads)
    samtools_calculate_depth(assembly_id, output_directory)

In [12]:
reference_metadata

Unnamed: 0,Taxonomy ID,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Strain,Downloaded,Species
0,1280,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,NCTC 8325,True,Staphylococcus aureus
1,1613,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,SCB0035,True,Limosilactobacillus fermentum
2,1351,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,LD33,True,Enterococcus faecalis
3,1423,GCF_000009045.1,BSNR,True,Complete Genome,Bacillus subtilis subsp. subtilis str. 168,168,True,Bacillus subtilis
4,287,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,PAO1,True,Pseudomonas aeruginosa
5,5207,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,VNII,True,Cryptococcus neoformans
6,1639,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,EGD-e,True,Listeria monocytogenes
7,1642,GCF_009648575.1,NCBI RefSeq,True,Complete Genome,Listeria innocua,CFSAN044836,True,Listeria innocua
8,562,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,Sakai substr. RIMD 0509952,True,Escherichia coli
9,1638,GCF_000763515.1,NCBI RefSeq,True,Complete Genome,Listeria ivanovii subsp. ivanovii,WSLC 3010,True,Listeria ivanovii
