In [20]:
import os
import subprocess

from collections import defaultdict
from Bio import Entrez
import time

import pandas as pd
import json
from ete3 import NCBITaxa

In [21]:
from utils.reference_finder import download_reference_genome, unpack, cat_reference_genome

In [22]:
from utils.alignment import run_minimap2, sort_samfile, calculate_depth

In [23]:
import warnings
warnings.filterwarnings("ignore")

In [24]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom={2, 4751, 2157, 10239}):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

## Load ETE3 NCBITaxa

In [29]:
ncbi_taxa_db = NCBITaxa()

In [30]:
# ncbi.update_taxonomy_database()

In [31]:
valid_kingdom = [2, 4751, 2157, 10239] # bacteria, archaea, viruses, and fungi

## Filtering Seqscreen Taxonomic Assignment

In [32]:
seqscreen_output = "/home/Users/yl181/seqscreen_nano/ZymoBIOMICS.STD.Even.ont.seqscreen"

In [33]:
classification_result_df = pd.read_csv(os.path.join(seqscreen_output, 'taxonomic_identification', 'taxonomic_assignment', 'taxonomic_results.txt'), sep='\t')

In [34]:
total_read_count, _ = classification_result_df.shape

In [35]:
taxid_count_dict = defaultdict(int)
taxid_species_lookup = dict()
error_count = 0
for taxid in classification_result_df['taxid']:
    
    try:
        taxid = int(taxid)
        try:
            species_taxid = taxid_species_lookup[taxid]
        except KeyError:
            species_taxid = get_species_taxid(taxid, ncbi_taxa_db)
            taxid_species_lookup[taxid] = species_taxid
            
        if species_taxid is not None:
            taxid_count_dict[species_taxid] += 1
    except ValueError:
        error_count += 1

In [36]:
min_frac = 0.002

taxid_queries = []
for taxid in taxid_count_dict:
    if taxid_count_dict[taxid] >= min_frac * total_read_count:
        taxid_queries.append(taxid)

In [37]:
min_frac * total_read_count

867.288

In [38]:
len(taxid_queries)

21

## Fetch Reference Genomes

In [39]:
output_directory = "/home/Users/yl181/seqscreen_nano/ZymoBIOMICS.STD.Even.ont.minimap2"

In [40]:
working_dir = os.path.join(output_directory, 'ncbi_downloads') 

In [41]:
if not os.path.exists(working_dir):
    os.mkdir(working_dir)

In [42]:
download_result = []

for taxid in taxid_queries:
    download_result.append(download_reference_genome(taxid, working_dir))

1280       	 GCF_000013425.1
1613       	 GCF_022819245.1
1351       	 GCF_001598635.1
1423       	 GCF_000009045.1
287        	 GCF_000006765.1
5207       	 GCA_022832995.1
1639       	 GCF_000196035.1
1642       	 GCF_009648575.1
562        	 GCF_000008865.2
1638       	 GCF_000763515.1
28901      	 GCF_000006945.2
1392       	 GCF_000008445.1
623        	 GCF_000006925.2
573        	 GCF_000240185.1
4932       	 GCF_000146045.2
96241      	 GCF_006094475.1
1590       	 GCF_003269405.1
1352       	 GCF_009734005.1
294        	 GCF_000730425.1
1643       	 GCF_900187315.1
176275     	 GCA_014607475.1


In [43]:
unpack(working_dir, output_directory)


21 archives were successfully processed.


In [45]:
reference_metadata = pd.DataFrame(download_result, columns=['Taxonomy ID', 'Assembly Accession ID', 'Source Database', 'Is Representative', 'Assembly Level', 'Organism of Assembly', 'Downloaded'])

In [49]:
taxonomy_name = []

for taxid in reference_metadata['Taxonomy ID']:
    taxonomy_name.append(ncbi_taxa_db.get_taxid_translator([taxid])[taxid])
    
reference_metadata['Species'] = taxonomy_name

In [52]:
reference_metadata = reference_metadata[['Taxonomy ID', 'Species', 'Assembly Accession ID', 'Source Database', 'Is Representative', 'Assembly Level', 'Organism of Assembly', 'Downloaded']]

In [53]:
reference_metadata.to_csv(os.path.join(output_directory, 'reference_metadata.csv'), index=False)
reference_metadata

Unnamed: 0,Taxonomy ID,Species,Assembly Accession ID,Source Database,Is Representative,Assembly Level,Organism of Assembly,Downloaded
0,1280,Staphylococcus aureus,GCF_000013425.1,University of Oklahoma Health Sciences Center,True,Complete Genome,Staphylococcus aureus subsp. aureus NCTC 8325,True
1,1613,Limosilactobacillus fermentum,GCF_022819245.1,NCBI RefSeq,True,Complete Genome,Limosilactobacillus fermentum,True
2,1351,Enterococcus faecalis,GCF_001598635.1,NCBI RefSeq,False,Complete Genome,Enterococcus faecalis,True
3,1423,Bacillus subtilis,GCF_000009045.1,BSNR,True,Complete Genome,Bacillus subtilis subsp. subtilis str. 168,True
4,287,Pseudomonas aeruginosa,GCF_000006765.1,PathoGenesis Corporation,True,Complete Genome,Pseudomonas aeruginosa PAO1,True
5,5207,Cryptococcus neoformans,GCA_022832995.1,Broad Institute,False,Complete Genome,Cryptococcus neoformans,True
6,1639,Listeria monocytogenes,GCF_000196035.1,European Consortium,True,Complete Genome,Listeria monocytogenes EGD-e,True
7,1642,Listeria innocua,GCF_009648575.1,NCBI RefSeq,True,Complete Genome,Listeria innocua,True
8,562,Escherichia coli,GCF_000008865.2,GIRC,True,Complete Genome,Escherichia coli O157:H7 str. Sakai,True
9,1638,Listeria ivanovii,GCF_000763515.1,NCBI RefSeq,True,Complete Genome,Listeria ivanovii subsp. ivanovii,True


In [54]:
cat_reference_genome(reference_metadata, output_directory, reference_genome_path=os.path.join(output_directory, 'reference_genomes'))

In [31]:
# reference_genome_path = os.path.join(output_directory, 'reference_genomes')

In [32]:
# input_fastq = '/home/Users/yl181/seqscreen_nano/ZymoBIOMICS.STD.Even.ont.raw_sequences/ERR3152364.downsampled.fastq'

In [33]:
# downloaded_assemblies = reference_metadata[reference_metadata['Downloaded']]

# num_cores = 20

# for assembly_id in downloaded_assemblies['Assembly Accession ID']:
#     reference_fasta = os.path.join(reference_genome_path, f'{assembly_id}.fasta')
#     run_minimap2(input_fastq, reference_fasta, assembly_id, output_directory, threads=num_cores)
#     sort_samfile(assembly_id, output_directory, num_cores)
#     calculate_depth(assembly_id, output_directory)