In [None]:
import os
import subprocess
import sys

from collections import defaultdict
from Bio import Entrez
import time

import pandas as pd
import json
from ete3 import NCBITaxa

In [None]:
sys.path.insert(0, '../utils')
from reference_finder import download_reference_genome, unpack, cat_reference_genome
from alignment import run_minimap2, sort_samfile, calculate_depth

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def get_species_taxid(taxid, ncbi_taxa_db, valid_kingdom={2, 4751, 2157, 10239}):
    lineage = ncbi_taxa_db.get_lineage(taxid)
    if bool(set(lineage) & valid_kingdom):
        taxid2rank_dict = ncbi_taxa_db.get_rank(lineage)
        for lineage_taxid in taxid2rank_dict:
            if taxid2rank_dict[lineage_taxid] == 'species':
                return lineage_taxid
    return None

In [None]:
ncbi_taxa_db = NCBITaxa()

In [None]:
valid_kingdom = [2, 4751, 2157, 10239] # bacteria, archaea, viruses, and fungi

In [None]:
seqscreen_output = "/home/Users/yl181/seqscreen_nano/output_datasets/ZymoBIOMICS.STD.Even.ont.seqscreen"

In [None]:
classification_result_df = pd.read_csv(os.path.join(seqscreen_output, 'taxonomic_identification', 'taxonomic_assignment', 'taxonomic_results.txt'), sep='\t')

In [None]:
total_read_count, _ = classification_result_df.shape

In [None]:
taxid_count_dict = defaultdict(int)
taxid_species_lookup = dict()
error_count = 0
for taxid in classification_result_df['taxid']:
    
    try:
        taxid = int(taxid)
        try:
            species_taxid = taxid_species_lookup[taxid]
        except KeyError:
            species_taxid = get_species_taxid(taxid, ncbi_taxa_db)
            taxid_species_lookup[taxid] = species_taxid
            
        if species_taxid is not None:
            taxid_count_dict[species_taxid] += 1
    except ValueError:
        error_count += 1

In [None]:
taxid_count_dict[176275]

In [None]:
for key in taxid_species_lookup:
    if taxid_species_lookup[key] == 176275:
        print(key)

In [None]:
classification_result_df[classification_result_df['taxid']==1245745]['confidence'].describe()