In [9]:
import polars as pl
import os, logging

In [10]:
def read_gtdbtk(output_directory, taxonomy_only=True, remove_empty_ranks=False):
    if not taxonomy_only:
        raise NotImplementedError("Only taxonomy is supported for now")
    taxonomies = {}
    bac_taxonomy_file = os.path.join(output_directory, 'gtdbtk.bac120.summary.tsv')
    logging.debug('Reading taxonomy from %s' % bac_taxonomy_file)
    d = pl.read_csv(bac_taxonomy_file, separator='\t')
    if remove_empty_ranks:
        empty_ranks = ['d__', 'p__', 'c__', 'o__', 'f__', 'g__', 's__']
    for row in d.rows(named=True):
        tax = row['classification']
        if remove_empty_ranks:
            tax = ';'.join([x for x in tax.split(';') if x.strip() not in empty_ranks])
        taxonomies[row['user_genome']] = tax
    logging.debug("Read %d taxonomies from Bacteria" % len(taxonomies))

    # Archaea
    arc_taxonomy_file = os.path.join(output_directory, 'gtdbtk.ar53.summary.tsv')
    logging.debug('Reading taxonomy from %s' % arc_taxonomy_file)
    d = pl.read_csv(arc_taxonomy_file, separator='\t')
    num_archaea = 0
    for row in d.rows(named=True):
        tax = row['classification']
        if remove_empty_ranks:
            tax = ';'.join([x for x in tax.split(';') if x.strip() not in empty_ranks])
        taxonomies[row['user_genome']] = tax
        num_archaea += 1
    logging.debug("Read %d new archaeal taxonomies, so %d total" % (num_archaea, len(taxonomies)))
    return taxonomies

In [12]:
tk = read_gtdbtk('gtdbtk_batchfile.random1000.gtdbtk_r207', remove_empty_ranks=True)
tk

{'GB_GCA_000195205.2': 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__CAILRJ01;s__CAILRJ01 sp010365165',
 'GB_GCA_000292915.1': 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiaceae;g__Burkholderia;s__Burkholderia cepacia_D',
 'GB_GCA_000411415.1': 'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Actinomycetaceae;g__Pauljensenia;s__Pauljensenia sp000411415',
 'GB_GCA_000468055.1': 'd__Bacteria;p__Spirochaetota;c__Spirochaetia;o__Treponematales;f__Treponemataceae;g__Treponema_C;s__Treponema_C lecithinolyticum',
 'GB_GCA_000514715.1': 'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Streptomycetales;f__Streptomycetaceae;g__Streptomyces;s__Streptomyces sp000514715',
 'GB_GCA_000564995.1': 'd__Bacteria;p__Actinobacteriota;c__Coriobacteriia;o__Coriobacteriales;f__Atopobiaceae;g__Lancefieldella;s__Lancefieldella sp000564995',
 'GB_GCA_001403755.1': 'd__Bacteria;p__Bacteroidota;c

In [14]:
len(tk), len([s for s in tk.values() if 's__' in s]), len([s for s in tk.values() if 'g__' in s])

(1000, 153, 857)

In [15]:
len([s for s in tk.values() if 'g__' in s]) / len(tk)

0.857