In [1]:
import polars as pl

In [2]:
# Read r207 and r214 metadata
gtdb_bac_metadata207 = pl.read_csv('/work/microbiome/db/gtdb/gtdb_release207/bac120_metadata_r207.tsv', separator='\t', infer_schema_length=10000, ignore_errors=True).select('accession', 'gtdb_taxonomy')
gtdb_arc_metadata207 = pl.read_csv('/work/microbiome/db/gtdb/gtdb_release207/ar53_metadata_r207.tsv', separator='\t').select('accession', 'gtdb_taxonomy')
r207 = pl.concat([gtdb_bac_metadata207, gtdb_arc_metadata207])

# Read r214 metadata
gtdb_bac_metadata214 = pl.read_csv('/work/microbiome/db/gtdb/gtdb_release214/bac120_metadata_r214.tsv', separator='\t').select('accession', 'gtdb_taxonomy', 'gtdb_representative', 'checkm_completeness', 'checkm_contamination')
gtdb_arc_metadata214 = pl.read_csv('/work/microbiome/db/gtdb/gtdb_release214/ar53_metadata_r214.tsv', separator='\t').select('accession', 'gtdb_taxonomy', 'gtdb_representative', 'checkm_completeness', 'checkm_contamination')
r214 = pl.concat([gtdb_bac_metadata214, gtdb_arc_metadata214])

In [23]:
r207 = r207.with_columns(pl.col('accession').str.replace(r'.*_', '').str.replace(r'\..*','').alias('accession_minimal'))
r214 = r214.with_columns(pl.col('accession').str.replace(r'.*_', '').str.replace(r'\..*','').alias('accession_minimal'))
r207.filter(pl.col('accession_minimal')=='000292915') # This changed RS to GB or vice versa

accession,gtdb_taxonomy,accession_minimal
str,str,str
"""RS_GCF_0002929…","""d__Bacteria;p_…","""000292915"""


In [24]:
# Merge r207 and r214 metadata
merged = r207.join(r214, on='accession_minimal', how='outer').filter(pl.col('gtdb_representative')=='t').filter(pl.col('checkm_completeness')-5*pl.col('checkm_contamination') > 90)
merged.shape, merged[:4]

((35006, 9),
 shape: (4, 9)
 ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
 │ accession ┆ gtdb_taxo ┆ accession ┆ accession ┆ … ┆ gtdb_repr ┆ checkm_co ┆ checkm_co ┆ accessio │
 │ ---       ┆ nomy      ┆ _minimal  ┆ _right    ┆   ┆ esentativ ┆ mpletenes ┆ ntaminati ┆ n_minima │
 │ str       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ e         ┆ s         ┆ on        ┆ l_right  │
 │           ┆ str       ┆ str       ┆ str       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---      │
 │           ┆           ┆           ┆           ┆   ┆ str       ┆ f64       ┆ f64       ┆ str      │
 ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
 │ GB_GCA_00 ┆ d__Bacter ┆ 000007325 ┆ GB_GCA_00 ┆ … ┆ t         ┆ 99.95     ┆ 0.0       ┆ 00000732 │
 │ 0007325.1 ┆ ia;p__Fus ┆           ┆ 0007325.1 ┆   ┆           ┆           ┆           ┆ 5        │
 │           ┆ obacterio ┆           ┆           ┆   ┆

In [25]:
# Get a list of genomes which are new in r214
r214_news = merged.filter(pl.col('gtdb_taxonomy').is_null()).select('accession_right', 'gtdb_taxonomy_right')
r214_news.shape, r214_news[:4]

((7002, 2),
 shape: (4, 2)
 ┌────────────────────┬───────────────────────────────────┐
 │ accession_right    ┆ gtdb_taxonomy_right               │
 │ ---                ┆ ---                               │
 │ str                ┆ str                               │
 ╞════════════════════╪═══════════════════════════════════╡
 │ GB_GCA_000195205.2 ┆ d__Bacteria;p__Pseudomonadota;c_… │
 │ GB_GCA_001672075.1 ┆ d__Bacteria;p__Cyanobacteriota;c… │
 │ GB_GCA_003223065.1 ┆ d__Bacteria;p__Acidobacteriota;c… │
 │ GB_GCA_003934145.1 ┆ d__Bacteria;p__Chloroflexota;c__… │
 └────────────────────┴───────────────────────────────────┘)

In [26]:
paths = pl.read_csv('~/m/db/gtdb/gtdb_release214/genomic_files_reps/gtdb_genomes_reps_r214/out', has_header=False)
paths = paths.with_columns(pl.col('column_1').str.replace(r'.*/', '').str.replace(r'_genomic.fna$','').alias('accession'), pl.col('column_1').alias('path')).select('accession', 'path')
paths[:3]

accession,path
str,str
"""GCF_916619195.…","""./database/GCF…"
"""GCF_916618895.…","""./database/GCF…"
"""GCF_916618745.…","""./database/GCF…"


In [27]:
def show_all(df, width=200, max_col_width=True):
    '''
    Prints an entire polars dataframe in the console or notebook output.
    Parameters
    ----------
    df : pl.DataFrame
        The dataframe to be printed.
    width : int, optional
        The width of the printed dataframe.
        Defaults to 200.
    max_col_width : bool, optional
        Whether to set the maximum column width.
        i.e. it will print the full contents of the cells.
        Defaults to True.
    '''
    with  pl.Config()  as  cfg:
        cfg.set_tbl_cols(-1)
        cfg.set_tbl_rows(-1)
        cfg.set_tbl_width_chars(width)
        if  max_col_width  or  len(df.columns) ==  1:
            cfg.set_fmt_str_lengths(width)
        print(df)

In [28]:
show_all(r214_news.with_columns(pl.col('accession_right').str.replace(r'.._', '').alias('accession'))[:3])#.join(paths, on='accession', how='inner')
show_all(paths[:3])

shape: (3, 3)
┌────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────┐
│ accession_right    ┆ gtdb_taxonomy_right                                                                                                                 ┆ accession       │
│ ---                ┆ ---                                                                                                                                 ┆ ---             │
│ str                ┆ str                                                                                                                                 ┆ str             │
╞════════════════════╪═════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════════╪═════════════════╡
│ GB_GCA_000195205.2 ┆ d__Bacteria;p__Pseudomonadota;c__Gammaproteobacteria;o__Burkholderiales;f__Burkholderiac

In [29]:
m2 = r214_news.with_columns(pl.col('accession_right').str.replace(r'.._', '').alias('accession')).join(paths, on='accession', how='inner')
m2.shape, m2[:3]

((7002, 4),
 shape: (3, 4)
 ┌────────────────────┬─────────────────────────────┬─────────────────┬─────────────────────────────┐
 │ accession_right    ┆ gtdb_taxonomy_right         ┆ accession       ┆ path                        │
 │ ---                ┆ ---                         ┆ ---             ┆ ---                         │
 │ str                ┆ str                         ┆ str             ┆ str                         │
 ╞════════════════════╪═════════════════════════════╪═════════════════╪═════════════════════════════╡
 │ RS_GCF_916619195.1 ┆ d__Bacteria;p__Pseudomonado ┆ GCF_916619195.1 ┆ ./database/GCF/916/619/195/ │
 │                    ┆ ta;c_…                      ┆                 ┆ GCF_9…                      │
 │ RS_GCF_916618895.1 ┆ d__Bacteria;p__Actinomyceto ┆ GCF_916618895.1 ┆ ./database/GCF/916/618/895/ │
 │                    ┆ ta;c_…                      ┆                 ┆ GCF_9…                      │
 │ RS_GCF_916618745.1 ┆ d__Bacteria;p__Pseudomonado ┆ G

In [30]:
m2 = m2.with_columns(('/work/microbiome/db/gtdb/gtdb_release214/genomic_files_reps/gtdb_genomes_reps_r214/'+pl.col('path')).alias('abspath'))
show_all(m2[:3])

shape: (3, 5)
┌────────────────────┬─────────────────────────────────────────────────────┬─────────────────┬────────────────────────────────────────────────────┬────────────────────────────────────────────────────┐
│ accession_right    ┆ gtdb_taxonomy_right                                 ┆ accession       ┆ path                                               ┆ abspath                                            │
│ ---                ┆ ---                                                 ┆ ---             ┆ ---                                                ┆ ---                                                │
│ str                ┆ str                                                 ┆ str             ┆ str                                                ┆ str                                                │
╞════════════════════╪═════════════════════════════════════════════════════╪═════════════════╪════════════════════════════════════════════════════╪═══════════════════════════════════

In [35]:
# write gtdbtk batchfile of 1000 random choices. Commented out to avoid accidental overwrite.
# m2.sample(1000).select(['abspath','accession_right']).write_csv('gtdbtk_batchfile.random1000.csv', include_header=False, separator='\t')