### Concatenating NCBI tables

In [1]:
import pandas as pd
from os import path, symlink
from shutil import copy
from glob import glob

In [2]:
# Reading
syn = pd.read_csv("../data/ncbi/synechoccus_ncbi_metadata_27-05_dedup.csv")
cya = pd.read_csv("../data/ncbi/cyanos_gtdb_ncbi_metadata_07-05_dedup.csv")

# Removing intersection of rows from cya
cya = cya[~cya['assembly_accession'].isin(syn['assembly_accession'])]

# Adding columns to identify data sets: Synechococcus or GTDB cyanobacteria
syn['is_synecho'] = True
cya['is_synecho'] = False

# Concatenate, format and save
df = pd.concat((syn, cya))
df['local_fna'] = df['filename'].apply(
    lambda s: f"../data/ncbi/processing/genomic_fna/{s}_genomic.fna")
#df.to_csv("../data/ncbi/concatenated_ncbi_table.csv")

# Add aditional columns
df['proteins_path'] = ("../data/ncbi/processing/prodigal_proteins/" +
                    df['filename'] + "_genomic_prodigal_proteins.faa")
df['genes_path'] = ("../data/ncbi/processing/prodigal_genes/" +
                       df['filename'] + "_genomic_prodigal_genes.fna")

In [3]:
# We will remove GCA_003228185.1 due to being described as a mixed culture
df = df[df['assembly_accession'] != 'GCA_003228185.1']

In [4]:
# Create CheckM input batchfile
copier = lambda x,y: [copy(i,y) for i in x]
# copier(df[df["is_synecho"]]["local_fna"], f"../data/checkm/input/")

# Should try this:
# map(lambda x,y: [copy(i,y) for i in x],
#    (df[df["is_synecho"]]["local_fna"], f"../data/checkm/input/"))

In [5]:
# Join completeness and contamination

# GTDB genomes
gtdb_metadata = "/home/vini/data/db/gtdb/bac120_metadata.tsv"
gtdb = pd.read_csv(gtdb_metadata, sep="\t")
gtdb.rename(columns={"ncbi_biosample": "biosample"}, inplace=True)
df = df.merge(gtdb[['biosample', 'checkm_completeness', 'checkm_contamination']],
         on='biosample', how='left')

# CheckM data
checkm = pd.read_csv("../data/checkm/checkm_tax_wf.tsv", sep="\t")
checkm.rename(columns={"Completeness": "checkm_completeness",
                       "Contamination": "checkm_contamination"}, inplace=True)
checkm['filename'] = checkm["Bin Id"].str.split("_genomic", expand=True).iloc[:, 0]
df = df.merge(checkm[['filename', 'checkm_completeness', 'checkm_contamination']],
              on='filename', how='left')
df['checkm_completeness'] = df['checkm_completeness_y'].fillna(
    df['checkm_completeness_x'])
df['checkm_contamination'] = df['checkm_contamination_y'].fillna(
    df['checkm_contamination_x'])
df = df.drop(['checkm_completeness_x', 'checkm_completeness_y',
              'checkm_contamination_x', 'checkm_contamination_y'], axis=1)

In [6]:
# Filter genomes with < 50% completeness and > 5% contamination
df['quality_filter'] = df.apply(lambda row: True if
         row['checkm_completeness'] > 50 and
         row['checkm_contamination'] < 5 else
        False, axis=1)

# Create GTDB batchfile for Synechococcus genomes
df[df['is_synecho']][['local_fna', 'assembly_accession']].to_csv(
    '../data/gtdb/batchfile.tsv', sep='\t', index=False, header=None)

In [None]:
# Join GTDB taxonomy for GTDB cyanobacteria
gtdb_tax = gtdb["gtdb_taxonomy"].str.split(";", expand=True)
gtdb_tax.columns = ("gtdb_" + i for i in "domain phylum class order family genus species".split())
gtdb_tax = gtdb_tax.applymap(lambda s: s[3:] if s is not None else s)
gtdb[gtdb_tax.columns] = gtdb_tax
df = df.merge(gtdb[['biosample'] + list(gtdb_tax.columns)], on='biosample', how='left')

# Join GTDB taxonomy for NCBI Synechococcus genomes

In [None]:
synecho_gtdb = pd.read_csv("../data/gtdb/output/synechococcus.bac120.summary.tsv", sep="\t")
synecho_gtdb['classification'].iloc[120]

### Adding genes and protein filepaths