### Concatenating NCBI tables

In [45]:
import pandas as pd
from os import path
from shutil import copy
from glob import glob
import abacat as ab
pd.set_option('display.max_rows', 200)

In [46]:
# Reading
syn = pd.read_csv("../data/ncbi/synechoccus_ncbi_metadata_27-05_dedup.csv")
cya = pd.read_csv("../data/ncbi/cyanos_gtdb_ncbi_metadata_07-05_dedup.csv")

# Removing intersection of rows from cya
cya = cya[~cya['assembly_accession'].isin(syn['assembly_accession'])]

# Adding columns to identify data sets: Synechococcus or GTDB cyanobacteria
syn['is_synecho'] = True
cya['is_synecho'] = False

# Concatenate, format and save
df = pd.concat((syn, cya))
df['local_fna'] = df['filename'].apply(
    lambda s: f"../data/ncbi/processing/genomic_fna/{s}_genomic.fna")
df.to_csv("../data/ncbi/concatenated_ncbi_table.csv")

# Add aditional columns
df['proteins_path'] = ("../data/ncbi/processing/prodigal_proteins/" +
                    df['filename'] + "_genomic_prodigal_proteins.faa")
df['genes_path'] = ("../data/ncbi/processing/prodigal_genes/" +
                       df['filename'] + "_genomic_prodigal_genes.fna")

In [47]:
# We will remove GCA_003228185.1 due to being described as a mixed culture
df = df[df['assembly_accession'] != 'GCA_003228185.1']

In [48]:
# Create CheckM input batchfile
copier = lambda x,y: [copy(i,y) for i in x]
# copier(df[df["is_synecho"]]["local_fna"], f"../data/checkm/input/")

# Should try this:
# map(lambda x,y: [copy(i,y) for i in x],
#    (df[df["is_synecho"]]["local_fna"], f"../data/checkm/input/"))

In [49]:
# Join completeness and contamination

# GTDB genomes
gtdb_metadata = "/home/vini/data/db/gtdb/bac120_metadata.tsv"
gtdb = pd.read_csv(gtdb_metadata, sep="\t")
gtdb.rename(columns={"ncbi_biosample": "biosample"}, inplace=True)
df = df.merge(gtdb[['biosample', 'checkm_completeness', 'checkm_contamination']],
         on='biosample', how='left')

# CheckM data
checkm = pd.read_csv("../data/checkm/checkm_tax_wf.tsv", sep="\t")
checkm.rename(columns={"Completeness": "checkm_completeness",
                       "Contamination": "checkm_contamination"}, inplace=True)
checkm['filename'] = checkm["Bin Id"].str.split("_genomic", expand=True).iloc[:, 0]
df = df.merge(checkm[['filename', 'checkm_completeness', 'checkm_contamination']],
              on='filename', how='left')
df['checkm_completeness'] = df['checkm_completeness_y'].fillna(
    df['checkm_completeness_x'])
df['checkm_contamination'] = df['checkm_contamination_y'].fillna(
    df['checkm_contamination_x'])
df = df.drop(['checkm_completeness_x', 'checkm_completeness_y',
              'checkm_contamination_x', 'checkm_contamination_y'], axis=1)

In [50]:
# Filter genomes with < 50% completeness and > 5% contamination
df['quality_filter'] = df.apply(lambda row: True if
         row['checkm_completeness'] > 50 and
         row['checkm_contamination'] < 10 else
        False, axis=1)

# Create GTDB batchfile for Synechococcus genomes
# df[df['is_synecho']][['local_fna', 'assembly_accession']].to_csv(
#     '../data/gtdb/batchfile.tsv', sep='\t', index=False, header=None)

In [51]:
# Join GTDB taxonomy for GTDB cyanobacteria
gtdb_tax = gtdb["gtdb_taxonomy"].str.split(";", expand=True)
gtdb_tax.columns = ("gtdb_" + i for i in "domain phylum class order family genus species".split())
gtdb_tax = gtdb_tax.applymap(lambda s: s[3:] if s is not None else s)
gtdb[gtdb_tax.columns] = gtdb_tax
df = df.merge(gtdb[['biosample'] + list(gtdb_tax.columns)], on='biosample', how='left')

# Join GTDB taxonomy for NCBI Synechococcus genomes
synecho_gtdb = pd.read_csv("../data/gtdb/output/synechococcus.bac120.summary.tsv", sep="\t")
gtdb_tax = synecho_gtdb["classification"].str.split(";", expand=True)
gtdb_tax.columns = ("gtdb_" + i for i in "domain phylum class order family genus species".split())
gtdb_tax = gtdb_tax.applymap(lambda s: s[3:] if isinstance(s, str) else s)
synecho_gtdb[gtdb_tax.columns] = gtdb_tax

df = df.merge(synecho_gtdb.rename(columns={"user_genome": "assembly_accession"}), on='assembly_accession', how='left')

for i in gtdb_tax.columns:
    df[i] = df[i + '_y'].fillna(df[i + '_x'])
    df = df.drop([i + "_x", i + "_y"], axis=1)

In [52]:
# Add Abacat to calculate GC and size
df['abacat'] = df['local_fna'].apply(lambda s: ab.SequenceFile(s, 'contigs'))
df['gc'] = df['abacat'].apply(lambda seq: seq.gc)
df['megabases'] = df['abacat'].apply(lambda seq: seq.bps / 10e5)
df = df[[i for i in df.columns if i != 'abacat']]
df.to_csv("../data/metadata.csv", index=False)

### "Checkpoint"

In [347]:
# Add labels from existing file
df = pd.read_csv("../data/metadata.csv")
df = df.iloc[:, :-1]
labels = pd.read_csv("../data/misc/labels.tsv", sep="\t")
labels.columns = ("filename", "label")
labels['filename'] = labels['filename'].str.split("_genomic", expand=True).iloc[:, 0]
df = df.merge(labels, on='filename', how='left')

In [338]:
# for i in ('label',):
#     df[i] = df[i + '_y'].fillna(df[i + '_x'])
#     df = df.drop([i + "_x", i + "_y"], axis=1)
    
# df.loc[df[df['label'].str.contains("RSCC")].index, "is_synecho"] = False

In [348]:
# Fill missing labels with GTDB tax
def join_gtdb_tax(row):     
    return "_".join([f"{i[5]}_" + str(row[i]) for i in gtdb_tax.columns[2:]])

df['label'] = df.apply(lambda row: join_gtdb_tax(row) if row['label'] != row['label'] else row['label'], axis=1)

In [349]:
# Format columns
def fill_rank(row, rank='genus'):
    ranks = {
        "genus": 0,
        "species": 1,
        "strain": 2
    }
    label = ""
    if row['is_synecho']:
        if ranks[rank] < 2:
            label = str(row['label'].split("_")[ranks[rank]])
        else:
            label = "_".join(row['label'].split("_")[2:])
        if len(label) > 1:
            pass
        else:
            if ranks[rank] < 2:
                label = str(row[f'gtdb_{rank}'])
            else:
                label = str(row['infraspecific_name'])
    else:
        if ranks[rank] == 2:
            label = str(row['infraspecific_name'])
        else:
            label = str(row[f'gtdb_{rank}'])
    return label

df['new_genus'] = df.apply(lambda row: fill_rank(row, rank='genus'), axis=1)
df['new_species'] = df.apply(lambda row: fill_rank(row, rank='species'), axis=1)
df['new_strain'] = df.apply(lambda row: fill_rank(row, rank='strain'), axis=1)

In [350]:
missing_labels = pd.read_csv("../data/misc/missing_labels.csv")

df = df.merge(missing_labels, on='assembly_accession', how='left')
for i in missing_labels.columns[1:]:
    df[i] = df[i + '_y'].fillna(df[i + '_x'])
    df = df.drop([i + "_x", i + "_y"], axis=1)

In [351]:
# Final edits

# Drop duplicates and other classes
df = df.query("gtdb_class == 'Cyanobacteriia'")
df = df.drop_duplicates("assembly_accession")

df['new_genus'] = df['new_genus'].str.replace('Enugrolinea', 'Limnothrix')
df['new_genus'] = df['new_genus'].str.replace('Synechococcus-C', 'Synechococcus_C')

# Manual edit to Vulcanococcus limneticus
ix = df[df['organism_name'].str.contains("limneticus")].index
df.loc[ix, 'new_species'] = 'limneticus'
df.loc[ix, 'new_strain']  = 'LL'
df.loc[ix, 'label'] = df.loc[ix, 'organism_name']

# And to nan labels:
df['label'] = df.apply(lambda row: row['organism_name'] if 'nan' in row['label'] else row['label'], axis=1)
df['label'] = df['label'].str.replace("Magnicoccus", "Vulcanococcus")
df['new_genus'] = df['new_genus'].str.replace("Magnicoccus", "Vulcanococcus")

In [343]:
# Export
df.to_csv("../data/metadata.csv", index=False)

In [344]:
# Create AAI input for Synechococcus - Add Cyanobium and Limnothrix here
dst = "../data/aai/synechococcus_input/"
#df[(df['is_synecho'])]['proteins_path'].apply(lambda file: copy(file, dst))

In [345]:
# Creating GTT input
df = df[df['quality_filter']].drop_duplicates("label")
df['protein_filename'] = df['proteins_path'].apply(lambda s: path.basename(s))
df['proteins_path'] = df['proteins_path'].str.replace("../data", "../")
df['gtt_label'] = df['label'] + " # " + df['assembly_accession']

remove_from_gtt = ["PCC-7407", "Gloeomargarita", "kilaueensis", "CACIAM-69", "RSCC"]
df = df[~df.index.isin(df[df['label'].apply(lambda s: any(i for i in remove_from_gtt if i in s))].index)]

# Exporting
df['proteins_path'].to_csv("../data/gtt/1072_input.txt", index=False, header=False)
df[['protein_filename', 'gtt_label']].to_csv("../data/gtt/1072_labels.tsv", sep="\t", index=False, header=False)

In [346]:
df.shape

(1072, 64)