### Concatenating NCBI tables

In [946]:
import pandas as pd
from os import path
from shutil import copy
from glob import glob
import abacat as ab
pd.set_option('display.max_rows', 200)

In [947]:
# Reading
syn = pd.read_csv("../data/ncbi/synechoccus_ncbi_metadata_27-05_dedup.csv")
cya = pd.read_csv("../data/ncbi/cyanos_gtdb_ncbi_metadata_07-05_dedup.csv")

# Removing intersection of rows from cya
cya = cya[~cya['assembly_accession'].isin(syn['assembly_accession'])]

# Adding columns to identify data sets: Synechococcus or GTDB cyanobacteria
syn['is_synecho'] = True
cya['is_synecho'] = False

# Concatenate, format and save
df = pd.concat((syn, cya))
df['local_fna'] = df['filename'].apply(
    lambda s: f"../data/ncbi/processing/genomic_fna/{s}_genomic.fna")
df.to_csv("../data/ncbi/concatenated_ncbi_table.csv")

# Add aditional columns
df['proteins_path'] = ("../data/ncbi/processing/prodigal_proteins/" +
                    df['filename'] + "_genomic_prodigal_proteins.faa")
df['genes_path'] = ("../data/ncbi/processing/prodigal_genes/" +
                       df['filename'] + "_genomic_prodigal_genes.fna")

In [948]:
# We will remove GCA_003228185.1 due to being described as a mixed culture
df = df[df['assembly_accession'] != 'GCA_003228185.1']

In [949]:
# Create CheckM input batchfile
copier = lambda x,y: [copy(i,y) for i in x]
# copier(df[df["is_synecho"]]["local_fna"], f"../data/checkm/input/")

# Should try this:
# map(lambda x,y: [copy(i,y) for i in x],
#    (df[df["is_synecho"]]["local_fna"], f"../data/checkm/input/"))

In [950]:
# Join completeness and contamination

# GTDB genomes
gtdb_metadata = "/home/vini/data/db/gtdb/bac120_metadata.tsv"
gtdb = pd.read_csv(gtdb_metadata, sep="\t")
gtdb.rename(columns={"ncbi_biosample": "biosample"}, inplace=True)
df = df.merge(gtdb[['biosample', 'checkm_completeness', 'checkm_contamination']],
         on='biosample', how='left')

# CheckM data
checkm = pd.read_csv("../data/checkm/checkm_tax_wf.tsv", sep="\t")
checkm.rename(columns={"Completeness": "checkm_completeness",
                       "Contamination": "checkm_contamination"}, inplace=True)
checkm['filename'] = checkm["Bin Id"].str.split("_genomic", expand=True).iloc[:, 0]
df = df.merge(checkm[['filename', 'checkm_completeness', 'checkm_contamination']],
              on='filename', how='left')
df['checkm_completeness'] = df['checkm_completeness_y'].fillna(
    df['checkm_completeness_x'])
df['checkm_contamination'] = df['checkm_contamination_y'].fillna(
    df['checkm_contamination_x'])
df = df.drop(['checkm_completeness_x', 'checkm_completeness_y',
              'checkm_contamination_x', 'checkm_contamination_y'], axis=1)

In [951]:
# Filter genomes with < 50% completeness and > 5% contamination
df['quality_filter'] = df.apply(lambda row: True if
         row['checkm_completeness'] > 50 and
         row['checkm_contamination'] < 10 else
        False, axis=1)

# Create GTDB batchfile for Synechococcus genomes
# df[df['is_synecho']][['local_fna', 'assembly_accession']].to_csv(
#     '../data/gtdb/batchfile.tsv', sep='\t', index=False, header=None)

In [952]:
# Join GTDB taxonomy for GTDB cyanobacteria
gtdb_tax = gtdb["gtdb_taxonomy"].str.split(";", expand=True)
gtdb_tax.columns = ("gtdb_" + i for i in "domain phylum class order family genus species".split())
gtdb_tax = gtdb_tax.applymap(lambda s: s[3:] if s is not None else s)
gtdb[gtdb_tax.columns] = gtdb_tax
df = df.merge(gtdb[['biosample'] + list(gtdb_tax.columns)], on='biosample', how='left')

# Join GTDB taxonomy for NCBI Synechococcus genomes
synecho_gtdb = pd.read_csv("../data/gtdb/output/synechococcus.bac120.summary.tsv", sep="\t")
gtdb_tax = synecho_gtdb["classification"].str.split(";", expand=True)
gtdb_tax.columns = ("gtdb_" + i for i in "domain phylum class order family genus species".split())
gtdb_tax = gtdb_tax.applymap(lambda s: s[3:] if isinstance(s, str) else s)
synecho_gtdb[gtdb_tax.columns] = gtdb_tax

df = df.merge(synecho_gtdb.rename(columns={"user_genome": "assembly_accession"}), on='assembly_accession', how='left')

for i in gtdb_tax.columns:
    df[i] = df[i + '_y'].fillna(df[i + '_x'])
    df = df.drop([i + "_x", i + "_y"], axis=1)

In [953]:
# Add Abacat to calculate GC and size
df['abacat'] = df['local_fna'].apply(lambda s: ab.SequenceFile(s, 'contigs'))
df['gc'] = df['abacat'].apply(lambda seq: seq.gc)
df['megabases'] = df['abacat'].apply(lambda seq: seq.bps / 10e5)
df = df[[i for i in df.columns if i != 'abacat']]
df.to_csv("../data/misc/metadata_v0.csv", index=False)

### "Checkpoint"

In [968]:
# Add labels from existing file
df = pd.read_csv("../data/misc/metadata_v0.csv")
labels = pd.read_csv("../data/misc/labels.tsv", sep="\t")
labels.columns = ("filename", "label")
labels['filename'] = labels['filename'].str.split("_genomic", expand=True).iloc[:, 0]
df = df.merge(labels, on='filename', how='left')

In [970]:
# Fill missing labels with GTDB tax
def join_gtdb_tax(row):     
    return "_".join([f"{i[5]}_" + str(row[i]) for i in gtdb_tax.columns[2:]])

df['label'] = df.apply(lambda row: join_gtdb_tax(row) if row['label'] != row['label'] else row['label'], axis=1)

In [971]:
# Format columns
def fill_rank(row, rank='genus'):
    ranks = {
        "genus": 0,
        "species": 1,
        "strain": 2
    }
    label = ""
    if row['is_synecho']:
        if ranks[rank] < 2:
            label = str(row['label'].split("_")[ranks[rank]])
        else:
            label = "_".join(row['label'].split("_")[2:])
        if len(label) > 1:
            pass
        else:
            if ranks[rank] < 2:
                label = str(row[f'gtdb_{rank}'])
            else:
                if row['isolate'] != row['isolate']:
                    label = str(row['infraspecific_name'])
                else:
                    label = str(row['isolate'])
    else:
        if ranks[rank] == 2:
            label = str(row['infraspecific_name'])
        else:
            label = str(row[f'gtdb_{rank}'])
    return label

df['new_genus'] = df.apply(lambda row: fill_rank(row, rank='genus'), axis=1)
df['new_species'] = df.apply(lambda row: fill_rank(row, rank='species'), axis=1)
df['new_strain'] = df.apply(lambda row: fill_rank(row, rank='strain'), axis=1)

In [972]:
# Add Limnothrix and Cyanobium as synechos
cyanobium = [
    "GFB01", # not SC
    "NIES-981", # not SC
    "PCC7001", # 2 not SC
    "8F6",
    "UBA5018", # not SC
    "CCALA_15", # not SC
    "PCC_6307", # not SC, Cyanobium gracile (type species)
    "BO_8801",
    "CACIAM_14" # not SC
]

regnicoccus = [
    "WH-5701" # 3 not SC
]

inmanicoccus = [
    "RCC307" # 2 not SC
]

limnothrix = [ 
    "Limnotrichaceae", # 2 not in synecho
    "Enugrolinea",
    "Limnothrix" ]

# Remove Synechos with only one genome
singletons = [
    "Leptovivax",
    "Coccusdissimilis"
]

# for genus in singletons:
#     df.loc[df[df['label'].apply(lambda s: any([i in s for i in list_]))].index, "is_synecho"] = False

for list_, genus in zip((cyanobium, regnicoccus, inmanicoccus, limnothrix),
                        ("Cyanobium", "Regnicoccus", "Inmanicoccus", "Limnothrix")):
    df.loc[df[df['label'].apply(lambda s: any([i in s for i in list_]))].index, "new_genus"] = genus
    df.loc[df[df['label'].apply(lambda s: any([i in s for i in list_]))].index, "is_synecho"] = True

In [973]:
missing_labels = pd.read_csv("../data/misc/missing_labels.csv")

df = df.merge(missing_labels, on='assembly_accession', how='left')
for i in missing_labels.columns[1:]:
    df[i] = df[i + '_y'].fillna(df[i + '_x'])
    df = df.drop([i + "_x", i + "_y"], axis=1)

In [974]:
# Final edits

# Drop duplicates and other classes
df = df.query("gtdb_class == 'Cyanobacteriia'")
df = df.drop_duplicates("assembly_accession")

df['new_genus'] = df['new_genus'].str.replace('Enugrolinea', 'Limnothrix')
df['new_genus'] = df['new_genus'].str.replace('Synechococcus-C', 'Synechococcus_C')

# Manual edit to Vulcanococcus limneticus
ix = df[df['organism_name'].str.contains("limneticus")].index
df.loc[ix, 'new_species'] = 'limneticus'
df.loc[ix, 'new_strain']  = 'LL'
df.loc[ix, 'label'] = df.loc[ix, 'organism_name']
df.loc[ix, 'is_synecho'] = True

# And to nan labels:
df['label'] = df.apply(lambda row: row['organism_name'] if 'nan' in row['label'] else row['label'], axis=1)
df['label'] = df['label'].str.replace("Magnicoccus", "Vulcanococcus")
df['new_genus'] = df['new_genus'].str.replace("Magnicoccus", "Vulcanococcus")

In [975]:
df.to_csv("../data/misc/metadata_v1.csv", index=False)

In [976]:
# # Create AAI input for Synechococcus - added Cyanobium and Limnothrix
dst = "../data/comparem/input/"

#df[(df['is_synecho'])]['proteins_path'].apply(lambda file: copy(file.replace("..//ncbi", "../data/ncbi/"), dst))

In [977]:
df = pd.read_csv("../data/misc/metadata_v1.csv")

# Creating GTT input
df = df[df['quality_filter']].drop_duplicates("label")
df['protein_filename'] = df['proteins_path'].apply(lambda s: path.basename(s))
df['proteins_path'] = df['proteins_path'].str.replace("../data", "../")
df['gtt_label'] = df['label'] + " # " + df['assembly_accession']

remove_from_gtt = ["PCC-7407", "Gloeomargarita", "kilaueensis", "CACIAM-69", "RSCC", "Gloeocapsa", "Prochloraceae", "Phormidiaceae_A"]
df = df[~df.index.isin(df[df['gtt_label'].apply(lambda s: any(i for i in remove_from_gtt if i in s))].index)]

In [979]:
# Exporting
df['proteins_path'].to_csv("../data/gtt/1085_input.txt", index=False, header=False)
df[['protein_filename', 'gtt_label']].to_csv("../data/gtt/1085_labels.tsv", sep="\t", index=False, header=False)
df.to_csv("../data/metadata.csv", index=False)

In [898]:
# Building table 1

In [899]:
df[df['is_synecho']].shape

(253, 61)

In [893]:
df[df['label'].str.contains("mexicanus")]

Unnamed: 0,assembly_accession,bioproject,biosample,wgs_master,excluded_from_refseq,refseq_category,relation_to_type_material,taxid,species_taxid,isolate,...,gtdb_family,gtdb_genus,gtdb_species,gc,organism_name,infraspecific_name,new_genus,new_species,new_strain,label
21,GCA_000155595.1,PRJNA19377,SAMN02436177,ABRV00000000.1,,na,,91464,91464,,...,Phormidesmiaceae,Phormidesmis,Phormidesmis sp000155595,0.4816,Synechococcus sp. PCC 7335,strain=PCC 7335,Coccusdissimilis,mexicanus,PCC_7335,Coccusdissimilis_mexicanus_PCC_7335


In [833]:
df[df['is_synecho']]['gtdb_order'].value_counts()

Synechococcales          167
Cyanobacteriales          14
PCC-7336                  11
Thermosynechococcales      7
Neosynechococcales         1
Phormidesmiales            1
Pseudanabaenales           1
Name: gtdb_order, dtype: int64

In [834]:
df[df['is_synecho']]['gtdb_family'].value_counts()

Cyanobiaceae              161
Limnotrichaceae            14
JA-3-3Ab                   10
Thermosynechococcaceae      7
Synechococcaceae            6
Phormidesmiaceae            1
Neosynechococcaceae         1
PCC-7336                    1
Pseudanabaenaceae           1
Name: gtdb_family, dtype: int64

In [835]:
df[df['is_synecho'] & (df['gtdb_family'] != 'Cyanobiaceae') & df['quality_filter']]['new_genus'].value_counts()

Limnothrix             14
Leptococcus            10
Thermosynechococcus     6
Synechococcus           6
Brevicoccus             1
Leptovivax              1
Stenotopis              1
Coccusdissimilis        1
Neosynechococcus        1
Name: new_genus, dtype: int64

In [836]:
df[df['new_genus'] == 'Synechococcus']

Unnamed: 0,assembly_accession,bioproject,biosample,wgs_master,excluded_from_refseq,refseq_category,relation_to_type_material,taxid,species_taxid,isolate,...,gtdb_species,gc,organism_name,infraspecific_name,new_genus,new_species,new_strain,label,protein_filename,gtt_label
6,GCA_000010065.1,PRJNA13282,SAMD00061073,,,representative genome,assembly from type material,269084,32046,,...,Synechococcus elongatus,0.55484,Synechococcus elongatus PCC 6301,strain=PCC 6301,Synechococcus,elongatus,PCC_6301,Synechococcus_elongatus_PCC_6301,GCA_000010065.1_ASM1006v1_genomic_prodigal_pro...,Synechococcus_elongatus_PCC_6301 # GCA_0000100...
7,GCA_000012525.1,PRJNA10645,SAMN02598254,,,na,,1140,32046,,...,Synechococcus elongatus,0.55426,Synechococcus elongatus PCC 7942 = FACHB-805,strain=PCC 7942,Synechococcus,elongatus,PCC_7942,Synechococcus_elongatus_PCC_7942,GCA_000012525.1_ASM1252v1_genomic_prodigal_pro...,Synechococcus_elongatus_PCC_7942 # GCA_0000125...
8,GCA_003846445.1,PRJNA476044,SAMN09425400,,,na,,2219813,32046,,...,,0.549,Synechococcus elongatus PCC 11801,PCC 11801,Synechococcus,elongatus,PCC 11801,Synechococcus_sp._PCC 11801,GCA_003846445.1_ASM384644v1_genomic_prodigal_p...,Synechococcus_sp._PCC 11801 # GCA_003846445.1
9,GCA_003957805.1,PRJNA496333,SAMN09724271,,,na,,2479200,32046,,...,Synechococcus elongatus,0.55123,Synechococcus elongatus UTEX 3055,strain=UTEX 3055,Synechococcus,elongatus,UTEX_3055,Synechococcus_elongatus_UTEX_3055,GCA_003957805.1_ASM395780v1_genomic_prodigal_p...,Synechococcus_elongatus_UTEX_3055 # GCA_003957...
10,GCA_009498715.1,PRJNA510697,SAMN10613228,,,na,,2283154,32046,,...,,0.54793,Synechococcus elongatus PCC 11802,PCC 11802,Synechococcus,elongatus,PCC 11802,Synechococcus_sp._PCC 11802,GCA_009498715.1_ASM949871v1_genomic_prodigal_p...,Synechococcus_sp._PCC 11802 # GCA_009498715.1
88,GCA_000817325.1,PRJNA209528,SAMN03278348,,,na,,1350461,1350461,,...,Synechococcus elongatus,0.55436,Synechococcus sp. UTEX 2973,strain=UTEX 2973,Synechococcus,elongatus,UTEX_2973,Synechococcus_elongatus_UTEX_2973,GCA_000817325.1_ASM81732v1_genomic_prodigal_pr...,Synechococcus_elongatus_UTEX_2973 # GCA_000817...


In [791]:
df[df['is_synecho'] & (df['gtdb_family'] == 'Cyanobiaceae') & df['quality_filter']]['new_genus'].value_counts()

Parasynechococcus      53
Pseudosynechococcus    41
Synechospongium        30
Inmanicoccus           12
Cyanobium               9
Regnicoccus             9
Vulcanococcus           4
Lacustricoccus          3
Name: new_genus, dtype: int64

In [837]:
df[df['is_synecho']]['gtdb_order'].value_counts()

Synechococcales          167
Cyanobacteriales          14
PCC-7336                  11
Thermosynechococcales      7
Neosynechococcales         1
Phormidesmiales            1
Pseudanabaenales           1
Name: gtdb_order, dtype: int64

In [851]:
df[df['is_synecho'] & (df['gtdb_order'] == 'Pseudanabaenales')]['gtdb_family']

86    Pseudanabaenaceae
Name: gtdb_family, dtype: object

In [859]:
df[~df['is_synecho'] & (df['gtdb_order'] == 'Synechococcales') & (df["gtdb_genus"] != "Prochlorothrix")]

Unnamed: 0,assembly_accession,bioproject,biosample,wgs_master,excluded_from_refseq,refseq_category,relation_to_type_material,taxid,species_taxid,isolate,...,gtdb_species,gc,organism_name,infraspecific_name,new_genus,new_species,new_strain,label,protein_filename,gtt_label
300,GCA_000007925.1,PRJNA419,SAMN02603142,,,representative genome,assembly from type material,167539,1219,,...,Prochlorococcus marinus,0.36442,Prochlorococcus marinus subsp. marinus str. CC...,strain=CCMP1375; SS120,Prochlorococcus,Prochlorococcus marinus,strain=CCMP1375; SS120,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_000007925.1_ASM792v1_genomic_prodigal_prot...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
301,GCA_000011465.1,PRJNA213,SAMEA3138209,,,na,,59919,1219,,...,Prochlorococcus_A marinus_S,0.30799,Prochlorococcus marinus subsp. pastoris str. C...,strain=MED4,Prochlorococcus_A,Prochlorococcus_A marinus_S,strain=MED4,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_000011465.1_ASM1146v1_genomic_prodigal_pro...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
302,GCA_000011485.1,PRJNA220,SAMEA3138210,,,na,,74547,1219,,...,Prochlorococcus_C marinus_B,0.50740,Prochlorococcus marinus str. MIT 9313,strain=MIT9313,Prochlorococcus_C,Prochlorococcus_C marinus_B,strain=MIT9313,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_000011485.1_ASM1148v1_genomic_prodigal_pro...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
303,GCA_000012465.1,PRJNA13911,SAMN00623057,,,na,,59920,1219,,...,Prochlorococcus_B marinus_B,0.35123,Prochlorococcus marinus str. NATL2A,strain=NATL2A,Prochlorococcus_B,Prochlorococcus_B marinus_B,strain=NATL2A,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_000012465.1_ASM1246v1_genomic_prodigal_pro...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
304,GCA_000012645.1,PRJNA13910,SAMN02598321,,,na,,74546,1219,,...,Prochlorococcus_A marinus_L,0.31210,Prochlorococcus marinus str. MIT 9312,strain=MIT 9312,Prochlorococcus_A,Prochlorococcus_A marinus_L,strain=MIT 9312,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_000012645.1_ASM1264v1_genomic_prodigal_pro...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,GCA_003211895.1,PRJNA445865,SAMN08886554,QCRJ00000000.1,derived from single cell,na,,2162537,2162537,AG-459-P07,...,Prochlorococcus_B marinus_B,0.35048,Prochlorococcus sp. AG-459-P07,,Prochlorococcus_B,Prochlorococcus_B marinus_B,,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_003211895.1_ASM321189v1_genomic_prodigal_p...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
1131,GCA_003211315.1,PRJNA445865,SAMN08886599,QCTC00000000.1,derived from single cell,na,,2162543,2162543,AG-670-O17,...,Prochlorococcus_A sp003211315,0.30892,Prochlorococcus sp. AG-670-O17,,Prochlorococcus_A,Prochlorococcus_A sp003211315,,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_003211315.1_ASM321131v1_genomic_prodigal_p...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
1132,GCA_003211155.1,PRJNA445865,SAMN08886609,QCTM00000000.1,derived from single cell,na,,2162544,2162544,AG-673-K22,...,Prochlorococcus_B marinus_B,0.35286,Prochlorococcus sp. AG-673-K22,,Prochlorococcus_B,Prochlorococcus_B marinus_B,,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_003211155.1_ASM321115v1_genomic_prodigal_p...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...
1133,GCA_003210915.1,PRJNA445865,SAMN08886623,QCUA00000000.1,derived from single cell,na,,2162546,2162546,AG-676-L21,...,Prochlorococcus_A sp003210915,0.30848,Prochlorococcus sp. AG-676-L21,,Prochlorococcus_A,Prochlorococcus_A sp003210915,,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...,GCA_003210915.1_ASM321091v1_genomic_prodigal_p...,c_Cyanobacteriia_o_Synechococcales_f_Cyanobiac...


In [864]:
df.query('gtdb_family == "Neosynechococcaceae"')

Unnamed: 0,assembly_accession,bioproject,biosample,wgs_master,excluded_from_refseq,refseq_category,relation_to_type_material,taxid,species_taxid,isolate,...,gtdb_species,gc,organism_name,infraspecific_name,new_genus,new_species,new_strain,label,protein_filename,gtt_label
97,GCA_000775285.1,PRJNA244289,SAMN02725026,JJML00000000.1,,representative genome,assembly from type material,1497020,1501145,,...,Neosynechococcus sphagnicola,0.5159,Neosynechococcus sphagnicola sy1,strain=CAUP A 1101,Neosynechococcus,sphagnicola,CAUP_A_1101,Neosynechococcus_sphagnicola_CAUP_A_1101,GCA_000775285.1_ASM77528v1_genomic_prodigal_pr...,Neosynechococcus_sphagnicola_CAUP_A_1101 # GCA...
734,GCA_001939115.1,PRJNA355315,SAMN06074267,MQTZ00000000.1,,na,,1922337,1922337,,...,GCF-001939115 sp001939115,0.52317,Leptolyngbya sp. 'hensonii',strain=hensonii,GCF-001939115,GCF-001939115 sp001939115,strain=hensonii,c_Cyanobacteriia_o_Neosynechococcales_f_Neosyn...,GCA_001939115.1_ASM193911v1_genomic_prodigal_p...,c_Cyanobacteriia_o_Neosynechococcales_f_Neosyn...


In [867]:
df[df['is_synecho']]['new_genus'].value_counts()

Parasynechococcus      53
Pseudosynechococcus    41
Synechospongium        30
Limnothrix             14
Inmanicoccus           12
Leptococcus            10
Regnicoccus             9
Cyanobium               9
Thermosynechococcus     6
Synechococcus           6
Vulcanococcus           4
Lacustricoccus          3
Leptovivax              1
Brevicoccus             1
Stenotopis              1
Neosynechococcus        1
Coccusdissimilis        1
Name: new_genus, dtype: int64

In [885]:
df[df['gtdb_family'] == 'Limnotrichaceae'][['label', 'seq_rel_date']]

Unnamed: 0,label,seq_rel_date
11,Limnothrix_euryhalinus_PCC 7002,2008/03/14
53,Limnothrix_euryhalinus_PCC_8807,2016/07/25
56,Limnothrix_euryhalinus_PCC_7117,2016/07/25
72,Limnothrix_sp._PCC_7003,2016/07/25
73,Limnothrix_euryhalinus_PCC_73109,2016/01/21
83,Limnothrix_euryhalinus_NKBG042902,2014/06/27
96,Limnothrix_sp._NKBG15041c,2013/10/22
104,Limnothrix_sp._NIES-970,2017/02/08
107,Limnothrix_euryhalinus_PCC_7002,2017/05/13
108,Limnothrix_euryhalinus_OG1,2017/04/21


### Create AAI inputs at family level

In [882]:
# # Create AAI input for Synechococcus - added Cyanobium and Limnothrix
dst = "../data/comparem/input/"

df['proteins_path'] = df['proteins_path'].str.replace("..//ncbi", "../data/ncbi/")

subsets = dict()

# Cyanobiaceae
subsets['cyanobiaceae'] = df[(df['gtdb_family'] == 'Cyanobiaceae') & df['is_synecho'] | df['label'].apply(lambda s:
                                                                                any(i in s for i in ["SS35", 
                                                                                                     "SS2", 
                                                                                                     "SS52", 
                                                                                                     "CCMP1375", 
                                                                                                     "LG",]))]



(166, 63)

(1085, 63)