In [1]:
import sys
import pandas as pd

sys.path.insert(0, "/local/path/to/scripts")

from plotting_utils import palettes, split_on_capitals

## File S1 – Sample metadata

In [2]:
df_2132 = pd.read_csv("../data/metadata_2132.csv", index_col=0)
df_1454 = pd.read_csv("../data/metadata_1454_cluster_labels.csv", index_col=0, dtype={"sourmash_k_10_1487_25m": str, "IHO_Sea_MRGID": str})
df_1454["in_final_analysis"] = True
df_2132 = df_2132.join(df_1454[["sourmash_k_10_1487_25m", "in_final_analysis"]])
df_2132["in_final_analysis"] = df_2132["in_final_analysis"].apply(lambda x: True if x == x else False)
df_2132["sourmash_k_10_1487_25m"] = df_2132["sourmash_k_10_1487_25m"].apply(lambda x: x if x == x else False)
prov_metadata = palettes["k_10"]
prov_metadata = {str(k): v for k, v in prov_metadata.items()}
prov_metadata[False] = {"label": "-", "description": "-"}
prov_labels = {k: v["label"] for k, v in prov_metadata.items()}
prov_description = {k: v["description"] for k, v in prov_metadata.items()}
df_2132["Proposed province description"] = df_2132["sourmash_k_10_1487_25m"].map(prov_description)
df_2132["sourmash_k_10_1487_25m"] = df_2132["sourmash_k_10_1487_25m"].map(prov_labels)
prov_category = {
    "BPRL": "Polar",
    "BALT": "-",
    "CTEM": "Temperate",
    "OTEM": "Temperate",
    "STEM": "Temperate",
    "MTEM": "Temperate",
    "TGYR": "Tropical",
    "TRHI": "Tropical",
    "TRLO": "Tropical",
    "APLR": "Polar"
}
df_2132["Proposed province category"] = df_2132["sourmash_k_10_1487_25m"].map(prov_description)

rename_dict = {
    "Prov": "Longhurst province code",
    "ProvDescr": "Longhurst province description",
    "ProvCategory": "Longhurst province category",
    "sourmash_k_10_1487_25m": "Proposed province code",
    "IHO_Sea_fmt": "IHO Sea formatted",
    "sample_name": "Sample name",
    "sra_run": "SRA run",
    "division": "Division",
    "subdivision": "Subdivision",
    "bioproject": "BioProject",
    "biosample": "BioSample",
    "in_final_analysis": "Included in final analysis",
}

cols_to_remove = [
    "sra_run_fmt",
    "temperature",
    "station_fmt",
    "station",
    "coords"
]

df_2132 = df_2132.rename(columns=rename_dict).drop(columns=cols_to_remove)

df_2132.columns = [i.replace("_", " ") for i in df_2132.columns]
env_cols = ['DiffuseAttenuationCoefficientPAR', 'MixedLayerDepth',
            'Salinity', 'Terrain', 'Nitrate', 'OceanTemperature',
            'DissolvedMolecularOxygen', 'PhotosyntheticallyAvailableRadiation',
            'Silicate', 'TotalCloudFraction', 'pH', 'SeaIceThickness',
            'SeaWaterSpeed', 'DissolvedIron', 'Phosphate', 'AirTemperature',
            'TotalPhytoplankton', 'SeaIceCover', 'Chlorophyll']

accession_cols = ['Sample name', 'SRA run', 'Division', 'Subdivision', 'BioProject',
                  'BioSample']

sample_cols = ["latitude", "longitude", "collection date", "depth", "instrument"]

category_cols = ['Proposed province code', 'Proposed province description', 'Proposed province category', 'Longhurst province code', 'Longhurst province description', 'Longhurst province category', 'IHO Sea', 'IHO Sea formatted', 'IHO Sea MRGID', 'Included in final analysis']

rename_env = {k: v.replace("P A R", "PAR") for k,v in dict(zip(env_cols, [split_on_capitals(i) for i in env_cols])).items()}
rename_sample = dict(zip(sample_cols, [i.capitalize() for i in sample_cols]))

df_2132 = df_2132.rename(columns={**rename_env, **rename_sample})

env_cols = [rename_env.get(i, i) for i in env_cols]
sample_cols = [rename_sample.get(i, i) for i in sample_cols]

df_2132 = df_2132[accession_cols + category_cols + sample_cols + env_cols]
df_2132["Proposed province code"] = pd.Categorical(df_2132["Proposed province code"], categories="BPLR BALT CTEM SANT NADR MEDI TGYR PEQD TROP APLR".split(), ordered=True)
df_2132 = df_2132.sort_values(["Included in final analysis", "Proposed province code", "Latitude"], ascending=[False, True, False])

df_2132["Collection date"] = pd.to_datetime(df_2132["Collection date"], format="mixed", utc=True)
df_2132["Collection date"] = df_2132["Collection date"].dt.strftime("%Y-%m-%d")

In [3]:
df_2132.to_excel("/local/path/to/data/File_S1_sample_metadata.xlsx", index=False)
df_2132.to_csv("/local/path/to/data/File_S1_sample_metadata.csv", index=False)

## File S2 – Genome metadata

In [None]:
df = pd.read_csv("/local/path/to/data/genome_metadata.tsv", index_col=0, sep="\t")
df = df.rename_axis("Genome ID")
gtdb_columns = ['checkm_marker',
       'checkm_n_genome', 'checkm_n_marker', 'checkm_n_marker_set', 'checkm_0',
       'checkm_1', 'checkm_2', 'checkm_3', 'checkm_4', 'checkm_5+',
       'checkm_compl', 'checkm_contam', 'checkm_hetero', 'genome_quality_QS',
       'QS_plus_ln(N50)', 'gtdb_novelty', 'gtdb_classification',
       'gtdb_fastani_reference', 'gtdb_fastani_reference_radius',
       'gtdb_fastani_taxonomy', 'gtdb_fastani_ani',
       'gtdb_closest_placement_radius', 'gtdb_fastani_af',
       'gtdb_closest_placement_reference', 'gtdb_closest_placement_taxonomy',
       'gtdb_closest_placement_ani', 'gtdb_closest_placement_af',
       'gtdb_pplacer_taxonomy', 'gtdb_classification_method', 'gtdb_note',
       'gtdb_other_related_references(genome_id,species_name,radius,ANI,AF)',
       'gtdb_aa_percent', 'gtdb_translation_table', 'gtdb_red_value',
       'gtdb_warnings',]

metric_columns = ['num_contig', 'largest_contig_length', 'total_length', 'percent_GC',
       'N50', 'ambiguous_nucleotide_per_100kb']

    
taxonomy_columns = ['data_source', 'genome_type', 'domain',
       'phylum', 'class', 'order', 'family', 'genus', 'species',
       'sci_names']

df.sort_values(taxonomy_columns[2:], ascending=False)
print("Columns not present:", [i for i in df.columns if i not in gtdb_columns + metric_columns + taxonomy_columns])
df = df[taxonomy_columns + metric_columns + gtdb_columns]

df.columns = [i.replace("_", " ").capitalize() for i in df.columns]

replace_terms = {
       "Gtdb": "GTDB",
       "Checkm": "CheckM",
       "gc": "GC",
       "quality qs": "quality Qs",
       "n genome": "num genome",
       "red value": "RED value" 
}

new_cols = {}
for term, new_term in replace_terms.items():
    for col in df.columns:
        if term in col:
            new_cols[col] = col.replace(term, new_term)

df = df.rename(columns=new_cols)

In [None]:
df.to_csv("/local/path/to/data/File_S2_genome_metadata.csv")
df.to_excel("/local/path/to/data/File_S2_genome_metadata.xlsx")