In [128]:
import pandas as pd
from os import path
from matplotlib import rc
from matplotlib import pyplot as plt
from scipy.spatial.distance import squareform, pdist
from scipy.cluster.hierarchy import dendrogram, linkage

In [134]:
small_data = "/home/vini/Bio/ProSyn/data"
aai_file = path.join(small_data, "aai/synechococcus/aai_summary.tsv")
metadata = pd.read_csv(path.join(small_data, "Synechococcus_metadata.csv"))
metadata = pd.read_csv("/home/vini/Bio/SynechococcusGT/data/new_metadata_table.csv")
df = pd.read_csv(aai_file, sep="\t")

In [135]:
df["#Genome A"] = df["#Genome A"].apply(lambda s: s.split("_genomic")[0])
df["Genome B"] = df["Genome B"].apply(lambda s: s.split("_genomic")[0])
df_ = df

In [136]:
df_ = df_.rename(columns={"#Genome A": "Genome B", "Genome B": "#Genome A"})
df_t = pd.concat([df, df_], sort=True)

In [139]:
metadata.columns

Index(['assembly_accession', 'bioproject', 'biosample', 'wgs_master',
       'excluded_from_refseq', 'refseq_category', 'relation_to_type_material',
       'taxid', 'species_taxid', 'organism_name', 'infraspecific_name',
       'isolate', 'version_status', 'assembly_level', 'release_type',
       'genome_rep', 'seq_rel_date', 'asm_name', 'submitter',
       'gbrs_paired_asm', 'paired_asm_comp', 'ftp_path', 'filename',
       'Completeness', 'Contamination', 'directory', 'contigs_file',
       'proteins_file', 'genes_file', 'Size', 'GC', 'New_Genus', 'New_Species',
       'Strain'],
      dtype='object')

In [140]:
renaming_dict = dict()
for filename, genus, species, strain, organism_name, assembly in zip(
    metadata["filename"], metadata["New_Genus"], metadata["New_Species"],
    metadata["Strain"], metadata["organism_name"], metadata["assembly_accession"]
):
    if type(genus) == str:
        renaming_dict[filename] = " ".join((genus, species, strain))
    else:
        renaming_dict[filename] = " ".join((organism_name, assembly))

In [141]:
table = pd.pivot_table(
    df_t,
    values="Mean AAI",
    index = ("#Genome A"),
    columns = ("Genome B")
)

table = table.fillna(value=99.99)

qc_filter = [i for i in metadata.query("Completeness >= 50 & Contamination <= 5")["filename"] if i in table.columns]

table = table[qc_filter].loc[qc_filter]

table.rename(columns = renaming_dict, index=renaming_dict, inplace=True)

In [143]:
X = abs(table - 99.99)
X = squareform(X)
Z = linkage(X,
            method="complete",
            metric="cityblock",
            optimal_ordering=True
           )

In [None]:
def augmented_dendrogram(*args, **kwargs):

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        for i, d in zip(ddata['icoord'], ddata['dcoord']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > 1.5:
                plt.plot(x, y, 'ro')
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -8),
                         textcoords='offset points',
                         va='top', ha='center')

    return ddata

fig = plt.figure(figsize=(60, 15))

dn = augmented_dendrogram(Z,
                          labels = table.columns,
                          leaf_rotation=-90,
                          color_threshold=30,
                          leaf_font_size=9,
                         )

plt.savefig(path.join(small_data, "aai/plots/synechococcus_q50.png"), dpi=700, bbox_inches="tight")