In [1]:
import pandas as pd
import seaborn as sns
from os import path
from matplotlib import rc
from matplotlib import pyplot as plt
from scipy.spatial.distance import squareform, pdist
from scipy.cluster.hierarchy import dendrogram, linkage

plt.style.use("default")

In [2]:
wd = "/Users/viniWS/Bio/synecho"
aai_file = path.join(wd, "data/aai/aai_summary.tsv")
df = pd.read_csv(aai_file, sep="\t")
metadata = pd.read_csv(path.join(wd, "data/Synechococcus_metadata_table.csv"))
df["Genome A"] = df["Genome A"].apply(lambda s: s.split("_genomic")[0])
df["Genome B"] = df["Genome B"].apply(lambda s: s.split("_genomic")[0])
df_ = df

In [3]:
metadata["No_genus"] = metadata.groupby('New_Genus')['New_Genus'].transform('count').map(int, na_action="ignore")

In [4]:
df_ = df_.rename(columns={"Genome A": "Genome B", "Genome B": "Genome A"})
df_t = pd.concat([df, df_], sort=True)

In [5]:
metadata["New_Species"] = metadata["New_Species"].apply(lambda s: "\mathrm{" + s + "}" if s == "sp." else s)

In [6]:
metadata["New_Name"] = metadata.apply(
    lambda row: " ".join(
        [
            row["New_Genus"],
            row["New_Species"],
            row["Strain"],
            # "(" + str(row["Completeness"]) + "%)"
        ]
    ) if type(row["New_Genus"]) != float else \
    row["Organism"].replace("_", " "), axis=1)

In [7]:
def raw(str):
    return r'${}$'.format(str)

In [8]:
metadata["New_Name_2"] = metadata.apply(lambda row: ("\ ").join(
    [
        row["New_Genus"],
        row["New_Species"],
        "\mathrm{" + row["Strain"] + "}",
        row["TypeSpecies"].replace(
            "yes", "_T"
        ).replace("no", "")]) if type(row["New_Genus"]) != float and type(row["TypeSpecies"]) != float else row["Organism"].replace("_", " "), axis=1)

In [9]:
metadata["New_Name_2"] = metadata["New_Name_2"].apply(lambda s: raw(s))

In [10]:
renaming_dict = dict()
for index, row in metadata.iterrows():
    renaming_dict[row["FileName"]] = row["New_Name_2"]

In [11]:
table = pd.pivot_table(df_t, values="Mean AAI", index = ("Genome A"), columns = ("Genome B"))

table = table.fillna(value=99.99)

table.rename(columns = renaming_dict, index=renaming_dict, inplace=True)

In [12]:
# We'll only include genomes with at least 1 mbp (Size) and 50% completeness
filter_ = metadata[
    (metadata.Completeness > 50.0) &
    (metadata.Size > 1) &
    (metadata["No_genus"] > 1)]["New_Name_2"]

In [13]:
filter_.shape
table_filtered = table.loc[filter_, filter_]

In [14]:
table_filtered = table.loc[filter_, filter_]
table_filtered.rename(columns=dict(zip(filter_, filter_.apply(lambda s: s.replace("_", " ")))), index=dict(zip(filter_, filter_.apply(lambda s: s.replace("_", " ")))))
X = abs(table_filtered - 99.99)
X = squareform(X)
Z = linkage(X, method="complete", metric="cityblock", optimal_ordering=True)

In [None]:
def augmented_dendrogram(*args, **kwargs):

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        for i, d in zip(ddata['icoord'], ddata['dcoord']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > 1.5:
                plt.plot(x, y, 'ro')
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -8),
                         textcoords='offset points',
                         va='top', ha='center')

    return ddata

fig = plt.figure(figsize=(40, 15))

dn = augmented_dendrogram(Z, labels = table_filtered.columns, leaf_rotation=-90, color_threshold=30, leaf_font_size=12)

#plt.savefig(path.join(wd, "data/finais/aai_default_species.png"), dpi=700, bbox_inches="tight")