In [None]:
save_file = True

# Create the dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
clusters = pd.read_excel(
    "../../../data/original_files/class_analysis_16_10/Clusters.xlsx"
)

In [None]:
clusters = clusters.melt()
clusters

In [None]:
clusters.columns = ["Cluster", "SampleID"]
clusters = clusters.dropna()

In [None]:
clusters

In [None]:
abundances = pd.read_excel(
    "../../../data/original_files/class_analysis_16_10/Subset_pathabundance.xlsx",
    index_col=0,
).transpose()

In [None]:
abundances.index

In [None]:
data = abundances.merge(clusters, left_index=True, right_on="SampleID", how="inner")
data

In [None]:
# Add the last sample W1351
W1351 = pd.DataFrame(abundances.loc["W1351bowtie2aligned"]).transpose()
W1351["Cluster"] = "Cluster 1"
W1351["SampleID"] = "W1351"
data = pd.concat([data, W1351])
data.reset_index(inplace=True)
data

In [None]:
if save_file:
    data.to_csv("../../../data/processed_files/cluster_data.csv")

# Here run fit_model_scripts/fit_supervised_clusters_Figure_3.R

# Look at the results

In [None]:
import matplotlib.pyplot as plt
from matplotlib import colormaps
import matplotlib.colors as colors
import numpy as np
import seaborn as sns
import pandas as pd

In [None]:
full_res = pd.read_csv(
    "../../../data/results_files/heatmap/cluster_significance/2023-10-21/_beg_3_end_85_cluster_results.csv",
    index_col=0,
)
full_res

In [None]:
# Plot the R2
plt.hist(full_res["r2"])
plt.title("R2")
plt.show()

In [None]:
# Correct p-values and test significance
# Get the vector of pvalues

pval_col = [
    column
    for column in list(full_res.columns)
    if ("pval" in column and "Model" not in column and "Intercept" not in column)
]
pvalues_lists = [list(full_res[col]) for col in pval_col]
pvalues = [item for sublist in pvalues_lists for item in sublist]
from statsmodels.stats.multitest import fdrcorrection, multipletests

pvalues_BH = multipletests(pvalues, 0.05, method="holm")[1]

plt.hist(np.log10(pvalues_BH))
plt.title("Holm corrected log10 p-values for the emmeans estimates")

In [None]:
i = 0
BH_col = []
for col in full_res.columns:
    if (
        "pval" in col and "Model" not in col and "Intercept" not in col
    ):  # Keep only the emmeans, loose the Model estimates directly output by the MM
        new_name = col + " BH"
        beg = i * len(full_res)
        end = (i + 1) * len(full_res)
        l = pvalues_BH[beg:end]
        full_res[new_name] = l
        i += 1

full_res

In [None]:
pvals = full_res[[c for c in full_res.columns if "pval BH" in c]]
pvals

In [None]:
# Plot the estimates filtering out the pvalues
mask = pvals > 0.05
# Use light grey when not significant


estimates = full_res[[c for c in full_res if "est" in c]]
xlabels = [
    "Cluster 1 - Cluster 2",
    "Cluster 1 - Cluster 3",
    "Cluster 1 - Cluster 4",
    "Cluster 1 - Cluster 5",
    "Cluster 2 - Cluster 3",
    "Cluster 2 - Cluster 4",
    "Cluster 2 - Cluster 5",
    "Cluster 3 - Cluster 4",
    "Cluster 3 - Cluster 5",
    "Cluster 4 - Cluster 5",
]

# Change the order of the pathways to match figure 3
pathway_order = [
    "PA401",
    "PA57",
    "PA10",
    "PA11",
    "PA592",
    "PA34",
    "PA469",
    "PA205",
    "PA468",
    "PA325",
    "PA245",
    "PA267",
    "PA130",
    "PA244",
    "PA394",
    "PA119",
    "PA291",
    "PA236",
    "PA457",
    "PA248",
    "PA396",
    "PA249",
    "PA518",
    "PA208",
    "PA570",
    "PA1",
    "PA488",
    "PA441",
    "PA577",
    "PA141",
    "PA584",
    "PA5",
    "PA13",
    "PA574",
    "PA72",
    "PA333",
    "PA69",
    "PA388",
    "PA393",
    "PA395",
    "PA233",
    "PA373",
    "PA509",
    "PA278",
    "PA110",
    "PA161",
    "PA266",
    "PA454",
    "PA92",
    "PA588",
    "PA397",
    "PA246",
    "PA247",
    "PA332",
    "PA116",
    "PA117",
    "PA24",
    "PA23",
    "PA479",
    "PA566",
    "PA162",
    "PA16",
    "PA74",
    "PA253",
    "PA6",
    "PA12",
    "PA90",
    "PA255",
    "PA140",
    "PA160",
    "PA487",
    "PA289",
    "PA290",
    "PA303",
    "PA526",
    "PA17",
    "PA55",
    "PA385",
    "PA404",
    "PA70",
    "PA27",
    "PA201",
    "PA529",
]

plt.figure(figsize=(5, 20))
plt.title("Estimates")
plt.xlabel("Contrast")
plt.ylabel("Pathway")
estimates.columns = xlabels
estimates = estimates.reindex(pathway_order)
mask.columns = xlabels
mask = mask.reindex(pathway_order)
hm = sns.heatmap(estimates, mask=mask, cmap="seismic", center=0)
hm.set_facecolor("lightgrey")

In [None]:
results = full_res[[c for c in full_res.columns if "est" in c or "pval BH" in c]]
results

In [None]:
col = results.columns
new_col = []
for c in col:
    print(c)
    split = c.split()
    cluster_names = split[0]
    type = split[1]
    cid = cluster_names.split("-")
    if type == "est":
        type = "estimate"
    else:
        type = "corrected pvalue"
    new_col.append(f"Cluster {cid[0]} - Cluster {cid[1]} {type}")
results.columns = new_col
results

In [None]:
if save_file:
    results.to_excel(
        "../../../data/results_files/heatmap/cluster_significance/results.xlsx"
    )