# Motivation

The goal of this Notebook is to explore the differences between the Clustering analysis and Network approach. To understand the differences between the genes selected through the usual ways and through the network.

# Init

In [39]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio


import multiprocess as mp

# own libraries
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
# needed for older pickler of some older experimt
sys.path.append(os.path.dirname("../../src")) 

from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities.helpers import save_fig
from NetworkAnalysis.utilities import clustering as cs


from NetworkAnalysis.utilities import pre_processing as pre
from NetworkAnalysis.utilities import modelling as md


pio.templates.default = "ggplot2"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/tum/"
tcga_data = "../../data/tumour/"

figures_path = "network_vs_clustering/"

#### Load the data ####
vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

# prep mut
tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/TPMs_selected_genes_v3_13k_gc42.tsv", sep="\t", index_col="genes")

## TCGA data
tcga_metadata_df = pd.read_csv(
    f"{tcga_data}/metadata_tcga_v2.csv"
)  # This version contains some small modifications on the spreadsheet such as removing duplicates of 01B as some were 01b and others 01B
healthy_metadata = pd.read_csv(f"{data_base}/metadata/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample", dtype_backend="pyarrow")

infiltration_response = pd.read_csv(f"{tcga_data}/TCGA_infiltrate_response.csv", sep="\t")
consensus_classifier = pd.read_csv(f"{tcga_data}/consensus_classifier_comparisons.tsv", sep="\t")
lund = pd.read_csv(f"{tcga_data}/Lund_TCGA_cohort_subtypes.txt", sep="\t")
estimates_scores = pd.read_csv(f"{tcga_data}/bladder_urothelial_carcinoma_RNAseqV2.tsv", sep="\t").rename(columns={"ID": "Sample"})

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

In [8]:
# process lund
lund.rename(columns={"TCGA2017.subtype": "TCGA", "ID": "Sample"}, inplace=True)
lund["Sample"] = ["-".join(sample.split("-")[:-1]) for sample in lund["Sample"]]

# process ESTIMATE scores
estimates_scores["Sample"] = ["-".join(sample.split("-")[:-1]) for sample in estimates_scores["Sample"]]
estimates_scores.set_index("Sample", inplace=True)

## Load experiment sets

In [9]:
%autoreload 2

tum = ExperimentSet("tum", base_path, exp_folder_tumour, tcga_mutations_df, sel_sets = ["5K"], rel_path="../")
tum.export_to_gephi(save=False)

# extra filtering out to just keep the 10TF experiments
# selected_exps = [exp for exp in tum.get_exp_labels() if "standard" in exp]

# extra filtering out to just keep the 6TF experiments
new_exps = {}
for key, val in tum.exps.items():
    tf_val = int(key.split("_")[2].split("TF")[0])
    if tf_val in [6] and "standard" in key:
        new_exps[key] = val

tum.exps = new_exps

##### Experiment labels:  dict_keys(['standard_5K_50TF', 'beta_5K_50TF', 'norm2_5K_50TF', 'norm3_5K_50TF', 'standard_5K_6TF', 'standard_5K_5TF', 'standard_5K_4TF', 'standard_5K_3TF', 'norm3_5K_5TF', 'norm3_5K_4TF', 'norm3_5K_6TF', 'beta_5K_3TF', 'beta_5K_4TF', 'beta_5K_6TF', 'norm3_5K_3TF', 'standard_5K_7TF', 'standard_5K_8TF', 'beta_5K_7TF', 'standard_5K_9TF', 'standard_5K_10TF', 'norm3_5K_7TF', 'norm3_5K_8TF', 'beta_5K_8TF', 'norm3_5K_9TF', 'beta_5K_9TF', 'beta_5K_10TF', 'norm3_5K_10TF', 'beta_5K_5TF'])


In [10]:
tf_range = [6]
exps, entropy_df = GtExp.load_sbm_exps(tum, name="standard_5K", exp_type="tum_700", tf_range=tf_range)

entropy_df["Entropy_norm"] = (entropy_df["Entropy"] - entropy_df["Entropy"].min()) / (entropy_df["Entropy"].max() - entropy_df["Entropy"].min())
entropy_df["Entropy_log10"] = np.log10(entropy_df["Entropy"])

### Loaded standard_int_tum_5K_6TF


## Spearman Correlation viz

In [11]:
def high_corr(corr_matrix: pd.DataFrame, mut_df: pd.DataFrame, th=0.4, figures_path=None, label=""):
    high_corr = corr_matrix[(corr_matrix > th) | (corr_matrix < -th)].copy(deep=True)
    high_corr = high_corr[high_corr != 1.0].copy(deep=True)

    high_corr_num = pd.DataFrame((high_corr.shape[0] - high_corr.isnull().sum()).reset_index()).rename(columns={0: "#high_cor"}).set_index("genes")

    # Counting the high correlated genes
    high_corr_num["mut_count"] = mut_df["count"]
    high_corr_num.loc[high_corr_num.index.isin(tf_list), "isTF"] = 1
    high_corr_num.fillna(0, inplace=True)
    high_corr_num["isTF"] = high_corr_num["isTF"].astype(str)

    fig_scatter = px.scatter(
        high_corr_num.reset_index(),
        x="genes",
        y="#high_cor",
        color="isTF",
        size_max=30,
        size="mut_count",
        title="# of genes that have high correlations",
        log_y=True,
        height=700,
    )

    # Looking at the values as well
    high_corr["mut_count"] = mut_df["count"]
    high_corr.loc[high_corr.index.isin(tf_list), "isTF"] = 1
    # high_corr.fillna(0, inplace=True)
    high_corr["isTF"] = high_corr["isTF"].astype(str)

    high_corr["#high_cor"] = high_corr_num["#high_cor"]

    # Un-comment for matrix viz
    if figures_path is not None:
        fig = px.imshow(high_corr.drop(columns=["#high_corr", "#high_cor"]), title="High correlated genes")
        fig.write_html(f"{figures_path}/highCorr_matrix_{label}.html")

    return high_corr, high_corr_num, fig_scatter

In [12]:
sel_exp = tum.exps["standard_5K_6TF"]
corr_matrix = sel_exp.tpm_df.T.corr().round(4)
# corr_matrix.to_csv(
#     figures_path + "spearman_corr.csv",
# )

In [13]:
df, df_num, fig = high_corr(corr_matrix, mut_df=tcga_mutations_df, th=0.4)
fig.show()

## How many high correlated tumours from all Genes are used in the network

In [14]:
corr_matrix_all = all_tum_tpms.T.corr().round(4)
df_all, df_num_all, fig = high_corr(corr_matrix_all, mut_df=tcga_mutations_df, th=0.4)

In [15]:
all_corr_g = set(df_all.sort_values(by="#high_cor", ascending=False).index.values[:5000])
cmn = set(sel_exp.tpm_df.index.values) & all_corr_g
print(f"There are {len(cmn)} common genes between the most corr from all the expressed genes and the highest varied")

There are 1524 common genes between the most corr from all the expressed genes and the highest varied


## ModCon & MEV

In [16]:
def worker(arg):
    obj, methname = arg[:2]
    _ = getattr(obj, methname)()
    return obj


pool = mp.Pool(mp.cpu_count())

# generate modCon
for exp in exps.values():
    exp.sbm_method = "sbm"
results = pool.map(worker, ((exp, "get_ModCon") for exp in exps.values()))
exps_gt = {exp.type.split("_")[-1].split("TF")[0]: exp for exp in results}

In [17]:
gt_exp = exps_gt["6"]
sort_col = "ModCon_{}_gt".format(gt_exp.type)
gt_modCon = gt_exp.get_ModCon()
# generate Mevs
for key, exp in exps_gt.items():
    sort_col = "ModCon_{}_gt".format(exp.type)
    exp.mevsMut, _ = exp.get_mevs(tpms=all_tum_tpms, modCon=exp.gt_modCon, sort_col=sort_col, num_genes=100, verbose=False)

# Clustering Analysis

Apply the standard K-means and the process we applied in the previous stage of the PhD

In [18]:
data_tpm, working_tpm, raw_metadata_t, selected_genes, common_samples = pre.prep_data(
    gt_exp.tpm_df.reset_index(), tcga_metadata_df.copy(deep=True), consensus_classifier, remap_cols=False
)

# adding infiltration
infiltration_response = infiltration_response[infiltration_response["Sample"].isin(list(common_samples))]
raw_metadata_t = (
    pd.concat([raw_metadata_t.rename(columns={"index": "Samples"}).set_index("Samples"), infiltration_response.set_index("Sample")], axis=1)
    .reset_index()
    .rename(columns={"index": "Samples"})
)

For th 0.9 ==> at least non-NAN values 364
####### Gene selection, num genes: 3500 #######
The genes selected by the highest standard deviation/median ration.
3500


In [19]:
selected_clusters = ["RawKMeans"]
n_clusters, n_comp = 5, 5

# Un-comment to use negative samples
data = np.log2(1 + (data_tpm[common_samples] / data_tpm[common_samples].median(axis=0)).transpose()).sort_index(ascending=False)
negative_samples = md.negative_silhouette_samples(data, n_comp, n_clusters)
samples = list(set(common_samples) - set(negative_samples))

data = np.log2(1 + (data_tpm[common_samples] / data_tpm[common_samples].median(axis=0)).transpose()).sort_index(ascending=False)

# run experiments
outputs, cluster_models, all_metrics, _ = md.compare_exp(
    data,
    selected_genes,
    raw_metadata_t,
    rob_comp=None,
    n_clusters=n_clusters,
    selected_clusters=selected_clusters,
    show_figures=False,
    custom_points=None,
    show_consensus=False,
    pca_data=True,
    n_comp=n_comp,
)

# Plot the PCA
outputs, label_name = md.add_labels(outputs, label_name="KMeans_labels_5")

outputs["RawKMeans_CS_5"] = outputs["RawKMeans_CS_5"].astype(str)
fig = px.scatter(outputs, x="PC_1", y="PC_2", color=label_name, height=600, hover_data=["TCGA408_classifier", "consensus"], title="KMeans 5")
fig = fig.update_traces(marker=dict(size=16, line=dict(width=1, color="Black")), selector=dict(mode="markers"))
# fig.show()

Variation per principal component [0.20063214 0.09500002] and the sum 29.56%
Variation per principal component [0.20063214 0.09500002 0.05929994 0.04423806 0.03736743] and the sum 43.65%
PCA score  -5383.486540762696


In [20]:
# add lund df
lund_df = pd.concat([lund.set_index("Sample"), outputs.set_index("Sample")], axis=1).reset_index().dropna().rename(columns={"index": "Sample"})
lund_df = lund_df[["Sample", "TCGA", "TCGA408_classifier", "consensus", "KMeans_labels_5", "RawKMeans_CS_5", "Lund2017.subtype"]]

# reorder_cols = ["TCGA408_classifier", "KMeans_labels_5", "RawKMeans_CS_5", "consensus"]
# sky.main(df=lund_df, reorder_cols=reorder_cols, title="Comparison between " + ", ".join(reorder_cols))

# Compare the two approaches

## Differences in the genes selected

In [21]:
%autoreload 2
gene_stats = gh.extract_gene_sel(exp=gt_exp, ref_ge=all_tum_tpms, num_genes=3000)

For th 0.5 ==> at least non-NAN values 202
####### Gene selection, num genes: 3000 #######
The genes selected by the highest standard deviation/median ration.
CS vs Network (all). There are 1717 different genes. 
CS vs Network (sel). There are 2253 different genes. 
CS vs Network (sel). Common genes: 747
CS vs Network (all). Common genes: 1283
Network selected: 2834
Highest relative/std 3000


## Gene expresion vs Variance vs Mutation

In [22]:
%autoreload 2
ann_add = ["LAMA3", "TNXB", "MYH11", "TNC", "VCAN", "SAMD9", "CDK12", "CDKN2A", "FOXQ1"]
ann_rm = [
    "RP11-87N24.3",
    "RP5-940J5.9",
    "AC132217.4",
    "IL20RB",
    "CTC-425F1.4",
    "AC005301.9",
    "RP11-20D14.6",
    "RP11-54H7.4",
    "PTHLH",
    "GABRP",
    "CD36",
    "SYNM",
    "MYH11",
    "VCAN",
    "UTRN",
]
ann_chg = ["FABP4", "KRT13", "SAMD9", "ELF3", "IGF2", "TGM1", "CCL20", "OBSCN", "SACS"]

prep_df = gh.prep_net_vs_ca(ge_df=all_tum_tpms, mut_df=tcga_mutations_df, gt_genes=gene_stats["net_sel_genes"], cs_genes=gene_stats["cs_genes"])

fig = gh.plot_net_vs_ca(prep_df, log=True, annotations=True, ann_add=ann_add, ann_rm=ann_rm, ann_chg=ann_chg)
fig = fig.update_layout(
    title="",
    legend=dict(
        orientation="h",
        title="Type",
        yanchor="bottom",
        xanchor="center",
        x=0.8,
        y=0.9,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=20, color="#003366"),
    ),
    xaxis=dict(tickfont=dict(size=18)),
    yaxis=dict(tickfont=dict(size=18), title="Mutation burden"),
    font=dict(size=18),
)
save_fig(name="ClusteringAnalysis_vs_Network_3", fig=fig, base_path=figures_path, width=1400, height=700, margin=0.02)

### Distribution of the data

In [23]:
figs, titles = [], []
fig1 = px.histogram(
    prep_df.reset_index(),
    x="mut_count",
    color="type",
    hover_data=["genes", "mut_count", "rel_var", "median_raw"],
    barmode="group",
    title="Mutations count spread",
)
fig1.update_layout(
    xaxis_range=[0, 30],
    xaxis=dict(tickmode="linear", tick0=0, dtick=1),
    legend=dict(
        title="",
        orientation="h",
        yanchor="top",
        y=1.0,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
    ),
)

fig2 = px.histogram(
    prep_df.reset_index(), x="rel_var", color="type", hover_data=["genes", "mut_count", "rel_var", "median_raw"], barmode="stack", title="Rel var spread"
)
fig2.update_layout(
    legend=dict(
        title="",
        orientation="h",
        yanchor="top",
        y=1.0,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
    ),
)

fig3 = px.histogram(
    prep_df.reset_index(),
    x="median_raw",
    color="type",
    hover_data=["genes", "mut_count", "rel_var", "median_raw"],
    barmode="stack",
    title="Median spread",
)
fig3 = fig3.update_layout(
    xaxis_range=[0, 100],
    xaxis=dict(tickmode="linear", tick0=0, dtick=10),
    legend=dict(
        title="",
        orientation="h",
        yanchor="top",
        y=1.0,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
    ),
)

if True:
    display(fig1)
    display(fig2)
    display(fig3)

    save_fig(name="MutCount_distrib", fig=fig1, base_path=figures_path, width=1000, height=500)
    save_fig(name="RelVar_distrib", fig=fig2, base_path=figures_path, width=1000, height=500)
    save_fig(name="Median_distrib", fig=fig3, base_path=figures_path, width=1000, height=500)

# Clustering the genes from the difference genes

We assume that the genes we are interested are the ones from the most varied


In [34]:
def prep_clustering(used_genes: set, tum_df: pd.DataFrame, metadata_df: pd.DataFrame, consensus_classifier: pd.DataFrame):
    """
    Preparing for clustering analysis

    Args:
        used_genes (set): Genes to be used
        tum_df (pd.DataFrame): The tumour dataframe
        metadata_df (pd.DataFrame): TCGA metadata

    """

    # re-do the pre-processing
    _, working_tpm, _, _, _ = pre.prep_data(tum_df.reset_index(), metadata_df.copy(deep=True), consensus_classifier, remap_cols=False)

    data_raw = working_tpm.set_index("genes")
    data_raw = data_raw[data_raw.index.isin(used_genes)]

    # quick check
    if data_raw.shape[0] == len(used_genes):
        print("Check 1 ✅. Data TPM and list of genes are the same.")
    else:
        print("Check 1 ❌. Data TPM and list of genes are not the same.")

    data = np.log2(1 + (data_raw / data_raw.median(axis=0)).transpose()).sort_index(ascending=False)
    print("# PCA transformation ")
    print("### Before log transform: ")
    gh.find_pcs(data)
    print("### After log transform: ")
    gh.find_pcs(data_raw)

    return data, data_raw


def clustering_analysis(config: dict, label=""):
    outputs, _, all_metrics, pca_model = md.compare_exp(
        config["data"],
        config["used_genes"],
        config["metadata"],
        rob_comp=None,
        n_clusters=None,
        selected_clusters=config["selected_clusters"],
        show_figures=False,
        custom_points=None,
        show_consensus=False,
        pca_data=config["pca_data"],
        n_comp=config["n_comp"],
    )
    outputs.set_index("Sample", inplace=True)
    outputs["TCGA408_classifier"] = config["vu_output"]["TCGA408_classifier"]
    outputs["consensus"] = config["vu_output"]["consensus"]
    outputs["Lund"] = config["vu_output"]["Lund2017.subtype"]
    outputs["VU_CA"] = config["vu_output"]["KMeans_labels_6"]

    fig = cs.display_metrics(all_metrics, "Cluster metrics for {}".format(exp.type), show_individual=False, verbose=False)
    fig = fig.update_layout(height=900)
    fig.show()

    top_metrics = gh.rank_cs_metrics(all_metrics, label="Selected by CA")
    print(top_metrics)

    fig, _ = cs.elbow_method(pca_model["data"], min_k=2, max_k=50, label=label)
    fig.show()

    return outputs, top_metrics, pca_model

## Highest variance

In [36]:
used_genes = gene_stats["diff_sel"]
print(f"### Highest varied specific #genes {len(used_genes)}")

data_var, _ = prep_clustering(used_genes, all_tum_tpms, tcga_metadata_df, consensus_classifier)

### Highest varied specific #genes 2253
For th 0.9 ==> at least non-NAN values 364
####### Gene selection, num genes: 3500 #######
The genes selected by the highest standard deviation/median ration.
3500
Check 1 ✅. Data TPM and list of genes are the same.
# PCA transformation 
### Before log transform: 
Sum of 90% variance at PC: 184
Change < 1% at PC: 4
### After log transform: 
Sum of 90% variance at PC: 3
Change < 1% at PC: 5


In [40]:
config = {
    "data": data_var,
    "metadata": raw_metadata_t,
    "vu_output": vu_output,
    "pca_data": True,
    "n_comp": 5,
    "selected_clusters": ["Ward", "Birch", "SpectralClustering", "RawKMeans", "GaussianMixture"],
    "used_genes": used_genes,
}

out_var, top_3, pca_var = clustering_analysis(config, label="Highest varied")

Variation per principal component [0.13507794 0.0577701 ] and the sum 19.28%
Variation per principal component [0.13507794 0.0577701  0.04291404 0.04080834 0.03091352] and the sum 30.75%
PCA score  -3196.850732480956


              Selected by CA
Sil_cos_1               KM_8
Sil_cos_2               KM_5
Sil_cos_3               KM_6
Cal_hab_1               KM_4
Cal_hab_2               KM_5
Cal_hab_3             Spec_4
Dav_bou_1               KM_7
Dav_bou_2  GaussianMixture_8
Dav_bou_3             Spec_7


In [41]:
sel_ca_model = "RawKMeans_CS_5"
out_var[sel_ca_model] = out_var[sel_ca_model].astype(str)
fig = px.scatter(out_var, x=f"PC_1", y="PC_2", color=sel_ca_model, height=600, hover_data=["TCGA408_classifier", "consensus"], title=f"{sel_ca_model}")
fig = fig.update_traces(marker=dict(size=16, line=dict(width=1, color="Black")), selector=dict(mode="markers"))
fig = fig.update_layout(
    xaxis_title=f"Updated X-Axis Title ({pca_var['pca'].explained_variance_ratio_[0].round(4)})",
    yaxis_title=f"PC_2 ({pca_var['pca'].explained_variance_ratio_[1].round(4)})",
)

In [42]:
sel_ca_model = "RawKMeans_CS_7"
reorder_cols = ["TCGA408_classifier", "VU_CA", sel_ca_model, "consensus"]
out_var.dropna(inplace=True)
meta, sky_fig = sky.main(df=out_var.dropna(), reorder_cols=reorder_cols, title="Comparison between " + ", ".join(reorder_cols), retMeta=True)

sky_stats = sky.prep_sankey_description(sky_meta=meta, input_df=out_var, sel_cols=reorder_cols)

config = {
    "shared_x": False,
    "shared_y": True,
    "h_spacing": 0.01,
    "v_spacing": 0.19,
    "main_title": "",
    "height": 700,
    "width": 1200,
    "y_title": "#Samples",
    "x_title": "",
    "specs": None,
}
fig = sky.plot_sankey_description(config, sky_stats)

sky_fig.show()
fig.show()

## Only the Network selected genes

In [43]:
used_genes_net = set(gene_stats["net_sel_genes"]) - set(gene_stats["cs_genes"])
print(f"### Network specific #genes {len(used_genes_net)}")

data_net, _ = prep_clustering(used_genes_net, all_tum_tpms, tcga_metadata_df, consensus_classifier)

### Network specific #genes 2087
For th 0.9 ==> at least non-NAN values 364
####### Gene selection, num genes: 3500 #######
The genes selected by the highest standard deviation/median ration.
3500
Check 1 ✅. Data TPM and list of genes are the same.
# PCA transformation 
### Before log transform: 
Sum of 90% variance at PC: 116
Change < 1% at PC: 6
### After log transform: 
Sum of 90% variance at PC: 6
Change < 1% at PC: 8


In [44]:
config = {
    "data": data_net,
    "metadata": raw_metadata_t,
    "vu_output": vu_output,
    "pca_data": True,
    "n_comp": 7,
    "selected_clusters": ["Ward", "Birch", "SpectralClustering", "RawKMeans", "GaussianMixture"],
    "used_genes": used_genes_net,
}

out_net, top_3, pca_net = clustering_analysis(config, label="Network selected")

Variation per principal component [0.2317124  0.10444722] and the sum 33.62%
Variation per principal component [0.2317124  0.10444722 0.07813941 0.05592184 0.03999257 0.03006956
 0.0177849 ] and the sum 55.81%
PCA score  -1946.6662431708244


          Selected by CA
Sil_cos_1           KM_4
Sil_cos_2           KM_6
Sil_cos_3         Spec_4
Cal_hab_1           KM_4
Cal_hab_2           KM_5
Cal_hab_3         Spec_5
Dav_bou_1         Spec_7
Dav_bou_2         Spec_5
Dav_bou_3         Spec_6


In [45]:
sel_ca_model = "RawKMeans_CS_5"
out_net[sel_ca_model] = out_net[sel_ca_model].astype(str)
fig = px.scatter(out_net, x=f"PC_1", y="PC_2", color=sel_ca_model, height=600, hover_data=["TCGA408_classifier", "consensus"], title=f"{sel_ca_model}")
fig = fig.update_traces(marker=dict(size=16, line=dict(width=1, color="Black")), selector=dict(mode="markers"))
fig = fig.update_layout(
    xaxis_title=f"Updated X-Axis Title ({pca_net['pca'].explained_variance_ratio_[0].round(4)})",
    yaxis_title=f"PC_2 ({pca_net['pca'].explained_variance_ratio_[1].round(4)})",
)

In [46]:
sel_ca_model = "RawKMeans_CS_7"  # "SpectralClustering_CS_7"
reorder_cols = ["TCGA408_classifier", "VU_CA", sel_ca_model, "consensus"]

meta, sky_fig = sky.main(df=out_net.dropna(), reorder_cols=reorder_cols, title="Comparison between " + ", ".join(reorder_cols), retMeta=True)
sky_fig.show()

## Compared the two selection

In [47]:
# Pre-processing
num = 7
sel_net_model, new_net = f"RawKMeans_CS_{num}", f"Net KM - {num}"
sel_var_model, new_var = f"RawKMeans_CS_{num}", f"Var KM - {num}"
comb_df = pd.concat([out_net[sel_net_model].rename(new_net), out_var.rename(columns={sel_var_model: new_var})], axis=1)

# Sankey plot, comparison
reorder_cols = ["TCGA408_classifier", "VU_CA", new_net, new_var, "consensus"]
meta, sky_fig = sky.main(df=comb_df.dropna(), reorder_cols=reorder_cols, title="Comparison between " + ", ".join(reorder_cols), retMeta=True)
sky_stats = sky.prep_sankey_description(sky_meta=meta, input_df=comb_df.dropna(), sel_cols=reorder_cols)

config = {
    "shared_x": False,
    "shared_y": True,
    "h_spacing": 0.01,
    "v_spacing": 0.16,
    "main_title": "",
    "height": 800,
    "width": 1200,
    "y_title": "#Samples",
    "x_title": "",
    "specs": None,
}
fig = sky.plot_sankey_description(config, sky_stats)

sky_fig.show()
fig.show()

## Common between network and CA

In [48]:
used_genes = set(gene_stats["net_sel_genes"]) - set(gene_stats["cs_genes"])

## TF ctrl

In [49]:
tf_ctrl = pd.read_csv(data_base + "tf_ctrl.csv").rename(columns={"Unnamed: 0": "gene"}).set_index("gene")
tf_ctrl_exp = all_tum_tpms.loc[all_tum_tpms.index.isin(tf_ctrl.index)]
# tf_ctrl_exp = tf_ctrl_exp / tf_ctrl_exp.median(axis=1)

scaled_tf = (tf_ctrl_exp - tf_ctrl_exp.min()).div((tf_ctrl_exp.max(axis=1) - tf_ctrl_exp.min(axis=1)), axis=0)

In [50]:
tf_ctrl_exp = pd.concat(
    [
        out_var[["TCGA408_classifier", "consensus", "VU_CA"]],
        tf_ctrl_exp.T,
    ],
    axis=1,
).T

In [51]:
tf_ctrl_exp.to_csv(data_base + "tf_ctrl_gene_exp_tum_2.csv", index=True)