# Motivation

This is a copy of the p0_modifiers Notebook that was used for my PhD thesis. The main differences is that the tumour gene expression data used here contains all the genes and not the 13k most varied genes.

From brief testing this simple switch gives different result which can be explained by having more genes in the DataFrame. There is a better representation of the genes mutated, Agglomerative clustering performs better than the K-means and the basal is split much better

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys 

import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import multiprocess as mp

import igraph as ig

from gseapy.plot import dualplot
import gseapy as gp

# own libraries
SCRIPT_DIR = os.path.dirname(os.path.abspath("pcgna_processing.py"))
sys.path.append(os.path.dirname(SCRIPT_DIR))
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.NetworkComp import NetworkComp
from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig

from NetworkAnalysis.utilities import clustering as cs


pio.renderers.default = "plotly_mimetype+notebook"
pio.templates.default = "ggplot2"

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Init

In [2]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/tum/"  # "/integration_v2.1/ - path from iNET
exp_folder_p0_gc42 = "network_I/P0_gc42/"

# where to save figures
figures_path = "p0_modifiers/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

# With this there will be different results than the on in the PhD thesis
all_tum_tpms = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

## Load experiment sets

In [3]:
%autoreload 2

tum = ExperimentSet("tum", base_path, exp_folder_tumour, tcga_mutations_df, sel_sets = ["4K", "7K"], rel_path="../")
p0_42 = ExperimentSet("p0_gc42", base_path, exp_folder_p0_gc42, tcga_mutations_df, sel_sets = ["4K"], rel_path="../")

%autoreload 2
tum.export_to_gephi(save=False)
p0_42.export_to_gephi(save=False)


❗️ Check this experiment, there might be a problem in loading the data: norm2_int_tum_4K_50TF
##### Experiment labels:  dict_keys(['beta_4K_50TF', 'norm2_4K_50TF', 'standard_4K_50TF', 'beta2_4K_50TF', 'beta3_4K_50TF', 'norm3_4K_50TF', 'standard_7K_50TF', 'beta_7K_50TF', 'norm2_7K_50TF', 'norm3_7K_50TF', 'standard_4K_10TF', 'norm3_4K_10TF', 'beta_4K_10TF', 'standard_4K_6TF', 'beta_4K_6TF', 'norm3_4K_6TF', 'standard_4K_3TF', 'beta_4K_3TF', 'norm3_4K_3TF', 'standard_4K_5TF', 'standard_4K_8TF', 'standard_4K_4TF', 'standard_4K_7TF', 'beta_4K_5TF', 'beta_4K_4TF', 'norm3_4K_7TF', 'norm3_4K_5TF', 'beta_4K_8TF', 'norm3_4K_4TF', 'beta_4K_7TF', 'norm3_4K_8TF', 'standard_4K_9TF', 'norm3_4K_9TF', 'beta_4K_9TF'])
##### Experiment labels:  dict_keys(['standard_4K_4TF', 'standard_4K_3TF', 'standard_4K_6TF', 'norm3_4K_4TF', 'beta_4K_4TF', 'norm3_4K_5TF', 'beta_4K_6TF', 'beta_4K_5TF', 'beta_4K_3TF', 'norm3_4K_6TF', 'norm3_4K_3TF', 'standard_4K_7TF', 'standard_4K_8TF', 'standard_4K_9TF', 'standard_4K_5TF

## Computed ModCon and MEV scores

In [4]:
def worker(arg):
    obj, methname = arg[:2]
    _ = getattr(obj, methname)()
    return obj

In [5]:
pool = mp.Pool(mp.cpu_count())
# results = pool.map(worker, ((exp, "get_ModCon") for exp in p0.exps.values()))
# p0.exps = {exp.type: exp for exp in results}

results = pool.map(worker, ((exp, "get_ModCon") for exp in p0_42.exps.values()))
p0_42.exps = {exp.type: exp for exp in results}

results = pool.map(worker, ((exp, "get_ModCon") for exp in tum.exps.values()))
tum.exps = {exp.type: exp for exp in results}

In [6]:
tpms_df = all_tum_tpms
for key, exp in p0_42.exps.items():
    sort_col = "ModCon_{}".format(exp.type)
    exp.mevsMut, _ = exp.get_mevs(tpms=tpms_df, modCon=exp.modCons, sort_col=sort_col, num_genes=100, verbose=False)
del tpms_df

# TF 50 (report)

Between standard, norm and reward compare the network metrics: degree, pageRank, closeness, betwenees and IVI.

The network configuration is: 4K and 10TF. and 4K and 3TF.

## Overview (report)

In [7]:
std_nt, rwrd_nt, pen_nt = p0_42.exps["standard_4K_50TF"], p0_42.exps["norm3_4K_50TF"], p0_42.exps["beta_4K_50TF"]
metrics_df_50 = gh.prep_net_metrics(std_nt, rwrd_nt, pen_nt)

In [9]:
fig = gh.plot_net_metrics(
    metrics_df_50, log_y=True, label="P0_42 - 50TF", color="Type", filename="P0_NetworkMetricsComp_{}_2".format("50TF"), figs_path=figures_path
)
fig = fig.update_layout(height=1000)
fig.show()

## Clustering analysis

In [15]:
if True:
    vers = "v3"
    tf = 50
    figs, titles = [], ["Standard", "Reward", "Penalised"]

    metrics_figs, sill_figs = [], []
    r_mod = {"standard": "std", "norm3": "rwd", "beta": "pen"}
    top = pd.DataFrame()
    for modifier in ["standard", "norm3", "beta"]:
        _, fig, metrics = gh.run_clusters(p0_42.exps["{}_4K_{}TF".format(modifier, tf)], label="{}_tf{}".format(r_mod[modifier], tf), show_figs=True)
        fig = gh.update_legend(fig)
        dmy = gh.rank_cs_metrics(metrics, label=r_mod[modifier])
        top = pd.concat([top, dmy], axis=1)
        metrics_figs.append(fig)

        # to save if True
        if True:
            # fig = fig.update_layout(font=dict(size=14), xaxis=dict(tickfont=dict(size=16)), yaxis=dict(tickfont=dict(size=16)), title="", template="ggplot2")
            save_fig(name=f"CA_metrics_{r_mod[modifier]}_tum_4k_{vers}", fig=fig, base_path=figures_path, width=1700, height=900)

        # sill fig
        fig, _ = gh.sill_distrib(metrics, label=f"{r_mod[modifier]}_tum4k_{vers}", figures_path=figures_path)
        sill_figs.append(fig)

    figs = gh.prcs_top3_metrics(top, label=f"p0_tum4K_50TF_{vers}", figures_path=figures_path)
    # figs[1].show()

Variation per principal component [0.63181158 0.21488563] and the sum 84.67%
Variation per principal component [0.62266196 0.2354051 ] and the sum 85.81%
Variation per principal component [0.60670946 0.22549558] and the sum 83.22%


### Elbow method

In [13]:
figs, titles = [], ["Standard", "Reward", "Penalised"]
for net in ["standard", "norm3", "beta"]:
    sel_exp = p0_42.exps["{}_4K_50TF".format(net)]
    fig, sum_dist = cs.elbow_method(sel_exp.mevsMut, min_k=2, max_k=sel_exp.mevsMut.shape[1], label="test")
    figs.append(fig)

In [14]:
subplots_config = {
    "num_cols": 3,
    "shared_x": False,
    "shared_y": False,
    "h_spacing": None,
    "v_spacing": 0.08,
    "main_title": "K-means elbow methods for each network",
    "height": 400,
    "width": None,
    "y_title": None,
    "x_title": None,
    "specs": None,
}

fig = gh.helper_multiplots(figs, titles, subplots_config)
fig.show()
save_fig(name=f"p0_elbowMethod_4K_{vers}", fig=fig, base_path=figures_path, width=None, height=400)

### K=4 vs K=5

In previous section we look at the differences between different clustering models and number of clusters and found out that Kmeans and Agg w/ Avg linkage are the best models. Also, there is a no clear winner between K=4 and K=5

In [22]:
# Combine all TFs

if True:
    tf = 50
    comb_std, _, _ = gh.run_clusters(p0_42.exps["standard_4K_{}TF".format(tf)], label="std_tf{}".format(tf))
    comb_norm3, _, _ = gh.run_clusters(p0_42.exps["norm3_4K_{}TF".format(tf)], label="norm3_tf{}".format(tf), show_figs=False)
    comb_norm3.drop(columns=["PC_1", "PC_2"], inplace=True)
    comb_beta, _, _ = gh.run_clusters(p0_42.exps["beta_4K_{}TF".format(tf)], label="beta_tf{}".format(tf))
    comb_beta.drop(columns=["PC_1", "PC_2"], inplace=True)

    sel_exp = p0_42.exps["standard_4K_{}TF".format(tf)]
    comb_tfs = pd.concat([comb_std, comb_norm3, comb_beta, vu_output], axis=1).dropna()

    cluster_model = "RawKMeans"
    reorder_cols = [
        "TCGA_2017_AM_remap",
        "KMeans_labels_6",
        "{}_CS_{}_std_tf{}".format(cluster_model, 4, tf),
        "{}_CS_{}_std_tf{}".format(cluster_model, 5, tf),
        "2019_consensus_classifier",
    ]

    rename_cols = {
        reorder_cols[0]: "TCGA_2017",
        reorder_cols[1]: "CA + IFNg",
        reorder_cols[2]: "Standard K=4",
        reorder_cols[3]: "Standard K=5",
        reorder_cols[4]: "Consensus_2019",
    }

    _, sky_fig = sky.main(
        df=comb_tfs.rename(columns=rename_cols),
        reorder_cols=list(rename_cols.values()),
        title="Standard Network. K-means with K=4 and K=5",
        retMeta=True,
    )
    # sky_fig.show()
    save_fig(name=f"Sankey_KM_4K_{vers}", fig=sky_fig, base_path=figures_path, width=900, height=500)

    cluster_model = "Avg"
    reorder_cols = [
        "TCGA_2017_AM_remap",
        "KMeans_labels_6",
        "{}_CS_{}_std_tf{}".format(cluster_model, 6, tf),
        "{}_CS_{}_std_tf{}".format(cluster_model, 7, tf),
        "2019_consensus_classifier",
    ]

    rename_cols = {
        reorder_cols[0]: "TCGA_2017",
        reorder_cols[1]: "CA + IFNg",
        reorder_cols[2]: "Standard K=4",
        reorder_cols[3]: "Standard K=5",
        reorder_cols[4]: "Consensus_2019",
    }

    _, sky_fig = sky.main(
        df=comb_tfs.rename(columns=rename_cols),
        reorder_cols=list(rename_cols.values()),
        title="Standard Network. Agg_Avg with K=4 and K=5",
        retMeta=True,
    )
    sky_fig.show()
    save_fig(name=f"Sankey_Avg_4K_{vers}", fig=sky_fig, base_path=figures_path, width=900, height=500)

Variation per principal component [0.46000661 0.35831906] and the sum 81.83%
Variation per principal component [0.47349261 0.37027456] and the sum 84.38%
Variation per principal component [0.4688147  0.34786232] and the sum 81.67%


## Leiden comparison and Sankey overview

Get an overview of the Leiden scores, number of communities and changes for TF = 10

In [None]:
leiden_scores = p0_42.comb_leiden_scores()

# the TF = 50 contain experiments with the different modifiers
# leiden_scores = leiden_scores.loc[leiden_scores["TF"] != "50"]

# Specific to the tum dataset
leiden_scores.loc[leiden_scores["Modifier"] == "beta", "Modifier"] = "Penalty"

# Figure for multiple TFs and 3 Leiden scores
fig = px.scatter(leiden_scores, x="Modifier", color="TF", y="ModularityScore", size="ModuleNum", facet_col="Leiden Rank", facet_col_wrap=4)

# Fir for one TF and 10 Leiden scores
fig = gh.plot_leiden(p0_42, tf="50")
# fig.show()

## Overview Leiden/Sankey (report)

In [23]:
tf, no_K, no_genes, cs_model = 50, 5, "4K", "RawKMeans"

p0_42, sky_fig, cols = gh.prep_sankey_leiden(p0_42, vu_output=vu_output, tf=tf, no_K=no_K, no_genes=no_genes, chosen_cs_model=cs_model)
leiden_sky = gh.plot_sankey_leiden(p0_42, sky_fig=sky_fig, rename_cols=cols, label="TF-{}".format(tf), tf=str(tf))

fig = leiden_sky.update_layout(font=dict(size=16))
fig = fig.update_layout(height=1000)
save_fig(name=f"Ldn_Sky_TF_{tf}_RawKMeans_K5_{vers}", fig=fig, base_path=figures_path, width=1400, height=1000)
fig.show()

Variation per principal component [0.46000661 0.35831906] and the sum 81.83%
Variation per principal component [0.47349261 0.37027456] and the sum 84.38%
Variation per principal component [0.4688147  0.34786232] and the sum 81.67%


## Gene representation (report)



### ModCon Selection

In [28]:
sel_exp = p0_42.exps["standard_4K_50TF"]
data_df = gh.mev_modcon_genes(sel_exp, ref_tpms=tum.exps["standard_4K_10TF"].tpm_df, num_genes=100)
fig = px.bar(data_df, x="Comm", y="Num genes", color="Type", title="Tum 4K. Gene representation in P0 gc42. Exp {}".format(sel_exp.type), text_auto=True)
save_fig(name=f"4K_p0_modConMev_rep_{sel_exp.type}_{vers}", fig=fig, base_path=figures_path, width=1400, height=600)

fig.show()

In [29]:
sel_exp = p0_42.exps["standard_4K_50TF"]
data_df = gh.mev_modcon_genes(sel_exp, ref_tpms=all_tum_tpms, num_genes=100)
fig = px.bar(data_df, x="Comm", y="Num genes", color="Type", title="Tum all. Gene representation in P0 gc42. Exp {}".format(sel_exp.type), text_auto=True)
save_fig(name=f"13K_p0_modConMev_rep_{sel_exp.type}_{vers}", fig=fig, base_path=figures_path, width=1400, height=600)

fig.show()

### Mutation

In [41]:
if 0:
    p0_all_stats_df = gh.stats_mut_burden(p0_42.exps["standard_4K_50TF"].tpm_df, tf_list, tcga_mutations_df, type="All")

    p0_all_stats_df["Type"] = "P0"
    fig = gh.plot_mut_rep(p0_all_stats_df, title="P0. The 4K expressed gene used in the network and their mutation burden")
    # fig.show()
    # save_fig(name="MutTF_representation_{}".format("4K-all"), fig=fig, base_path=figures_path, width=1400, height=700)

## Norm3 vs standard (report)

In [40]:
%autoreload 2
std_norm3_comp = NetworkComp(p0_42, 4, "standard_4K_50TF", "norm3_4K_50TF")

map_names = {"standard_4K_50TF": "Standard", "norm3_4K_50TF": "Reward"}
figs = std_norm3_comp.plt_median_ge(map_names=map_names, path=figures_path)

## Community comparison (report)

In [39]:
%autoreload 2
if 1:
    # Community changes
    fig = std_norm3_comp.sankey_plot(toSave=False, path=figures_path)
    fig.update_layout(title="Community comparison. Standard vs Reward")
    save_fig(name=f"Sky_Comm_Comp_4K_{vers}", fig=fig, base_path=figures_path, width=1100, height=700)

    # Mutation distribution for each community
    fig = std_norm3_comp.com_mut_distrib(
        include_source=True,
        toSave=False,
        path=figures_path,
        annotations=True,
        ann_chg=[
            "UTRN",
            "CREBBP",
            "FAT2",
        ],
        ann_rm=["DST"],
        ann_add=["MED3", "CDKN2A", "FAT2", "LYST", "STAB1", "MYH10", "COL7A1", "MYO1F", "FGFR3"],
    )
    fig.update_layout(title="Community comparison. Standard vs Reward", xaxis_title="Reward", yaxis_title="Mutations count", legend_title="Standard")
    save_fig(name=f"Mut_Comm_Comp_4K_{vers}", fig=fig, base_path=figures_path, width=1100, height=700)

In [35]:
dmy_df, meta_norm3 = std_norm3_comp.comb_mut_stats(direction="Target", end=25)
fig = std_norm3_comp.plot_mut_evo(dmy_df, direction="Target")
fig.update_layout(title="Reward. Mutation burden across communities.", xaxis_title="Mutation burden", legend_title="Reward")
fig.show()
save_fig(name=f"Mut_evo_Rwd_4k_{vers}", fig=fig, base_path=figures_path, width=900, height=500)
# NetworkComp.plot_corr_matrix_coms(meta_norm3, height=700, title="P0 derived. Corr matrix for Reward", hide_up=True)

In [36]:
dmy_df, meta_std = std_norm3_comp.comb_mut_stats(direction="Source", end=25)
fig = std_norm3_comp.plot_mut_evo(dmy_df, direction="Source")
fig.update_layout(title="Standard. Mutation burden across communities.", xaxis_title="Mutation burden", legend_title="Standard")
fig.show()
save_fig(name=f"Mut_evo_Std_4k_{vers}", fig=fig, base_path=figures_path, width=900, height=500)
# NetworkComp.plot_corr_matrix_coms(meta_std, height=700, title="P0 derived. Corr matrix for standard", hide_up=True)

In [None]:
if False:
    std_beta_comp = NetworkComp(p0_42, 4, "standard_4K_50TF", "r_beta_4K_50TF")

    comp_dict = std_beta_comp.comp_ge_comm()
    for key, df in comp_dict.items():
        title = "P0. Median values in communities for {}".format(key)
        fig_name = "{}_median".format(key)
        fig = px.box(df, x="Comm", y="Median", color="Comm", title=title, points="all")
        # fig.show()

    fig = std_beta_comp.sankey_plot(toSave=False, path=figures_path)
    dmy_df, meta_beta = std_beta_comp.comb_mut_stats(direction="Target", end=40)
    fig = std_beta_comp.plot_mut_evo(dmy_df, direction="Target")
    fig.update_layout(title="Pen. Mutation burden across communities.", xaxis_title="Mutation burden", legend_title="Penalised")
    save_fig(name="Mut_evo_Pen_4k", fig=fig, base_path=figures_path, width=900, height=500)

# Cluster Analysis vs Network All experiments

In [None]:
%%capture
exps_gene_stats = []
for exp in p0_42.get_exps():
    gene_stats = gh.extract_gene_sel(exp=exp, ref_ge=all_tum_tpms, num_genes=3500)
    tf, modifier = exp.type.split("_")[-1], exp.type.split("_")[0]
    exps_gene_stats.append((tf, modifier, len(gene_stats["cmn_sel"]), len(gene_stats["cmn_all"]), len(gene_stats["net_sel_genes"]), len(gene_stats["cs_genes"])))

tst_df = pd.DataFrame(exps_gene_stats, columns=["TF", "Modifier", "cmn_sel", "cmn_all", "all_net", "cs_genes"])

In [None]:
comm_nums = []
for exp in p0_42.get_exps():
    tf, modifier = exp.type.split("_")[-1], exp.type.split("_")[0]
    comm_nums.append((tf, modifier, len(exp.leiden_best["Modularity Class"].unique())))


dmy_df = pd.DataFrame(comm_nums, columns=["TF", "Modifier", "Community_size"])
dmy_df["TF_num"] = dmy_df["TF"].str.split("TF", expand=True)[0].astype(int)
dmy_df.sort_values(by="TF_num", ascending=True, inplace=True)
dmy_df["Community_size_scale"] = dmy_df["Community_size"] * 10

In [None]:
fig = px.bar(
    tst_df,
    x="TF",
    y="cmn_sel",
    color="Modifier",
    barmode="group",
    title="P0. Network vs Std/Median. Cmn genes",
    category_orders={"TF": ["{}TF".format(i) for i in range(3, 11)] + ["50TF"]},
)
# fig.add_trace(go.Scatter(
#         x=dmy_df['TF'],
#         y=dmy_df['Community_size'],
#         name="Community_size"
#     ))
# fig.show()

In [None]:
color_scheme = px.colors.qualitative.G10
fig_comm = px.line(
    dmy_df,
    x="TF",
    y="Community_size_scale",
    color="Modifier",
    # barmode="group",
    markers=True,
    # color_discrete_sequence=px.colors.qualitative.G10,
    text="Community_size",
    color_discrete_sequence=[color_scheme[0], color_scheme[1], color_scheme[3]],
    title="P0. Network vs Std/Median. Community size genes",
    category_orders={"TF": ["{}TF".format(i) for i in range(3, 11)] + ["50TF"]},
)
fig_comm.update_traces(textposition="top right")
fig_comm.for_each_trace(lambda t: t.update(textfont_color="white", textposition="top center"))
# fig_comm.show()

for trace in range(len(fig_comm["data"])):
    fig.add_trace(fig_comm["data"][trace])

fig = fig.update_layout(height=700)

# Tools

## Explore relationship of TF with modCon score 

**Hypothesis**: When TF is set to a large number, there are going to be more TF genes in the modCon as in compared when TF = 10

In [None]:
# compute modConRank
for exp in p0_42.get_exps():
    # exp.nodes_df["ModCon_Rank"] = 0
    for modCon, value in exp.modCons.items():
        dmy = value.sort_values(by=["ModCon_{}".format(exp.type)], ascending=False).reset_index(names="Id").iloc[:100]
        dmy["Rank"] = dmy.index + 1
        dmy.set_index("Id", inplace=True)
        exp.nodes_df.loc[exp.nodes_df["Modularity Class"] == modCon, "ModCon_Rank"] = dmy["Rank"]
        exp.nodes_df["ModCon_Rank"] = exp.nodes_df["ModCon_Rank"].fillna(0)

In [None]:
std_TF_comp = NetworkComp(p0_42, 4, "standard_4K_50TF", "standard_4K_10TF")

# We need to allign the modCons communities number too. We'll do that by the
remap_cols = {}
for src_com in std_TF_comp.comp_df["Source"].unique():
    trgt_com = std_TF_comp.comp_df[std_TF_comp.comp_df["Source"] == src_com]["Target"].values[0]
    remap_cols[trgt_com] = src_com

In [None]:
# rename exp 2 modCons
r_modCon = {}
for key, value in std_TF_comp.target_exp.modCons.items():
    if str(key) in remap_cols.keys():
        r_modCon[remap_cols[str(key)]] = value

# Prepare figures
exp_1 = std_TF_comp.source_exp
exp_2 = std_TF_comp.target_exp

figs, titles = [], []
for key, val in remap_cols.items():
    modCon_1, modCon_2 = exp_1.modCons[int(val)], exp_2.modCons[int(key)]
    modCon_1["exp"], modCon_2["exp"] = exp_1.type, exp_2.type
    col_1, col_2 = "ModCon_{}".format(exp_1.type), "ModCon_{}".format(exp_2.type)
    merged_df = pd.concat([modCon_1[col_1], modCon_2[col_2]], axis=1).fillna(-100)

    merged_df["TF"] = "No TF"
    merged_df.loc[merged_df.index.isin(tf_list), "TF"] = "TF"

    titles.append("{} vs {}.".format(key, val))
    fig = px.scatter(merged_df.reset_index(names="gene"), x=col_1, y=col_2, color="TF", hover_data=["gene"])
    figs.append(fig)

In [None]:
num_cols = 3
num_rows = int(np.ceil(len(remap_cols.keys()) / num_cols))

subplot = make_subplots(rows=num_rows, cols=num_cols, subplot_titles=titles, shared_xaxes=False, horizontal_spacing=0.05, vertical_spacing=0.07)

idx_row, idx_col = 1, 1
for i, fig in enumerate(figs):
    for trace in range(len(fig["data"])):
        subplot.append_trace(fig["data"][trace], row=idx_row, col=idx_col)

    if idx_col % num_cols == 0:
        idx_col = 0
        idx_row += 1
    idx_col += 1

layout = go.Layout(title_text="ModCon top 100, community comparison. Target {} vs Source {}".format(std_TF_comp.source_exp.type, std_TF_comp.target_exp.type))
subplot.update_layout(coloraxis_autocolorscale=False, coloraxis_colorscale=px.colors.sequential.Sunsetdark)

subplot = subplot.update_layout(layout, height=700)
subplot = subplot.update_xaxes(title_text="ModCon {}".format("10TF"))
subplot = subplot.update_yaxes(title_text="ModCon {}".format("50TF"))

# subplot
# save_fig(name="TF_comp", fig=subplot, base_path=figures_path, width=1920, height=1080)

# IVI comparison

In [None]:
exp_1 = p0_42.exps["standard_4K_10TF"]
# exp_2 = p0_42.exps["norm3_4K"]
exp_2 = tum.exps["standard_4K_50TF"]


def ivi_comp(exp_1, exp_2, tf_list):
    tissue_type_1, tissue_type_2 = exp_1.name.split("_")[-3], exp_2.name.split("_")[-3]

    ivi_col_1, ivi_col_2 = "IVI_{}_{}".format(tissue_type_1, exp_1.type), "IVI_{}_{}".format(tissue_type_2, exp_2.type)
    nodes_1 = exp_1.nodes_df.rename(columns={"IVI": ivi_col_1}).copy(deep=True)
    nodes_2 = exp_2.nodes_df.rename(columns={"IVI": ivi_col_2}).copy(deep=True)

    dmy_df = pd.concat([nodes_1[ivi_col_1], nodes_2[ivi_col_2]], axis=1)
    dmy_df["mut_count"] = tcga_mutations_df["count"]
    dmy_df["mut_count"] = dmy_df["mut_count"].fillna(0)
    dmy_df["TF"] = "No TF"
    dmy_df.loc[dmy_df.index.isin(list(tf_list)), "TF"] = "TF"

    dmy_df.fillna(-10, inplace=True)
    dmy_df = dmy_df.reset_index(names="gene")
    return dmy_df, ivi_col_1, ivi_col_2

## 50 TF vs 10 TF

The below plot clearly shows that the bias of the algorithm with 50 edges for TF

In [None]:
exp_1, exp_2 = p0_42.exps["standard_4K_10TF"], p0_42.exps["standard_4K_50TF"]
dmy_df, ivi_1, ivi_2 = ivi_comp(exp_1, exp_2, tf_list)

px.scatter(
    dmy_df,
    x=ivi_1,
    y=ivi_2,
    hover_data=dmy_df.columns,
    color="TF",
    size="mut_count",
    size_max=40,
    title="IVI comparison {} - {}".format(exp_1.name, exp_2.name),
    height=600,
)

## P0_10TF vs Tum

In [None]:
exp_1, exp_2 = p0_42.exps["standard_4K_10TF"], tum.exps["standard_4K_50TF"]
dmy_df, ivi_1, ivi_2 = ivi_comp(exp_1, exp_2, tf_list)

if 0:
    px.scatter(
        dmy_df,
        x=ivi_1,
        y=ivi_2,
        hover_data=dmy_df.columns,
        color="TF",
        size="mut_count",
        size_max=40,
        title="IVI comparison {} - {}".format(exp_1.name, exp_2.name),
        height=600,
    )

## P0_50TF vs TUM

In [None]:
exp_1, exp_2 = p0_42.exps["standard_4K_50TF"], tum.exps["standard_4K_50TF"]
dmy_df, ivi_1, ivi_2 = ivi_comp(exp_1, exp_2, tf_list)

if 0:
    px.scatter(
        dmy_df,
        x=ivi_1,
        y=ivi_2,
        hover_data=dmy_df.columns,
        color="TF",
        size="mut_count",
        size_max=40,
        title="IVI comparison {} - {}".format(exp_1.name, exp_2.name),
        height=600,
    )