# Motivation 

The purpose of this Notebook is to analyse the test output 


# Installation requirements

* Setup a conda environment with Python >3.11
    ```script
        conda create --name iCoExpMut_env python=3.11
        conda activate iCoExpMut_env
    ```
* Install graph-tool and other packages via conda-forge
    ```script
        conda install conda-forge::graph-tool
        conda install conda-forge::multiprocess
    ```
* Install iCoExpNet via pip
    ```script
        pip install icoexpnet
    ```


In [64]:
%load_ext autoreload
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import numpy as npw
import os
import sys

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import multiprocess as mp


# own libraries
from icoexpnet.analysis.ExperimentSet import ExperimentSet
from icoexpnet.analysis.GraphToolExp import GraphToolExperiment as GtExp
from icoexpnet.analysis import GraphHelper as gh
from icoexpnet.analysis.utilities import clustering as cs
from icoexpnet.analysis.utilities import sankey_consensus_plot as sky
from icoexpnet.analysis.utilities.helpers import save_fig, survival_plot

pio.templates.default = "ggplot2"

pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [65]:
results_path = "../../../results/"
data_base = "../../../data/"
base_path = "../../../"
test_exps_path = "results/test/"
test_cltrs_path = "results/testCtrl/"

figures_path = "test_results/"

mut_df = pd.read_csv(f"{data_base}/test_mutation_data.tsv",
                     sep="\t", index_col="gene")

# tf list
tf_path = f"{data_base}/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t",
                            skip_header=1, dtype="str")

# Load experiments

In [66]:
exp_test = ExperimentSet("test", base_path=base_path, exp_path=test_exps_path,
                         mut_df=mut_df, sel_sets=None, exp_type="iNet")

exps = {}
# Compute some useful metadata for the experiments§
for idx, exp in enumerate(exp_test.get_exps()):
    if exp.sbm_method != "hsbm":  # or idx != 2:
        continue

    print(f"Loading Graph-Tool for {exp.type}")
    exps[exp.type] = GtExp.from_pgcna_inet(exp, rel_path="")
    exps[exp.type].export_to_gephi(save=False)

exp_test.exps = exps

##### Experiment labels:  dict_keys(['standard_5K_4TF_hsbm', 'standard_5K_3TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_9TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_8TF_hsbm', 'standard_5K_6TF_hsbm'])
Loading Graph-Tool for standard_5K_4TF_hsbm
Loading Graph-Tool for standard_5K_3TF_hsbm
Loading Graph-Tool for standard_5K_7TF_hsbm
Loading Graph-Tool for standard_5K_9TF_hsbm
Loading Graph-Tool for standard_5K_5TF_hsbm
Loading Graph-Tool for standard_5K_8TF_hsbm
Loading Graph-Tool for standard_5K_6TF_hsbm


In [67]:
# This gets the list of folders of the control experiments
folders = next(os.walk(base_path + test_cltrs_path), (None, None, []))[1]

# Create a dictionary of the control experiments where each key is the index of the control
test_ctrls = {}
for folder in folders:
    hCtrl_path = f"{test_cltrs_path}/{folder}/"
    idx = int(folder.split("tctrl_")[-1])
    test_ctrls[idx] = ExperimentSet(
        "tCtrl", base_path, hCtrl_path, mut_df, sel_sets=None, rel_path="../", exp_type="iNet")
    test_ctrls[idx].export_to_gephi(save=False)

##### Experiment labels:  dict_keys(['standard_5K_3TF_hsbm', 'standard_5K_4TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_6TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_8TF_hsbm', 'standard_5K_9TF_hsbm'])
##### Experiment labels:  dict_keys(['standard_5K_3TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_6TF_hsbm', 'standard_5K_4TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_8TF_hsbm', 'standard_5K_9TF_hsbm'])
##### Experiment labels:  dict_keys(['standard_5K_3TF_hsbm', 'standard_5K_4TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_6TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_8TF_hsbm', 'standard_5K_9TF_hsbm'])
##### Experiment labels:  dict_keys(['standard_5K_3TF_hsbm', 'standard_5K_4TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_9TF_hsbm', 'standard_5K_6TF_hsbm', 'standard_5K_8TF_hsbm'])
##### Experiment labels:  dict_keys(['standard_5K_4TF_hsbm', 'standard_5K_3TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_9TF_hsbm', 'standard_5K_6TF_hsbm', 'standard_5K_7TF_hs

# Selective edge pruning


Note:
- The example only uses hierarchical SBM for non-hierarchical SBM there is a separate method for loading the objects.

In [68]:
# Load the hsbm experiments with the real biological TFs
h_exps, h_entropy = GtExp.load_hsbm_exps(exp_test)
h_entropy["Type"] = "Experiment"

Loading Graph-Tool for standard_5K_4TF_hsbm
Loading Graph-Tool for standard_5K_3TF_hsbm
Loading Graph-Tool for standard_5K_7TF_hsbm
Loading Graph-Tool for standard_5K_9TF_hsbm
Loading Graph-Tool for standard_5K_5TF_hsbm
Loading Graph-Tool for standard_5K_8TF_hsbm
Loading Graph-Tool for standard_5K_6TF_hsbm


In [69]:
# Load the hCtrl experiments with control TFs
ctrl_exps, cmb_df = {}, pd.DataFrame()

# Iterate over the control experiments
for key in range(1, len(test_ctrls)+1, 1):
    print(f"-->Loading control experiment #{key}")
    exps, entropy = GtExp.load_hsbm_exps(test_ctrls[key])
    entropy["Type"] = "hCtrl{}".format(key)
    cmb_df = pd.concat([cmb_df, entropy], axis=0)
    ctrl_exps[key] = {"entropy": entropy, "exps": exps}

-->Loading control experiment #1
Loading Graph-Tool for standard_5K_4TF_hsbm
Loading Graph-Tool for standard_5K_3TF_hsbm
Loading Graph-Tool for standard_5K_5TF_hsbm
Loading Graph-Tool for standard_5K_9TF_hsbm
Loading Graph-Tool for standard_5K_6TF_hsbm
Loading Graph-Tool for standard_5K_7TF_hsbm
Loading Graph-Tool for standard_5K_8TF_hsbm
-->Loading control experiment #2
Loading Graph-Tool for standard_5K_3TF_hsbm
Loading Graph-Tool for standard_5K_4TF_hsbm
Loading Graph-Tool for standard_5K_5TF_hsbm
Loading Graph-Tool for standard_5K_6TF_hsbm
Loading Graph-Tool for standard_5K_7TF_hsbm
Loading Graph-Tool for standard_5K_8TF_hsbm
Loading Graph-Tool for standard_5K_9TF_hsbm
-->Loading control experiment #3
Loading Graph-Tool for standard_5K_3TF_hsbm
Loading Graph-Tool for standard_5K_5TF_hsbm
Loading Graph-Tool for standard_5K_6TF_hsbm
Loading Graph-Tool for standard_5K_4TF_hsbm
Loading Graph-Tool for standard_5K_7TF_hsbm
Loading Graph-Tool for standard_5K_8TF_hsbm
Loading Graph-Tool fo

# Analyse the entropy of the controls and experiments

In [70]:
# Combine the entropy of the the control networks and the real experiments
cmb_df = pd.concat([cmb_df, h_entropy], axis=0)
cmb_df["Entropy_norm"] = (cmb_df["Entropy"] - cmb_df["Entropy"].min()) / \
    (cmb_df["Entropy"].max() - cmb_df["Entropy"].min())
cmb_df["Entropy_log10"] = np.log10(cmb_df["Entropy"])

## Plot the Entropy Evolution across the networks

We look at how the community detections performs across the different networks generated as the #degree is increased

In [71]:
# Find the network with top model
top_exp, color_map = [], {}
for exp in cmb_df["Type"].unique():
    sel_df = cmb_df[cmb_df["Type"] == exp]
    for tf in sel_df["TF"].unique():
        tst_df = sel_df[sel_df["TF"] == tf].sort_values(
            by="Entropy", ascending=False).values[0]
        top_exp.append(tst_df)

    color_map[exp] = px.colors.qualitative.Plotly[0]
    if exp == "Experiment":
        color_map[exp] = px.colors.qualitative.Plotly[2]

top_exp = pd.DataFrame(
    top_exp, columns=["Entropy", "TF", "Type", "Entropy_norm", "Entropy_log"])

# Compute stats
stats_vals = []
for tf in top_exp["TF"].unique():
    sel_df = top_exp[top_exp["TF"] == tf]
    sel_df_ctrl = sel_df[sel_df["Type"].str.contains("hCtrl")]["Entropy"]
    sel_df_exp = sel_df[~sel_df["Type"].str.contains(
        "hCtrl")]["Entropy"].values[0]
    stats_vals.append((sel_df_ctrl.median(), sel_df_ctrl.mean(),
                      sel_df_ctrl.std(), tf, "Control"))
    stats_vals.append((sel_df_exp, sel_df_exp, 0, tf, "Experiment"))

In [72]:
top_df = pd.DataFrame(stats_vals, columns=[
                      "Median_Ent", "Mean_Ent", "Std_Ent", "TF", "Type"])

color_map = {
    "Experiment": px.colors.qualitative.G10[0], "Control": px.colors.qualitative.G10[1]}
fig = px.scatter(
    top_df,
    y="Mean_Ent",
    x="TF",
    error_y="Std_Ent",
    color="Type",
    title="Entropy evolution TFs 3-15. Standard 5K",
    height=600,
    color_discrete_map=color_map,
)

fig.update_layout(
    legend=dict(
        orientation="h",
        title="SBM",
        yanchor="bottom",
        y=0.87,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=16, color="#003366"),
    ),
    title="",
    xaxis=dict(
        tickfont=dict(size=16),
        title="Minimum degree for selected genes",
        tickmode="array", tickvals=list(range(3, 16, 1))
    ),
    yaxis=dict(
        tickfont=dict(size=16),
        title="Entropy",
    ),
    font=dict(size=16),
)
fig.update_traces(error_y=dict(width=8), selector=(
    {"name": "Control"}), marker_size=8)
fig.show()

# Leiden and SBM comparison

In [73]:
def prep_leiden(exp: ExperimentSet, label="Experiment"):
    leid_stats = exp.comb_leiden_scores()
    leid_stats = leid_stats.loc[leid_stats["Modifier"] == "Standard"]
    leid_stats["Type"] = label

    sel_cols = ["Leiden Rank", "ModularityScore", "ModuleNum",
                "AvgModSize", "Modifier", "Type", "TF", "AvgModuleNum"]

    return leid_stats[sel_cols]


def tf_stats(ctrls_df: pd.DataFrame, exp_df: pd.DataFrame):
    for tf in ctrls_df["TF"].unique():
        sel_df = ctrls_df[ctrls_df["TF"] == tf]
        sel_df_exp = exp_df[exp_df["TF"] == tf]

        r_field = {"ModularityScore": "Mod",
                   "AvgModuleNum": "ModNum", "AvgModSize": "ModSize"}
        for field in ["ModularityScore", "AvgModuleNum", "AvgModSize"]:
            ctrls_df.loc[ctrls_df["TF"] == tf, "{}_median".format(
                r_field[field])] = sel_df[field].median(axis=0)
            ctrls_df.loc[ctrls_df["TF"] == tf, "{}_mean".format(
                r_field[field])] = sel_df[field].mean(axis=0)
            ctrls_df.loc[ctrls_df["TF"] == tf, "{}_std".format(
                r_field[field])] = sel_df[field].std(axis=0)

            exp_df.loc[exp_df["TF"] == tf, "{}_median".format(
                r_field[field])] = sel_df_exp[field].median(axis=0)
            exp_df.loc[exp_df["TF"] == tf, "{}_mean".format(
                r_field[field])] = sel_df_exp[field].mean(axis=0)
            exp_df.loc[exp_df["TF"] == tf, "{}_std".format(
                r_field[field])] = sel_df_exp[field].std(axis=0)

    ctrls_df["Type"] = "Control"
    comb_leiden = pd.concat([ctrls_df, exp_df], axis=0)
    comb_leiden["TF"] = comb_leiden["TF"].astype(int)
    comb_leiden = comb_leiden[comb_leiden["TF"] <= 15]
    comb_leiden.sort_values(by="TF", ascending=True, inplace=True)
    comb_leiden["TF"] = comb_leiden["TF"].astype(str)

    return comb_leiden

In [74]:
leid_h = prep_leiden(exp_test, label="Experiment")

leid_ctrls = []
for idx in range(1, len(test_ctrls), 1):
    df = prep_leiden(test_ctrls[idx], label="hCtrl{}".format(idx))
    leid_ctrls.append(df)

all_leid_ctrls = pd.concat(leid_ctrls, axis=0)
top_leid_ctrls = all_leid_ctrls[all_leid_ctrls["Leiden Rank"] == 0]

## Compare the metrics between the two community detection methods

In [75]:
top_comb_leiden = tf_stats(top_leid_ctrls, leid_h[leid_h["Leiden Rank"] == 0])
fig = px.line(
    top_comb_leiden,
    x="TF",
    y="Mod_mean",
    error_y="Mod_std",
    markers=True,
    color="Type",
    height=600,
    title="Modularity scores for the best top Leiden Run.",
    color_discrete_map=color_map,
)
fig.update_layout(
    legend=dict(
        orientation="h",
        title="Leiden",
        yanchor="bottom",
        y=0.87,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=16, color="#003366"),
    ),
    title="",
    xaxis=dict(
        tickfont=dict(size=16),
        title="Minimum degree for selected genes",
    ),
    yaxis=dict(
        tickfont=dict(size=16),
        title="Modularity Score",
    ),
    font=dict(size=16),
)
fig.update_traces(line_width=5, marker_size=12, error_y=dict(width=5))

fig.show()

## Comparing the community sizes and the degree of TFs

### Leiden

In [None]:
# A more polished figure
leid_fig = px.line(
    top_comb_leiden,
    x="TF",
    y="ModNum_mean",
    error_y="ModNum_std",
    markers=True,
    color="Type",
    height=600,
    title="Community sizes for the best top Leiden Run.",
    color_discrete_map=color_map,
)
leid_fig.update_layout(
    # legend=dict(
    #     orientation="h",
    #     title="Leiden",
    #     bgcolor="rgba(0,0,0,0)",
    #     font=dict(size=16, color="#003366"),
    # ),
    title="",
    xaxis=dict(
        tickfont=dict(size=16),
        title="Minimum degree for selected genes",
    ),
    yaxis=dict(
        tickfont=dict(size=16),
        tickmode="linear",
        title="Number of communities",
    ),
    font=dict(size=16),
)
leid_fig.update_traces(line_width=5, marker_size=12, error_y=dict(width=5))
leid_fig.show()

## hSBM

In [77]:
# Get the number of communities for each experiment using hsbm_get_gt_df
ctrl_sizes, h_sizes = [], []
for key, val in h_exps.items():
    results_df, _ = val.hsbm_get_gt_df()
    tf = val.extract_tf_number(val.name)
    h_sizes.append(((tf, "Experiment", len(results_df["max_b"].unique()))))
    for idx in range(1, len(test_ctrls)+1, 1):
        ctrl_exp = ctrl_exps[idx]['exps'][key]
        control_res, _ = ctrl_exp.hsbm_get_gt_df()
        tf = ctrl_exp.extract_tf_number(key)
        ctrl_sizes.append(
            ((tf, "hCtrl{}".format(idx), len(control_res["max_b"].unique()))))

# Combine the networks' sizes of the communities into a single DataFrame
ctrl_sizes = pd.DataFrame(ctrl_sizes, columns=["TF", "Type", "Com_size"])
ctrl_sizes["Type"] = "Control"
h_sizes = pd.DataFrame(h_sizes, columns=["TF", "Type", "Com_size"])

In [88]:
# Compute stats of the number of communities
for tf in ctrl_sizes["TF"].unique():
    sel_df = ctrl_sizes[(ctrl_sizes["TF"] == tf)]
    sel_df_exp = h_sizes[h_sizes["TF"] == tf]

    ctrl_sizes.loc[ctrl_sizes["TF"] == tf,
                   "Size_median"] = sel_df["Com_size"].median(axis=0)
    ctrl_sizes.loc[ctrl_sizes["TF"] == tf,
                   "Size_mean"] = sel_df["Com_size"].mean(axis=0)
    ctrl_sizes.loc[ctrl_sizes["TF"] == tf,
                   "Size_std"] = sel_df["Com_size"].std(axis=0)

    h_sizes.loc[h_sizes["TF"] == tf,
                "Size_median"] = sel_df_exp["Com_size"].median(axis=0)
    h_sizes.loc[h_sizes["TF"] == tf,
                "Size_mean"] = sel_df_exp["Com_size"].mean(axis=0)
    h_sizes.loc[h_sizes["TF"] == tf, "Size_std"] = 0

comb_df = pd.concat([ctrl_sizes, h_sizes], axis=0)

In [89]:
if True:
    sbm_fig = px.line(
        comb_df,
        x="TF",
        y="Size_mean",
        color="Type",
        error_y="Size_std",
        markers=True,
        title="SBM. Community sizes",
        height=600,
        color_discrete_map=color_map,
    )
    sbm_fig.update_layout(
        legend=dict(
            orientation="h",
            title="Type",
            yanchor="bottom",
            xanchor="center",
            y=0.8,
            x=0.15,
            bgcolor="rgba(0,0,0,0)",
            font=dict(size=16, color="#003366"),
        ),
        title="",
        xaxis=dict(
            tickfont=dict(size=16),
            title="Minimum degree for selected genes",
        ),
        yaxis=dict(
            tickfont=dict(size=16),
            tickmode="linear",
            # tick0=3,
            # dtick=1,
            title="Number of communities",
        ),
        font=dict(size=16),
    )
    sbm_fig.update_traces(line_width=5, marker_size=12, error_y=dict(width=5))
    sbm_fig.show()

## Combine the community sizes for Leiden and hSBM

In [90]:
sbm_fig = sbm_fig.add_traces(leid_fig.data)
sbm_fig = sbm_fig.update_layout(
    legend=dict(
        orientation="h",
        title="Type",
        yanchor="bottom",
        y=0.4,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=16, color="#003366"),
    ),
    xaxis=dict(
        tickfont=dict(size=16),
        title="Minimum degree for selected genes",
    ),
    title="",
    yaxis=dict(
        tickmode="linear",
        tick0=10,
        dtick=2,
    ),
    showlegend=True,
    font=dict(size=16),
)
sbm_fig = sbm_fig.update_traces(
    line_width=5, marker_size=12, error_y=dict(width=5))
legend_shown = {}
for trace in sbm_fig.data:
    if trace.name in legend_shown:
        trace.showlegend = False
    else:
        trace.showlegend = True
        legend_shown[trace.name] = True

sbm_fig.add_annotation(x=2.5, y=35, text="SBM",
                       showarrow=False, font=dict(size=16, color="#003366"))
sbm_fig.add_annotation(x=2.5, y=20, text="Leiden",
                       showarrow=False, font=dict(size=16, color="#003366"))

sbm_fig.show()

# Compute ModCon and MEV

This part is needed to extract the relevant genes across the communities

In [91]:
# Define a worker function for parallel processing
def worker(arg):
    obj, methname = arg[:2]
    _ = getattr(obj, methname)()
    return obj

In [92]:

results = pool.map(worker, ((exp, "get_ModCon") for exp in h_exps.values()))
h_exps = {exp.extract_tf_number(exp.name): exp for exp in results}

for key, exp in h_exps.items():
    sort_col = "ModCon_{}_gt".format(exp.type)
    exp.mevsMut, _ = exp.get_mevs(
        tpms=mut_df, modCon=exp.gt_modCon, sort_col=sort_col, num_genes=100, verbose=False)

In [93]:
# Compute the ModCon and MEV for controls
# ModCon
for key in ctrl_exps.keys():
    print(f"### ModCon for control #{key}")
    results = pool.map(worker, ((exp, "get_ModCon")
                       for exp in ctrl_exps[key]["exps"].values()))
    ctrl_exps[key]["exps"] = {exp.extract_tf_number(
        exp.name): exp for exp in results}

# MEV
for key in ctrl_exps.keys():
    print(f"### MEV for control #{key}")
    for key, exp in ctrl_exps[key]["exps"].items():
        sort_col = f"ModCon_{exp.type}_gt"
        exp.mevsMut, _ = exp.get_mevs(
            tpms=mut_df, modCon=exp.gt_modCon, sort_col=sort_col, num_genes=100, verbose=False)

### ModCon for control #1
### ModCon for control #2
### ModCon for control #3
### ModCon for control #4
### ModCon for control #5
### MEV for control #1
### MEV for control #2
### MEV for control #3
### MEV for control #4
### MEV for control #5


# Extract the emerging TFs from the controls 

This is the part from where we extracted a subset of 98 TFs in the original paper of selective edge pruning

In [94]:
def gene_sel_by_mev(exp, mut_df: pd.DataFrame, exp_label="", show_figs=True, tf_list=[]):
    all_exps = []
    for num_genes in range(25, 201, 25):
        exp_genes = []
        for modCon in exp.gt_modCon.values():
            top_n = modCon.sort_values(by=["ModCon_{}_gt".format(
                exp.type)], ascending=False).index.values[:num_genes]
            exp_genes.extend(top_n)

        # add to the global exps
        sel_df = mut_df[mut_df.index.isin(exp_genes)]
        mut_0 = sel_df[sel_df["count"] > 0].shape[0]
        mut_5 = sel_df[sel_df["count"] >= 5].shape[0]
        mut_10 = sel_df[sel_df["count"] >= 10].shape[0]

        # tf exploration
        tf_used = set(exp_genes) & set(tf_list)
        tf_used_mut = sel_df[sel_df["count"] > 0]
        tf_used_mut = tf_used_mut[tf_used_mut.index.isin(
            list(tf_used))].shape[0]

        all_exps.append((len(exp_genes), mut_0, mut_5, mut_10,
                        len(tf_used), tf_used_mut, str(num_genes)))

    genes_stats = pd.DataFrame(all_exps, columns=[
                               "#Genes", "#Mutated >0", "#Mutated >5", "#Mutated >10", "#TF", "#Mutated TF", "Exp"])

    metrics_cols = genes_stats.columns[:-1]
    remap_cols = {col: col.replace("#", "%") for col in metrics_cols}
    prct_df = pd.concat(
        [genes_stats["Exp"], genes_stats[metrics_cols].div(genes_stats["#Genes"].values, axis=0).multiply(100).rename(columns=remap_cols)], axis=1
    )

    if show_figs:
        # Proportion
        fig = px.bar(
            prct_df,
            x="Exp",
            y=list(remap_cols.values()),
            barmode="group",
            title="{}. Proportion of genes included when different #num selected by ModCon".format(
                exp_label),
            height=600,
            text_auto=True,
        )
        fig.show()

        # Numbers
        fig = px.bar(
            genes_stats,
            x="Exp",
            y=genes_stats.columns[:-1],
            barmode="group",
            title="{}. Number of genes when different #num selected by ModCon".format(
                exp_label),
            height=600,
        )
        fig.show()
    return genes_stats, prct_df, exp_genes


def tf_modCon_exps(exps, mut_df: pd.DataFrame, tf_range=None, tf_list=[]):
    tf_changes = pd.DataFrame()

    for tf in tf_range:
        sel_exp = exps[tf]
        genes_used, _, _ = gene_sel_by_mev(
            sel_exp, mut_df=mut_df, exp_label=str(tf), show_figs=False, tf_list=tf_list)

        remap_cols = {col: "{}TF_{}".format(tf, col) for col in [
            "#TF", "#Mutated TF", "Exp"]}
        dmy = genes_used[list(remap_cols.keys())].copy(deep=True)
        dmy["TF_edges"] = "{}".format(tf)
        tf_changes = pd.concat(
            [tf_changes, dmy],
            axis=0,
        )

    return tf_changes

In [103]:
# NOTE: You may need to update this
used_tf = set(
    exp_test.exps['standard_5K_4TF_hsbm'].nodes_df.index) & set(tf_list)

tf_chgs_all_ctrls = []


# Determine how many experiments with TFs were run
tf_range = []
for exp in exp_test.exps.values():
    tf = exp.extract_tf_number(exp.name)
    tf_range.append(tf)
tf_range = sorted(set(tf_range))


for key, val in ctrl_exps.items():
    tf_chgs = tf_modCon_exps(
        val["exps"], mut_df=mut_df, tf_range=tf_range, tf_list=tf_list)
    tf_chgs = tf_chgs[tf_chgs["Exp"] == "100"]
    tf_chgs["Type"] = "hCtrl{}".format(key)
    tf_chgs["Used_prct"] = tf_chgs["#TF"] / len(used_tf) * 100
    # tf_chgs["Mut_prct"] = tf_chgs["#Mutated TF"] / len(used_tf) * 100 #used_tf which are mutated
    tf_chgs_all_ctrls.append(tf_chgs)

tf_chgs_all_ctrls = pd.concat(tf_chgs_all_ctrls, axis=0)

# Generate the Exp Changes too
tf_changes_exp = tf_modCon_exps(
    h_exps, mut_df=mut_df, tf_range=tf_range, tf_list=tf_list)

In [104]:
# Process the information and get some stats

# Control
all_tf_chgs_stats = []
for tf in tf_chgs_all_ctrls["TF_edges"].unique():
    sel_df = tf_chgs_all_ctrls[tf_chgs_all_ctrls["TF_edges"] == tf]
    all_tf_chgs_stats.append(
        [
            tf,
            sel_df["Exp"].values[0],
            sel_df["Used_prct"].median(axis=0),
            sel_df["Used_prct"].mean(axis=0),
            sel_df["Used_prct"].std(axis=0),
            sel_df["#TF"].median(axis=0),
            sel_df["#TF"].mean(axis=0),
            sel_df["#TF"].std(axis=0),
            sel_df["#Mutated TF"].median(axis=0),
            sel_df["#Mutated TF"].mean(axis=0),
            sel_df["#Mutated TF"].std(axis=0),
        ]
    )


tf_chgs_stats_df = pd.DataFrame(
    all_tf_chgs_stats,
    columns=["TF_edges", "Exp", "Prct_median", "Prct_mean", "Prct_std", "#TF_median",
             "#TF_mean", "#TF_std", "#Mutated median", "Mutated mean", "Mutated std"],
)
tf_chgs_stats_df["Type"] = "Control"

# Experiment
tf_changes_exp = tf_changes_exp[tf_changes_exp["Exp"] == "100"]
tf_changes_exp["Used_prct"] = tf_changes_exp["#TF"] / len(used_tf) * 100
tf_changes_exp["Prct_median"] = tf_changes_exp["Used_prct"]
tf_changes_exp["Prct_mean"] = tf_changes_exp["Used_prct"]
tf_changes_exp["Prct_std"] = 0
tf_changes_exp["Type"] = "Experiment"
# tf_changes_exp["Mut_prct"] = tf_changes_exp["#Mutated TF"] / len(used_tf) * 100

# Combine the two
comb_chgs = pd.concat([tf_chgs_stats_df, tf_changes_exp], axis=0)

In [105]:
fig = px.line(
    comb_chgs,
    x="TF_edges",
    y="Prct_mean",
    error_y="Prct_std",
    color="Type",
    color_discrete_map=color_map,
    markers=True,
    title="% TF used in calculating the MEV",
    height=600,
)
fig.update_layout(
    legend=dict(
        orientation="h",
        # title="Type",
        yanchor="bottom",
        y=0.18,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=16),
    ),
    title="",
    xaxis=dict(
        tickfont=dict(size=16),
        title_font=dict(
            size=16,
        ),
        title="Minimum degree for selected genes",
    ),
    yaxis=dict(
        tickfont=dict(size=16),
        title_font=dict(
            size=16,
        ),
        title="% of genes included",
        # range=[0, 90],
    ),
    font=dict(size=16),
)
# fig = fig.update_yaxes(title_text="% of genes included", range=[0, 105], tickfont=dict(size=14), title_font=dict(size=16))
fig.update_traces(line_width=5, marker_size=12, error_y=dict(width=5))

fig.show()