# Motivation

This notebook is complementing the work in reward hSBM, particularly the work on analysing the highly connected genes.

Here we focused on understanding if the 122 genes which are the highest mutated genes, with the highest degree values and are also separated in smalller communities. These are all found in the reward network.

The notebook tries to understand if the 122 genes are not randomn by using controls.

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio
import multiprocess as mp

SCRIPT_DIR = os.path.dirname(os.path.abspath("pcgna_processing.py"))
sys.path.append(os.path.dirname(SCRIPT_DIR))
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities.helpers import save_fig
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
import graph_tool.all as gt

# %matplotlib inline

pio.templates.default = "ggplot2"
pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/network_II/"

figures_path = "../network_II/reward/"

sbm_exps_path = "sbm/"
control_exps_path = "mut_control_v3_2/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

healthy_metadata = pd.read_csv(f"{data_base}/metadata/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample", dtype_backend="pyarrow")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

# Load experiments

In [3]:
hsbm_v3 = ExperimentSet("v3", base_path=base_path, exp_path=sbm_exps_path, mut_df=tcga_mutations_df, sel_sets=None, exp_type="iNet")

exps = {}
for idx, exp in enumerate(hsbm_v3.get_exps()):
    if "sigmoid" not in exp.name:  # or idx != 2:
        continue

    print(f"Loading Graph-Tool for {exp.type}")
    exp = GtExp.from_pgcna_inet(exp, rel_path="")
    gt.remove_self_loops(exp.gt_g)  # needed for sigmoid

    # partitions from hSBM
    exp.hsbm_add_vp(mut_df=tcga_mutations_df)
    com_df, _ = exp.hsbm_get_gt_df()

    exp.export_to_gephi(save=False, com_df=com_df)

    # Need it for comparison
    exps[exp.type] = exp

hsbm_v3.exps = exps

##### Experiment labels:  dict_keys(['standard_5K_6TF_sbm', 'standard_5K_6TF_hsbm', 'sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm


In [4]:
%autoreload 2
sel_gt: GtExp = hsbm_v3.exps["sigmoid_5K_6TF_hsbm"]
gt_state: gt.NestedBlockState = sel_gt.hstateObj["state"]

sel_gt.hsbm_add_vp(mut_df=tcga_mutations_df)

gt_g = sel_gt.gt_g
com_df, _ = sel_gt.hsbm_get_gt_df()

# Compute and store the nodes/edges with all the information
sel_gt.export_to_gephi(save=False, com_df = com_df)
graph_stats = sel_gt.compute_graph_stats()

nodes_df: pd.DataFrame = gh.add_stats_to(nodes_df=sel_gt.nodes_df, tpm_df=sel_gt.tpm_df)

# Experiment well-connected

In [5]:
def get_high_deg(exp: GtExp, com_det="max_b"):

    nodes_df = exp.nodes_df
    col_metric = "degree_t"

    #### Compute the network stats
    network_stats: pd.DataFrame = exp.compute_graph_stats()
    network_stats["max_b"] = nodes_df["max_b"].astype(str)
    network_stats["Modularity Class"] = nodes_df["Modularity Class"].astype(str)

    ### Mean degree
    com_stats, col_metric = [], "degree_t"
    for com in network_stats[com_det].unique():
        sel_net = network_stats.loc[network_stats[com_det] == com]
        com_stats.append((int(com), sel_net[col_metric].mean()))
    com_stats = pd.DataFrame(com_stats, columns=["com", "mean_degree"])

    network_stats["mean_degree"] = round(com_stats["mean_degree"])
    com_stats.sort_values(by="com", inplace=True)
    com_stats["com"] = com_stats["com"].astype(str)

    return network_stats, com_stats


def process_high_deg(exp: GtExp, com_stats: pd.DataFrame):

    nodes_df = exp.nodes_df
    # th degree of the given percentile (70) so that it includes all the communities w/ mean degree > 10
    th = 70
    th_dg = np.percentile(com_stats["mean_degree"], th)
    coms = com_stats[com_stats["mean_degree"] > th_dg]["com"].astype(int)
    print(f"Number of coms {coms.shape[0]} of {len(nodes_df['max_b'].unique())}")
    print(list(coms.values))

    sel_com_genes = nodes_df.loc[nodes_df["max_b"].isin(coms)][["count", "TF", "max_b"]].rename(columns={"count": "mut_count"})

    print(f"Nummber of genes selected {sel_com_genes.shape[0]}")

    return sel_com_genes


def corr_rank_stats(exp: GtExp, corr_df: pd.DataFrame, sel_com_genes: pd.DataFrame):
    rank_stats = []
    for gene in sel_com_genes.index:
        # Get the neighbors for the gene
        _, n_df = exp.get_gene_neigbhors(gene_name=gene, verbose=False)
        neighbors = list(n_df.index)

        # Rank the neighbors ranking in unmodified corr matrix
        unmodified_rank = corr_df.loc[neighbors, :].rank(ascending=False, axis=1)
        gene_ranked = pd.DataFrame(unmodified_rank.loc[:, gene].rename(gene))

        rank_stats.append((gene, gene_ranked.min()[0], gene_ranked.max()[0], gene_ranked.mean()[0], gene_ranked.median()[0], gene_ranked.std()[0]))

    rank_stats = pd.DataFrame(rank_stats, columns=["gene", "min", "max", "mean", "median", "std"]).set_index("gene")

    return rank_stats


def compute_rank_ctrl(idx: int, ctrl_exps: dict, figures_path: str):

    ctrl_exp: GtExp = ctrl_exps[idx].exps["sigmoid_5K_6TF_hsbm"]
    ctrl_mut: pd.DataFrame = pd.read_csv(f"{figures_path}/control_mutated/control_mutated_{idx}.csv", index_col="gene")
    ctrl_genes = ctrl_mut[ctrl_mut["Type"] == "Control High Degree"].index

    _, ctrl_com_stats = get_high_deg(ctrl_exp)
    ctrl_sel_com_genes = process_high_deg(ctrl_exp, ctrl_com_stats)

    cmn_high_deg_genes = set(ctrl_sel_com_genes.index) & set(sel_com_genes.index)

    ctrl_genes_high = set(ctrl_sel_com_genes.index) & set(ctrl_genes)
    print(f"--> Common Exp & Control highly connected genes {len(cmn_high_deg_genes)}")
    print(f"--> Ctrl highly connected genes {len(ctrl_genes_high)}")

    rank_stats = corr_rank_stats(ctrl_exp, corr_df, sel_com_genes=ctrl_sel_com_genes)

    return rank_stats


network_stats, com_stats = get_high_deg(sel_gt)
sel_com_genes = process_high_deg(sel_gt, com_stats)

Number of coms 14 of 45
[1, 2, 3, 5, 7, 8, 10, 12, 15, 18, 28, 32, 33, 42]
Nummber of genes selected 122


# Control

In [6]:
import random

### Generate the mutations
if 0:
    for i in range(1, 2):
        # keep a copy of the initial dataset
        all_5K_genes = sel_gt.tpm_df.copy(deep=True)
        all_5K_genes["mut_count"] = tcga_mutations_df["count"]
        all_5K_genes["mut_count"].fillna(0, inplace=True)
        all_5K_genes = all_5K_genes.reset_index()[["genes", "mut_count"]]
        not_mutated = all_5K_genes[all_5K_genes["mut_count"] == 0]["genes"]

        ### Randomnly NOT select the highly-connected genes
        rand_sel_genes = random.sample(list(not_mutated), len(sel_com_genes))
        rename_genes = {sel_com_genes.index[idx]: rand_sel_genes[idx] for idx in range(0, len(rand_sel_genes))}

        ### Change the highly connected genes to the randomnly picked
        highly_con = all_5K_genes.loc[all_5K_genes["genes"].isin(sel_com_genes.index)]
        highly_con["genes"] = highly_con["genes"].replace(rename_genes)
        highly_con.set_index("genes", inplace=True)

        ### The randomnly picked genes are changed with the highly connected
        reversed_dict = {value: key for key, value in rename_genes.items()}
        not_highly_con = all_5K_genes.loc[~all_5K_genes["genes"].isin(sel_com_genes.index)]
        not_highly_con["genes"] = not_highly_con["genes"].replace(reversed_dict)
        not_highly_con.set_index("genes", inplace=True)

        new_mutated_genes = pd.concat([not_highly_con, highly_con], axis=0)
        new_mutated_genes["Type"] = "Experiment"
        new_mutated_genes.loc[rename_genes.keys(), "Type"] = "Experiment High Degree"
        new_mutated_genes.loc[rename_genes.values(), "Type"] = "Control High Degree"

        ### Exported
        new_mutated_genes["mut_count"] = new_mutated_genes["mut_count"].astype(int)
        new_mutated_genes.rename(columns={"mut_count": "count"}, inplace=True)
        new_mutated_genes.index.names = ["gene"]
        new_mutated_genes.to_csv(f"{figures_path}/control_mutated_{i}_c.csv")

In [7]:
folders_path = f"{base_path}/mut_control_v3_4/"
folders = next(os.walk(f"{folders_path}"), (None, None, []))[1]
folders

['ctrl_2', 'ctrl_5', 'ctrl_4', 'ctrl_3', 'ctrl_1', 'ctrl_0']

In [8]:
folders_path = f"{base_path}/mut_control_v3_4/"
folders = next(os.walk(f"{folders_path}"), (None, None, []))[1]

ctrls = {}
for folder in folders:
    print(f"### Control experiment form {folder}")
    ctrl_path = f"{folders_path}/{folder}/"
    idx = int(folder.split("ctrl_")[-1])

    # Load the experiment set
    ctrls[idx] = ExperimentSet("ctrl", base_path, ctrl_path, tcga_mutations_df, sel_sets=None, rel_path="../", exp_type="iNet")
    ctrls[idx].export_to_gephi(save=False)

    # Compute the required information
    exps = {}
    for exp in ctrls[idx].get_exps():
        if "sigmoid" not in exp.name:
            continue

        print(f"Loading Graph-Tool for {exp.type}")
        exp = GtExp.from_pgcna_inet(exp, rel_path="")

        # partitions from hSBM
        exp.hsbm_add_vp(mut_df=tcga_mutations_df)
        com_df, _ = exp.hsbm_get_gt_df()

        gt_g = exp.gt_g

        exp.export_to_gephi(save=False, com_df=com_df)
        exp.gt_modCon_MEV(all_tpms=exp.tpm_df, is_imev=True)

        graph_stats = exp.compute_graph_stats()
        nodes_df: pd.DataFrame = gh.add_stats_to(nodes_df=exp.nodes_df, tpm_df=exp.tpm_df)
        exp.nodes_df = nodes_df

        # Need it for comparison
        exps[exp.type] = exp

    # Save the experiments
    ctrls[idx].exps = exps

### Control experiment form ctrl_2
##### Experiment labels:  dict_keys(['sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm
### Control experiment form ctrl_5
##### Experiment labels:  dict_keys(['sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm
### Control experiment form ctrl_4
##### Experiment labels:  dict_keys(['sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm
### Control experiment form ctrl_3
##### Experiment labels:  dict_keys(['sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm
### Control experiment form ctrl_1
##### Experiment labels:  dict_keys(['sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm
### Control experiment form ctrl_0
##### Experiment labels:  dict_keys(['sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for sigmoid_5K_6TF_hsbm


In [9]:
ctrl_exp: GtExp = ctrls[3].exps["sigmoid_5K_6TF_hsbm"]
ctrl_gt_g = ctrl_exp.gt_g
_, ctrl_com_stats = get_high_deg(ctrl_exp)
ctrl_sel_com_genes = process_high_deg(ctrl_exp, ctrl_com_stats)

cmn_high_deg_genes = set(ctrl_sel_com_genes.index) & set(sel_com_genes.index)
print(f"--> Common highly connected genes {len(cmn_high_deg_genes)}")

Number of coms 13 of 42
[1, 2, 4, 6, 8, 9, 13, 20, 23, 24, 34, 35, 38]
Nummber of genes selected 137
--> Common highly connected genes 0


## Rank stats

In [10]:
corr_df = sel_gt.tpm_df.T.corr("spearman")

In [11]:
rank_stats = compute_rank_ctrl(idx=3, ctrl_exps=ctrls, figures_path=figures_path)

metric = "mean"
rank_stats.sort_values(by=metric, ascending=False, inplace=True)
rank_stats["max_b"] = sel_com_genes["max_b"].astype(str)
fig = px.bar(
    rank_stats.reset_index(),
    x="gene",
    y=metric,
    error_y="std",
    hover_data="gene",
    title=f"Average {metric} of the correlation rank of these genes in their neighbours ranks",
)
fig.update_layout(height=700)

Number of coms 13 of 42
[1, 2, 4, 6, 8, 9, 13, 20, 23, 24, 34, 35, 38]
Nummber of genes selected 137
--> Common Exp & Control highly connected genes 0
--> Ctrl highly connected genes 88


## Ctrl genes in highest connected nodes

In [12]:
ctrl_high_conn, diff_genes, stats = [], [], []
all_degree = pd.DataFrame()


network_stats, com_stats = get_high_deg(sel_gt)
sel_com_genes = process_high_deg(sel_gt, com_stats)
high_degree_stats = network_stats.loc[list(sel_com_genes.index)]["degree_t"]

###### To be first
# For degree values
high_degree_stats = pd.DataFrame(high_degree_stats)
high_degree_stats["type"] = "Exp"
all_degree = pd.concat([all_degree, high_degree_stats])

# For node stats
stats.append(
    (
        6,
        "Reward Network",
        high_degree_stats["degree_t"].max(),
        high_degree_stats["degree_t"].min(),
        high_degree_stats["degree_t"].mean(),
        high_degree_stats["degree_t"].var(),
        high_degree_stats["degree_t"].std(),
        high_degree_stats["degree_t"].quantile(0.75),
    )
)

for idx in range(1, 6):
    ctrl_exp: GtExp = ctrls[idx].exps["sigmoid_5K_6TF_hsbm"]
    ctrl_mut: pd.DataFrame = pd.read_csv(f"{figures_path}/control_mutated/control_mutated_{idx}.csv", index_col="gene")
    ctrl_genes = ctrl_mut[ctrl_mut["Type"] == "Control High Degree"].index

    ctrl_gt_g = ctrl_exp.gt_g
    _, ctrl_com_stats = get_high_deg(ctrl_exp)
    ctrl_sel_com_genes = process_high_deg(ctrl_exp, ctrl_com_stats)

    ctrl_genes_high = set(ctrl_sel_com_genes.index) & set(ctrl_genes)
    diff_genes = set(ctrl_sel_com_genes.index) - set(ctrl_genes)
    ctrl_high_conn.append((idx, len(ctrl_genes_high), len(ctrl_genes), list(ctrl_genes_high)))

    # stats
    graph_stats = ctrl_exp.compute_graph_stats()
    high_degree_stats = graph_stats.loc[list(ctrl_genes_high)]["degree_t"]
    stats.append(
        (
            idx,
            "Control",
            high_degree_stats.max(),
            high_degree_stats.min(),
            high_degree_stats.mean(),
            high_degree_stats.var(),
            high_degree_stats.std(),
            high_degree_stats.quantile(0.75),
        )
    )
    high_degree_stats = pd.DataFrame(high_degree_stats)
    high_degree_stats["type"] = f"Ctrl_{idx}"
    all_degree = pd.concat([all_degree, high_degree_stats])
    # all_degree.append((high_degree_stats.values, f"Ctrl_{idx}"))


stats = pd.DataFrame(stats, columns=["run", "type", "max", "min", "mean", "var", "std", "q3"])
ctrl_high_conn = pd.DataFrame(ctrl_high_conn, columns=["Run", "#genes", "#control", "genes"])
display(ctrl_high_conn)
display(stats)

Number of coms 14 of 45
[1, 2, 3, 5, 7, 8, 10, 12, 15, 18, 28, 32, 33, 42]
Nummber of genes selected 122
Number of coms 14 of 47
[1, 2, 3, 6, 8, 12, 13, 17, 21, 28, 31, 33, 34, 36]
Nummber of genes selected 143
Number of coms 14 of 46
[1, 2, 3, 5, 7, 15, 16, 18, 22, 36, 37, 38, 39, 40]
Nummber of genes selected 112
Number of coms 13 of 42
[1, 2, 4, 6, 8, 9, 13, 20, 23, 24, 34, 35, 38]
Nummber of genes selected 137
Number of coms 13 of 42
[1, 2, 6, 8, 10, 11, 14, 22, 24, 25, 27, 29, 36]
Nummber of genes selected 118
Number of coms 12 of 39
[1, 2, 3, 5, 6, 9, 12, 15, 24, 27, 31, 32]
Nummber of genes selected 119


Unnamed: 0,Run,#genes,#control,genes
0,1,90,122,"[MAILR, WASH6P, ALDH6A1, ENSG00000176593, DLX6..."
1,2,75,122,"[ENSG00000278493, ENSG00000290957, LAMC1-AS1, ..."
2,3,88,122,"[MICB, ENSG00000286584, ENSG00000267274, UCK2,..."
3,4,85,122,"[MAILR, ENSG00000291176, NSUN5P2, ENSG00000288..."
4,5,82,122,"[ENSG00000176593, MIR34AHG, ZNF516-AS1, ACP6, ..."


Unnamed: 0,run,type,max,min,mean,var,std,q3
0,6,Reward Network,985,8,157.106557,36460.09599,190.94527,196.5
1,1,Control,834,8,174.855556,41722.799126,204.261595,184.25
2,2,Control,989,12,208.84,45176.271351,212.547104,301.0
3,3,Control,1085,10,190.681818,44223.6907,210.294295,267.25
4,4,Control,753,11,188.741176,38383.122689,195.916111,338.0
5,5,Control,1108,13,206.646341,52105.070912,228.265352,304.0


In [13]:
fig = px.box(
    all_degree.reset_index(names="gene"),
    y="degree_t",
    x="type",
    color="type",
    points="all",
    color_discrete_sequence=px.colors.qualitative.Vivid,
    color_discrete_map={"Exp": px.colors.qualitative.Plotly[1]},
    hover_data="gene",
)
fig = fig.update_layout(
    title="",
    legend=dict(
        title="Type",
        orientation="h",
        yanchor="middle",
        xanchor="center",
        y=1.05,
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=20, color="#003366"),
    ),
    showlegend=False,
    yaxis=dict(tickfont=dict(size=22), title="Degree", title_font_size=28),
    xaxis=dict(tickfont=dict(size=22), title="Type", title_font_size=28),
    font=dict(size=24),
    height=700,
)
fig.show()
save_fig(name=f"ctr_degree", fig=fig, base_path=f"{figures_path}/", width=1700, height=600, margin=0.02)

In [15]:
from scipy import stats

sep_metrics = []
for exp_type in all_degree["type"].unique():
    sep_metrics.append(all_degree.loc[all_degree["type"] == exp_type]["degree_t"])

In [18]:
stats.kruskal(sep_metrics[0], sep_metrics[1], sep_metrics[2], sep_metrics[3], sep_metrics[4], sep_metrics[5])

KruskalResult(statistic=8.66820414354024, pvalue=0.12305298695424723)

In [None]:
network_stats, com_stats = get_high_deg(sel_gt)
sel_com_genes = process_high_deg(sel_gt, com_stats)
sel_com_genes