# Motivation

This notebook is using the standard network generated from the non-cancerous dataset. It explores the subtypes derived of the non-tumour dataset.

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys
import igraph as ig
import pickle

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

import multiprocess as mp

SCRIPT_DIR = os.path.dirname(os.path.abspath("pcgna_processing.py"))
sys.path.append(os.path.dirname(SCRIPT_DIR))
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.NetworkOutput import NetworkOutput
from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot, survival_comp
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
import NetworkAnalysis.utilities.clustering as cs

from NetworkAnalysis.dea import dea
from NetworkAnalysis.dea import helpers as dea_hp
from NetworkAnalysis.dea import gsea as gsea_hp

# Gsea libraries
import gseapy as gp
import matplotlib.pyplot as plt

import graph_tool.all as gt

# %matplotlib inline

pio.templates.default = "ggplot2"


pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"

figures_path = "../network_II/standard/"

base_sbm_path = "../../../iNet_v2/"
sbm_exps_path = "results/sbm/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")


healthy_metadata = pd.read_csv(f"{data_base}/metadata/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample", dtype_backend="pyarrow")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

# Load experiments

In [3]:
hsbm_v3 = ExperimentSet("v3", base_path=base_sbm_path, exp_path=sbm_exps_path, mut_df=tcga_mutations_df, sel_sets=None, exp_type="iNet")

exps = {}
for idx, exp in enumerate(hsbm_v3.get_exps()):
    if exp.sbm_method != "hsbm":  # or idx != 2:
        continue

    print(f"Loading Graph-Tool for {exp.type}")
    exps[exp.type] = GtExp.from_pgcna_inet(exp, rel_path="")
    exps[exp.type].export_to_gephi(save=False)

hsbm_v3.exps = exps

##### Experiment labels:  dict_keys(['standard_5K_6TF_sbm', 'standard_5K_6TF_hsbm', 'sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for standard_5K_6TF_hsbm
Loading Graph-Tool for sigmoid_5K_6TF_hsbm


In [4]:
%autoreload 2
sel_gt: GtExp = hsbm_v3.exps["standard_5K_6TF_hsbm"]
gt_state: gt.NestedBlockState = sel_gt.hstateObj["state"]

sel_gt.hsbm_add_vp(mut_df=tcga_mutations_df)

gt_g = sel_gt.gt_g
pv = sel_gt.hstateObj["pv"]
pmode = sel_gt.hstateObj["pmode"]

com_df, _ = sel_gt.hsbm_get_gt_df()
com_df["max_b"] = com_df["P_lvl_0"]

gt.remove_self_loops(gt_g)

######## ModCon and MEVS ########
sel_gt.gt_modCon_MEV(all_tpms=sel_gt.tpm_df)

# Compute and store the nodes/edges with all the information
sel_gt.export_to_gephi(save=False, com_df = com_df)
sel_gt.add_gt_prop_draw(gt_g, com_df=com_df, tf_list=tf_list)
graph_stats = sel_gt.compute_graph_stats()

nodes_df: pd.DataFrame = gh.add_stats_to(nodes_df=sel_gt.nodes_df, tpm_df=sel_gt.tpm_df)

In [6]:
if 0:
    nodes_df[["max_b", "ModCon_Rank", "mean", "median", "std", "var", "count", "ctrl_tf"]].to_csv(figures_path + "nodes_df_hsbm.tsv", sep="\t")
    # sel_gt.export_to_gephi(save=True, com_df=com_df[["max_b", "node_idx"]])

# Non-tum Analysis

## Export to Morpheus

In [5]:
label = "hsbm_std"
sel_gt.gt_modCon_MEV(all_tpms=sel_gt.tpm_df)
cs_exp, fig_std, metrics = gh.run_clusters(sel_gt, label="hsbm_std", show_figs=False)

# Combine clustering with the Mevs
cluster_cols = [f"RawKMeans_CS_{7}_hsbm_std"]
mevs_cs = pd.concat([cs_exp[cluster_cols], sel_gt.mevsMut], axis=1)

healthy_cols = healthy_metadata.columns
mevs_cols = sel_gt.mevsMut.columns

# Add the metadata
comb_df = pd.concat([healthy_metadata.drop("Y2796_P0"), mevs_cs], axis=1).dropna(how="all")

# reorder columns
comb_df[list(healthy_cols) + list(cluster_cols) + list(mevs_cols)].T

# save the file
label = f"{sel_gt.type}"
filename = f"healthy_{label}_v3_1"
path = f"{figures_path}/Morpheus/non_tum/{filename}.tsv"

comb_df.transpose().to_csv(path, sep="\t", index=True)

Variation per principal component [0.47958678 0.32651145] and the sum 80.61%


## Import morpheus

In [6]:
morpheus_path = f"{figures_path}/Morpheus/non_tum/norm_{filename}.gct"

morp_df = pd.read_csv(morpheus_path, sep="\t", skiprows=2)

columns = morp_df["id"]
morp_df = morp_df.transpose()
morp_df.columns = columns
morp_df = morp_df.iloc[2:, :]

morp_df.rename_axis("sample", axis="columns", inplace=True)
# Group 4 has just one sample
morp_df.loc[morp_df["dendrogram_cut"] == "1.00", "dendrogram_cut"] = "2.00"

In [7]:
morp_df[morp_df["subset_name"] == "B-Diff"][["subset_name", "dendrogram_cut", "NHU_differentiation"]].index

Index(['Y815A-Bl', 'Y836B-Bl', 'Y499B-Bl', 'Y929B-Bl', 'Y719B-Bl'], dtype='object')

In [8]:
reorder_cols = [
    "NHU_differentiation",
    "dendrogram_cut",
    "subset_name",
    # "RawKMeans_CS_7_rwd_mut",
    "Gender",
]
meta, sky_fig = sky.main(df=morp_df.fillna("NA"), reorder_cols=reorder_cols, title="Non-tum", retMeta=True)
sky_fig.update_layout(
    font=dict(size=18),
    paper_bgcolor="rgba(0,0,0,0)",
)

### Export for DEA

In [9]:
morp_df.to_csv(f"{figures_path}/Morpheus/non_tum/prcsd_morpheus_cs_7.tsv", sep="\t", index_label="sample")

# Export to show the communities on the Volcano plot
_ = gh.export_top_modCon_genes(nodes_df=sel_gt.nodes_df, path=figures_path, top_n=100, metric="median")
modConRank_genes = gh.export_top_modCon_genes(nodes_df=sel_gt.nodes_df, path=figures_path, top_n=100, metric="ModCon_Rank")

### Export for cluster tree

In [10]:
cluster_labels = sel_gt.nodes_df[["P_lvl_0", "P_lvl_1", "P_lvl_2", "P_lvl_3", "P_lvl_4"]].copy(deep=True)
cluster_labels.to_csv(f"{figures_path}/cluster_tree/cluster_tree_std_v3.csv")

# reverse the columns
cluster_labels.columns = cluster_labels.columns[::-1]
cluster_labels.to_csv(f"{figures_path}/cluster_tree/cluster_tree_std_v3_reverse.csv")

## DEA

### P0 Split

In [11]:
dea_path = "../../data/non_cancerous/dea/norm_splits/"

f_male_diff_small = "sleuth_5.0_6.0_v3_vulcano_labels.tsv"
f_male_diff_large = "sleuth_5.0_7.0_v3_vulcano_labels.tsv"
f_small_large_diff = "sleuth_6.0_7.0_v3_vulcano_labels.tsv"
f_p0_comp = "sleuth_P0_small_P0_large_vulcano_labels.tsv"

markers = modConRank_genes.to_dict("list")

In [12]:
sel_coms = [19, 22, 25, 29]
filter_dict = {f"{key}_ModCon": markers[key] for key in sel_coms}

volcano = dea.volcano(f_p0_comp, base_path=dea_path, known_markers=False, markers=filter_dict)

volcano = dea_hp.toggle_legend(volcano, kept_traces=["0", "Point(s) of interest", "Dataset"])
volcano.add_annotation(text="P0 large", x=-4, y=8, showarrow=False, font=dict(size=20, color="#177c7a"))
volcano.add_annotation(text="P0 small", x=2, y=8, showarrow=False, font=dict(size=20, color="#177c7a"))

volcano = volcano.update_layout(
    title="",
    legend=dict(
        orientation="h",
        yanchor="middle",
        xanchor="center",
        y=0.98,
        x=0.6,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=14, color="#003366"),
    ),
    xaxis=dict(tickfont=dict(size=20)),
    yaxis=dict(
        tickfont=dict(size=20),
    ),
    font=dict(size=16),
    height=1100,
)
# volcano

Finished loading the data in 0.03572797775268555


### Diff split

In [13]:
sel_coms = [19, 1, 29, 25, 22]
filter_dict = {f"{key}_ModCon": markers[key] for key in sel_coms}

volcano = dea.volcano(f_small_large_diff, base_path=dea_path, known_markers=False, markers=filter_dict)

volcano = dea_hp.toggle_legend(volcano, kept_traces=["0", "Point(s) of interest", "Dataset"])
volcano.add_annotation(text="Abs-Ca large", x=-3, y=19, showarrow=False, font=dict(size=20, color="#177c7a"))
volcano.add_annotation(text="Abs-Ca small", x=5, y=19, showarrow=False, font=dict(size=20, color="#177c7a"))

volcano = volcano.update_layout(
    title="",
    legend=dict(
        orientation="h",
        yanchor="middle",
        xanchor="center",
        y=0.98,
        x=0.4,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=14, color="#003366"),
    ),
    xaxis=dict(tickfont=dict(size=20)),
    yaxis=dict(
        tickfont=dict(size=20),
    ),
    font=dict(size=16),
    height=1100,
)
# volcano.show()

Finished loading the data in 0.03459286689758301


## Male specific

In [14]:
config = {"file_1": f_male_diff_large, "file_2": f_male_diff_small, "ref_x": "max", "ref_y": "min"}
pi, pi_df = gsea_hp.rank_pi_vals("maleSpecific", config, dea_path=dea_path, custom_points={}, show_known_markers=False)
# pi.show()

## GSEA

In [15]:
import gseapy as gp

msigdb_hallmarks = "../../data/GSEA/msigDB/h.all.v2023.2.Hs.symbols.gmt"
msigdb_reactome = "../../data/GSEA/msigDB/c2.cp.reactome.v2023.2.Hs.symbols.gmt"
msigdb_tf = "../../data/GSEA/msigDB/c3.tft.v2023.2.Hs.symbols.gmt"
msigdb_cancer = "../../data/GSEA/msigDB/c4.all.v2023.2.Hs.symbols.gmt"
msigdb_onco_sig = "../../data/GSEA/msigDB/c6.all.v2023.2.Hs.symbols.gmt"
msigdb_all = "../../data/GSEA/msigDB/msigdb.v2023.2.Hs.symbols.gmt"

databases = {
    "hallmark": {"path": msigdb_hallmarks},
    # "onco_sig": {"path": msigdb_onco_sig},
    "reactome": {"path": msigdb_reactome},
    ### Un-comment to run the following pathways
    # "tf": { "path": msigdb_tf },
    # "can_path": { "path": msigdb_cancer }, #canonical pathway
    # "all": { "db": msigdb_all },
}

runs_config = {"male_specific": {"file_1": f_male_diff_large, "file_2": f_male_diff_small, "ref_x": "max", "ref_y": "min"}}

In [16]:
if 0:
    new_runs = {}
    version = "v1"
    for subtype, config in runs_config.items():
        print(f"##### {subtype} #####")

        path = f"{figures_path}/GSEA/{version}/{subtype}"

        # Add the path
        config["path"] = path

        # Run the Pi value - the saved Pi plot is a check for selecting the right quadrant
        config["pi"], config["pi_df"] = gsea_hp.rank_pi_vals(subtype, config, dea_path=dea_path, custom_points={})

        # Run the GSEA
        gsea_res = gsea_hp.run_gsea(subtype, config, databases=databases)

        # Merging two dict, requires python >3.5
        new_runs[subtype] = {**config, **gsea_res}

In [17]:
if 0:
    for subtype, config in new_runs.items():

        for key, value in databases.items():
            print(f"{subtype} --> {key}")
            rank_gsea = config[key]
            base_path = config["path"]
            _, _ = gsea_hp.plot_top_gsea(res=rank_gsea, path=base_path, num=10, label=f"{subtype}_{key}", database=key)

            # brea

# MIBC Analysis

## Export to Morpheus

In [128]:
sorted([9, 13, 15, 21])

[9, 13, 15, 21]

In [140]:
######## Tissue type based on Community from healthy ########
comm_meta = {
    "diff": [8, 20, 22, 24, 25, 26, 27, 28, 30, 31, 32, 33],
    "p0": [0, 1, 2, 3, 4, 5, 6, 7, 16, 17],
    "undiff": [9, 10, 11, 12, 13, 14, 15, 21, 34],
    "p0_ud": [18],
    "misfit": [19, 29],
}

comm_meta_df = []
for key, comunities in comm_meta.items():
    for com in comunities:
        comm_meta_df.append((key, f"Com_{com}", com))
diff_type = pd.DataFrame(comm_meta_df, columns=["Diff Type", "Com", "Com_idx"]).sort_values(by=["Com_idx"]).set_index("Com")

In [142]:
diff_type['Com_idx']

Unnamed: 0_level_0,Diff Type,Com_idx
Com,Unnamed: 1_level_1,Unnamed: 2_level_1
Com_0,p0,0
Com_1,p0,1
Com_2,p0,2
Com_3,p0,3
Com_4,p0,4
Com_5,p0,5
Com_6,p0,6
Com_7,p0,7
Com_8,diff,8
Com_9,undiff,9


In [121]:
# # from observations
# Samples which were clustered in their single group
samples_to_remove = ["TCGA-5N-A9KI", "TCGA-DK-A2HX", "TCGA-XF-A8HD", "TCGA-G2-A2EF"]

# Add the samples thare grouped in 2 (cut - 11)
samples_to_remove.extend(["TCGA-XF-AAN0", "TCGA-4Z-AA7Q"])

# Add another sample which was an outlier in 3.2
samples_to_remove.extend(["TCGA-FD-A3SR", "TCGA-CF-A9FH"])

# pre-processing
f_tum = all_tum_tpms.drop(columns=samples_to_remove)

In [141]:
sel_gt.gt_modCon_MEV(all_tpms=f_tum, is_imev=True, com_df=com_df)
cs_exp, cs_figs, metrics_figs = gh.run_clusters(sel_gt, label="iMEV", show_figs=True, norm=False)

settings = {
    "cs_num_1": 5,
    "cs_num_2": 7,
    "sel_mut": pd.DataFrame(),
    "vu_output": vu_output,
    "cs_exp": cs_exp,
    "figures_path": f"{figures_path}/Morpheus/tum/",
    "label_col_cs": "iMEV",
    "filename": f"tum_{sel_gt.type}_iMev_3.3",
    "diff_type": diff_type,
    # 3.0 - with all the samples
    # 3.1 - without the samples that are in a single group
    # 3.2 - without the samples that are in pair
    # 3.3 - extra sample as an outliar
}

tum_morph = sel_gt.export_morpheus(settings)

Variation per principal component [0.51363944 0.29612881] and the sum 80.98%


In [139]:
K = 7
label = f"RawKMeans_CS_{K}_iMEV"
reorder_cols = [
    "TCGA_2017_AM_remap",
    # "KMeans_labels_6",
    # label,
    "2019_consensus_classifier",
]

meta, sky_fig = sky.main(
    df=tum_morph.dropna(), reorder_cols=reorder_cols, title="{}. Comp between {} ".format("SBM derived ", ", ".join(reorder_cols)), retMeta=True
)
sky_fig.show()

In [19]:
if 0:
    morpheus_path = f"{figures_path}/v3/Morpheus/v3.4 (tum)"

    morp_df = pd.read_csv(f"{morpheus_path}/norm_mevsMut_sigmoid_v3_2_5K.gct", sep="\t", skiprows=2)

    columns = morp_df["id"]
    morp_df = morp_df.transpose()
    morp_df.columns = columns
    morp_df = morp_df.iloc[2:, :]

    morp_df.rename_axis("sample", axis="columns", inplace=True)
    reorder_cols = [
        "TCGA_2017_AM_remap",
        # "KMeans_labels_6",
        "dendrogram_cut",
        # "RawKMeans_CS_7_rwd_mut",
        "2019_consensus_classifier",
    ]
    meta, sky_fig = sky.main(df=morp_df, reorder_cols=reorder_cols, title="{}. Comp between {} ".format("SBM derived ", ", ".join(reorder_cols)), retMeta=True)
    sky_fig.show()

## Visualising SBM

In [22]:
sel_gt: GtExp = hsbm_v3.exps["standard_5K_6TF_hsbm"]
gt_g: gt.Graph = sel_gt.gt_g

com_df, _ = sel_gt.hsbm_get_gt_df()
sel_gt.graph_type = "sbm"
sel_gt.nodes_df["node_idx"] = com_df["node_idx"]
sel_gt.nodes_df["max_b"] = com_df["max_b"]

sel_gt.hsbm_add_vp(mut_df=tcga_mutations_df)
sel_gt.add_gt_prop_draw(sel_gt.gt_g, com_df=com_df, tf_list=tf_list)

gt_state: gt.NestedBlockState = sel_gt.hstateObj["state"]
pv = sel_gt.hstateObj["pv"]

vp_dg = gt_g.degree_property_map(deg="total", weight=gt_g.ep["weight"])

In [23]:
sel_gt.gt_g.set_vertex_filter(None)
sel_gt.show_community(communities=[19, 25, 29])
# sel_gt.gt_g.set_vertex_filter(None)

In [53]:

draw_results = gt_state.draw(
    # pos=pos,
    layout="sfdp",
    # vertex_shape="pie",
    # vertex_pie_fractions=pv,
    inline=False,
    vertex_color=gt_g.vp["colors_rank"],
    # vertex_color=gt.prop_to_size(gt_g.vp["modCon_rank"], 0, 100, power=0.1),
    vertex_text=gt_g.vp.prcsd_gene,
    display_props=gt_g.vp.display_prop,
    # subsample_edgesint=int(gt_g.num_edges() * 0.5),
    output_size=(1600, 1600),  # good to visualised in Notebook
    # ######## For mut_count ########
    # vertex_size=gt.prop_to_size((gt_g.vp["sel_com"]), 1, 10, power=0.5),
    # vertex_font_size=gt.prop_to_size(gt_g.vp.is_tf, 15, 30, power=0.5),
    # vertex_font_size=gt.prop_to_size(gt_g.vp.is_tf, 5, 10, power=0.5),
    # edge_pen_width=gt.prop_to_size(gt_g.ep.weight, 2, 10, power=1),
    # vertex_size=gt.prop_to_size(vp_dg, 35, 35, power=1),
    ######## is TF ########
    # vertex_size=gt.prop_to_size(gt_g.vp.is_tf, 10, 30, power=0.5),
    # vertex_font_size=gt.prop_to_size(gt_g.vp.mut_count, 5, 20, power=1),
    ##### edges properties
    # edge_pen_width=gt.prop_to_size(gt_g.ep.weight, 5, 10, power=0.01, log=True),
    output=f"{figures_path}/sel_communities.pdf",
    # hide=0,
    update_layout=False,
    nodesfirst=False,
    # beta=0.7,
    # chord_scale=5,
    # display_props_size=16,
)


Unknown parameter: display_props


Unknown parameter: update_layout



# Network Analysis

## Level membership

In [None]:
if 0:
    fig = sel_gt.hsbm_plot_posterior()
    fig.show()