# Motivation

This notebook is using the standard network generated from the non-cancerous dataset. It explores the subtypes derived of the non-tumour dataset.

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys
import igraph as ig
import pickle

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

import multiprocess as mp

SCRIPT_DIR = os.path.dirname(os.path.abspath("pcgna_processing.py"))
sys.path.append(os.path.dirname(SCRIPT_DIR))
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.NetworkOutput import NetworkOutput
from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot, survival_comp
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
import NetworkAnalysis.utilities.clustering as cs


# Gsea libraries
import gseapy as gp
import matplotlib.pyplot as plt

import graph_tool.all as gt

# %matplotlib inline

pio.templates.default = "ggplot2"


pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [112]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"

figures_path = "../network_II/standard/"

base_sbm_path = "../../../iNet_v2/"
sbm_exps_path = "results/sbm/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")


healthy_metadata = pd.read_csv(f"{data_base}/metadata/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample", dtype_backend="pyarrow")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

# Load experiments

In [3]:
hsbm_v3 = ExperimentSet("v3", base_path=base_sbm_path, exp_path=sbm_exps_path, mut_df=tcga_mutations_df, sel_sets=None, exp_type="iNet")

exps = {}
for idx, exp in enumerate(hsbm_v3.get_exps()):
    if exp.sbm_method != "hsbm":  # or idx != 2:
        continue

    print(f"Loading Graph-Tool for {exp.type}")
    exps[exp.type] = GtExp.from_pgcna_inet(exp, rel_path="")
    exps[exp.type].export_to_gephi(save=False)

hsbm_v3.exps = exps

##### Experiment labels:  dict_keys(['standard_5K_6TF_sbm', 'standard_5K_6TF_hsbm', 'sigmoid_5K_6TF_hsbm'])
Loading Graph-Tool for standard_5K_6TF_hsbm
Loading Graph-Tool for sigmoid_5K_6TF_hsbm


In [6]:
%autoreload 2
sel_gt: GtExp = hsbm_v3.exps["standard_5K_6TF_hsbm"]
gt_state: gt.NestedBlockState = sel_gt.hstateObj["state"]

sel_gt.hsbm_add_vp(mut_df=tcga_mutations_df)

gt_g = sel_gt.gt_g
pv = sel_gt.hstateObj["pv"]
pmode = sel_gt.hstateObj["pmode"]

com_df, _ = sel_gt.hsbm_get_gt_df()
com_df["max_b"] = com_df["P_lvl_0"]

gt.remove_self_loops(gt_g)

######## ModCon and MEVS ########
sel_gt.gt_modCon_MEV(all_tpms=sel_gt.tpm_df)

# Compute and store the nodes/edges with all the information
sel_gt.export_to_gephi(save=False, com_df = com_df)
sel_gt.add_gt_prop_draw(gt_g, com_df=com_df, tf_list=tf_list)
graph_stats = sel_gt.compute_graph_stats()

nodes_df: pd.DataFrame = gh.add_stats_to(nodes_df=sel_gt.nodes_df, tpm_df=sel_gt.tpm_df)

In [7]:
if 0:
    nodes_df[["max_b", "ModCon_Rank", "mean", "median", "std", "var", "count", "ctrl_tf"]].to_csv(figures_path + "nodes_df_hsbm.tsv", sep="\t")
    # sel_gt.export_to_gephi(save=True, com_df=com_df[["max_b", "node_idx"]])

# Non-tum Analysis

## Export to Morpheus

In [72]:
label = "hsbm_std"
sel_gt.gt_modCon_MEV(all_tpms=sel_gt.tpm_df)
cs_exp, fig_std, metrics = gh.run_clusters(sel_gt, label="hsbm_std", show_figs=False)

# Combine clustering with the Mevs
cluster_cols = [f"RawKMeans_CS_{7}_hsbm_std"]
mevs_cs = pd.concat([cs_exp[cluster_cols], sel_gt.mevsMut], axis=1)

healthy_cols = healthy_metadata.columns
mevs_cols = sel_gt.mevsMut.columns

# Add the metadata
comb_df = pd.concat([healthy_metadata.drop("Y2796_P0"), mevs_cs], axis=1).dropna(how="all")

# reorder columns
comb_df[list(healthy_cols) + list(cluster_cols) + list(mevs_cols)].T

# save the file
label = f"{sel_gt.type}"
filename = f"healthy_{label}_v3_1"
path = f"{figures_path}/Morpheus/non_tum/{filename}.tsv"

comb_df.transpose().to_csv(path, sep="\t", index=True)

Variation per principal component [0.47958678 0.32651145] and the sum 80.61%


## Import morpheus

In [157]:
morpheus_path = f"{figures_path}/Morpheus/non_tum/{filename}.gct"

morp_df = pd.read_csv(morpheus_path, sep="\t", skiprows=2)

columns = morp_df["id"]
morp_df = morp_df.transpose()
morp_df.columns = columns
morp_df = morp_df.iloc[2:, :]

morp_df.rename_axis("sample", axis="columns", inplace=True)
# Group 4 has just one sample
morp_df = morp_df[morp_df["dendrogram_cut"] != "4.00"]

In [158]:
reorder_cols = [
    "NHU_differentiation",
    "dendrogram_cut",
    "subset_name",
    # "RawKMeans_CS_7_rwd_mut",
    "Gender",
]
meta, sky_fig = sky.main(df=morp_df.fillna("NA"), reorder_cols=reorder_cols, title="Non-tum", retMeta=True)
sky_fig.show()

In [167]:
# Export for DEA
morp_df.to_csv(f"{figures_path}/Morpheus/non_tum/prcsd_morpheus_cs_7.tsv", sep="\t", index_label="sample")

# MIBC Analysis

## Export to Morpheus

In [17]:
# from observations
# samples_to_remove = ["TCGA-5N-A9KI", "TCGA-GV-A3QH", "TCGA-FD-A3SR", "TCGA-GV-A3QI", "TCGA-UY-A9PA", "TCGA-G2-A2EF", "TCGA-E7-A677", "TCGA-DK-A3IV"]

# pre-processing
# df = all_tum_tpms.drop(columns=samples_to_remove)
df = all_tum_tpms.copy(deep=True)

mut_df = tcga_mutations_df.copy(deep=True)
missing_genes = list(set(sel_gt.tpm_df.index) - set(mut_df.index))

mut_df = mut_df.reindex(mut_df.index.union(missing_genes), fill_value=0)
mut_df = mut_df.loc[sel_gt.tpm_df.index].drop(columns="count")
mut_df = mut_df[all_tum_tpms.columns]

In [None]:
morpheus_path = f"{figures_path}/v3/Morpheus/v3.1"

morp_df = pd.read_csv(f"{morpheus_path}/norm_mevsMut_sigmoid_v3_2_5K.gct", sep="\t", skiprows=2)

columns = morp_df["id"]
morp_df = morp_df.transpose()
morp_df.columns = columns
morp_df = morp_df.iloc[2:, :]

morp_df.rename_axis("sample", axis="columns", inplace=True)
reorder_cols = [
    "TCGA_2017_AM_remap",
    # "KMeans_labels_6",
    "dendrogram_cut",
    # "RawKMeans_CS_7_rwd_mut",
    "2019_consensus_classifier",
]
meta, sky_fig = sky.main(df=morp_df, reorder_cols=reorder_cols, title="{}. Comp between {} ".format("SBM derived ", ", ".join(reorder_cols)), retMeta=True)
sky_fig.show()

# Network Analysis

## Level membership

In [None]:
if 0:
    fig = sel_gt.hsbm_plot_posterior()
    fig.show()