# Motivation

The purpose of this notebook is to test if we can find the 98 TFs in the tumour networks

In [9]:
%load_ext autoreload
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import multiprocess as mp


# own libraries
sys.path.append('/Users/vlad/Developer/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.NetworkOutput import NetworkOutput

from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import clustering as cs
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
from NetworkAnalysis.dea import dea
from NetworkAnalysis.dea import helpers as dea_hp
from NetworkAnalysis.dea import gsea as gsea_hp
# Gsea libraries

pio.templates.default = "ggplot2"

pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/tum/"  # "/integration_v2.1/tum/" - path from iNET

# figures_path = base_path + exp_folder_tumour + "Figures/"
figures_path = "tum_modifiers/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

# prep mut
tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/TPMs_selected_genes_v3_13k_gc42.tsv", sep="\t", index_col="genes")
all_tum_tpms_v4 = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")


sel_tfs = pd.read_csv(f"{data_base}/tf_ctrl.csv", index_col="gene")
sel_tfs["tum_mean_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].mean(axis=1)
sel_tfs["tum_median_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].median(axis=1)
sel_tfs["tum_std_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].std(axis=1)
sel_tfs["tum_var_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].var(axis=1)

## Load experiment sets

In [3]:
%autoreload 2

tum = ExperimentSet("tum", base_path, exp_folder_tumour, tcga_mutations_df, sel_sets = ["5K",], rel_path="../")
# p0 = ExperimentSet("p0", base_path, exp_folder_p0, tcga_mutations_df, sel_sets = ["4K"], rel_path="../")

%autoreload 2
tum.export_to_gephi(save=False)
# p0.export_to_gephi(save=False)

##### Experiment labels:  dict_keys(['standard_5K_50TF', 'beta_5K_50TF', 'norm2_5K_50TF', 'norm3_5K_50TF', 'standard_5K_6TF', 'standard_5K_5TF', 'standard_5K_4TF', 'standard_5K_3TF', 'norm3_5K_5TF', 'norm3_5K_4TF', 'norm3_5K_6TF', 'beta_5K_3TF', 'beta_5K_4TF', 'beta_5K_6TF', 'norm3_5K_3TF', 'standard_5K_7TF', 'standard_5K_8TF', 'beta_5K_7TF', 'standard_5K_9TF', 'standard_5K_10TF', 'norm3_5K_7TF', 'norm3_5K_8TF', 'beta_5K_8TF', 'norm3_5K_9TF', 'beta_5K_9TF', 'beta_5K_10TF', 'norm3_5K_10TF', 'beta_5K_5TF'])


## Check sel_tfs in nodes_df

In [21]:
sel_exp: NetworkOutput = tum.exps['norm3_5K_10TF']
nodes_df: pd.DataFrame = sel_exp.nodes_df.copy(deep=True)
graph_stats = sel_exp.compute_graph_stats()
nodes_df = pd.concat([nodes_df, graph_stats], axis=1)
nodes_df['type'] = 'standard'
nodes_df.loc[nodes_df.index.isin(sel_tfs.index), 'type'] = 'TF'

sig_exp_genes = ['KLF5', 'TP63', 'BCL6', 'SPEN', 'SP1', 'TCF20', 'MAFG', 'ZNF609']



In [23]:
nodes_df['type'].value_counts()

type
standard    4933
TF            67
Name: count, dtype: int64

In [22]:
px.box(nodes_df, y='degree', x='type', color='type', points='all')