# Motivation

The purpose of this notebook is to introduce a simpler method but with the same results or similar to extract the 98 TFs

In [2]:
%load_ext autoreload
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import multiprocess as mp


# own libraries
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import clustering as cs
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
from NetworkAnalysis.dea import dea
from NetworkAnalysis.dea import helpers as dea_hp
from NetworkAnalysis.dea import gsea as gsea_hp
sys.path.append(os.path.dirname("../../src"))
# Gsea libraries

pio.templates.default = "ggplot2"

pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/tum/"  # "/integration_v2.1/ - path from iNET


# Experiments set
exp_folder_h42 = "network_I/healthy42/"
exp_folder_h42_ctrl = "network_I/healthyControls/"


figures_path = "./sel_pruning_v2/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

tum_tpms = pd.read_csv(f"{data_base}/tumour/TPMs_selected_genes_v3_13k_gc42.tsv", sep="\t", index_col="genes")
tum_tpms_v4 = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

# Healthy and metadata
healthy_metadata = pd.read_csv(f"{data_base}/non_cancerous/healthy_bladder_metadata.tsv", sep="\t", index_col="Sample").drop(index=["Y2796_P0"])
healthy_metadata["sample_prcsd"] = healthy_metadata.index.str.replace("-", "_")

healthy_tpm = pd.read_csv(f"{data_base}/non_cancerous/healthy_data_all_gc42_v4.tsv", sep="\t", index_col="gene")
remap_col = {}
for col in healthy_tpm.columns:
    remap_col[col] = col.replace("-", "_")
healthy_tpm = healthy_tpm.rename(columns=remap_col)

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

In [4]:
# Base-line, this is the results from previous work
sel_tfs = pd.read_csv(f"{data_base}/tf_ctrl.csv", index_col="gene")
sel_tfs["tum_mean_expression"] = tum_tpms_v4.loc[sel_tfs.index].mean(axis=1)
sel_tfs["tum_median_expression"] = tum_tpms_v4.loc[sel_tfs.index].median(axis=1)
sel_tfs["tum_std_expression"] = tum_tpms_v4.loc[sel_tfs.index].std(axis=1)
sel_tfs["tum_var_expression"] = tum_tpms_v4.loc[sel_tfs.index].var(axis=1)

## Import the data

## Experiment sets

In [64]:
def load_experiment_set(base_path, exp_folder_path):
    exp_path = f"{base_path}/{exp_folder_path}/Stats/"
    meta_files = next(os.walk(exp_path), (None, None, []))[2]
    # keep only the network stats
    files = [file for file in meta_files if "networkStats" in file]
    exp_set = {}
    for file in files:
        exp = file.replace(".tsv", "").replace("networkStats_standard_int_", "")
        exp_set[exp] = pd.read_csv(f"{exp_path}/{file}", index_col="gene", sep="\t")
    return exp_set

In [65]:
folders = next(os.walk(base_path + exp_folder_h42_ctrl), (None, None, []))[1]
exp_ctrl = {}
for folder in folders:
    hCtrl_path = f"/{exp_folder_h42_ctrl}/{folder}/"
    meta_files = next(os.walk(hCtrl_path), (None, None, []))[2]
    # keep only the network stats
    files = [file for file in meta_files if "networkStats" in file]
    exp_ctrl[folder] = load_experiment_set(base_path, hCtrl_path)

exp_set = load_experiment_set(base_path, exp_folder_h42)

## Network stats

Information calculated in the selective edge pruning