# Motivation

The purpose of this notebook is to test if we can find the 98 TFs in the tumour networks.

It also generate the control lists of the TFs

In [2]:
%load_ext autoreload
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import multiprocess as mp


# own libraries
sys.path.append('/Users/vlad/Developer/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.NetworkOutput import NetworkOutput

from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import clustering as cs
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
from NetworkAnalysis.dea import dea
from NetworkAnalysis.dea import helpers as dea_hp
from NetworkAnalysis.dea import gsea as gsea_hp
# Gsea libraries

pio.templates.default = "ggplot2"

pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/gc_47/tum/"  # "/integration_v2.1/tum/" - path from iNET

# figures_path = base_path + exp_folder_tumour + "Figures/"
figures_path = "controls/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

# prep mut
tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/TPMs_selected_genes_v3_13k_gc42.tsv", sep="\t", index_col="genes")
all_tum_tpms_v4 = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")


sel_tfs = pd.read_csv(f"{data_base}/tf_ctrl.csv", index_col="gene")
sel_tfs["tum_mean_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].mean(axis=1)
sel_tfs["tum_median_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].median(axis=1)
sel_tfs["tum_std_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].std(axis=1)
sel_tfs["tum_var_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].var(axis=1)

## Load experiment sets

In [4]:
exp_folder_tumour = "network_I/gc_47/tum/"  # "/integration_v2.1/tum/" - path from iNET


In [6]:
%autoreload 2
base_path = "../../results/"

tum = ExperimentSet("tum", base_path, exp_folder_tumour, tcga_mutations_df, sel_sets = ["5K",], rel_path="../", exp_type='iNet')
# p0 = ExperimentSet("p0", base_path, exp_folder_p0, tcga_mutations_df, sel_sets = ["4K"], rel_path="../")

%autoreload 2
tum.export_to_gephi(save=False)
# p0.export_to_gephi(save=False)

##### Experiment labels:  dict_keys(['standard_5K_13TF_hsbm'])


In [7]:
if True:

    sel_exp: NetworkOutput = tum.exps['standard_5K_13TF_hsbm']

    print("######## Runing TF control (random genes) ########")
    import random

    for i in range(1, 11, 1):
        used_5K_genes = sel_exp.tpm_df.index.values

        # not all tf maybe in the dataset
        tf_not_found = set(tf_list) - set(used_5K_genes)
        non_tf = list(set(used_5K_genes) - set(tf_list))

        # randomnly select n genes from the list to be used as control.
        #  we substract the number of tf that are not presented in order to keep it as the same number for tf
        random_genes = random.sample(non_tf, len(tf_list) - len(tf_not_found))

        # override tfList
        random_tf = list(random_genes)
        pd.DataFrame(random_tf, columns=["gene"]).to_csv(figures_path + "TF_rand_{}_v1.tsv".format(i), index=False)

######## Runing TF control (random genes) ########


In [8]:
net_47 = sel_exp.tpm_df.index
tf_47 = set(net_47) & set(tf_list)
print(f"TFs in CS 47 {len(tf_47)}")

small_basal = ['KLF5', 'TP63', 'BCL6', 'SPEN', 'SP1', 'TCF20', 'MAFG', 'ZNF609']
tf_47 = set(net_47) & set(small_basal)

print(f"Small Basal TFs in CS 42 {len(tf_47)}/{len(small_basal)}")


TFs in CS 47 324
Small Basal TFs in CS 42 2/8
