# Motivation

The purpose of this notebook is to test if we can find the 98 TFs in the tumour networks.

It also generate the control lists of the TFs

In [1]:
%load_ext autoreload
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import multiprocess as mp


# own libraries
sys.path.append('/Users/vlad/Developer/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.NetworkOutput import NetworkOutput

from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import clustering as cs
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
from NetworkAnalysis.dea import dea
from NetworkAnalysis.dea import helpers as dea_hp
from NetworkAnalysis.dea import gsea as gsea_hp
from NetworkAnalysis.utilities import pre_processing as pre

# Gsea libraries

pio.templates.default = "ggplot2"

pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
data_base = "../../data/"
base_path = "../../results/"
exp_folder_h47_ctrl = '../results/network_I/gc_47/tum_ctrls/'
tcga_data = "../../data/tumour/"

exp_folder_tumour = "network_I/gc_47/tum/"  # "/integration_v2.1/tum/" - path from iNET
exp_folder_tumour_42 = "network_I/gc_42/tum/"  # "/integration_v2.1/tum/" - path from iNET

# figures_path = base_path + exp_folder_tumour + "Figures/"
figures_path = "controls/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

# prep mut
tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/TPMs_selected_genes_v3_13k_gc42.tsv", sep="\t", index_col="genes")
all_tum_tpms_v4 = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

tum_tpms_gc47 = pd.read_csv(f"{data_base}/tumour/gc47_tpms_selected_14k.tsv", sep="\t", index_col="gene")

tcga_metadata_df = pd.read_csv(
    f"{tcga_data}/metadata_tcga_v2.csv"
)  # This version contains some small modifications on the spreadsheet such as removing duplicates of 01B as some were 01b and others 01B
consensus_classifier = pd.read_csv(f"{tcga_data}/consensus_classifier_comparisons.tsv", sep="\t")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")


sel_tfs = pd.read_csv(f"{data_base}/tf_ctrl.csv", index_col="gene")
sel_tfs["tum_mean_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].mean(axis=1)
sel_tfs["tum_median_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].median(axis=1)
sel_tfs["tum_std_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].std(axis=1)
sel_tfs["tum_var_expression"] = all_tum_tpms_v4.loc[sel_tfs.index].var(axis=1)

## Differences in gencode

In [5]:
# After aggressive gene filtering
cmn_genes_all = set(all_tum_tpms.index) & set(tum_tpms_gc47.index)
print(f"Genes that are cmn in gc42 and gc47 w/ all the ~14k genes: {len(cmn_genes_all)}")

Genes that are cmn in gc42 and gc47 w/ all the ~14k genes: 12033


In [24]:
tpms_gc42_iNet = pd.read_csv('/Users/vlad/Developer/York/iNet_v2/data/TPMs_selected_genes_v3_13k_gc42.tsv', index_col='gene', sep='\t')
print(f'Are the datasets the same? {len(set(tpms_gc42_iNet.index) & set(all_tum_tpms))}')

corr_tpm_42 = pd.read_csv('/Users/vlad/Developer/York/iNet_v2/results/gc_42/tum/Processed/genes_5000/corr_standard.tsv', sep='\t', index_col='gene')

Are the datasets the same? 0


In [40]:
top_5000_iNet = (tpms_gc42_iNet.std(axis=1) / tpms_gc42_iNet.median(axis=1)).sort_values(ascending=False).index.values[:5000]
top_5000_gc42 = (all_tum_tpms.std(axis=1) / all_tum_tpms.median(axis=1)).sort_values(ascending=False).index.values[:5000]
top_5000_gc47 = (tum_tpms_gc47.std(axis=1) / tum_tpms_gc47.median(axis=1)).sort_values(ascending=False).index.values[:5000]

print(f"Genes shared: gc42 vs iNet v2: {len(set(top_5000_iNet) & set(corr_tpm_42))}")

print(f"Genes shared: gc47 vs iNet v2: {len(set(top_5000_gc47) & set(corr_tpm_42))}")


Genes shared: gc42 vs iNet v2: 5000
Genes shared: gc47 vs iNet v2: 3842


In [53]:
shared_genes = set(top_5000_gc47) & set(tf_list)
print(f"Shared genes gc47 & tf: {len(shared_genes)}")

shared_genes = set(top_5000_gc42) & set(tf_list)
print(f"Shared genes gc42 & tf: {len(shared_genes)}")

shared_genes = set(corr_tpm_42.index) & set(tf_list)
print(f"Shared genes Corr gc42 & tf: {len(shared_genes)}")

Shared genes gc47 & tf: 324
Shared genes gc42 & tf: 359
Shared genes Corr gc42 & tf: 359


## Load experiment sets

In [57]:
%autoreload 2
base_path = "../../results/"

tum_47 = ExperimentSet("tum", base_path, exp_folder_tumour, tcga_mutations_df, sel_sets = ["5K",], rel_path="../", exp_type='iNet')
sel_exp_47: NetworkOutput = tum_47.exps['standard_5K_6TF_hsbm']

tum_42 = ExperimentSet("tum", base_path, exp_folder_tumour_42, tcga_mutations_df, sel_sets = ["5K",], rel_path="../", exp_type='iNet')
sel_exp_42: NetworkOutput = tum_42.exps['standard_5K_6TF_hsbm']

%autoreload 2
tum_42.export_to_gephi(save=False)
tum_47.export_to_gephi(save=False)

##### Experiment labels:  dict_keys(['standard_5K_8TF_hsbm', 'standard_5K_3TF_hsbm', 'standard_5K_11TF_hsbm', 'standard_5K_9TF_hsbm', 'standard_5K_6TF_hsbm', 'standard_5K_10TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_4TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_12TF_hsbm'])
##### Experiment labels:  dict_keys(['standard_5K_12TF_hsbm', 'standard_5K_11TF_hsbm', 'standard_5K_4TF_hsbm', 'standard_5K_9TF_hsbm', 'standard_5K_10TF_hsbm', 'standard_5K_5TF_hsbm', 'standard_5K_3TF_hsbm', 'standard_5K_8TF_hsbm', 'standard_5K_7TF_hsbm', 'standard_5K_6TF_hsbm'])


In [66]:
diff_genes = set(sel_exp_47.tpm_df.index) - set(sel_exp_42.tpm_df.index)
print(f"gc42 - gc47 .tpm_df genes diff: {len(diff_genes)}")

tf_42 = set(tf_list) & set(sel_exp_42.tpm_df.index)
print(f"tf_lust & gc42 .tpm_df genes cmn: {len(tf_42)}")

tf_47 = set(tf_list) & set(sel_exp_47.tpm_df.index)
print(f"tf_lust & gc47 .tpm_df genes cmn: {len(tf_47)}")

gc42 - gc47 .tpm_df genes diff: 1158
tf_lust & gc42 .tpm_df genes cmn: 359
tf_lust & gc47 .tpm_df genes cmn: 324


## gc42 vs gc47

## Generate controls

In [54]:
if True:

    print("######## Runing TF control (random genes) ########")
    import random

    for i in range(1, 11, 1):
        used_5K_genes = top_5000_gc42

        # not all tf maybe in the dataset
        tf_not_found = set(tf_list) - set(used_5K_genes)
        non_tf = list(set(used_5K_genes) - set(tf_list))

        # randomnly select n genes from the list to be used as control.
        #  we substract the number of tf that are not presented in order to keep it as the same number for tf
        random_genes = random.sample(non_tf, len(tf_list) - len(tf_not_found))

        # override tfList
        random_tf = list(random_genes)
        pd.DataFrame(random_tf, columns=["gene"]).to_csv(figures_path + "gc_42/TF_rand_{}_v1.tsv".format(i), index=False)

######## Runing TF control (random genes) ########


In [6]:
net_47 = sel_exp.tpm_df.index
tf_47 = set(net_47) & set(tf_list)
print(f"TFs in CS 47 {len(tf_47)}")

small_basal = ['KLF5', 'TP63', 'BCL6', 'SPEN', 'SP1', 'TCF20', 'MAFG', 'ZNF609']
tf_47 = set(net_47) & set(small_basal)

print(f"Small Basal TFs in CS 42 {len(tf_47)}/{len(small_basal)}")


TFs in CS 47 324
Small Basal TFs in CS 42 2/8


## Load controls

In [7]:
def load_experiment_set(base_path, exp_folder_path):
    exp_path = f"{base_path}/{exp_folder_path}/Stats/"
    meta_files = next(os.walk(exp_path), (None, None, []))[2]
    # keep only the network stats
    files = [file for file in meta_files if "networkStats" in file]
    exp_set = {}
    for file in files:
        exp = file.replace(".tsv", "").replace("networkStats_tum_standard_", "")
        exp_set[exp] = pd.read_csv(f"{exp_path}/{file}", index_col="gene", sep="\t")
    return exp_set

In [8]:
folders = next(os.walk(base_path + exp_folder_h47_ctrl), (None, None, []))[1]
exp_ctrls = {}
for folder in folders:
    hCtrl_path = f"/{exp_folder_h47_ctrl}/{folder}/"
    meta_files = next(os.walk(hCtrl_path), (None, None, []))[2]
    # keep only the network stats
    files = [file for file in meta_files if "networkStats" in file]
    exp_ctrls[folder] = load_experiment_set(base_path, hCtrl_path)


exp_set = load_experiment_set(base_path, exp_folder_tumour)