# Motivation

This notebook analyses the impact of choosing different gene expression filtering. There are two strategies:

1. permissive (standard in literature) - considers a gene to be expressed if it is present in at least 10% of the samples
2. aggresive (used in the project) - considers a gene to be expressed if it is present in at least 90% of the samples

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import dash_bio
import plotly.figure_factory as ff
from scipy.spatial.distance import pdist, squareform

import multiprocess as mp

# own libraries
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
sys.path.append(os.path.dirname("../../src")) # needed for older pickler

from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig

from NetworkAnalysis.utilities import pre_processing as pre
from NetworkAnalysis.utilities import modelling as md


pio.templates.default = "ggplot2"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [52]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/tum/"
tcga_data = "../../data/tumour/"

figures_path = "gene_selection/"

#### Load the data ####
vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

# prep mut
tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")
consensus_classifier = pd.read_csv(f"{tcga_data}/consensus_classifier_comparisons.tsv", sep="\t")

all_tum_tpms = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")
all_markers = pd.read_csv(f"{data_base}/known_markers.tsv", sep="\t").drop(columns="Unnamed: 0")

## TCGA data
tcga_metadata_df = pd.read_csv(f"{tcga_data}/metadata_tcga_v2.csv")

# Get the IFNG signature
infg_sig = list(all_markers["sb_ifng"].dropna().values)

# Tumour comparions

In [33]:
data_permisive, working_tpm, raw_metadata_t, selected_genes, common_samples = pre.prep_data(
    all_tum_tpms.reset_index(), tcga_metadata_df.copy(deep=True), consensus_classifier, remap_cols=False, at_least_good=0.1, num_genes=all_tum_tpms.shape[0]
)
data_permisive["median_std"] = data_permisive.std(axis=1) / data_permisive.median(axis=1)
data_permisive["rank_median_std"] = data_permisive["median_std"].rank(ascending=False, method="first")

data_aggresive, working_tpm, raw_metadata_t, selected_genes, common_samples = pre.prep_data(
    all_tum_tpms.reset_index(), tcga_metadata_df.copy(deep=True), consensus_classifier, remap_cols=False, at_least_good=0.90, num_genes=all_tum_tpms.shape[0]
)
data_aggresive["median_std"] = data_aggresive.std(axis=1) / data_aggresive.median(axis=1)
data_aggresive["rank_median_std"] = data_aggresive["median_std"].rank(ascending=False, method="first")

For th 0.1 ==> at least non-NAN values 40
####### Gene selection, num genes: 32376 #######
The genes selected by the highest standard deviation/median ration.
24453
For th 0.9 ==> at least non-NAN values 364
####### Gene selection, num genes: 32376 #######
The genes selected by the highest standard deviation/median ration.
13373


In [57]:
ifng_permisive = data_permisive.loc[data_permisive.index.isin(infg_sig)][["median_std", "rank_median_std"]].sort_values("rank_median_std", ascending=True)
ifng_permisive["type"] = "Permisive"

ifng_aggresive = data_aggresive.loc[data_aggresive.index.isin(infg_sig)][["median_std", "rank_median_std"]].sort_values("rank_median_std", ascending=True)
ifng_aggresive["type"] = "Aggresive"


dmy_df = pd.concat([ifng_aggresive, ifng_permisive], axis=0).reset_index()

In [59]:
fig = px.bar(dmy_df, x="genes", y="rank_median_std", color="type", barmode="group")

fig = fig.update_layout(
    title="",
    legend=dict(
        title="Gene filtering",
        orientation="h",
        yanchor="middle",
        xanchor="center",
        y=0.92,
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=20, color="#003366"),
    ),
    yaxis=dict(tickfont=dict(size=20), title="std/median rank"),
    xaxis=dict(tickfont=dict(size=20), title="gene"),
    font=dict(size=20),
    height=600,
)


save_fig(name="ifng_ranks", fig=fig, base_path=figures_path, width=1300, height=600, margin=0.02)