# Motivation

The role of this notebook is to explore the biological significance of the selected tf

In [1]:
%load_ext autoreload
import pandas as pd
import numpy as np
import os
import sys

import plotly.express as px
import plotly.io as pio


import multiprocess as mp


# own libraries
SCRIPT_DIR = os.path.dirname(os.path.abspath("pcgna_processing.py"))
sys.path.append(os.path.dirname(SCRIPT_DIR))
sys.path.append('/Users/vlad/Documents/Code/York/iNet_v2/src/')

from NetworkAnalysis.ExperimentSet import ExperimentSet
from NetworkAnalysis import GraphHelper as gh
from NetworkAnalysis.utilities import clustering as cs
from NetworkAnalysis.utilities import sankey_consensus_plot as sky
from NetworkAnalysis.utilities.helpers import save_fig, survival_plot, survival_comp
from NetworkAnalysis.GraphToolExp import GraphToolExperiment as GtExp
from NetworkAnalysis.dea import dea
from NetworkAnalysis.dea import scatter_plot as sp
sys.path.append(os.path.dirname("../../src"))
# Gsea libraries

pio.templates.default = "ggplot2"


pool = mp.Pool(mp.cpu_count())

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
results_path = "../../results/exp/"
data_base = "../../data/"
base_path = "../../results/"
exp_folder_tumour = "network_I/tum/" # "/integration_v2.1/ - path from iNET
exp_folder_h42_ctrl = "network_I/healthyControls/"
dea_path = f'{data_base}/non_cancerous/dea/'


figures_path = "selective_edge_pruning/sel_tfs/"

vu_output = pd.read_csv(f"{data_base}/metadata/VU_clustering_v3.tsv", sep="\t", index_col="Sample")

tcga_mutations_df = pd.read_csv(f"{data_base}/tumour/mutations_tcga.csv")
tcga_mutations_df = tcga_mutations_df[tcga_mutations_df["count"] != 0].set_index("gene")

tum_tpms = pd.read_csv(f"{data_base}/tumour/TPMs_selected_genes_v3_13k_gc42.tsv", sep="\t", index_col="genes")
tum_tpms_v4 = pd.read_csv(f"{data_base}/tumour/tum_TPMs_selected_genes_gc42_all_v4.tsv", sep="\t", index_col="genes")

# tf list
tf_path = f"{data_base}/metadata/TF_names_v_1.01.txt"
if os.path.exists(tf_path):
    tf_list = np.genfromtxt(fname=tf_path, delimiter="\t", skip_header=1, dtype="str")

# Analysis

In [3]:
def mean_var_bar(df: pd.DataFrame, markers: list):
    df.sort_values(by='tum_mean_expression', ascending=False, inplace=True)
    fig1 = px.bar(df, y='tum_mean_expression', error_y='tum_std_expression', title='Tum expression and variance')
    fig2 = px.bar(df, y='healthy_mean_expression', error_y='healthy_std_expression', title='Healthy expression and variance')

    num_cols=1
    subplots_config = {
    "num_cols": num_cols,
    "shared_x": False,
    "shared_y": False,
    "h_spacing": 0.25,
    "v_spacing": 0.17,
    "main_title": "Mean expression (log) and variance",
    "height": 700,
    "width": None,
    "y_title": None,
    "x_title": None,
    "specs": None,
    }

    # Color based on high variance
    marker_colors_h = []
    marker_colors_t = []
    for gene, row in df.iterrows():
        if row['varied_label'] == 'tum':
            marker_colors_t.append("#F8766D") #reddish
            marker_colors_h.append('#619CFF') #bluesh
        elif row['varied_label'] == 'h':
            marker_colors_h.append("#F8766D") #reddish
            marker_colors_t.append('#619CFF') #bluesh
        elif row['varied_label'] == 'both':
            marker_colors_t.append("#D7B740") #greenish
            marker_colors_h.append("#D7B740") #greenish
        else:
            marker_colors_t.append('#619CFF') #bluesh
            marker_colors_h.append('#619CFF') #bluesh


    fig1.update_traces(marker_color=marker_colors_t)
    fig2.update_traces(marker_color=marker_colors_h)

    figs = [fig1, fig2]
    titles = [None, None]
    fig = gh.helper_multiplots(figs, titles, subplots_config)
    fig.update_yaxes(type='log', title='Tumour', row=1, col=1)
    fig.update_yaxes(type='log', title='Non-cancerous', row=2, col=1)


    for gene in markers:
        data = df.loc[gene]
        if gene in ['KLF4', 'ZBTB7C', 'GRHL3']:
            fig.add_annotation(text=f'{gene}', x=gene, y=np.log10(data['tum_mean_expression']), showarrow=True,
                                arrowhead=1, xref=f"x{1}", yref=f"y{1}", font=dict(color=px.colors.qualitative.Plotly[0]),  ax=-30, ay=-70)
        else:
            fig.add_annotation(text=f'{gene}', x=gene, y=np.log10(data['tum_mean_expression']), showarrow=True,
                                arrowhead=1, xref=f"x{1}", yref=f"y{1}", font=dict(color=px.colors.qualitative.Plotly[0]),  ax=-60, ay=-70)
                                
        fig.add_annotation(text=f'{gene}', x=gene, y=np.log10(data['healthy_mean_expression']), showarrow=True,
                            arrowhead=1, xref=f"x{2}", yref=f"y{2}", font=dict(color=px.colors.qualitative.Plotly[0]),  ax=-20, ay=-120)

    return fig

In [186]:
sel_tfs = pd.read_csv(f'{data_base}/tf_ctrl.csv', index_col='gene')
sel_tfs['tum_mean_expression'] = tum_tpms_v4.loc[sel_tfs.index].mean(axis=1)
sel_tfs['tum_median_expression'] = tum_tpms_v4.loc[sel_tfs.index].median(axis=1)
sel_tfs['tum_std_expression'] = tum_tpms_v4.loc[sel_tfs.index].std(axis=1)
sel_tfs['tum_var_expression'] = tum_tpms_v4.loc[sel_tfs.index].var(axis=1)

sel_tfs.to_csv(f"{figures_path}/selected_tfs.tsv", sep='\t')

In [5]:
# sel_tfs = sel_tfs[~sel_tfs.index.isin(["ELF3", "JUNB"])]
dmy_df = sel_tfs.copy(deep=True)
dmy_df['tum_mean_expression'] = sel_tfs['tum_mean_expression'] +1
dmy_df['healthy_mean_expression'] = sel_tfs['healthy_mean_expression'] +1

log = True
fig = px.scatter(
    dmy_df.reset_index(),
    x="tum_mean_expression",
    y="healthy_mean_expression",
    # text="gene",
    color="mut_count",
    size="mut_count",
    hover_data="gene",
    color_continuous_scale=px.colors.sequential.Sunset_r,
    height=700,
    log_x=log,
    log_y=log,
    title=f'Selected TFs ({dmy_df.shape[0]})',
    # trendline='ols',  trendline_options=dict(log_x=True, log_y=True),
)

# Get the top 10 genes with highest healthy median expression
top_genes_h = dmy_df.nlargest(10, "healthy_median_expression").index.tolist()
top_genes_t = dmy_df.nlargest(10, "tum_median_expression").index.tolist()
top_genes_m = dmy_df.nlargest(10, "mut_count").index.tolist()

top_genes = list(set(top_genes_h + top_genes_t + top_genes_m))
top_genes.extend(["FOSL1", "FOXQ1", "MYCL", "STAT2", "IRF7", "MBD6", "ZNF750", "ZNF513", "BNC1", 'KLF6', 'HOXB6', "ZBTB7C", "KLF4", 'ZBTB10', 'RUNX1'])
top_genes = list(set(top_genes) - set(['ZNF750', 'IRF6', 'IRF7', 'TGIF1', "DOT1L"]))
inverted  = ['SPEN', 'ELF3', 'ZNF513']

# Add text annotation for the top 10 genes
for idx, gene in enumerate(top_genes):
    x = dmy_df.loc[dmy_df.index == gene, "tum_mean_expression"].values[0]
    y = dmy_df.loc[dmy_df.index == gene, "healthy_mean_expression"].values[0]
    ay = -30

    xanchor = 'right' if idx % 2 == 0 else 'left'
    ax = 10 if idx % 2 == 0 else 15

    ay = 30 if gene in inverted else -15

    if log:
        x, y = np.log10(x), np.log10(y)

    fig.add_annotation(x=x, y=y, text=gene, showarrow=True, arrowhead=1, ax=ax, ay=ay, xanchor=xanchor)

fig.update_layout(height=900, yaxis_title='Non-cancerous TPM', xaxis_title='Tum TPM_mean', 
                  paper_bgcolor="rgba(0,0,0,0)",
                  xaxis=dict(tickfont=dict(size=16)),
                  yaxis=dict(tickfont=dict(size=16),),
                  font=dict(size=14),
                  title = ''
                  )
# fig.add_scatter(x=[0.1, 1500], y=[0.1, 1500], mode='lines+markers', name='')
save_fig(name="sel_tfs_mean_tum_healthy", fig=fig, base_path=figures_path, width=1400, height=700)
# fig.show()


In [107]:
# markers = ['BNC1', "AHR", 'OVOL1', "HES2", ]
high_varied_tum = ['EGR1', 'FOXQ1', "ATF3", 'MYCL', 'TP63', 'BHLHE41', 'FOSL1', 'OVOL1', 'MSX2', "ZNF750", "ZNF552", 'JRK', "HOXB6", "HES2", "EBF4", "REL", "BNC1", "NR4A2", 'KLF4', 'ZBTB7C', 'GRHL3']
high_varied_h = ['JUN', 'ETS2', 'KLF6', "ATF3", 'FOSL1', "EGR1", 'MAFF', "OVOL1", 'MAFK', 'ZNF750', 'HES2', "EBF4",'ARID5B', 'NR4A2', 'KLF4', 'BNC1', 'FOXQ1', 'ZBTB7C', 'ZBTB10', 'MYCL']
cmn_varied = ['OVOL1', 'FOSL1', 'KLF4', 'BNC1', 'MYCL', 'NR4A2', 'ZBTB7C',  'FOXQ1', 'ZNF750', 'EGR1', 'HES2', 'ATF3', 'EBF4']
cmn_varied = list(set(high_varied_h) & set(high_varied_tum))
sel_tfs['varied_label'] = ''
sel_tfs.loc[high_varied_tum, 'varied_label'] = "tum"
sel_tfs.loc[high_varied_h, 'varied_label'] = "h"
sel_tfs.loc[cmn_varied, 'varied_label'] = "both"

markers = ["GRHL3", 'OVOL1', 'JRK', 'HES2', 'EBF4', 'NR4A2', 'KLF4', 'ZBTB7C', 'KLF6', 'ETS2']
fig = mean_var_bar(sel_tfs, markers=markers)
fig.update_layout(height=900,
                  paper_bgcolor="rgba(0,0,0,0)",
                  xaxis=dict(tickfont=dict(size=16)),
                  yaxis=dict(tickfont=dict(size=16)),
                  font=dict(size=14),
                  title = ''
                  )
save_fig(name="sel_tfs_var_tum_healthy", fig=fig, base_path=figures_path, width=1600, height=900)
fig.show()
del fig


## Comparing with known markers

In [7]:
luminal_markers = ["KRT20", "PPARG", "FOXA1", "GATA3", "SNX31", "UPK1A", "UPK2", "FGFR3"]
basal_markers = ["CD44", "KRT6A", "KRT5", "KRT14", "COL17A1"]
squamos_markers = ["DSC3", "GSDMC", "TCGM1", "PI3", "TP63"]
immune_markers = ["CD274", "PDCD1LG2", "IDO1", "CXCL11", "L1CAM", "SAA1"]
neural_diff = ["MSI1", "PLEKHG4B", "GNG4", "PEG10", "RND2", "APLP1", "SOX2", "TUBB2B"]

# TCGA markers - main paper
emt_claudin = ["ZEB1", "ZEB2", "SNAI1", "TWIST1", "CDH2", "CLDN3", "CLDN4", "CLDN7"]
ecm_muscle = ["PGM5", "DES", "C7", "SFRP4", "COMP", "SGCD"]

tcga_markers = luminal_markers + basal_markers + squamos_markers + immune_markers + neural_diff + emt_claudin + ecm_muscle

In [8]:
set(sel_tfs.index) & set(tcga_markers)

{'TP63'}

### Urothelium type markers

In [9]:
tf_diff = ["P63", "FOXA1","PPARG", "RARG", "IRF1", "ELF3", "GRHL3", "KLF5", "GATA4", "GATA6", "GATA3"]
krt = ["KRT13", "KRT14", "KRT15", "KRT20"]
upk = ["UPK1B", "UPK1A", "UPK3A", "UPK2"]
cld = ["CLDN3", "CLDN4", "CLDN5" ]

egfr_fam = ["EGFR", "ERBB2", "ERBB3", "ERBB4", "EGF", "AREG", "HBEGF","TGFA","BTC", "EREG"]
fgfr_fam = ["FGFR1", "FGFR2", "FGFR3", "FGF1", "FGF2"]
map_kpathway = ["RAS", "RAF", "MEK1", "MEK2", "MEK3", "MEK4","ERK"]
pi3_kpathway = ["PIK3C3", "PIK3R2", "PIK3C2B", "AKT1", "AKT2"]
others = ["MKI67", "MCM2", "UPK3A", "ZO1", "TJP1", "ZO2", "TJP2", "ZO3", "TJP3"]
hox_ur = ["HOXB2", "HOXB3", "HOXB5", "HOXB6", "HOXB8"]
hox_bla = ["HOXA9", "HOXA10", "HOXA11", "HOXA13"]

diff_markers = tf_diff + cld + krt + upk

uro_markers = diff_markers + egfr_fam + fgfr_fam + map_kpathway + pi3_kpathway + others + hox_ur + hox_bla

In [10]:
set(sel_tfs.index) & set(uro_markers)

{'ELF3', 'GRHL3', 'HOXB6', 'KLF5'}

### Lund type markers

In [11]:
lund_qtc1 = ["FLI1", "FOXP3", "ILKZF1", "IRF4", "IRF8", "RUNX3", "SCML4", "SPI1", "STAT4", "TBX21", "TFEC"]
lund_qtc2 = ["AEBP1", "BNC2", "GLI2", "GLIS1", "HIC1", "MSC", "PPRX1", "PPRX2", "TGFB1I1", "TWIST1"]
lund_qtc3 = ["EBF1", "HEYL", "LEF1", "MEF2C", "TCF4", "ZEB1", "ZEB2"]
lund_qtc8 = ["GATA5", "HAND1", "HAND2", "KLF16"]
lund_qtc17 = ["ARID5A", "BATF3", "VENTX"]
lund_ba_mes = lund_qtc1 + lund_qtc2 + lund_qtc3 + lund_qtc8 + lund_qtc17

lund_ba_sq = ["BRIP1", "E2F7", "FOXM1", "ZNF367", "IRF1", "SP110", "STAT1"]
lund_mes = ["TP53", "RB1", "FGFR3", "ANKHD1", "VIM", "ZEB2"]
ba_sq_inf = ["CDH3", "EGFR"]

lund_sc_ne = ["CHGA", "SYP", "ENO2", "EPCAM"] #Highly expressed

lund_markers = lund_ba_mes + lund_ba_sq + lund_mes + ba_sq_inf + lund_sc_ne

In [12]:
set(sel_tfs.index) & set(lund_markers)

{'KLF16', 'SP110', 'STAT1'}

### Immune markers

In [13]:
b_cells = ["BCL2", "BCL6", "CD19", "CD1D", "CD22", "CD24", "CD27", "CD274","CD34", "CD38", "CD40","CD44","CD5","CD53","CD69","CD72", "CD79A", "CD79B", "CD80", "CD86", "CD93", "CR2", "CXCR4", 'CXCR5',"FAS","FCER2", "FCRL4" "HAVCR1","IL10", 'IL2RA','IL7R','IRF4','ITGAX', 'LILRB1','MME','MS4A1','NT5E','PDCD1LG2','PRDM1','PTPRC','SDC1','SPN','TFRC','TLR9','TNFRSF13B','TNFRSF13C','TNFRSF17','XBP1']
t_cells = ['CD4', 'CD8', 'CCR4', 'CCR5', 'CCR6', 'CCR7', 'CCR10', 'CD127', 'CD27', 'CD28', 'CD38', 'CD58', 'CD69', 'CTLA4', 'CXCR3', 'FAS', 'IL2RA',
        'IL2RB', 'ITGAE', 'ITGAL', 'KLRB1', 'NCAM1', 'PECAM1', 'PTGDR2', 'SELL', 'IFNG', 'IL10', 'IL13', 'IL17A', 'IL2', 'IL21','IL22', 'IL25', 'IL26', 'IL4', 'IL5', 'IL9', 'TGFB1', 'TNF', 'AHR', 'EOMES','FOXO4', 'FOXP1', 'FOXP3', 'GATA3','IRF4', 'LEF1', 'PRDM1', 'RORC','STAT4', 'TBX21','TCF7', 'GZMA']

nk_cells = ['B3GAT1','CCR7','CD16','CD2','CD226','CD244','CD27','CD300A','CD34','CD58','CD59','CD69','CSF2','CX3CR1','CXCR1','CXCR3','CXCR4','EOMES','GZMB','ICAM1','IFNG','IL1R1','IL22','IL2RB','IL7R','ITGA1','Itga2','ITGAL','ITGAM','ITGB2','KIR2DL1','KIR2DL2','KIT','Klrb1c','KLRC1','KLRC2','KLRD1','KLRF1','KLRG1','KLRK1','LILRB1','Klra4','Klra8','NCAM1','NCR1','NCR2','NCR3','PRF1','SELL','SIGLEC7','SLAMF6','SPN','TBX21','TNF']

macrophages_cells = [ 'ADGRE1','CCR2','CD14','CD68','CSF1R','Ly6c1','MARCO','MRC1','NOS2','PPARG','SIGLEC1','TLR2','ARG1','CD163','CD200R1','CD80','CD86','CLEC10A','CLEC7A','CSF2','CX3CR1','FCGR1A','ITGAM','MERTK','PDCD1LG2','Retnla','TNF','CCL22','CD36','CD40','IL10','IL1B','IL6','LGALS3','TLR4','CCL2','CCR5','CD209','CD63','CD86','CSF1','CXCL2','FCGR3A','IFNG','IL4','IRF4','ITGAX','MSR1','PDGFB','PTPRC','STAT6','TIMD4','Chil3','CLEC6A','IL1R1','ITGB2','PDCD1LG2','TLR7']

monocyte_cells = ['CD14','CD16','CSF1R','CX3CR1','ITGAM','ITGAX','LY6C1','CCR2','CXCR4','FCGR1A','SELL','SPN','ADGRE1','CCR7','TNF','CD86','IL10','IL1B','MERTK','TREML4','CD209','NR4A1','Ly6a','PTPRC','IL3RA','CD27','CCR5','CD32','CD1A','MRC1','ITGB3','CD9','CXCR6','CCR1','FLT3','KLF2','CLEC12A','CCR6','CCR8','CD68','CLEC7A','KIT','MAF','MAFB','SPI1','CD1C','PPARG','CEBPB','ITGAE','TEK']


immune_markers = b_cells + t_cells + nk_cells + macrophages_cells + monocyte_cells

In [14]:
set(sel_tfs.index) & set(immune_markers)

{'AHR', 'BCL6'}

In [183]:
sel_tfs

Unnamed: 0_level_0,mut_count,tum_median_expression,healthy_median_expression,tum_mean_expression,healthy_mean_expression,tum_std_expression,healthy_std_expression,tum_var_expression,varied_label
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ELF3,49.0,516.467020,1434.519008,640.581292,1664.910202,567.808783,1533.937058,322406.813879,
JUNB,3.0,248.349863,271.009715,286.124501,437.743241,181.303215,398.875691,32870.855760,
KLF5,21.0,160.663601,403.414207,196.327968,423.927756,158.378786,252.581784,25083.839989,
STAT1,10.0,91.850454,147.586097,142.432516,155.719798,140.764415,99.203676,19814.620595,
TGIF1,4.0,122.278603,133.417372,133.043308,158.564100,68.719895,87.264486,4722.423997,
...,...,...,...,...,...,...,...,...,...
NFAT5,13.0,3.649047,15.113545,4.739326,16.319613,4.322653,8.126043,18.685332,
ZBTB21,11.0,3.435088,10.379879,3.967440,12.239118,2.319988,5.985041,5.382343,
ZBTB10,1.0,2.886254,3.889252,3.804161,7.196493,3.285030,8.219635,10.791424,h
BNC1,7.0,0.037108,13.012142,3.760191,27.664774,9.283576,37.200816,86.184788,both


# Morpheus

## Prepare for Morpheus
Outliers for standard log2 TPMS and norm of log, after applying agglomerative clustering with 1-pearson correlation.

```Python
outliers_log2 = ['TCGA-C4-A0EZ', 'TCGA-DK-AA6W', 'TCGA-G2-A2EL', 'TCGA-BL-A3JM', 'TCGA-XF-A9T2', 'TCGA-XF-AAMH', 'TCGA-XF-A9ST', 'TCGA-GC-A4ZW', 'TCGA-HQ-A2OF', 'TCGA-DK-AA6T', 'TCGA-BT-A2LA', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-CF-A3MF']
```

```Python
norm_outliers = ['TCGA-2F-A9KW', 'TCGA-XF-A9ST', 'TCGA-BL-A3JM', 'TCGA-XF-A9T2', 'TCGA-XF-AAMH', 'TCGA-DK-AA6T', 'TCGA-BT-A2LA', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-C4-A0EZ', 'TCGA-DK-AA6W', 'TCGA-G2-A2EL']
```

<!-- ![alt text](selective_edge_pruning/sel_tf_log2.png)
![alt text](selective_edge_pruning/sel_tf_norm_log2.png) -->



In [15]:
outliers_log2 = ['TCGA-C4-A0EZ', 'TCGA-DK-AA6W', 'TCGA-G2-A2EL', 'TCGA-BL-A3JM', 'TCGA-XF-A9T2', 'TCGA-XF-AAMH', 'TCGA-XF-A9ST', 'TCGA-GC-A4ZW', 'TCGA-HQ-A2OF', 'TCGA-DK-AA6T', 'TCGA-BT-A2LA', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-CF-A3MF']

norm_outliers = ['TCGA-2F-A9KW', 'TCGA-XF-A9ST', 'TCGA-BL-A3JM', 'TCGA-XF-A9T2', 'TCGA-XF-AAMH', 'TCGA-DK-AA6T', 'TCGA-BT-A2LA', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-C4-A0EZ', 'TCGA-DK-AA6W', 'TCGA-G2-A2EL']

cmn_outliars = set(outliers_log2) & set(norm_outliers)

print(f"### Num outliers for standard log2 {len(outliers_log2)}.\n --> {outliers_log2}")
print(f"### Num outliers for norm log2 {len(norm_outliers)}.\n --> {norm_outliers}")
print(f"### Common outliers *{len(cmn_outliars)}*.\n --> {cmn_outliars}")

### Num outliers for standard log2 14.
 --> ['TCGA-C4-A0EZ', 'TCGA-DK-AA6W', 'TCGA-G2-A2EL', 'TCGA-BL-A3JM', 'TCGA-XF-A9T2', 'TCGA-XF-AAMH', 'TCGA-XF-A9ST', 'TCGA-GC-A4ZW', 'TCGA-HQ-A2OF', 'TCGA-DK-AA6T', 'TCGA-BT-A2LA', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-CF-A3MF']
### Num outliers for norm log2 12.
 --> ['TCGA-2F-A9KW', 'TCGA-XF-A9ST', 'TCGA-BL-A3JM', 'TCGA-XF-A9T2', 'TCGA-XF-AAMH', 'TCGA-DK-AA6T', 'TCGA-BT-A2LA', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-C4-A0EZ', 'TCGA-DK-AA6W', 'TCGA-G2-A2EL']
### Common outliers *11*.
 --> {'TCGA-DK-AA6T', 'TCGA-BL-A3JM', 'TCGA-XF-AAMH', 'TCGA-DK-AA6W', 'TCGA-XF-AAN7', 'TCGA-FJ-A871', 'TCGA-XF-A9T2', 'TCGA-BT-A2LA', 'TCGA-C4-A0EZ', 'TCGA-G2-A2EL', 'TCGA-XF-A9ST'}


In [16]:
dmy_df = tum_tpms_v4.loc[sel_tfs.index]
dmy_df = np.log2(dmy_df + 1)

sel_metadata = ['KMeans_labels_6', 'consensus', 'TCGA408_classifier', 'Lund2017.subtype', 'ESTIMATE_score', "Immune_score", "Stromal_score", 'tumor_stage']
dmy_df = pd.concat([vu_output[sel_metadata].T, dmy_df], axis=0).dropna(axis=1)

# Remove outliers
dmy_df = dmy_df[list(set(dmy_df.columns) - set(cmn_outliars))]

# Adding notes on the genes
dmy_df['Notes'] = ''
dmy_df.loc[dmy_df.index.isin(lund_markers), "Notes"] = 'Lund marker'
dmy_df.loc[dmy_df.index.isin(immune_markers), "Notes"] = 'Immune marker'
dmy_df.loc[dmy_df.index.isin(uro_markers), "Notes"] = 'Uro markers'
dmy_df.loc[dmy_df.index.isin(tcga_markers), "Notes"] = 'TCGA markers'

dmy_df = dmy_df[['Notes'] + list(dmy_df.columns[:-1])]
dmy_df.to_csv(f"{figures_path}/log2_sel_tfs_no_outliers.tsv", sep='\t')


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



## Importing Morpheus

In [17]:
morpheus_path = "selective_edge_pruning/morpheus/"

morp_df = pd.read_csv(f"{morpheus_path}/15_CS_norm_log2_sel_tfs_no_outliers.gct", sep="\t", skiprows=2)
columns = morp_df["id"]
morp_df = morp_df.drop(columns=["Notes"]).transpose()
morp_df.columns = columns
morp_df = morp_df.iloc[2:, :]

morp_df['dendrogram_cut'] = morp_df['dendrogram_cut'].astype(float).astype(str)

keep_clusters = []
# Size of 1%
size_th = round(morp_df.shape[0] * 0.01)
for cluster, size in morp_df['dendrogram_cut'].value_counts().items():
    if size > 5:
        keep_clusters.append(cluster)

# Drop clusters that are smaller than 1% of the cohort size
morp_df = morp_df.loc[morp_df['dendrogram_cut'].isin(keep_clusters)]
morp_df.shape

(378, 107)

In [38]:
#Sankey
morp_df.rename_axis("sample", axis="columns", inplace=True)
reorder_cols = [
    "TCGA408_classifier",
    "dendrogram_cut",
    # "KMeans_labels_6",
    'Lund2017.subtype',
    # "consensus",
]
meta, sky_fig = sky.main(df=morp_df, reorder_cols=reorder_cols, title='MIBC stratification based on the TF from selective edge pruning', retMeta=True)
sky_fig.update_layout(
    title="",
    template="ggplot2",  # "ggplot2", "plotly_white"
    font=dict(size=16),
    paper_bgcolor="rgba(0,0,0,0)",
)
sky_fig.show()
save_fig(name="sankey_sel_tfs", fig=sky_fig, base_path=figures_path, width=1200, height=600)
del sky_fig

## Dumbell plots

In [19]:
import plotly.graph_objects as go
import matplotlib.pyplot as plt


# Main function to see the differences
def dumbell_plots(morp_df: pd.DataFrame, tum_df: pd.DataFrame, sel_tfs: pd.DataFrame, cls_1="", cls_2="", markers=[], log=False):

    cluster_1, cluster_2 = int(cls_1.split("_")[-1]), int(cls_2.split("_")[-1])

    samples_1 = morp_df[morp_df['dendrogram_cut'] == cluster_1].index
    samples_2 = morp_df[morp_df['dendrogram_cut'] == cluster_2].index

    # Reconstruct the TPM
    dmy_df = tum_df.loc[tum_df.index.isin(sel_tfs.index)]
    df_1 = dmy_df[samples_1]
    df_1[cls_1] = df_1.mean(axis=1)

    dmy_df = tum_df.loc[tum_df.index.isin(sel_tfs.index)]
    df_2 = dmy_df[samples_2]
    df_2[cls_2] = df_2.mean(axis=1)

    comb_df = pd.concat([df_1[cls_1], df_2[cls_2]], axis=1)

    # Prepare for plotting
    plot_data = {"line_x": [], "line_y": [], cls_1: [], cls_2: []}

    y_axis_title = 'TPM mean'
    if log:
        comb_df[cls_1] = np.log2(comb_df[cls_1] + 1)
        comb_df[cls_2] = np.log2(comb_df[cls_2] + 1)
        y_axis_title = 'Log2(TPM + 1) mean '

    comb_df['diff']= np.abs(comb_df[cls_1] - comb_df[cls_2])
    comb_df.sort_values(by='diff', ascending=False, inplace=True)

    genes = list(comb_df.index)

    # generate the data for plots
    for gene, row in comb_df.iterrows():
        # print(gene)

        val_1, val_2 = row[cls_1], row[cls_2]
        # The data for two scatter points cls_1 and cls_2
        plot_data[cls_1].extend([val_1])
        plot_data[cls_2].extend([val_2])
        # The line between the two clases
        plot_data["line_y"].extend([
                val_1,
                val_2,
                None,
            ]
        )
        plot_data["line_x"].extend([gene, gene, None])

    # plotting
    fig = go.Figure(
    data=[
        go.Scatter(
            x=plot_data["line_x"],
            y=plot_data["line_y"],
            mode="lines",
            showlegend=False,
            marker=dict(
                color="grey"
            )
        ),
        go.Scatter(
            y=plot_data[cls_1],
            x=genes,
            mode="markers",
            name=cls_1,
            marker=dict(
                # color="green",
                size=10
            )
            
        ),
        go.Scatter(
            y=plot_data[cls_2],
            x=genes,
            mode="markers",
            name=cls_2,
            marker=dict(
                # color="blue",
                size=10
            )   
        ),
        ]
    )

    # Add title and change the figure size
    fig.update_layout(
        title=f"Changes between {cls_1} and {cls_2}",
        yaxis_title=y_axis_title,
        xaxis_title="Gene",
        legend_itemclick=False
    )

    fig.update_xaxes()
    # Add markers

    annotations = []
    for i, marker in enumerate(markers):
        sel_df = comb_df.loc[marker][[cls_1, cls_2]]
        x = marker
        # avoid overlapping
        xanchor = 'right' if i % 2 == 0 else 'left'
        ax = -10 if i % 2 == 0 else 10
        y = sel_df.max() if i % 2 == 0 else sel_df.min()
        ay = -10 if i % 2 == 0 else 50

        fig.add_annotation(
                x=x,
                y=y,
                text=marker,
                showarrow=True,
                ax = ax,  # No horizontal offset
                ay=ay,
                xanchor=xanchor
            )
        annotations.append({
                "x": x,
                "y": y,
                "text": marker,
                "showarrow": True,
                "ax": ax,  
                "ay": ay,
                "xanchor": xanchor, 
            })
        
    return fig, comb_df, annotations

In [20]:
cluster_1, label_1 = 3, 'mesLike'
cluster_2, label_2 = 5, 'smallBasal'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = [ 'TP63','HES2', "GRHL3", 'BNC1', 'IRF6', 'ZNF750', 'OVOL1', 'KLF5', "ETS2", "AHR", 'STAT1', "ZBTB7C"]

morp_df['dendrogram_cut'] = morp_df['dendrogram_cut'].astype(float).astype(int)
fig5, df, ann5 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title5= f'{cls_1} vs {cls_2}'
fig5

In [21]:
# Basal large vs Luminal
log = True
cluster_1, label_1 = 13, 'luminal'
cluster_2, label_2 = 4, 'basal'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = ['JRK','HES2', 'BNC1', 'ELF3', 'FOSL1', 'MYCL', 'FOXQ1', "GRHL3", 'STAT1', 'HOXB6', 'TP63']

fig1, df, ann1 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title1 = f'A) {cls_1} vs {cls_2}'

# Lum vs LumInf
cluster_1, label_1 = 13, 'luminal'
cluster_2, label_2 = 12, 'lumInf'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = ['TP63', 'MECOM', "IRF6", 'KLF6', 'IRF7', 'ELF3', 'BNC1']

fig2, df, ann2 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title2= f'B) {cls_1} vs {cls_2}'

# Small vs LumInf
cluster_1, label_1 = 5, 'smallBasal'
cluster_2, label_2 = 12, 'lumInf'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = ['TP63',  'MYCL', 'BNC1', "GRHL3",'HES2',  'MSX2', 'IRF6', 'HOXB6']

fig3, df, ann3 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title3 = f'C) {cls_1} vs {cls_2}'

# Mes-like vs Basal
cluster_1, label_1 = 3, 'mesLike'
cluster_2, label_2 = 4, 'basal'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = ["GRHL3",'BNC1', 'ELF3', 'MYCL','HES2', 'TP63', 'IRF6', 'STAT1', 'ZBTB7C', 'ZNF750', 'JUNB']

fig4, df, ann4 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title4 = f'D) {cls_1} vs {cls_2}'

#Mes-like vs small basal
cluster_1, label_1 = 3, 'mesLike'
cluster_2, label_2 = 5, 'smallBasal'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = [ 'TP63','HES2', "GRHL3", 'BNC1', 'IRF6', 'ZNF750', 'OVOL1', 'KLF5', "ETS2", "AHR", 'STAT1', "ZBTB7C", 'ELF3', "REL", "IRF7"]

fig5, df, ann5 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title5= f'E) {cls_1} vs {cls_2}'


# Small basal vs Basla
cluster_1, label_1 = 4, 'basal'
cluster_2, label_2 = 5, 'smallBasal'
cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
markers = ['ZBTB7C', 'MECOM', "TP63", 'ELF3', 'KLF5', 'MSX2',]

fig6, df, ann6 = dumbell_plots(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)
title6 = f'F) {cls_1} vs {cls_2}'

In [37]:
num_cols=2
subplots_config = {
    "num_cols": num_cols,
    "shared_x": False,
    "shared_y": False,
    "h_spacing": 0.04,
    "v_spacing": 0.1,
    "main_title": "Gene differences",
    "height": 1800,
    "width": None,
    "y_title": None,
    "x_title": None,
    "specs": None,
}

figs, titles = [fig1, fig2, fig3, fig4, fig5, fig6], [title1, title2, title3, title4, title5, title6]
annotations = [ann1, ann2, ann3, ann4, ann5, ann6]
# annotations = []

# Defining the trace colors
traces_names = ["mesLike_3", 'basal_4', "luminal_13", 'lumInf_12', "smallBasal_5"]
trace_colors = {}
for idx, name in enumerate(traces_names):
     trace_colors[name] = px.colors.qualitative.Plotly[idx]

# Making sure that we only display the traces once
displayed_legends = set()
for fig in figs:
    for trace in fig.data:
        if trace.name:
            trace.update(marker=dict(color=trace_colors[trace.name]))

            # Manage legend entries
            if trace.name not in displayed_legends:
                displayed_legends.add(trace.name)
                trace.showlegend = True  # Show legend for this trace
            else:
                trace.showlegend = False  # Hide legend for this trace

fig = gh.helper_multiplots(figs, titles, subplots_config)

# Adding the annotations to the subtplots
idx_row, idx_col = 1, 1

for i, ann in enumerate(annotations):
    for idx, elem in enumerate(ann):
        fig.add_annotation(elem, row=idx_row, col=idx_col)
    if idx_col % num_cols == 0:
            idx_col = 0
            idx_row += 1
    idx_col += 1

fig.update_layout(height=2100, yaxis_title='log2(TPM+1)',
                  showlegend=True,
                  legend=dict(
                       orientation="h",
                       title="Subtypes", 
                       yanchor="bottom",
                       y=1.0,
                       xanchor="center", 
                       x=0.5, 
                       bgcolor="rgba(0,0,0,0)",
                       font=dict(size=16, color="#003366"),
                    ),
                #   paper_bgcolor="rgba(0,0,0,0)",
                  xaxis=dict(tickfont=dict(size=16)),
                  yaxis=dict(tickfont=dict(size=16),),
                  font=dict(size=14),
                  title = '')
# fig.show()
del fig
# save_fig(name="dumbell_sel_tfs", fig=fig, base_path=figures_path, height=1700, width=2700)

### Single plot version

In [23]:
# Single scatter plot in plotly
def plot_cluster_means(morp_df: pd.DataFrame, tum_df: pd.DataFrame, sel_tfs: pd.DataFrame, cls_1="", cls_2="", markers=[], log=False):

    cluster_1, cluster_2 = int(cls_1.split("_")[-1]), int(cls_2.split("_")[-1])

    samples_1 = morp_df[morp_df['dendrogram_cut'] == cluster_1].index
    samples_2 = morp_df[morp_df['dendrogram_cut'] == cluster_2].index

    # Reconstruct the TPM
    dmy_df = tum_df.loc[tum_df.index.isin(sel_tfs.index)]
    df_1 = dmy_df[samples_1]
    df_1[cls_1] = df_1.mean(axis=1)

    dmy_df = tum_df.loc[tum_df.index.isin(sel_tfs.index)]
    df_2 = dmy_df[samples_2]
    df_2[cls_2] = df_2.mean(axis=1)

    comb_df = pd.concat([df_1[cls_1], df_2[cls_2]], axis=1)
    fig = px.scatter(comb_df.reset_index(), x=cls_1, y=cls_2, hover_data='genes', title=f'{cls_1} vs {cls_2}', log_x=log, log_y=log, trendline='ols', trendline_color_override='red', trendline_options=dict(log_x=log, log_y=log))

    for marker in markers:
        dmy = comb_df.loc[marker]
        x, y = dmy.values[0], dmy.values[1]
        if log:
            x, y = np.log10(x), np.log10(y)
            
        fig.add_annotation(
            x=x,
            y=y,
            text=dmy.name,
            showarrow=False,
            xanchor="right",
        )
    return fig, comb_df

# Matplotlib functions - useful for multiplots
def plot_cluster_means_2(ax, morp_df, tum_df, sel_tfs, cls_1="", cls_2="", markers=[], log=False):
    cluster_1 = int(cls_1.split("_")[-1])
    cluster_2 = int(cls_2.split("_")[-1])

    samples_1 = morp_df[morp_df['dendrogram_cut'] == cluster_1].index
    samples_2 = morp_df[morp_df['dendrogram_cut'] == cluster_2].index

    # Reconstruct the TPM
    dmy_df = tum_df.loc[tum_df.index.isin(sel_tfs.index)]
    df_1 = dmy_df.loc[:, samples_1].mean(axis=1).rename(cls_1)
    df_2 = dmy_df.loc[:, samples_2].mean(axis=1).rename(cls_2)

    comb_df = pd.concat([df_1, df_2], axis=1)

    # Plotting
    x_values = comb_df[cls_1]
    y_values = comb_df[cls_2]
    
    if log:
        x_values = np.log10(x_values + 1)  # +1 to handle log(0) cases
        y_values = np.log10(y_values + 1)
        xlabel = f'Log10({cls_1})'
        ylabel = f'Log10({cls_2})'
        title = f'Log10-scaled: {cls_1} vs {cls_2}'
    else:
        xlabel = cls_1
        ylabel = cls_2
        title = f'{cls_1} vs {cls_2}'

    ax.scatter(x_values, y_values)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.set_title(title)

    # Adding trend line
    z = np.polyfit(x_values, y_values, 1)  # Fit a first degree polynomial (linear fit)
    p = np.poly1d(z)  # Create the polynomial object to evaluate
    ax.plot(x_values, p(x_values), "r--")  # Plot the trend line

    # Annotations
    for marker in markers:
        x = x_values.get(marker, None)
        y = y_values.get(marker, None)
        if x is not None and y is not None:
            ax.annotate(marker, (x, y), textcoords="offset points", xytext=(0,10), ha='center')

    # Adding gridlines
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.7)

# save_fig(name="test", fig=fig, base_path=figures_path, width=1400, height=700)

In [24]:
if 0:
    cluster_1, label_1 = 9, 'basal'
    cluster_2, label_2 = 7, 'luminf'
    cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'

    markers = ['TP63', 'HES2', 'MSX2', "MYCL", 'ZSCAN16', 'MAFF', "IRF7", 'IRF6', 'KLF5', "ETS2"]

    dmy_df = sel_tfs.loc[~sel_tfs.index.isin(["BNC1"])]
    fig, df = plot_cluster_means(morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=dmy_df, cls_1=cls_2, cls_2=cls_1, markers=markers, log=True)
    # fig.show()

### Matplotlib version

In [25]:
if 0:
    fig, axs = plt.subplots(1, 2, figsize=(14, 6))  # 1 row, 2 columns

    cluster_1, label_1 = 10, 'basal'
    cluster_2, label_2 = 13, 'basal'
    cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
    markers = ["IRF6", "TP63", "GRHL3", "HES2", 'BNC1', "REL", "ZBTB7C", "STAT1", "ELF3", "JUNB", "ZNF750", "AHR", "MYCL", 'REPIN1']
    fig1, df = plot_cluster_means_2(axs[0], morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)

    # Repeat for the second figure
    cluster_1, cluster_2, label_1, label_2 = 10, 9, 'basal', 'basal'
    cls_1, cls_2 = f'{label_1}_{cluster_1}', f'{label_2}_{cluster_2}'
    markers = ["MSX2", "ZBTB7C", "ELF3", "KLF5", "TP63", "MECOM", 'NR4A2']
    fig2, df = plot_cluster_means_2(axs[1], morp_df=morp_df, tum_df=tum_tpms_v4, sel_tfs=sel_tfs, cls_1=cls_1, cls_2=cls_2, markers=markers, log=True)


# Selected genes and network metrics

In [26]:
import pickle as pickle
import os 

ctrls_folder = f'{base_path}/{exp_folder_h42_ctrl}'
folders = next(os.walk(ctrls_folder), (None, None, []))[1]
ctrl_exps = {}
for folder in folders:
    stats_folder = f'{ctrls_folder}/{folder}/Stats/'
    files = next(os.walk(stats_folder), (None, None, []))[2]
    
    ctrl_idx = int(folder.split('hCtrl')[-1])
    ctrl_exps[ctrl_idx] = {}
    for file in files:
        if ('pickle' in file) or (file == 'stats_master.tsv') or (file == '.DS_Store'):
            continue
        dmy_df = pd.read_csv(f"{stats_folder}/{file}", index_col="gene", sep='\t')

        dmy_df['num_tf'] =  int(file.split("_")[-1].split("TF")[0])
        dmy_df['exp'] = file.replace("standard_int", 'std')
        dmy_df['ctrl'] = ctrl_idx
        key_name = file.split(".tsv")[0].replace('.pickle', '').replace(f'standard_int_{folder}', "std")
        ctrl_exps[ctrl_idx][key_name] = dmy_df


In [27]:
genes = ['RARG', 'PPARG', 'ELF3', 'AHR']
genes = list(diff_markers)
comb_df = pd.DataFrame()
for ctrlExp in ctrl_exps.values():
    for net_stats in ctrlExp.values():
        dmy_df = net_stats.loc[net_stats.index.isin(genes)]
        comb_df = pd.concat([comb_df, dmy_df], axis=0)

# Mark the selected
comb_df['selected_tf'] = "No"
comb_df.loc[comb_df.index.isin(sel_tfs.index), 'selected_tf'] = 'Yes'

In [28]:
px.box(comb_df.reset_index(), x='gene', y='degree_w', color='selected_tf', points='all')

# Survival analysis

In [29]:
from lifelines.statistics import multivariate_logrank_test

def prep_survival(df, cs_model="RawKMeans", label="SBM"):
    
    colors_net, color_map = px.colors.qualitative.G10 + px.colors.qualitative.D3, {}
    for idx, val in enumerate(df[cs_model].unique()):
        color_map[val] = colors_net[idx]

    df[cs_model] = df[cs_model].astype(str)
    fig = survival_plot(df.drop(columns=["days_to_last_follow_up", "days_to_death"]), vu_output, classifier=cs_model, color_map=color_map)
    fig = fig.update_layout(title="{}. Survival analysis for {}".format(label, cs_model))

    return fig, color_map

def survival_sig(df, model):
    df = df.reset_index().rename(columns={"index": "Sample"}).copy(deep=True)
    classifier = model

    dmy = df[["days_to_last_follow_up", "days_to_death", classifier]].replace("--", 0).astype(int)
    dmy["last_contact"] = dmy[["days_to_last_follow_up", "days_to_death"]].max(axis=1).div(30)

    labels = list(df[model].unique())
    dmy = dmy[dmy[classifier].isin(labels)]
    print(labels)

    results = multivariate_logrank_test(dmy["last_contact"], dmy[classifier], dmy["days_to_death"])
    display(results.print_summary())
    print("{0:.6f}".format(results.p_value))

# add the survival metadata
tcga_metadata = pd.read_csv(f"{data_base}/tumour/TCGA_metadata.tsv", sep="\t", index_col="Sample")
morp_df['days_to_last_follow_up'] = tcga_metadata['days_to_last_follow_up']
morp_df['days_to_death'] = tcga_metadata['days_to_death']

In [30]:
cluster_model = 'dendrogram_cut'
fig, dendo_color_map = prep_survival(morp_df, cs_model=cluster_model, label="CS_15")
# save_fig(name="Survival_plot_reward", fig=fig, base_path=figures_path, width=1400, height=600)

fig.update_layout(
    legend=dict(
        orientation="h",
        title="Network subtype",
        yanchor="bottom",
        y=0.9,
        xanchor="center",
        x=0.5,
        bgcolor="rgba(0,0,0,0)",
        font=dict(size=16, color="#003366"),
    ),
    title="",
    template="ggplot2",  # "ggplot2", "plotly_white"
    paper_bgcolor="rgba(0,0,0,0)",
    # plot_bgcolor="rgba(0,0,0,0)",
    xaxis=dict(tickfont=dict(size=16)),
    yaxis=dict(tickfont=dict(size=16)),
    font=dict(size=16),
    height=700
)
fig.add_annotation(text='Significance p<0.005', x=29, y=0.92, showarrow=False, font=dict(size=18, color="#003366"))
fig.show()
save_fig(name="survival_sel_tfs_cs", fig=fig, base_path=figures_path, width=1000, height=600)


In [31]:
morp_df[cluster_model] = morp_df[cluster_model].astype(float).astype(int)
survival_sig(morp_df, model=cluster_model)

[3, 4, 5, 12, 13]


0,1
t_0,-1
null_distribution,chi squared
degrees_of_freedom,4
test_name,multivariate_logrank_test

Unnamed: 0,test_statistic,p,-log2(p)
0,22.78,<0.005,12.8


None

0.000140


### Comparing survival

In [32]:
cluster_model = "dendrogram_cut"
comp_model = 'TCGA408_classifier'

colors_ref = px.colors.qualitative.Pastel2
color_map = {
    "LumP": colors_ref[0],
    "Lum Inf/Ns": colors_ref[1],
    "High IFNG": colors_ref[2],
    "Low IFNG": colors_ref[3],
    "Med IFNG": colors_ref[4],
    "Ne": colors_ref[5],
}

color_map_grey = {label: "grey" for label in morp_df[comp_model].unique()}
color_map = dict(color_map, **color_map_grey)

# choose the subtypes for each to compare if needed
select_labels_1, select_labels_2 = None, None

colors_net = px.colors.qualitative.G10
morp_df[cluster_model] = morp_df[cluster_model].astype(str)
for idx, val in enumerate(morp_df[cluster_model].unique()):
    color_map[val] = colors_net[idx]

fig = survival_comp(
    morp_df.drop(columns=["days_to_last_follow_up", "days_to_death"]),
    vu_output,
    classifier_1=cluster_model,
    classifier_2=comp_model,
    selected_labels_1=select_labels_1,
    selected_labels_2=select_labels_2,
    color_map=color_map,
)
fig = fig.update_layout(title="Survival analysis {}".format("VU + in-situ"))
fig.update_layout(height=900)
# save_fig(name="Survival_plot_reward", fig=fig, base_path=figures_path, width=1400, height=600)

## Apply clustering analysis

In [33]:
plot_data = tum_tpms_v4.loc[sel_tfs.index]
plot_data = np.log2(plot_data + 1)

gh.find_pcs((plot_data))

Sum of 90% variance at PC: 15
Change < 1% at PC: 5


In [34]:
selected_clusters = ["Birch", "RawKMeans", "GaussianMixture", "Ward", "SpectralClustering", "Avg"]

# run experiments
outputs, _, all_metrics, _ = cs.compare_exp(
    plot_data, rob_comp=None, n_clusters=None, selected_clusters=selected_clusters, show_figures=False, show_consensus=True, pca_data=False, n_comp=15,
)
outputs.set_index("Sample", inplace=True)

show_figs=False
if show_figs:
    # Plot the metrics
    fig = cs.display_metrics(all_metrics, f"Cluster metrics for Selected TF", show_individual=False, verbose=True)
    gh.plot_individual_metric(all_metrics, pca=False, offset_db=4)

Variation per principal component [0.63955238 0.09883797] and the sum 73.84%


# Differentially Expressed Analysis

In [52]:
def toggle_legend(fig: dict, kept_traces =[]):
    for trace in fig['data']:
        if trace.name in kept_traces:
            continue
        trace.visible = 'legendonly'
    return fig

In [35]:
absCa_p0 = 'sleuth_ABS-Ca_P0_v4_vulcano_labels.tsv'
ud_p0 = 'sleuth_UD_P0_v4_vulcano_labels.tsv'
absCa_ud = 'sleuth_ABS-Ca_UD_v4_vulcano_labels.tsv'

In [143]:
markers_var = {
    # 'all_tfs': list(sel_tfs.index),
    'tum_varied': high_varied_tum,
    'h_varied': high_varied_h,
    "cmn_varied": cmn_varied,
}

In [112]:
label = 'cmn_varied'
volcano = dea.volcano(absCa_p0, base_path=dea_path, known_markers=False, markers=markers_var)
# scatter = sp.draw_scatter(absCa_p0, base_path=dea_path, selected_genes=[], known_markers=True)

volcano = toggle_legend(volcano, kept_traces=markers_var.keys())

volcano.add_annotation(text='P0', x=-6, y=52, showarrow=False, font=dict(size=18, color="#003366"))
volcano.add_annotation(text='ABS-Ca', x=6, y=52, showarrow=False, font=dict(size=18, color="#003366"))

volcano = volcano.update_layout(height=800)
# volcano.show()

Finished loading the data in 0.06695699691772461


## Volcano plots

## Pi plots

In [192]:
markers_var = {
    '98 TFs': list(sel_tfs.index),
    'tum only': high_varied_tum,
    'non-tum only': high_varied_h,
    "both": cmn_varied,
}

In [194]:
if 1:
    pi = dea.plotPi(absCa_ud, ud_p0, base_path=dea_path, known_markers=False, markers=markers_var)
    pi = toggle_legend(pi, kept_traces=markers_var.keys())
    pi.update_layout(
        legend=dict(
            orientation="h",
            title = '',
            yanchor="bottom",
            xanchor="center",
            y=0.92,
            x=0.6,
            bgcolor="rgba(0,0,0,0)",
            font=dict(size=12, color="#003366"),
        ),
        template="ggplot2",
        paper_bgcolor="rgba(0,0,0,0)",
        xaxis=dict(tickfont=dict(size=16)),
        yaxis=dict(tickfont=dict(size=16)),
        yaxis_title='Pi_P0_UD',
        xaxis_title='Pi_UD_AbsCA',
        height=900
    )

    text_color ="#F8766D"
    # Y axis
    pi.add_annotation(text='UD', x=0, y=100, showarrow=False, font=dict(size=18, color=text_color))
    pi.add_annotation(text='P0', x=0, y=-150, showarrow=False, font=dict(size=18, color=text_color))
    # X-axis
    pi.add_annotation(text='UD', x=-70, y=0, showarrow=False, font=dict(size=18, color=text_color))
    pi.add_annotation(text='Abs-Ca', x=110, y=0, showarrow=False, font=dict(size=18, color=text_color))
    pi.show()

In [182]:
if 0:
    pi = dea.plotPi(absCa_p0, absCa_ud, base_path=dea_path, known_markers=False, markers=markers_var)
    pi = toggle_legend(pi, kept_traces=markers_var.keys())
    pi.update_layout(
        legend=dict(
            orientation="h",
            # title = '',
            yanchor="bottom",
            xanchor="center",
            y=0.92,
            x=0.3,
            bgcolor="rgba(0,0,0,0)",
            font=dict(size=12, color="#003366"),
        ),
        template="ggplot2",
        paper_bgcolor="rgba(0,0,0,0)",
        xaxis=dict(tickfont=dict(size=16)),
        yaxis=dict(tickfont=dict(size=16)),
        xaxis_title='Pi_P0_AbsCa',
        yaxis_title='Pi_AbsCa_UD',
        height=900
    )

    text_color ="#F8766D"
    # Y axis
    pi.add_annotation(text='ABS-Ca', x=0, y=150, showarrow=False, font=dict(size=18, color=text_color))
    pi.add_annotation(text='UD', x=0, y=-70, showarrow=False, font=dict(size=18, color=text_color))
    # X-axis
    pi.add_annotation(text='P0', x=-220, y=0, showarrow=False, font=dict(size=18, color=text_color))
    pi.add_annotation(text='ABS-Ca', x=65, y=0, showarrow=False, font=dict(size=18, color=text_color))
    pi.update_layout(
        xaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 25
        ),
        yaxis = dict(
            tickmode = 'linear',
            tick0 = 0,
            dtick = 25
        )
    )
#   pi.show()