In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import signaturescoring as ssc

In [None]:
import pathlib as pl

In [None]:
from tqdm.notebook import tqdm

In [None]:
import gseapy as gp

In [None]:
import os

In [None]:
adata = sc.read_h5ad("/add/path/here/full_cohort.h5ad")

refined_annotations = pd.read_csv("/add/path/here/refined_annotations.csv", index_col=0)

adata.obs["refined_annotation"] = refined_annotations.loc[adata.obs_names,"Annotation"].ravel()

In [None]:
patient_id_mapping = {"CCG1153_4496262": "P1", "CCG1153_6640539": "P2", 
                      "CCG1153_4411": "P3", "Aguirre_EGSFR0074": "P4", 
                      "Aguirre_EGSFR0148": "P5", "Aguirre_EGSFR1732": "P6", 
                      "Aguirre_EGSFR0128": "P7", "Aguirre_EGSFR1938": "P8", 
                      "Aguirre_EGSFR1982": "P9", "Aguirre_EGSFR2218": "P10"}

# Save the adatas for cNMF

In [None]:
toremove = [f"AC{i}" for i in range(0,10)] + [f"AL{i}" for i in range(0,10)] + ["LINC"] + ["MT-"]

In [None]:
adata = adata[:,~adata.var_names.str.startswith(tuple(toremove))].copy()

In [None]:
pd.Series(adata.var_names).to_csv("/add/path/here/eac_gene_names.csv")

In [None]:
subadata = adata[(adata.obs.refined_annotation=="Carcinoma") & (adata.obs.CNV_celltype_annotation=="Tumor")].copy()

In [None]:
subadata.X = subadata.layers["counts"].copy()

In [None]:
for sample in subadata.obs.sample_id.unique():
    patadata = subadata[subadata.obs.sample_id==sample].copy()
    patadata.write(f"/add/path/here/{sample}_subadata_cNMF.h5ad")

# Per patient

In [None]:
from cnmf import cNMF

In [None]:
subadata.obs.sample_id.unique()

## Aguirre_EGSFR1982

In [None]:
sample = "Aguirre_EGSFR1982"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 7
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR2218

In [None]:
sample = "Aguirre_EGSFR2218"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 3
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## CCG1153_4411

In [None]:
sample = "CCG1153_4411"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 7
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR1938

In [None]:
sample = "Aguirre_EGSFR1938"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR0074

In [None]:
sample = "Aguirre_EGSFR0074"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 7
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR0128

In [None]:
sample = "Aguirre_EGSFR0128"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 4
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR1732

In [None]:
sample = "Aguirre_EGSFR1732"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 4
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR0148

In [None]:
sample = "Aguirre_EGSFR0148"

sample_file = f"/add/path/here/{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 4
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

# Aggregate programs

In [None]:
import seaborn as sns 
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances, euclidean_distances

In [None]:
import scipy

In [None]:
import palettable
colorlist = palettable.colorbrewer.qualitative.Set1_7.mpl_colors
colormapping_mal = {"cNMF_1": colorlist[0], "cNMF_2": colorlist[1], "cNMF_3": colorlist[3], 
                    "cNMF_4": colorlist[4], "cNMF_5": colorlist[6]}
colormapping_mal["Outlier"] = "whitesmoke"
colormapping_mal["Mixed"] = "lightgrey"

colorlist = palettable.colorbrewer.qualitative.Dark2_8.mpl_colors
colorlistbis = palettable.colorbrewer.qualitative.Paired_3.mpl_colors
colormapping_pat = {'Aguirre_EGSFR1982': colorlist[0], 
                    "Aguirre_EGSFR2218": colorlist[1], 
                    "CCG1153_4411": colorlist[2], 
                    "Aguirre_EGSFR1938": colorlist[3], 
                    "Aguirre_EGSFR0074": colorlist[4], 
                    "Aguirre_EGSFR0128": colorlist[5], 
                    "Aguirre_EGSFR1732": colorlist[6], 
                    "Aguirre_EGSFR0148": colorlist[7], 
                    "CCG1153_4496262": colorlistbis[0], 
                    "CCG1153_6640539": colorlistbis[1]}

colormapping_pat_bis = {patient_id_mapping[pat]: colormapping_pat[pat] for pat in colormapping_pat}

In [None]:
program_dir = pl.Path("./cNMF_malignant_per_patient/")

In [None]:
program_genes = []
usages = []
for sample in program_dir.iterdir():
    print("_________")
    print(sample.stem)
    print("_________")
    for f in sample.iterdir():
        if "gene_spectra_score" in f.stem:
            df = pd.read_csv(f, index_col=0, sep="\t").T
            df.columns = f"{sample.stem}_" + df.columns.astype(str)
            program_genes.append(df)
        if "usages" in f.stem:
            df = pd.read_csv(f, index_col=0, sep="\t")
            df.columns = f"{sample.stem}_" + df.columns.astype(str)
            df = (df.T/df.sum(axis=1)).T
            usages.append(df)

In [None]:
programs_to_remove = ["Aguirre_EGSFR1982_5", "Aguirre_EGSFR1982_6", "Aguirre_EGSFR1982_7", "CCG1153_4411_6", 
             "CCG1153_4411_7", "Aguirre_EGSFR1938_5",
             "Aguirre_EGSFR0074_5","Aguirre_EGSFR0074_6","Aguirre_EGSFR0074_7",
             "Aguirre_EGSFR0128_3","Aguirre_EGSFR1732_4",]

In [None]:
full_programs = pd.concat(program_genes,axis=1).drop(programs_to_remove,axis=1)

In [None]:
cossim = pd.DataFrame(cosine_similarity(full_programs.T.fillna(0)),index=full_programs.columns,columns=full_programs.columns)

In [None]:
linkage = scipy.cluster.hierarchy.linkage(full_programs.T.fillna(0), method='average', metric='cosine', optimal_ordering=False)

In [None]:
ax = sns.clustermap(data=cossim, cmap="vlag", 
               center=0., row_linkage=linkage, col_linkage=linkage )

In [None]:
lnkg = linkage

clusters = scipy.cluster.hierarchy.fcluster(lnkg, t=5, criterion="maxclust")

row_programs = ("cNMF_" + pd.Series(clusters, index=full_programs.columns).astype(str)).ravel()
pats = list(full_programs.columns.str.split("_").str[:-1])
row_pats = ["_".join(pat) for pat in pats]

row_colors = [[],[]]
for i,prog in enumerate(row_programs):
    row_colors[0].append(colormapping_mal[prog])
    row_colors[1].append(colormapping_pat[row_pats[i]])

In [None]:
os.makedirs("figures/malignant/",exist_ok=True)
fig = sns.clustermap(data=cossim, cmap="vlag", 
               center=0., row_linkage=linkage, col_linkage=linkage, row_colors=row_colors, xticklabels=False, yticklabels=False)
fig.savefig("figures/malignant/clustermap_cnmf.svg", dpi=300, bbox_inches="tight")

In [None]:
cluster_assignment = pd.Series(clusters, index=full_programs.columns)

In [None]:
marker_genes = {}
for cl in cluster_assignment.unique():
    
    sigs = cluster_assignment[cluster_assignment==cl].index
    marker_genes[cl] = full_programs[sigs].median(axis=1).sort_values(ascending=False)
    marker_genes[cl] = marker_genes[cl].loc[~marker_genes[cl].index.str.startswith(("MT-","RPS","RPL"))]

In [None]:
for cl in marker_genes:
    marker_genes[cl].to_csv(f"/add/path/here/cNMF_{cl}.csv")

In [None]:
("cNMF_"+ cluster_assignment.astype(str)).to_csv("/add/path/here/cNMF_program_assignment_cluster.csv")

# Go back to original data

In [None]:
sc.pp.normalize_total(subadata, target_sum=10000)
sc.pp.log1p(subadata)

In [None]:
sc.tl.pca(subadata)
sc.pp.neighbors(subadata)
sc.tl.umap(subadata)

In [None]:
fig = sc.pl.umap(subadata, color=["pid"], palette=colormapping_pat_bis, frameon=False, return_fig=True)

fig.axes[0].set_title('Patient ID', fontsize=18)
fig.axes[0].legend(fontsize=15, frameon=False, bbox_to_anchor=(1,1,-0.5,0))
fig.savefig("figures/malignant/unintegrated_sampleid_umap.png", dpi=300, bbox_inches="tight")

In [None]:
itay_MPs = pd.read_csv("/add/path/here/ItayTiroshHeterogeneityMPs.csv")

In [None]:
sorted_gm = sorted(list(marker_genes))
mp_similarities = pd.DataFrame(np.zeros((len(sorted_gm),itay_MPs.shape[1])), 
                               columns=itay_MPs.columns, 
                               index=[f"cNMF_{cl}" for cl in sorted_gm])
for cl in sorted_gm:
    for mp in itay_MPs.columns:
        available = len(np.intersect1d(itay_MPs.loc[:,mp].ravel(),marker_genes[cl].index))
        inter = len(np.intersect1d(marker_genes[cl].head(50).index.ravel(),
                                                    itay_MPs[mp].ravel()))/available
        if inter>0:
            print(cl, mp)
            print(np.intersect1d(marker_genes[cl].head(50).index.ravel(),
                                                    itay_MPs[mp].ravel()))
        mp_similarities.loc[f"cNMF_{cl}",mp] = inter

In [None]:
fig = sns.clustermap(data=mp_similarities, row_cluster=False, col_cluster=False,
                    cmap="vlag", vmin=0, vmax=0.2, center=0.01,
                    figsize=(12,4))
fig.ax_heatmap.set_yticks(fig.ax_heatmap.get_yticks(), ["cNMF$_{1}$","cNMF$_{2}$","cNMF$_{3}$","cNMF$_{4}$","cNMF$_{5}$"])
fig.savefig("figures/malignant/heatmap_itay_program_comparison.png", dpi=200, bbox_inches="tight")

In [None]:
from gseapy import gseaplot

In [None]:
gsea_results_scanpy = {}
for cl in sorted(marker_genes):
    
    df = marker_genes[cl].to_frame()
    pre_res = gp.prerank(rnk=df, # or rnk = rnk,
                         gene_sets='/add/path/here/h.all.v7.4.symbols.gmt',
                         #threads=4,
                         min_size=5,
                         max_size=1000,
                         permutation_num=1000, # reduce number to speed up testing
                         outdir=None, # don't write to disk
                         seed=6,
                         verbose=True, # see what's going on behind the scenes
                        )

    gsea_results_scanpy[cl] = pre_res.res2d.sort_values(by="nes",ascending=False)
    gsea_results_scanpy[cl]["cluster"] = cl

    if cl==3:
        gseaplot(rank_metric=pre_res.ranking, 
                 term="HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION", 
                 ofname=f'figures/cNMF_3_EMT.pdf', **pre_res.results["HALLMARK_EPITHELIAL_MESENCHYMAL_TRANSITION"])
    elif cl==2:
        gseaplot(rank_metric=pre_res.ranking, 
                 term="HALLMARK_G2M_CHECKPOINT", 
                 ofname=f'figures/cNMF_2_G2M.pdf', **pre_res.results["HALLMARK_G2M_CHECKPOINT"])

In [None]:
gsea_df = pd.concat(list(gsea_results_scanpy.values()))

colorlist = palettable.colorbrewer.qualitative.Dark2_8.mpl_colors

hallmark_classif = {"APICAL JUNCTION": "Cellular component", "PEROXISOME": "Cellular component", 
                    "ANDROGEN RESPONSE": "Signaling", "ESTROGEN RESPONSE EARLY": "Signaling", "ESTROGEN RESPONSE LATE": "Signaling",
                    "HEDGEHOG SIGNALING": "Signaling", "IL2 STAT5 SIGNALING": "Signaling", "MTORC1 SIGNALING": "Signaling", 
                    "NOTCH SIGNALING": "Signaling", "TGF BETA SIGNALING": "Signaling", "PI3K AKT MTOR SIGNALING": "Signaling",
                    "TNFA SIGNALING VIA NFKB": "Signaling", "WNT BETA CATENIN SIGNALING": "Signaling", 
                    "IL6 JAK STAT3 SIGNALING": "Signaling", "KRAS SIGNALING UP": "Signaling", 
                    "DNA REPAIR": "DNA damage", "UV RESPONSE DN": "DNA damage", "UV RESPONSE UP": "DNA damage", 
                    "E2F TARGETS": "Proliferation", "APOPTOSIS": "Proliferation", 
                    "G2M CHECKPOINT": "Proliferation", "MITOTIC SPINDLE": "Proliferation", "MYC TARGETS V1": "Proliferation", 
                    "MYC TARGETS V2": "Proliferation", "P53 PATHWAY": "Proliferation", "CHOLESTEROL HOMEOSTASIS": "Metabolic", 
                    "FATTY ACID METABOLISM": "Metabolic", "GLYCOLYSIS": "Metabolic", "OXIDATIVE PHOSPHORYLATION": "Metabolic", 
                    "BILE ACID METABOLISM": "Metabolic", "XENOBIOTIC METABOLISM": "Metabolic", "HEME METABOLISM": "Metabolic",
                    "ALLOGRAFT REJECTION": "Immune", "COAGULATION": "Immune", "INTERFERON ALPHA RESPONSE": "Immune", 
                    "COMPLEMENT": "Immune", "INTERFERON GAMMA RESPONSE": "Immune", "INFLAMMATORY RESPONSE": "Immune", 
                    "ADIPOGENESIS": "Development", "ANGIOGENESIS": "Development", 
                    "EPITHELIAL MESENCHYMAL TRANSITION": "Development",
                    "PANCREAS BETA CELLS": "Development", "SPERMATOGENESIS": "Development",
                    "HYPOXIA": "Other", "MYOGENESIS": "Development", 
                    "PROTEIN SECRETION": "Other", "UNFOLDED PROTEIN RESPONSE": "Other", "REACTIVE OXYGEN SPECIES PATHWAY": "Other"}

hallmark_classif = pd.DataFrame(hallmark_classif, index=["Hallmark gr."]).T

colormapping_hallmark = {gr: colorlist[i] for i,gr in enumerate(hallmark_classif["Hallmark gr."].unique())}

sign_df = gsea_df[gsea_df["fdr"]<0.05].copy()

gsea_heatmap_df = {}

for term in sign_df.index:
    termdf = sign_df.loc[[term]].copy()
    termdf = termdf.replace({np.inf: 0})
    gsea_heatmap_df[term] = termdf[["nes","cluster"]].set_index("cluster")

gsea_heatmap_df = pd.concat(gsea_heatmap_df,axis=1).fillna(0).T
gsea_heatmap_df.index = gsea_heatmap_df.index.droplevel(1).str.replace("_"," ").str[9:]

gsea_heatmap_df = pd.concat([gsea_heatmap_df, hallmark_classif],axis=1).sort_values(by="Hallmark gr.")

In [None]:
fig = sns.clustermap(data=gsea_heatmap_df.drop("Hallmark gr.",axis=1), cmap="vlag", row_cluster=False, 
               col_cluster=False, 
               row_colors=gsea_heatmap_df["Hallmark gr."].replace(colormapping_hallmark, regex=True), 
                     figsize=(2,13))
fig.ax_heatmap.set_xticks(fig.ax_heatmap.get_xticks(), ["cNMF$_{1}$","cNMF$_{2}$","cNMF$_{3}$","cNMF$_{4}$","cNMF$_{5}$"], rotation=90, ha="center")
fig.savefig("figures/malignant/cNMF_GSEA_results.svg", dpi=200, bbox_inches="tight")