In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import signaturescoring as ssc

In [None]:
import pathlib as pl

In [None]:
from tqdm.notebook import tqdm

In [None]:
import gseapy as gp

In [None]:
adata = sc.read_h5ad("/add/path/here/full_cohort.h5ad")

refined_annotations = pd.read_csv("../refined_annotations.csv", index_col=0)

adata.obs["refined_annotation"] = refined_annotations.loc[adata.obs_names,"Annotation"].ravel()

#scib.preprocessing.score_cell_cycle(adata, organism='human')

In [None]:
clinical = pd.read_csv("/add/path/here/EAC_clinical_info.csv",index_col=0)
treatment_mapping = {"Neoadjuvant CROSS": "Neoadj. chemo", "Neoadjuvent carboplatin": "Neoadj. chemo"}
metastatic = (clinical["Tumor?"]=="Yes ") & (clinical["Site"].str.contains("metastasis"))
metastatic.name = "Metastatic?"
clinical["Metastatic?"] = metastatic

clinical["Location"] = clinical["Site"].replace({"GEJ": "Esophagus/GEJ", 
                          "Esophagus": "Esophagus/GEJ"})
clinical["Location"][clinical["Location"].str.contains("Liver")] = "Liver"
clinical["Location"][clinical["Location"].str.contains("Adrenal")] = "Adrenal gland"
clinical["Location"][clinical["Location"].str.contains("Peritoneal")] = "Peritoneum"

clinical["Stage"] = clinical["Grade/stage"].replace({"Stage IV ": "IV", "Stage IV": "IV", 
                                                     "Moderately differentiated; ypT1aN0": "I", 
                                                     "Moderately differentiated; pT1aN0": "I", 
                                                     "Poorly differentiated; ypT2N0": "II", 
                                                     "Presented with stage III became stage IV during esophagectomy when pleural metastases were identified": "III/IV"})

clinical["Treatment"] = ["Neoadj. chemo",
                         "None",
                         "Neoadj. chemo + ICI + RT",
                         "None",
                         "None",
                         "Chemo + HER2 targeted + ICI", 
                         "Neoadj. chemo + HER2 targeted", 
                         "Neoadj. chemo + ICI", 
                         "None",
                         "Neoadj. chemo + VEGFR2i"]

clinical["HER2 status"] = clinical["HER2"].replace({"HER 2 1+": "1+/equivocal"})

clinical = clinical.sort_values(by=["Tumor?","Metastatic?","Location"])

In [None]:
location_map = clinical["Location"].to_dict()
treatment_map = clinical["Treatment"].to_dict()
metastatic_map = clinical["Metastatic?"].to_dict()

adata.obs["Location"] = adata.obs.sample_id.replace(location_map)

adata.obs["Treatment"] = adata.obs.sample_id.replace(treatment_map)

adata.obs["Metastatic"] = adata.obs.sample_id.replace(metastatic_map)

adata.obs["Treatment Naïve"] = adata.obs.Treatment.apply(lambda x: "Yes" if x=="None" else "No")

# Save the adatas for cNMF

In [None]:
toremove = [f"AC{i}" for i in range(0,10)] + [f"AL{i}" for i in range(0,10)] + ["LINC"] + ["MT-"]

In [None]:
adata = adata[:,~adata.var_names.str.startswith(tuple(toremove))].copy()

In [None]:
subadata = adata[adata.obs.refined_annotation=="Carcinoma"].copy()

In [None]:
subadata.X = subadata.layers["counts"].copy()

In [None]:
subadata.write("subadata_cNMF.h5ad")

In [None]:
for sample in subadata.obs.sample_id.unique():
    patadata = subadata[subadata.obs.sample_id==sample].copy()
    patadata.write(f"{sample}_subadata_cNMF.h5ad")

# Per patient

In [None]:
from cnmf import cNMF

In [None]:
subadata.obs.sample_id.unique()

## Aguirre_EGSFR1982

In [None]:
sample = "Aguirre_EGSFR1982"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR2218

In [None]:
sample = "Aguirre_EGSFR2218"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 4
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## CCG1153_4411

In [None]:
sample = "CCG1153_4411"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR1938

In [None]:
sample = "Aguirre_EGSFR1938"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR0074

In [None]:
sample = "Aguirre_EGSFR0074"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR0128

In [None]:
sample = "Aguirre_EGSFR0128"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR1732

In [None]:
sample = "Aguirre_EGSFR1732"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 5
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

## Aguirre_EGSFR0148

In [None]:
sample = "Aguirre_EGSFR0148"

sample_file = f"{sample}_subadata_cNMF.h5ad"

cnmf_obj = cNMF(output_dir="./cNMF_malignant_per_patient/", name=sample)

In [None]:
cnmf_obj.prepare(counts_fn=sample_file, components=np.arange(2,11),n_iter=20, seed=14, num_highvar_genes=2000)

In [None]:
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine()

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
selected_K = 4
density_threshold = 0.1

In [None]:
cnmf_obj.consensus(k=selected_K, density_threshold=density_threshold, show_clustering=True, close_clustergram_fig=False)

In [None]:
hvgs = open(f'./cNMF_malignant_per_patient/{sample}/{sample}.overdispersed_genes.txt').read().split('\n')

In [None]:
patadata = sc.read_h5ad(sample_file)

In [None]:
sc.pp.normalize_total(patadata, target_sum=10000)
sc.pp.log1p(patadata)

In [None]:
patadata = patadata[:,hvgs].copy()

In [None]:
sc.tl.pca(patadata)
sc.pp.neighbors(patadata)
sc.tl.umap(patadata)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
patadata.obs = pd.concat([patadata.obs, usage_norm],axis=1)

In [None]:
sc.pl.umap(patadata, color=usage_norm.columns,
           ncols=3, vmin=0, vmax=1)

In [None]:
topgenes.head(20)

In [None]:
patadata.obs["GEP"] = usage_norm.idxmax(axis=1)

In [None]:
sc.pl.umap(patadata, color="GEP",)

# Aggregate programs

In [None]:
import seaborn as sns 
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances, euclidean_distances

In [None]:
import scipy

In [None]:
adata.obs.sample_id.unique()

In [None]:
import palettable
colorlist = palettable.colorbrewer.qualitative.Set1_5.mpl_colors
colormapping_mal = {"cNMF_1": colorlist[0], "cNMF_2": colorlist[1], "cNMF_3": colorlist[2], 
                    "cNMF_4": colorlist[3], "cNMF_5": colorlist[4]}
colormapping_mal["cNMF_Outlier"] = "grey"
colormapping_mal["Carcinoma_undefined"] = "grey"

colorlist = palettable.colorbrewer.qualitative.Dark2_8.mpl_colors
colorlistbis = palettable.colorbrewer.qualitative.Paired_3.mpl_colors
colormapping_pat = {'Aguirre_EGSFR1982': colorlist[0], 
                    "Aguirre_EGSFR2218": colorlist[1], 
                    "CCG1153_4411": colorlist[2], 
                    "Aguirre_EGSFR1938": colorlist[3], 
                    "Aguirre_EGSFR0074": colorlist[4], 
                    "Aguirre_EGSFR0128": colorlist[5], 
                    "Aguirre_EGSFR1732": colorlist[6], 
                    "Aguirre_EGSFR0148": colorlist[7], 
                    "CCG1153_4496262": colorlistbis[0], 
                    "CCG1153_6640539": colorlistbis[1]}

In [None]:
program_dir = pl.Path("/add/path/here/cNMF_malignant_per_patient/")

In [None]:
program_genes = []
usages = []
for sample in program_dir.iterdir():
    print("_________")
    print(sample.stem)
    print("_________")
    for f in sample.iterdir():
        if "gene_spectra_score" in f.stem:
            df = pd.read_csv(f, index_col=0, sep="\t").T
            df.columns = f"{sample.stem}_" + df.columns.astype(str)
            program_genes.append(df)
        if "usages" in f.stem:
            df = pd.read_csv(f, index_col=0, sep="\t")
            df.columns = f"{sample.stem}_" + df.columns.astype(str)
            df = (df.T/df.sum(axis=1)).T
            usages.append(df)

In [None]:
programs_to_remove = ["Aguirre_EGSFR1982_3","Aguirre_EGSFR1982_4","Aguirre_EGSFR1982_5",
                      "Aguirre_EGSFR2218_3","Aguirre_EGSFR2218_4","CCG1153_4411_5",
                      "Aguirre_EGSFR1938_5","Aguirre_EGSFR0074_4",
                      "Aguirre_EGSFR0074_5",
                      "Aguirre_EGSFR0128_3","Aguirre_EGSFR0128_4","Aguirre_EGSFR0128_5",
                      "Aguirre_EGSFR1732_4","Aguirre_EGSFR1732_5","Aguirre_EGSFR0148_4"]

In [None]:
full_programs = pd.concat(program_genes,axis=1).drop(programs_to_remove,axis=1)

In [None]:
cossim = pd.DataFrame(cosine_similarity(full_programs.T.fillna(0)),index=full_programs.columns,columns=full_programs.columns)

In [None]:
ax = sns.clustermap(data=cossim, cmap="vlag", 
               center=0., method="average", )

In [None]:
lnkg = ax.dendrogram_col.linkage

clusters = scipy.cluster.hierarchy.fcluster(lnkg, t=5, criterion="maxclust")

row_programs = ("cNMF_" + pd.Series(clusters, index=full_programs.columns).astype(str)).ravel()
pats = list(full_programs.columns.str.split("_").str[:-1])
row_pats = ["_".join(pat) for pat in pats]

row_colors = [[],[]]
for i,prog in enumerate(row_programs):
    row_colors[0].append(colormapping_mal[prog])
    row_colors[1].append(colormapping_pat[row_pats[i]])

In [None]:
fig = sns.clustermap(data=cossim, cmap="vlag", 
               center=0., method="average", row_colors=row_colors, xticklabels=False, yticklabels=False)
fig.savefig("figures/malignant/clustermap_cnmf.svg", dpi=300, bbox_inches="tight")

In [None]:
cluster_assignment = pd.Series(clusters, index=full_programs.columns)

In [None]:
cell_assignment = []
for i in range(len(usages)):
    cell_assignment.append(usages[i].idxmax(axis=1))

cell_assignment = pd.concat(cell_assignment)

outliers = cell_assignment[~cell_assignment.isin(cluster_assignment.index)].index

cell_assignment.loc[outliers] = "Outlier"

cell_assignment = cell_assignment.replace(cluster_assignment.to_dict())
cell_assignment = cell_assignment.to_frame()

cell_assignment.columns = ["cNMF_program"]

In [None]:
cell_confidence = []
for i in range(len(usages)):
    cell_confidence.append(usages[i].max(axis=1))

cell_confidence = pd.concat(cell_confidence).to_frame()
cell_confidence.columns = ["cNMF_confidence"]

In [None]:
df = adata.obs.copy()

In [None]:
df.refined_annotation.loc[cell_assignment.index] =  "cNMF_" + cell_assignment.cNMF_program.astype(str).ravel()

In [None]:
df.refined_annotation.to_csv("../refined_wCNM_programs_new.csv")

In [None]:
marker_genes = {}
for cl in cluster_assignment.unique():
    
    sigs = cluster_assignment[cluster_assignment==cl].index
    marker_genes[cl] = full_programs[sigs].median(axis=1).sort_values(ascending=False)
    marker_genes[cl] = marker_genes[cl].loc[~marker_genes[cl].index.str.startswith(("MT-","RPS","RPL"))]

In [None]:
for cl in marker_genes:
    marker_genes[cl].to_csv(f"/add/path/here/cNMF_{cl}.csv")

# Go back to original data

In [None]:
sc.pp.normalize_total(subadata, target_sum=10000)
sc.pp.log1p(subadata)

In [None]:
if "cNMF_program" in subadata.obs.columns:
    subadata.obs = subadata.obs.drop("cNMF_program",axis=1)
if "cNMF_confidence" in subadata.obs.columns:
    subadata.obs = subadata.obs.drop("cNMF_confidence",axis=1)

subadata.obs = pd.concat([subadata.obs, cell_assignment.astype(str).astype('category')],axis=1)
subadata.obs = pd.concat([subadata.obs, cell_confidence],axis=1)

subadata.obs["Low confidence"] = (subadata.obs.cNMF_confidence<0.5).astype(int)

In [None]:
subadata = subadata[~subadata.obs.cNMF_program.isna()].copy()

In [None]:
subadata.obs.cNMF_program = "cNMF_" + subadata.obs.cNMF_program.astype(str)

In [None]:
sc.tl.pca(subadata)
sc.pp.neighbors(subadata)
sc.tl.umap(subadata)

In [None]:
fig = sc.pl.umap(subadata, color=["sample_id"], palette=colormapping_pat, frameon=False, return_fig=True)
fig.savefig("figures/malignant/unintegrated_sampleid_umap.png", dpi=300, bbox_inches="tight")

In [None]:
itay_MPs = pd.read_csv("/add/path/here/ItayTiroshHeterogeneityMPs.csv")

In [None]:
sorted_gm = sorted(list(marker_genes))
mp_similarities = pd.DataFrame(np.zeros((len(sorted_gm),itay_MPs.shape[1])), 
                               columns=itay_MPs.columns, 
                               index=[f"cNMF_{cl}" for cl in sorted_gm])
for cl in sorted_gm:
    for mp in itay_MPs.columns:
        available = len(np.intersect1d(itay_MPs.loc[:,mp].ravel(),marker_genes[cl].index))
        inter = len(np.intersect1d(marker_genes[cl].head(50).index.ravel(),
                                                    itay_MPs[mp].ravel()))/available
        if inter>0:
            print(cl, mp)
            print(np.intersect1d(marker_genes[cl].head(50).index.ravel(),
                                                    itay_MPs[mp].ravel()))
        mp_similarities.loc[f"cNMF_{cl}",mp] = inter

In [None]:
fig = sns.clustermap(data=mp_similarities, row_cluster=False, col_cluster=False,
                    cmap="vlag", vmin=0, vmax=0.2, center=0.01,
                    figsize=(12,4))
fig.savefig("figures/malignant/heatmap_itay_program_comparison.png", dpi=200, bbox_inches="tight")

In [None]:
for prog in marker_genes:
    ssc.score_signature(adata=subadata,
                        gene_list=list(marker_genes[prog].head(100).index.ravel()), 
                        method="adjusted_neighborhood_scoring", 
                        ctrl_size=150,
                        score_name=f"cNMF_{prog}_score")

In [None]:
import numpy as np
from sklearn.mixture import GaussianMixture

df = subadata.obs[["cNMF_1_score","cNMF_3_score","cNMF_4_score"]].copy()
df = (df - df.mean())/df.std()
gm = GaussianMixture(n_components=3, random_state=0).fit(df)
gm.means_

labels = gm.fit_predict(df)
confidence = gm.predict_proba(df).max(axis=1)

gmm_scores = pd.concat([df,
           pd.DataFrame(labels, index=df.index, columns=["GMM label"]),
           pd.DataFrame(confidence, index=df.index, columns=["GMM confidence"])],axis=1)

gmm_scores["Corrected label"] = gmm_scores["GMM label"]
gmm_scores.loc[gmm_scores["GMM confidence"]<0.8,"Corrected label"] = 3

%matplotlib inline
y = gmm_scores["Corrected label"].ravel()

Xax = df.iloc[:,0]
Yax = df.iloc[:,1]
Zax = df.iloc[:,2]

cdict = {0:'red',1:'green',2:"blue",3:"grey"}
labl =  gmm_scores.groupby(by="GMM label").mean().drop(["GMM confidence","Corrected label"],axis=1).idxmax(axis=1).str.rstrip("_score").to_dict()
labl[3] = "Carcinoma_undefined"
marker = {0:'*',1:'o',2:"x",3:"s"}
alpha = {0:.1, 1:.1, 2:.3,3:.1}

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111, projection='3d')

fig.patch.set_facecolor('white')
for l in np.unique(y):
    ix=np.where(y==l)[0]
    ax.scatter(Xax[ix], Yax[ix], Zax[ix], c=cdict[l], s=40,
           label=labl[l], marker=marker[l], alpha=alpha[l])
# for loop ends
ax.set_xlabel("cNMF_1", fontsize=14)
ax.set_ylabel("cNMF_3", fontsize=14)
ax.set_zlabel("cNMF_4", fontsize=14)

ax.legend()
ax.view_init(60, 35)
fig.tight_layout()

In [None]:
fig = plt.figure(figsize=(7,5))
ax = fig.add_subplot(111, projection='3d')

fig.patch.set_facecolor('white')
for l in np.unique(y):
    ix=np.where(y==l)[0]
    ax.scatter(Xax[ix], Yax[ix], Zax[ix], c=cdict[l], s=40,
           label=labl[l], marker=marker[l], alpha=alpha[l])
# for loop ends
ax.set_xlabel("cNMF_1", fontsize=14)
ax.set_ylabel("cNMF_3", fontsize=14)
ax.set_zlabel("cNMF_4", fontsize=14)

ax.legend()
fig.tight_layout()

In [None]:
gmm_scores[["cNMF_1_score","cNMF_3_score","cNMF_4_score","GMM label"]].groupby(by="GMM label").mean()

In [None]:
subadata.obs = pd.concat([subadata.obs,gmm_scores["Corrected label"].replace(labl)],axis=1)

In [None]:
cell_idx = []
df = subadata.obs[["sample_id","Corrected label"]]
for sample_name in df.sample_id.unique():
    subdf = df.loc[df.sample_id==sample_name]
    vc = subdf["Corrected label"].value_counts()
    toremove = vc[vc<25].index.to_numpy()
    if len(toremove)>0:
        cell_idx.append(subdf[subdf["Corrected label"].isin(toremove)].index.to_numpy())

cell_idx = np.hstack(cell_idx)

subadata.obs.loc[cell_idx,"Corrected label"] = "Carcinoma_undefined"

In [None]:
fig = sc.pl.umap(subadata, color=["Corrected label"], palette=colormapping_mal, frameon=False, return_fig=True)
fig.savefig("figures/malignant/unintegrated_cNMF_umap.png", dpi=300, bbox_inches="tight")

In [None]:
fig, axs = plt.subplots(3,2, figsize=(10,10))
flatax = axs.flatten()
for i,ax in enumerate(flatax[:-1]):
    sns.boxplot(data=subadata.obs, x="Corrected label", y=f"cNMF_{i+1}_score", palette=colormapping_mal, ax=flatax[i])
    flatax[i].spines[['right', 'top']].set_visible(False)
    flatax[i].set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
    flatax[i].hlines(y=0, xmin=flatax[i].get_xlim()[0], xmax=flatax[i].get_xlim()[1], linestyles="dashed", color="grey")
    flatax[i].set_xlabel("")
flatax[-1].axis("off")
fig.tight_layout()
fig.savefig("figures/malignant/boxplot_cNMF_score_per_group.png", dpi=250, bbox_inches="tight")

In [None]:
df = subadata.obs[["cNMF_1_score","cNMF_3_score","cNMF_4_score"]]

g = sns.PairGrid(df, diag_sharey=False, corner=True)
g.map_lower(sns.kdeplot)
g.map_diag(sns.histplot)
g.fig.savefig("figures/malignant/cNMF_pairgrid_relbetweenscore.png", dpi=300, bbox_inches="tight")

In [None]:
gsea_results_scanpy = {}
for cl in sorted(subadata.obs.cNMF_program.unique()):
    if cl=="Outlier":
        continue
    
    #df = group_markers[cl][["names","scores"]].set_index("names")
    df = marker_genes[int(cl)].to_frame()
    pre_res = gp.prerank(rnk=df, # or rnk = rnk,
                         s
                         threads=4,
                         min_size=5,
                         max_size=1000,
                         permutation_num=1000, # reduce number to speed up testing
                         outdir=None, # don't write to disk
                         seed=6,
                         verbose=True, # see what's going on behind the scenes
                        )

    gsea_results_scanpy[cl] = pre_res.res2d.sort_values(by="NES",ascending=False)
    gsea_results_scanpy[cl]["cluster"] = cl

In [None]:
gsea_df = pd.concat(list(gsea_results_scanpy.values())).set_index("Term")

In [None]:
def gsea_df_plot(gsea_df):
    ms_names = gsea_df["cluster"].unique()
    
    gsea_df = gsea_df[(gsea_df["FWER p-val"]<0.05) & (gsea_df["NES"]>=2)]

    gsea_heatmap_df = pd.DataFrame(columns=ms_names)
    sign_hallmarks = np.unique((gsea_df[gsea_df.NES>=1]).index)
    
    for hm in sign_hallmarks:
        df = gsea_df[["NES","cluster"]].loc[[hm]].set_index("cluster").T
        df.index = [hm]
        gsea_heatmap_df = pd.concat([gsea_heatmap_df,df])

    fig, ax = plt.subplots(1,1,figsize=(len(ms_names),len(sign_hallmarks)))

    annot = gsea_heatmap_df.applymap(lambda x: np.nan if x<1 else x).round(2)
    annot = annot.fillna("").astype(str)

    sns.heatmap(gsea_heatmap_df.fillna(0),cmap="vlag",center=2.,vmax=2.5,mask=gsea_heatmap_df.isnull(),annot=annot,fmt="",ax=ax)
    ax.set_xticklabels(ax.get_xticklabels(),rotation=45,horizontalalignment="right")
    return fig

In [None]:
fig = gsea_df_plot(gsea_df)

In [None]:
patlevel_counts = subadata.copy()
patlevel_counts = patlevel_counts.obs[["sample_id","Corrected label"]]

patlevel_counts = patlevel_counts.groupby(by="sample_id").value_counts(normalize=True)

df = patlevel_counts.unstack(level=-1)
df = df.loc[clinical.index.intersection(df.index)]

fig, ax = plt.subplots(1,1,figsize=(6,3))
df.plot (kind = 'bar', stacked = True, ax=ax, color=colormapping_mal)
ax.legend(bbox_to_anchor=(1.05, 1), frameon=False)
ax.spines[['right', 'top']].set_visible(False)
ax.set_xlabel("")
ax.set_title("Patient distribution")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
fig.savefig("figures/malignant/barplot_pat_vs_cNMF_distribution.svg", dpi=200, bbox_inches="tight")

In [None]:
patlevel_counts = subadata.copy()
patlevel_counts = patlevel_counts.obs[["sample_id","Corrected label"]]
patlevel_counts = patlevel_counts.groupby(by="Corrected label").value_counts(normalize=True)

df = patlevel_counts.unstack(level=-1)

fig, ax = plt.subplots(1,1,figsize=(6,3))
df.plot (kind = 'bar', stacked = True, ax=ax, color=colormapping_pat)
ax.legend(bbox_to_anchor=(1.05, 1), frameon=False)
ax.spines[['right', 'top']].set_visible(False)
ax.set_xlabel("")
ax.set_title("Patient distribution")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
fig.savefig("figures/malignant/barplot_cNMF_vs_pat_distribution.svg", dpi=200, bbox_inches="tight")

In [None]:
patlevel_counts = subadata.copy()
patlevel_counts = patlevel_counts.obs[["Location","Corrected label"]]
patlevel_counts = patlevel_counts.groupby(by="Location").value_counts(normalize=True)

df = patlevel_counts.unstack(level=-1)

fig, ax = plt.subplots(1,1,figsize=(6,3))
df.plot (kind = 'bar', stacked = True, ax=ax, color=colormapping_mal)
ax.legend(bbox_to_anchor=(1.05, 1), frameon=False)
ax.spines[['right', 'top']].set_visible(False)
ax.set_xlabel("")
ax.set_title("Location")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
fig.savefig("figures/malignant/barplot_cNMF_vs_location_distribution.svg", dpi=200, bbox_inches="tight")

In [None]:
patlevel_counts = subadata.copy()
patlevel_counts = patlevel_counts.obs[["Metastatic","Corrected label"]]
patlevel_counts = patlevel_counts.groupby(by="Metastatic").value_counts(normalize=True)

df = patlevel_counts.unstack(level=-1)

fig, ax = plt.subplots(1,1,figsize=(6,3))
df.plot (kind = 'bar', stacked = True, ax=ax, color=colormapping_mal)
ax.legend(bbox_to_anchor=(1.05, 1), frameon=False)
ax.spines[['right', 'top']].set_visible(False)
ax.set_xlabel("")
ax.set_title("Metastatic site")
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
fig.savefig("figures/malignant/barplot_cNMF_vs_metastatic_distribution.svg", dpi=200, bbox_inches="tight")

In [None]:
patlevel_counts = subadata.copy()
patlevel_counts = patlevel_counts.obs.loc[subadata.obs.Location!="Esophagus/GEJ",["Treatment Naïve","Corrected label"]]
patlevel_counts = patlevel_counts.groupby(by="Treatment Naïve").value_counts(normalize=True)

df = patlevel_counts.unstack(level=-1)
df = df.loc[["Yes","No"]]

fig, ax = plt.subplots(1,1,figsize=(6,3))
df.plot (kind = 'bar', stacked = True, ax=ax, color=colormapping_mal)
ax.legend(bbox_to_anchor=(1.05, 1), frameon=False)
ax.spines[['right', 'top']].set_visible(False)
ax.set_xlabel("")
ax.set_title("Treatment naïve, Metastatic sites" )
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
fig.savefig("figures/malignant/barplot_cNMF_vs_metastatic_treatmentnaive_distribution.svg", dpi=200, bbox_inches="tight")

In [None]:
highlevel_annot = {"Quiescent EC": "Endothelial", "Venous EC": "Endothelial",
                   "Hepatocyte": "Epithelial", 
                   "Smooth muscle": "Stromal/Muscle", "TAM1": "Myeloid", 
                   "TAM2": "Myeloid", "Inflammatory CAF": "Fibroblast", 
                   "TCD4": "Lymphoid", "Skeletal muscle": "Stromal/Muscle", 
                   "TCD8": "Lymphoid", "TAM3": "Myeloid", "B": "Lymphoid", 
                   "Angiogenic EC": "Endothelial", "DC": "Myeloid", 
                   "Hepatic EC": "Endothelial", "HGF-CAF": "Fibroblast",
                   "Kupffer cells": "Myeloid", 
                   "Treg": "Lymphoid", "NK": "Lymphoid", 
                   "Myeloid-HighMT": "Myeloid", "Adipose CAF": "Fibroblast", "T-HighMT": "Lymphoid", 
                   "Mast": "Myeloid", "Adipocytes": "Stromal/Muscle",}

malannot = subadata.obs["Corrected label"].astype(str)
df = pd.concat([adata.obs[["refined_annotation","sample_id"]].astype(str),malannot],axis=1)
df["refined_wcancer"] = df["Corrected label"].fillna(df.refined_annotation)

df["highlevel_annotation"] = df["refined_annotation"].replace(highlevel_annot)

In [None]:
df.to_csv("../refined_wCNMF_programs_and_sampleid.csv")

In [None]:
df.refined_wcancer.value_counts()