In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

from statannotations.Annotator import Annotator

In [None]:
import pathlib as pl

In [None]:
from sklearn.preprocessing import StandardScaler
from lifelines import CoxPHFitter

In [None]:
def pretty_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)

# Download TCGA data

In [None]:
tcga_dir = pl.Path("/add/path/here")

In [None]:
eac_tcga_dir = pl.Path("/add/path/here")

In [None]:
clinical_full = pd.read_csv(eac_tcga_dir / "TCGA.ESCA.sampleMap_ESCA_clinicalMatrix", sep="\t", index_col=0)

In [None]:
clinical_nature = pd.read_csv(eac_tcga_dir / "ESCA_Nature_clinicalinfo.csv", index_col=0, skiprows=1)

In [None]:
fpkm = pd.read_csv(eac_tcga_dir / "TCGA-ESCA.htseq_fpkm-uq.tsv.gz",sep="\t",index_col=0).T
fpkm.index = fpkm.index.str[:15]

In [None]:
gencode = pd.read_csv(eac_tcga_dir /"gencode_mapping_table.csv")
gencode = gencode.set_index('gencode_id_gex')

mapping_gen = gencode.loc[fpkm.columns.intersection(gencode.index)].gene_name.to_dict()
fpkm = fpkm.loc[:,fpkm.columns.intersection(gencode.index)]
fpkm = fpkm.rename(columns=mapping_gen)
fpkm = fpkm.loc[:,~(fpkm.columns.duplicated())]

In [None]:
eac_patients = clinical_full[clinical_full["histological_type"]=="Esophagus Adenocarcinoma, NOS"].index
eac_patients = fpkm.index.intersection(eac_patients)

In [None]:
normal_patients = eac_patients[eac_patients.str.contains("-11")]
eac_patients = eac_patients[~eac_patients.str.contains("-11")]

In [None]:
fpkm_eac = fpkm.loc[eac_patients]
clinical_eac = clinical_full.loc[eac_patients]

In [None]:
ss = StandardScaler()
std_fpkm = pd.DataFrame(ss.fit_transform(fpkm_eac),index=fpkm_eac.index,columns=fpkm_eac.columns)

In [None]:
signature_dir = pl.Path("/add/path/here")

full_sigs = {}
for s in (signature_dir).iterdir():
    sig = s.stem
    full_sigs[sig] = pd.read_csv(s,index_col=0)
    full_sigs[sig] = full_sigs[sig].head(100).index.ravel()

In [None]:
mTFs = ['KLF5', 'ELF3', 'SMAD3', 'TCF7L2', 'HMGA2', "BNC2"]

In [None]:
state_score = []
for sig, genes in full_sigs.items():
    selgenes = std_fpkm.columns.intersection(genes)
    selgenes = np.setdiff1d(selgenes, mTFs)
    scores = std_fpkm[selgenes].mean(axis=1)
    scores.name = sig
    state_score.append(scores)
state_score = pd.concat(state_score,axis=1)

In [None]:
mTF_score = std_fpkm[std_fpkm.columns.intersection(["KLF5","ELF3","SMAD3","TCF7L2"])].mean(axis=1)
mTF_score.name = "mTF_score"

In [None]:
ax = sns.scatterplot(data=state_score, x="cNMF_3", y="cNMF_4")
ax.spines[['right', 'top']].set_visible(False)
ax.hlines(y=0, xmin=ax.get_xlim()[0], xmax=ax.get_xlim()[1], linestyles="dashed", color="grey")
ax.vlines(x=0, ymin=ax.get_ylim()[0], ymax=ax.get_ylim()[1], linestyles="dashed", color="grey")

In [None]:
cNMF_3_patients = state_score[(state_score["cNMF_3"]>=0) & (state_score["cNMF_4"]<0)].index

cNMF_4_patients = state_score[(state_score["cNMF_4"]>=0) & (state_score["cNMF_3"]<0)].index

cNMF_mixed_patients = state_score[(state_score["cNMF_4"]>0) & (state_score["cNMF_3"]>0)].index

In [None]:
len(cNMF_3_patients),len(cNMF_4_patients),cNMF_4_patients.intersection(cNMF_3_patients)

In [None]:
df = state_score.copy()
df.index = df.index.str[:-3]

In [None]:
common_patients = df.index.intersection(clinical_nature.index)

In [None]:
pats = pd.concat([df.loc[common_patients],clinical_nature.loc[common_patients]],axis=1)

In [None]:
augclin = pd.concat([state_score,clinical_eac],axis=1,join="inner")
augclin = pd.concat([augclin, tmb_eac],axis=1,join='outer')

stage_mapping = {"Stage X": np.nan, "Stage IV": 4, "Stage IVB": 4, "Stage IVA": 4, "Stage IIB": 2, "Stage IA": 1, "Stage IIIA": 3, "Stage IIA": 2, "Stage IIC": 2, 
                 "Stage I": 1, "Stage IIIC": 3, "Stage IB": 1, "Stage IIIB": 3, 'Stage III': 3, "Stage II": 2, "I/II NOS": 1,
                 "[Discrepancy]": np.nan}

augclin["Disease"] = pd.Series(augclin.index.str[:15].str.split("-").str[-1]).replace({"01": "Tumor", "11": "Normal", "06": "Tumor"}).ravel()

augclin["Stage"] = augclin.pathologic_stage.replace(stage_mapping).astype("category")

augclin["Response to treatment"] = augclin["additional_treatment_completion_success_outcome"].replace({"Partial Response": "Response/Stable", 
                                                                    "Stable Disease": "Response/Stable", 
                                                                    "Complete Response": "Response/Stable"})

augclin["M_status"]  = augclin["pathologic_M"].fillna(augclin["clinical_M"]).replace({"MX": np.nan, "M1a": "M1", "M1b": "M1"})

augclin["N_status"] = augclin["pathologic_N"].fillna(augclin["clinical_N"]).replace({"NX": np.nan, "N2": "N1/N2/N3", "N1": "N1/N2/N3", "N3": "N1/N2/N3"})

augclin["T_status"] = augclin["pathologic_T"].fillna(augclin["clinical_T"]).replace({"TX": np.nan, "T0": np.nan, 
                                                                                     "T3": "T3/T4", 
                                                                                     "T4": "T3/T4",
                                                                                     "T4a": "T3/T4", })

augclin["MSI_status"] = augclin["CDE_ID_3226963"].replace({"MSI-H": "MSI", "MSI-L": "MSI"})
augclin["Histological subtype"] = augclin["histological_type"].replace({"Esophagus Adenocarcinoma, NOS": "EAC", 
                                                                        "Esophagus Squamous Cell Carcinoma": "ESCC"})

In [None]:
augclin["PatClass"] = "None"
augclin.loc[cNMF_3_patients, "PatClass"] = "cNMF_3"
augclin.loc[cNMF_4_patients, "PatClass"] = "cNMF_4"
augclin.loc[cNMF_mixed_patients, "PatClass"] = "Mixed"

In [None]:
def plot_box(ax, augclin, x, y, pairs, order=None):
    sns.boxplot(data=augclin,x=x, y=y, order=order, ax=ax)
    pretty_ax(ax)
    annotator = Annotator(ax, pairs, data=augclin,
                          x=x, y=y, order=order)
    annotator.configure(test='Mann-Whitney', text_format='simple', show_test_name=False, loc='inside')
    annotator.apply_and_annotate()

In [None]:
augclin[["cNMF_3","TMB"]].corr()

In [None]:
fig, ax = plt.subplots(1,1, figsize=(3,3))
plot_box(ax, augclin[augclin.TMB<40], "PatClass", "TMB", 
         [("cNMF_3","None"),("cNMF_3","cNMF_4"),("cNMF_3","Mixed"),("None","cNMF_4"),("cNMF_4","Mixed"),("None","Mixed")], order=["None","cNMF_3","Mixed","cNMF_4"])

In [None]:
pd.crosstab(augclin["N_status"], augclin["PatClass"])

In [None]:
pd.crosstab(augclin["T_status"], augclin["PatClass"])

In [None]:
fig, ax = plt.subplots(1,1, figsize=(3,3))
plot_box(ax, augclin, "N_status", "cNMF_3", [("N0","N1/N2/N3")], order=["N0","N1/N2/N3"])
fig.savefig("/cluster/work/boeva/jyates/EAC_singlecell/clean_code/figures/tcga/cNMF_3_N_status.svg", dpi=200, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1, figsize=(3,3))
plot_box(ax, augclin, "N_status", "cNMF_4", [("N0","N1/N2/N3")], order=["N0","N1/N2/N3"])
fig.savefig("/cluster/work/boeva/jyates/EAC_singlecell/clean_code/figures/tcga/cNMF_4_N_status.svg", dpi=200, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1, figsize=(3,3))
plot_box(ax, augclin, "T_status", "cNMF_3", [("T1","T2"),("T2","T3/T4"),("T1","T3/T4")], order=["T1","T2","T3/T4"])

In [None]:
fig, ax = plt.subplots(1,1, figsize=(3,3))
plot_box(ax, augclin, "T_status", "cNMF_4", [("T1","T2"),("T2","T3/T4"),("T1","T3/T4")], order=["T1","T2","T3/T4"])

In [None]:
fig, ax = plt.subplots(1,1, figsize=(3,3))
plot_box(ax, augclin, "T_status", "cNMF_5", [("T1","T2"),("T2","T3/T4"),("T1","T3/T4")], order=["T1","T2","T3/T4"])

In [None]:
import matplotlib
import matplotlib.cm as cm
import matplotlib.colors as mcolors

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pd.DataFrame(pca.fit_transform(std_fpkm.fillna(0)),index=std_fpkm.index,columns=["PC1","PC2"])

X_pca = pd.concat([X_pca, state_score, augclin["PatClass"]],axis=1)

def plot_pcs_color(ax, state):
    vcenter = 0
    vmin, vmax = X_pca[state].min(), X_pca[state].max()
    normalize = mcolors.TwoSlopeNorm(vcenter=vcenter, vmin=vmin, vmax=vmax)
    colormap = matplotlib.colormaps['RdBu_r']
    sns.scatterplot(
        y=X_pca["PC2"],
        x=X_pca["PC1"],
        c=X_pca[state],
        norm=normalize,
        cmap=colormap,
        ax=ax
    )
    scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=colormap)
    scalarmappaple.set_array(X_pca[state])
    ax.set_title(state)
    fig.colorbar(scalarmappaple, ax=ax)
    pretty_ax(ax)

fig, ax = plt.subplots(1,3, figsize=(15,4))
flatax = ax.flatten()

plot_pcs_color(flatax[0], "cNMF_3")
plot_pcs_color(flatax[1], "cNMF_1")
plot_pcs_color(flatax[2], "cNMF_4")
fig.tight_layout()
fig.savefig("figures/tcga/PC_wCNMF_score.svg", dpi=200, bbox_inches="tight")

fig, ax = plt.subplots(1,2, figsize=(10,4))
flatax = ax.flatten()

plot_pcs_color(flatax[0], "cNMF_2")
plot_pcs_color(flatax[1], "cNMF_5")

fig.tight_layout()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,2))
sns.scatterplot(
    y=X_pca["PC2"],
    x=X_pca["PC1"],
    hue=X_pca["PatClass"], ax=ax
)
plt.legend(bbox_to_anchor=(1,1,1,0), frameon=False)
pretty_ax(ax)

In [None]:
X_pca.corr()

# Survival

In [None]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

In [None]:
survival = pd.read_csv(eac_tcga_dir / "survival.txt", sep="\t",index_col=0)
purity = pd.read_csv(eac_tcga_dir / "tumor_purity_ESTIMATE.csv",index_col=0) # for ESCA
cancer_purity = purity[purity["cancer_type"]=="ESCA"]["TumorPurity"]
cancer_purity.index = cancer_purity.index.str.rstrip("A")
cancer_purity.index = cancer_purity.index.str.rstrip("B")
cancer_purity.index = cancer_purity.index.str.rstrip("C")

cancer_purity = cancer_purity.loc[~cancer_purity.index.duplicated(keep="first")]

In [None]:
augclin["age"] = augclin["age_at_initial_pathologic_diagnosis"]
survstr = "DSS"
clin = pd.concat([augclin[["age","Stage","gender"]],survival[[survstr,f"{survstr}.time"]],cancer_purity],axis=1,join="inner")
clin = clin.rename(columns={"TumorPurity": "Purity"})
clin.DSS = clin[survstr].fillna(0)
clin.age = (clin.age - clin.age.min())/(clin.age.max()- clin.age.min())
clin.gender = clin.gender.replace({"MALE": 0, "FEMALE": 1})

common_patients = fpkm_eac.index.intersection(clin.index)
std_fpkm = std_fpkm.loc[common_patients]
clin = clin.loc[common_patients]

In [None]:
survstr = "DSS"
for i in range(1,6):
    metasig = f"cNMF_{i}"
    scores = std_fpkm[std_fpkm.columns.intersection(full_sigs[metasig])].mean(axis=1)
    scores = scores.loc[clin.index]

    duration, event = {},{}
    for high in [0,1]:
        if high==1:
            stratification = scores>=scores.quantile(0.7)
            df = clin[stratification]
            duration[high] = df[f"{survstr}.time"].ravel()
            event[high] = df[survstr].ravel()
        else:
            stratification = scores<=scores.quantile(0.3)
            df = clin[stratification]
            duration[high] = df[f"{survstr}.time"].ravel()
            event[high] = df[survstr].ravel()

    fig, ax = plt.subplots(1,1)
    kmf = KaplanMeierFitter() 
    ## Fit the data into the model
    kmf.fit(duration[0], event[0], label='Low score')
    kmf.plot(show_censors=True,c="r",ax=ax,ci_alpha=0.1)
    kmf.fit(duration[1], event[1], label='High score')
    kmf.plot(show_censors=True,c="b",ax=ax)
    pretty_ax(ax)
    ax.set_ylabel("DSS")
    ax.set_xlabel("Time to event")
    results=logrank_test(duration[0],duration[1],event_observed_A=event[0], event_observed_B=event[1])
    results.print_summary()
    ax.text(0.75*ax.get_xlim()[1],0.8,f"p={results.p_value:.1e}",fontsize=13)

In [None]:
duration, event = {},{}
for high in [0,1]:
    if high==1:
        df = clin[augclin["PatClass"]=="cNMF_3"]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()
    else:
        df = clin[augclin["PatClass"]=="cNMF_4"]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()

fig, ax = plt.subplots(1,1)
kmf = KaplanMeierFitter() 
## Fit the data into the model
kmf.fit(duration[0], event[0], label='cNMF_4 patients')
kmf.plot(show_censors=True,c="r",ax=ax,ci_alpha=0.1)
kmf.fit(duration[1], event[1], label='cNMF_3 patients')
kmf.plot(show_censors=True,c="b",ax=ax)
pretty_ax(ax)
ax.set_ylabel("DSS")
ax.set_xlabel("Time to event")
results=logrank_test(duration[0],duration[1],event_observed_A=event[0], event_observed_B=event[1])
results.print_summary()
ax.text(0.75*ax.get_xlim()[1],0.8,f"p={results.p_value:.1e}",fontsize=13)

In [None]:
duration, event = {},{}
for high in [0,1,2]:
    if high==2:
        df = clin[augclin["PatClass"]=="cNMF_3"]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()
    elif high==1:
        df = clin[augclin["PatClass"]=="Mixed"]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()
    elif high==0:
        df = clin[augclin["PatClass"]=="cNMF_4"]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()

fig, ax = plt.subplots(1,1)
kmf = KaplanMeierFitter() 
## Fit the data into the model
kmf.fit(duration[0], event[0], label='cNMF_4 patients')
kmf.plot(show_censors=True,c="r",ax=ax,ci_alpha=0.1)
kmf.fit(duration[1], event[1], label='Mixed patients')
kmf.plot(show_censors=True,c="g",ax=ax)
kmf.fit(duration[2], event[2], label='cNMF_3 patients')
kmf.plot(show_censors=True,c="b",ax=ax)
pretty_ax(ax)
ax.set_ylabel("DSS")
ax.set_xlabel("Time to event")
res4vsM=logrank_test(duration[0],duration[1],event_observed_A=event[0], event_observed_B=event[1])
res4vsM.print_summary()
p4vsM = res4vsM.p_value
res3vsM=logrank_test(duration[2],duration[1],event_observed_A=event[2], event_observed_B=event[1])
res3vsM.print_summary()
p3vsM = res3vsM.p_value
res3vs4=logrank_test(duration[2],duration[0],event_observed_A=event[2], event_observed_B=event[0])
res3vs4.print_summary()
p3vs4 = res3vs4.p_value
ax.text(0.65*ax.get_xlim()[1],0.95,f"p(3vs4)={p3vs4:.1e}",fontsize=13)
ax.text(0.65*ax.get_xlim()[1],0.85,f"p(3vsM)={p3vsM:.1e}",fontsize=13)
ax.text(0.65*ax.get_xlim()[1],0.75,f"p(4vsM)={p4vsM:.1e}",fontsize=13)
plt.legend(bbox_to_anchor=(1,1,0.5,0), frameon=False, fontsize=13)

In [None]:
gene = "TCF7L2"

In [None]:
duration, event = {},{}
gene_expr = std_fpkm[gene]
for high in [0,1]:
    if high==1:
        stratification = gene_expr>=gene_expr.quantile(0.75)
        df = clin[stratification]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()
    else:
        stratification = gene_expr<=gene_expr.quantile(0.25)
        df = clin[stratification]
        duration[high] = df[f"{survstr}.time"].ravel()
        event[high] = df[survstr].ravel()

fig, ax = plt.subplots(1,1)
kmf = KaplanMeierFitter() 
## Fit the data into the model
kmf.fit(duration[0], event[0], label='Low score')
kmf.plot(show_censors=True,c="r",ax=ax,ci_alpha=0.1)
kmf.fit(duration[1], event[1], label='High score')
kmf.plot(show_censors=True,c="b",ax=ax)
pretty_ax(ax)
ax.set_ylabel("DSS")
ax.set_xlabel("Time to event")
results=logrank_test(duration[0],duration[1],event_observed_A=event[0], event_observed_B=event[1])
results.print_summary()
ax.text(0.75*ax.get_xlim()[1],0.8,f"p={results.p_value:.1e}",fontsize=13)

# Correlation between scores and TF

In [None]:
state_score[["cNMF_1","cNMF_3","cNMF_4"]].corr()

In [None]:
corr_df = pd.concat([fpkm_eac[mTFs],state_score[["cNMF_1","cNMF_3","cNMF_4"]]],axis=1)

In [None]:
heatmap_df = corr_df.corr().loc[["cNMF_3","cNMF_1","cNMF_4"],mTFs]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,2))
sns.heatmap(data=heatmap_df, annot=heatmap_df, cmap="vlag", center=0, ax=ax)
fig.savefig("figures/tcga/heatmap_cNMF_TF_corr.png", dpi=300, bbox_inches="tight")

In [None]:
corr_df = pd.concat([state_score[["cNMF_3","cNMF_1","cNMF_4"]],
           mTF_score],axis=1)

heatmap_df = corr_df.corr().loc[["mTF_score"],["cNMF_3","cNMF_1","cNMF_4"]]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(2,0.5))
sns.heatmap(data=heatmap_df, annot=heatmap_df, cmap="vlag", center=0, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.set_xticklabels(["cNMF_3","cNMF_1","cNMF_4"], rotation=45, ha="right")
fig.savefig("figures/tcga/heatmap_cNMF_mTFscore_corr.png", dpi=300, bbox_inches="tight")

# Download other TCGA

In [None]:
all_fpkm = {}
for ct in tcga_dir.iterdir():
    if ct.stem in ["STAD","COAD","PAAD","ESCA"]:
        all_fpkm[ct.stem] = pd.read_csv(ct / f"{ct.stem}_gex_fpkm_uq.csv",index_col=0)

In [None]:
all_clinical = {}
for ct in ["STAD","COAD","PAAD","ESCA"]:
    all_clinical[ct] = pd.read_csv(eac_tcga_dir / f"TCGA.{ct}.sampleMap_{ct}_clinicalMatrix",sep="\t", index_col=0)

In [None]:
all_histological = {}
all_histological["COAD"] = all_clinical["COAD"]["histological_type"].fillna("NA")
all_histological["COAD"].loc[all_histological["COAD"].isin(["[Discrepancy]"])] = "NA"
all_histological["COAD"].name = "Histological type"

all_histological["PAAD"] = all_clinical["PAAD"]["histological_type"]
all_histological["PAAD"].loc[all_histological["PAAD"].isin(["[Discrepancy]",
                                                    "Pancreas-Undifferentiated Carcinoma",
                                                    "Pancreas-Colloid (mucinous non-cystic) Carcinoma"])] = "NA"
all_histological["PAAD"] = all_histological["PAAD"].str.split("-").str[:].ravel()
all_histological["PAAD"] = [" ".join(all_histological["PAAD"][i]) for i in range(len(all_histological["PAAD"]))]
all_histological["PAAD"] = pd.DataFrame(all_histological["PAAD"], index=all_clinical["PAAD"].index, columns=["Histological type"])

all_histological["STAD"] = all_clinical["STAD"]["histological_type"].fillna("NA")
all_histological["STAD"].loc[all_histological["STAD"].isin(["[Discrepancy]"])] = "NA"
all_histological["STAD"] = all_histological["STAD"].str.split(",").str[:-1].ravel()
all_histological["STAD"] = ["".join(all_histological["STAD"][i]) for i in range(len(all_histological["STAD"]))]
all_histological["STAD"] = pd.DataFrame(all_histological["STAD"], index=all_clinical["STAD"].index, columns=["Histological type"])

all_histological["ESCA"] = all_clinical["ESCA"]["histological_type"].replace({"Esophagus Adenocarcinoma, NOS": "Esophagus Adenocarcinoma"}).fillna("NA")
all_histological["ESCA"].name = "Histological type"

In [None]:
full_fpkm = []
for ct in all_fpkm:
    all_fpkm[ct]["Cancer type"] = ct
    all_fpkm[ct] = pd.concat([all_fpkm[ct],all_histological[ct]],axis=1)
    full_fpkm.append(all_fpkm[ct])

In [None]:
full_fpkm = pd.concat(full_fpkm)

In [None]:
full_fpkm = full_fpkm.dropna()

In [None]:
full_fpkm["Histological type"] = full_fpkm["Histological type"].replace({"":"NA"})

In [None]:
ss = StandardScaler()
std_full_fpkm = pd.DataFrame(ss.fit_transform(full_fpkm.iloc[:,:-2]),index=full_fpkm.index,columns=full_fpkm.columns[:-2])

std_full_fpkm = pd.concat([std_full_fpkm,full_fpkm[["Cancer type","Histological type"]]],axis=1)

In [None]:
state_score = []
for sig, genes in full_sigs.items():
    scores = std_full_fpkm[std_full_fpkm.columns.intersection(genes)].mean(axis=1)
    scores.name = sig
    state_score.append(scores)
state_score = pd.concat(state_score,axis=1)

In [None]:
state_score = pd.concat([state_score,std_full_fpkm[["Cancer type","Histological type"]]],axis=1)

In [None]:
pairs = [("Esophagus Adenocarcinoma",ct) for ct in np.setdiff1d(state_score["Histological type"].unique(),["Esophagus Adenocarcinoma"])]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,5))
x= "Histological type"
y="cNMF_5"
order=["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", "Colon Adenocarcinoma", "Colon Mucinous Adenocarcinoma",
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sns.boxplot(data=state_score, x=x, y=y, ax=ax, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=state_score,
                      x=x, y=y, order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()
fig.savefig("figures/tcga/cNMF_5_score_GI_tract_cancers.svg", dpi=200, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,5))
x= "Histological type"
y="cNMF_1"
order=["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", "Colon Adenocarcinoma", "Colon Mucinous Adenocarcinoma",
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sns.boxplot(data=state_score, x=x, y=y, ax=ax, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=state_score,
                      x=x, y=y, order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,5))
x= "Histological type"
y="cNMF_2"
order=["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", "Colon Adenocarcinoma", "Colon Mucinous Adenocarcinoma",
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sns.boxplot(data=state_score, x=x, y=y, ax=ax, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=state_score,
                      x=x, y=y, order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,5))
x= "Histological type"
y="cNMF_3"
order=["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", "Colon Adenocarcinoma", "Colon Mucinous Adenocarcinoma",
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sns.boxplot(data=state_score, x=x, y=y, ax=ax, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=state_score,
                      x=x, y=y, order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()
fig.savefig("figures/tcga/cNMF_3_score_GI_tract_cancers.svg", dpi=200, bbox_inches="tight")

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,5))
x= "Histological type"
y="cNMF_4"
order=["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", "Colon Adenocarcinoma", "Colon Mucinous Adenocarcinoma",
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sns.boxplot(data=state_score, x=x, y=y, ax=ax, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=state_score,
                      x=x, y=y, order=order)
annotator.configure(test='Mann-Whitney', text_format='star', loc='inside')
annotator.apply_and_annotate()

# Link with perineural invasion

In [None]:
path_reports = pd.read_csv("/add/path/here/TCGA_Reports.csv",index_col=0)

path_reports.index = path_reports.index.str.split(".").str[0]

gi_path_reports = path_reports.loc[path_reports.index.intersection(state_score[state_score["Cancer type"]!="COAD"].index.str[:-3])]

pni_info = gi_path_reports[gi_path_reports.text.str.lower().str.contains("perineural") | gi_path_reports.text.str.lower().str.contains("pni") | gi_path_reports.text.str.lower().str.contains("peri-neural")]

#pni_info.text[139]

PNI_status = [0,1,None,1,None,0,None,None,1,1,
             1,None,0,0,1,1,None,0,1,0,
             1,None,None,1,1,1,1,1,None,0,
             1,0,None,0,1,0,0,1,None,None,
             1,1,0,1,1,1,None,1,1,None,
             1,1,1,None,1,0,1,1,1,1,
             1,None,0,1,None,1,1,None,1,1,
             0,None,1,None,None,0,1,None,None,1,
             None,1,1,1,1,None,None,1,0,0,
             0,1,None,1,1,None,None,0,1,0,
             1,0,None,None,None,None,0,1,1,1,
             1,0,0,1,1,None,0,0,1,1,
              1,0,None,None,1,0,None,1,1,1,
             0,None,None,0,0,1,1,1,None,1,
             0,None,None,1,1,None,0,1,1,1,
              1,None,None,1,1,1,1,1,None,1,
             None,None,1,None,1,0,1,0,1,0,
             1,None,0,None,None,1,None,None,1,0,
             0,1,0,0,None,1,1,None,1,1,
             None,0,None,1,0,1,1,1,1,1,
             1,1,0,1,0,0,None,None,1,0,
             0,0,1,1,1,1,None,1,None,1,
             0,1,None,1,None,None,None,0,1,None,
             None,None,1,1,0,1,1,1,0,1,
             0,None,0,None,1,None,None,1,1,1,
             0,None,None,None,1,0,0,None,1,None,
             0,0,None,0,1,1,None,1,0,None,
             0,None,1,None,None,1,1,1,None,None,
             1,1,1,1,0,1,None,1,None,1,
              1,None,0,1,1,1,None,None,0,1,
             1,1,1,1,None,1,1,None,1,1,
             1,1,0,0,1,1,1,1,1,0,
             1,1,None,0,1,1,0,1,0,None,
              1,1,None,1,None,1,1,None,None,1,
             None,None,0,1,1,None,1,1,1,1,
             1,1,None,None,1,None,1,0,0,1,
             None,0,0,1,1,None,1,None,1,1,
             1,0,None,1,1,None,0,1,]

pni_info["PNI_status"] = PNI_status

pni_info.to_csv("/add/path/here/PNI_status_annotated_manually.csv")

In [None]:
pni_info = pd.read_csv("/add/path/here/PNI_status_annotated_manually.csv",index_col=0)

pni_info.PNI_status = pni_info.PNI_status.replace({np.nan:"Not specified", 1: "PNI+", 0: "PNI-"})

cNMF_4_specific = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/auxiliary_data/marker_genes_cNMF_4_wonerve.csv",index_col=0).values.ravel()

cNMF_4_score = std_full_fpkm.loc[:,std_full_fpkm.columns.intersection(cNMF_4_specific)].mean(axis=1)
cNMF_4_score.name = "cNMF_4"
cNMF_4_score = pd.concat([cNMF_4_score,std_full_fpkm[["Cancer type","Histological type"]]],axis=1)

idx = cNMF_4_score.index.str.split("-")
idx = ["-".join(ix[:-1]) for ix in idx]

cNMF_4_score.index = idx

cNMF_4_score = cNMF_4_score.loc[~cNMF_4_score.index.duplicated()]

df = pd.concat([cNMF_4_score,pni_info],axis=1)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,5))
x= "PNI_status"
y="cNMF_4"
order=["PNI-","PNI+"]
pairs = [("PNI-","PNI+")]
sns.boxplot(data=df, x=x, y=y, ax=ax, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=df,
                      x=x, y=y, order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()

In [None]:
df = df.dropna()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,5))
x="Cancer type"
hue= "PNI_status"
y="cNMF_4"
hue_order=["PNI-","PNI+"]
order = ["ESCA","STAD","PAAD"]

sub_pairs = ["PNI-","PNI+"]
hist_types = df["Cancer type"].dropna().unique()
pairs = []
for typ in hist_types:
    pairs.append(((typ, sub_pairs[0]),(typ, sub_pairs[1])))
    
sns.boxplot(data=df, x=x, y=y, hue=hue, ax=ax, hue_order=hue_order, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=df,
                      x=x, y=y, hue=hue, hue_order=hue_order, order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()

In [None]:
df[["Cancer type","PNI_status"]].value_counts()

In [None]:
df[["Histological type","PNI_status"]].value_counts().unstack()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,5))
x="Histological type"
hue= "PNI_status"
y="cNMF_4"
hue_order=["PNI-","PNI+"]
order = ["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", 
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sub_pairs = ["PNI-","PNI+"]
hist_types = df["Histological type"].dropna().unique()
pairs = []
for typ in hist_types:
    pairs.append(((typ, sub_pairs[0]),(typ, sub_pairs[1])))
sns.boxplot(data=df, x=x, y=y, hue=hue, ax=ax, hue_order=hue_order, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=df,
                      x=x, y=y, hue=hue, hue_order=hue_order, order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()

# BayesPrism deconvolved data

In [None]:
name_mapping = {"ESCA": "eac", "PAAD": "pdac", "STAD": "gastric"}

In [None]:
bp_deconvolved = {}
for cancer in ["ESCA","PAAD","STAD"]:
    bp_deconvolved[cancer] = pd.read_csv(f"/add/path/here/{name_mapping[cancer]}_tumor_component_counts.csv",index_col=0)
    bp_deconvolved[cancer].index = bp_deconvolved[cancer].index.str.replace(".","-")

bp_purity = {}
for cancer in ["ESCA","PAAD","STAD"]:
    bp_purity[cancer] = pd.read_csv(f"/add/path/here/{name_mapping[cancer]}_purity.csv",index_col=0)
    bp_purity[cancer].index = bp_purity[cancer].index.str.replace(".","-")
    bp_purity[cancer].index = bp_purity[cancer].index.str[:-1]
    if cancer == "ESCA":
        bp_purity[cancer] = bp_purity[cancer]["tumor"]
    elif cancer=="PAAD":
        bp_purity[cancer] = bp_purity[cancer]["Malignant"]
    elif cancer=="STAD":
        bp_purity[cancer] = bp_purity[cancer]["Epithelium"]
    bp_purity[cancer].name = "Malignant"

In [None]:
true_purity = {}
for cancer in ["ESCA", "PAAD", "STAD"]:
        purity = pd.read_csv(eac_tcga_dir / "tumor_purity_ESTIMATE.csv",index_col=0) # for ESCA
        cancer_purity = purity[purity["cancer_type"]==cancer]["TumorPurity"]
        cancer_purity.index = cancer_purity.index.str.rstrip("A")
        cancer_purity.index = cancer_purity.index.str.rstrip("B")
        cancer_purity.index = cancer_purity.index.str.rstrip("C")
        cancer_purity = cancer_purity.loc[~cancer_purity.index.duplicated(keep="first")]
        true_purity[cancer] = cancer_purity

In [None]:
gencode = pd.read_csv("/add/path/here/gencode_annot_length.csv",index_col=0).set_index("gene_name")

In [None]:
def get_tpm(bp_deconvolved: pd.DataFrame, gencode: pd.DataFrame) -> pd.DataFrame:
    gencode = gencode.loc[bp_deconvolved.columns.intersection(gencode.index)]
    gencode = gencode.loc[~gencode.index.duplicated()]
    bp_deconvolved = bp_deconvolved.loc[:,bp_deconvolved.columns.intersection(gencode.index)]
    bp_deconvolved = bp_deconvolved.loc[~bp_deconvolved.index.duplicated()]

    tpm = bp_deconvolved/gencode["length"]
    tpm = (tpm.T/tpm.sum(axis=1)).T*1000000
    return tpm

In [None]:
tpm_pc = {}
for cancer in ["ESCA","PAAD","STAD"]:
    tpm_pc[cancer] = get_tpm(bp_deconvolved=bp_deconvolved[cancer], gencode=gencode)
    tpm_pc[cancer].index = tpm_pc[cancer].index.str[:-1]

In [None]:
fig, ax = plt.subplots(1,3,figsize=(10,2))
sns.scatterplot(data=pd.concat([bp_purity["ESCA"],true_purity["ESCA"]],axis=1),
                x="Malignant",y="TumorPurity",ax=ax[0])
sns.scatterplot(data=pd.concat([bp_purity["PAAD"],true_purity["PAAD"]],axis=1),
                x="Malignant",y="TumorPurity",ax=ax[1])
sns.scatterplot(data=pd.concat([bp_purity["STAD"],true_purity["STAD"]],axis=1),
                x="Malignant",y="TumorPurity",ax=ax[2])
for i in range(len(ax)):
    pretty_ax(ax[i])
ax[0].set_title("ESCA")
ax[1].set_title("PAAD")
ax[2].set_title("STAD")
fig.tight_layout()

In [None]:
state_score_full = []
for cancer in ["ESCA","PAAD","STAD"]:
    state_score = []
    for sig, genes in full_sigs.items():
        scores = tpm_pc[cancer][tpm_pc[cancer].columns.intersection(genes)].mean(axis=1)
        scores.name = sig
        state_score.append(scores)
    state_score = pd.concat(state_score,axis=1)
    state_score["Cancer type"] = cancer
    
    state_score = pd.concat([state_score,all_histological[cancer]],axis=1).dropna()
    state_score_full.append(state_score)

state_score_full = pd.concat(state_score_full)

idx = state_score_full.index.str.split("-")
idx = ["-".join(ix[:-1]) for ix in idx]

state_score_full.index = idx

state_score_full = state_score_full.loc[~state_score_full.index.duplicated()]

df = pd.concat([state_score_full,pni_info],axis=1).dropna()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,5))
x="Histological type"
hue= "PNI_status"
y="cNMF_4"
hue_order=["PNI-","PNI+"]
order = ["Esophagus Adenocarcinoma", "Esophagus Squamous Cell Carcinoma", "Stomach Intestinal Adenocarcinoma",
        "Stomach Adenocarcinoma", 
         "Pancreas Adenocarcinoma Ductal Type", "Pancreas Adenocarcinoma Other Subtype","NA"]
sub_pairs = ["PNI-","PNI+"]
hist_types = df["Histological type"].dropna().unique()
pairs = []
for typ in hist_types:
    pairs.append(((typ, sub_pairs[0]),(typ, sub_pairs[1])))
sns.boxplot(data=df, x=x, y=y, hue=hue, ax=ax, hue_order=hue_order, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=df,
                      x=x, y=y, hue=hue, hue_order=hue_order, order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()

In [None]:
df[["Histological type","PNI_status"]].value_counts().unstack()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,5))
x="Cancer type"
hue= "PNI_status"
y="cNMF_4"
hue_order=["PNI-","PNI+"]
order = ["ESCA","STAD","PAAD"]

sub_pairs = ["PNI-","PNI+"]
hist_types = df["Cancer type"].dropna().unique()
pairs = []
for typ in hist_types:
    pairs.append(((typ, sub_pairs[0]),(typ, sub_pairs[1])))
    
sns.boxplot(data=df, x=x, y=y, hue=hue, ax=ax, hue_order=hue_order, order=order)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")
pretty_ax(ax)
ax.set_xlabel("")
annotator = Annotator(ax, pairs, data=df,
                      x=x, y=y, hue=hue, hue_order=hue_order, order=order)
annotator.configure(test='Mann-Whitney', text_format='simple', loc='inside')
annotator.apply_and_annotate()