In [None]:
import pandas as pd
import numpy as np
import pathlib as pl

from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [None]:
def pretty_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)

In [None]:
_LEN_CNA = 3209286105

In [None]:
tcga_dir = pl.Path("/add/path/here")

In [None]:
masked_cnv = pd.read_csv(tcga_dir / "TCGA-ESCA.masked_cnv.tsv.gz",sep="\t",index_col=0)

In [None]:
clinical_full = pd.read_csv(tcga_dir / "TCGA.ESCA.sampleMap_ESCA_clinicalMatrix", sep="\t", index_col=0)

In [None]:
fpkm = pd.read_csv(tcga_dir / "TCGA-ESCA.htseq_fpkm-uq.tsv.gz",sep="\t",index_col=0).T
fpkm.index = fpkm.index.str[:15]

In [None]:
gencode = pd.read_csv(tcga_dir /"gencode_mapping_table.csv")
gencode = gencode.set_index('gencode_id_gex')

mapping_gen = gencode.loc[fpkm.columns.intersection(gencode.index)].gene_name.to_dict()
fpkm = fpkm.loc[:,fpkm.columns.intersection(gencode.index)]
fpkm = fpkm.rename(columns=mapping_gen)
fpkm = fpkm.loc[:,~(fpkm.columns.duplicated())]

In [None]:
eac_patients = clinical_full[clinical_full["histological_type"]=="Esophagus Adenocarcinoma, NOS"].index
eac_patients = fpkm.index.intersection(eac_patients)

normal_patients = eac_patients[eac_patients.str.contains("-11")]
eac_patients = eac_patients[~eac_patients.str.contains("-11")]

In [None]:
high_cnv = masked_cnv[masked_cnv.value.abs()>0.2]

high_cnv["length"]= high_cnv.End - high_cnv.Start

total_cna = high_cnv.groupby(by="sample").sum()["length"]

cna_burden = (total_cna/_LEN_CNA)*100
cna_burden.name = "CNA burden"

cna_burden.index = cna_burden.index.str[:-1]

In [None]:
fpkm_eac = fpkm.loc[eac_patients]
clinical_eac = clinical_full.loc[eac_patients]
cna_burden = cna_burden.loc[cna_burden.index.intersection(eac_patients)]

In [None]:
ss = StandardScaler()
std_fpkm = pd.DataFrame(ss.fit_transform(fpkm_eac),index=fpkm_eac.index,columns=fpkm_eac.columns)

In [None]:
signature_dir = pl.Path("/add/path/here")

full_sigs = {}
for s in (signature_dir).iterdir():
    sig = s.stem
    full_sigs[sig] = pd.read_csv(s,index_col=0)
    full_sigs[sig] = full_sigs[sig].head(100).index.ravel()

In [None]:
mTFs = ['KLF5', 'ELF3', 'SMAD3', 'TCF7L2', 'HMGA2', "BNC2"]

In [None]:
state_score = []
for sig, genes in full_sigs.items():
    selgenes = std_fpkm.columns.intersection(genes)
    selgenes = np.setdiff1d(selgenes, mTFs)
    scores = std_fpkm[selgenes].mean(axis=1)
    scores.name = sig
    state_score.append(scores)
state_score = pd.concat(state_score,axis=1)

In [None]:
df = pd.concat([state_score,cna_burden],axis=1).dropna()

In [None]:
df.corr()

In [None]:
r,p = pearsonr(df.cNMF_4.ravel(), df["CNA burden"].ravel())

fig, ax = plt.subplots(1,1,figsize=(3,2))
sns.regplot(data=df, y="cNMF_4", x="CNA burden",ax=ax)
pretty_ax(ax)
ax.text(60,1,f"R={r:.2f}\np={p:.1e}",fontsize=10)

In [None]:
r,p = pearsonr(df.cNMF_5.ravel(), df["CNA burden"].ravel())

fig, ax = plt.subplots(1,1,figsize=(3,2))
sns.regplot(data=df, y="cNMF_5", x="CNA burden",ax=ax)
pretty_ax(ax)
ax.text(60,1,f"R={r:.2f}\np={p:.1e}",fontsize=10)

In [None]:
r,p = pearsonr(df.cNMF_3.ravel(), df["CNA burden"].ravel())

fig, ax = plt.subplots(1,1,figsize=(3,2))
sns.regplot(data=df, y="cNMF_3", x="CNA burden",ax=ax)
pretty_ax(ax)
ax.text(60,1,f"R={r:.2f}\np={p:.1e}",fontsize=10)