In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import loompy as lp
import pathlib as pl

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def pretty_ax(ax, linew: float=1.5):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(linew)
    ax.spines["left"].set_linewidth(linew)

In [None]:
orig_corrs = {}
for state in [f"cNMF_{i}" for i in range(1,6)]:
   orig_corrs[state] = pd.read_csv(f"/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/correlation_TF_scenicplus/{state}_score_triad_corr.csv",index_col=0)

In [None]:
mTFs_pp = {}
for state in [f"cNMF_{i}" for i in range(1,6)]:
    df = orig_corrs[state].dropna().copy()
    mTFs_pp[state] = df[(df["TF GEX r"]>0.1) & (df["eReg. Gene r"]>0.1) & (df["eReg. Reg. r"]>0.1)].index

# Carroll 

In [None]:
adata = sc.read_h5ad("/cluster/work/boeva/jyates/EAC_singlecell/atlas_datasets/Carroll_singlecell/Carroll_EAC_raw.h5ad")

sc.pp.filter_genes(adata, min_cells=500)

adata = adata[adata.obs.patient.isin(["EAC-LPPN",'EAC-ACMO','EAC-HDHI', 'EAC-PAHE', 'EAC-HENB', 
                        'EAC-HGFI', 'EAC-GDBD','EAC-JCNP', 'EAC-ODHL', 'EAC-IKIJ', 
                        'EAC-JJHD'])].copy()

In [None]:
import loompy

f_loom_path_scenic = "/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/pyscenic-external-results/adata_filtered_scenic.loom"

# create basic row and column attributes for the loom file:
row_attrs = {
    "Gene": np.array(adata.var_names) ,
}
col_attrs = {
    "CellID": np.array(adata.obs_names) ,
    "nGene": np.array( np.sum(adata.X.transpose()>0 , axis=0)).flatten() ,
    "nUMI": np.array( np.sum(adata.X.transpose() , axis=0)).flatten() ,
}
loompy.create( f_loom_path_scenic, adata.X.transpose(), row_attrs, col_attrs)

In [None]:
# collect SCENIC AUCell output
lf = lp.connect("pyscenic-external-results/pyscenic_carroll_output.loom", mode='r+', validate=False )
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()

In [None]:
adata.layers["counts"] = adata.X.copy()

sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
del adata.raw

In [None]:
signature_dir = pl.Path("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/cNMF_malignant_genes_new_cosine")

full_sigs = {}
for s in (signature_dir).iterdir():
    sig = s.stem
    full_sigs[sig] = pd.read_csv(s,index_col=0)
    full_sigs[sig] = full_sigs[sig][~full_sigs[sig].index.str.startswith(("MT-","RPS","RPL"))]
    full_sigs[sig] = full_sigs[sig].head(100).index.ravel()

In [None]:
toptfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/correlation_TF_scenicplus/toptfs_top20.csv",index_col=0)
#toptfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/correlation_TF_scenicplus/toptfs_top10_zscoreauc.csv",index_col=0)

toptfs = {state[:6]: toptfs[state].dropna().ravel() for state in toptfs.columns}

for sig, genes in full_sigs.items():
    sc.tl.score_genes(adata=adata,
                        gene_list=list(np.setdiff1d(genes,toptfs[sig])), 
                        score_name=f"{sig}_score")

In [None]:
subadata = adata[adata.obs.celltype.isin(["EAC"])].copy()

In [None]:
all_tfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/auxiliary_data/utoronto_human_tfs_v_1.01.txt",header=None).values.ravel()

TF_expr = pd.DataFrame(subadata[:,subadata.var_names.intersection(all_tfs)].X.copy().toarray(),
                       index=subadata.obs_names,columns=subadata.var_names.intersection(all_tfs))

all_corrs = pd.concat([TF_expr.corrwith(subadata.obs[f"cNMF_{i}_score"]) for i in range(1,6)],axis=1)
all_corrs.columns = [f"cNMF_{i}" for i in range(1,6)]
all_corrs = all_corrs.dropna()

reg_corrs = pd.concat([auc_mtx.loc[subadata.obs_names].corrwith(subadata.obs[f"cNMF_{i}_score"]) for i in range(1,6)],axis=1)
reg_corrs.columns = [f"cNMF_{i}" for i in range(1,6)]
reg_corrs.index = reg_corrs.index.str[:-3]
reg_corrs = reg_corrs.dropna()

In [None]:
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
from adjustText import adjust_text

from tqdm.notebook import tqdm

mTFs_pp = toptfs.copy()

for i,state in tqdm(enumerate([f"cNMF_{i}" for i in range(1,6)])):

    wormdf = pd.concat([all_corrs[state].sort_values(ascending=False),reg_corrs[state]],axis=1).dropna()
    wormdf.columns = ["TF GEX r","Gene-based eReg. r"]
    
    TFs_mostcorr = all_corrs[(all_corrs[state]>0.1) & (reg_corrs[state]>0.1)].index
    
    TFs_candidate = wormdf.index.intersection(mTFs_pp[state])
    
    TFs_common = np.intersect1d(TFs_candidate, TFs_mostcorr)
        
    fig, ax = plt.subplots(1,1, figsize=(3,3))
    sns.scatterplot(data=wormdf, x="TF GEX r", y="Gene-based eReg. r",ax=ax, alpha=0.7, c=["lightgray"]*wormdf.shape[0])
    
    ax.set_title(f"cNMF$_{i+1}$ top TFs", fontsize=13)
    
    texts = []
    for g in np.setdiff1d(TFs_candidate,TFs_common):
        x = wormdf.loc[g,"TF GEX r"]
        y = wormdf.loc[g,"Gene-based eReg. r"]
        texts.append(ax.text(x,y,g,fontsize=13,c="red"))
    for g in TFs_common:
        x = wormdf.loc[g,"TF GEX r"]
        y = wormdf.loc[g,"Gene-based eReg. r"]
        texts.append(ax.text(x,y,g,fontsize=13,c="purple"))
    
    
    adjust_text(texts, arrowprops=dict(arrowstyle="-", color='r', lw=1.5), force_text=(0.2,0.3))
    ax.xaxis.set_tick_params(labelsize=13)
    ax.yaxis.set_tick_params(labelsize=13)
    ax.set_ylabel("Gene-based eReg. r", fontsize=13)
    ax.set_xlabel("TF GEX r", fontsize=13)
    pretty_ax(ax, linew=3)

    fig.savefig(f"/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/{state}_carroll_et_al.png", 
            dpi=200, bbox_inches="tight")
    fig.savefig(f"/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/{state}_carroll_et_al.svg", 
            dpi=200, bbox_inches="tight")

# Croft

In [None]:
adata_croft = sc.read_h5ad("/cluster/work/boeva/jyates/EAC_singlecell/atlas_datasets/Croft_singlecell/GSE222078_adata.h5ad")

In [None]:
sc.pp.filter_genes(adata, min_cells=500)

In [None]:
import loompy

f_loom_path_scenic = "/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/pyscenic-external-results/adata_croft_filtered_scenic.loom"

# create basic row and column attributes for the loom file:
row_attrs = {
    "Gene": np.array(adata.var_names) ,
}
col_attrs = {
    "CellID": np.array(adata.obs_names) ,
    "nGene": np.array( np.sum(adata.X.transpose()>0 , axis=0)).flatten() ,
    "nUMI": np.array( np.sum(adata.X.transpose() , axis=0)).flatten() ,
}
loompy.create( f_loom_path_scenic, adata.X.transpose(), row_attrs, col_attrs)

In [None]:
adata_croft.layers["counts"] = adata_croft.X.copy()

sc.pp.normalize_total(adata_croft, target_sum=10000)
sc.pp.log1p(adata_croft)

In [None]:
# collect SCENIC AUCell output
lf = lp.connect("pyscenic-external-results/pyscenic_croft_output.loom", mode='r+', validate=False )
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()

In [None]:
signature_dir = pl.Path("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/cNMF_malignant_genes_new_cosine")

full_sigs = {}
for s in (signature_dir).iterdir():
    sig = s.stem
    full_sigs[sig] = pd.read_csv(s,index_col=0)
    full_sigs[sig] = full_sigs[sig][~full_sigs[sig].index.str.startswith(("MT-","RPS","RPL"))]
    full_sigs[sig] = full_sigs[sig].head(100).index.ravel()

In [None]:
toptfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/correlation_TF_scenicplus/toptfs_top20.csv",index_col=0)
#toptfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/correlation_TF_scenicplus/toptfs_top10_zscoreauc.csv",index_col=0)
toptfs = {state[:6]: toptfs[state].dropna().ravel() for state in toptfs.columns}

for sig, genes in full_sigs.items():
    sc.tl.score_genes(adata=adata_croft,
                        gene_list=list(np.setdiff1d(genes,toptfs[sig])), 
                        score_name=f"{sig}_score")

In [None]:
subadata_croft = adata_croft[adata_croft.obs.celltype.isin(["Epithelial"])].copy()

In [None]:
all_tfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/auxiliary_data/utoronto_human_tfs_v_1.01.txt",header=None).values.ravel()

TF_expr_croft = pd.DataFrame(subadata_croft[:,subadata_croft.var_names.intersection(all_tfs)].X.copy().toarray(),
                       index=subadata_croft.obs_names,columns=subadata_croft.var_names.intersection(all_tfs))

all_corrs_croft = pd.concat([TF_expr_croft.corrwith(subadata_croft.obs[f"cNMF_{i}_score"]) for i in range(1,6)],axis=1)
all_corrs_croft.columns = [f"cNMF_{i}" for i in range(1,6)]
all_corrs_croft = all_corrs_croft.dropna()

reg_corrs_croft = pd.concat([auc_mtx.loc[subadata_croft.obs_names].corrwith(subadata_croft.obs[f"cNMF_{i}_score"]) for i in range(1,6)],axis=1)
reg_corrs_croft.columns = [f"cNMF_{i}" for i in range(1,6)]
reg_corrs_croft.index = reg_corrs_croft.index.str[:-3]
reg_corrs_croft = reg_corrs_croft.dropna()

In [None]:
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
from adjustText import adjust_text

from tqdm.notebook import tqdm

mTFs_pp = toptfs.copy()

for i,state in tqdm(enumerate([f"cNMF_{i}" for i in range(1,6)])):

    wormdf = pd.concat([all_corrs_croft[state].sort_values(ascending=False),reg_corrs_croft[state]],axis=1).dropna()
    wormdf.columns = ["TF GEX r","Gene-based eReg. r"]
    
    TFs_mostcorr = all_corrs_croft[(all_corrs_croft[state]>0.1) & (reg_corrs_croft[state]>0.1)].index
    
    TFs_candidate = wormdf.index.intersection(mTFs_pp[state])
    
    TFs_common = np.intersect1d(TFs_candidate, TFs_mostcorr)
        
    fig, ax = plt.subplots(1,1, figsize=(3,3))
    sns.scatterplot(data=wormdf, x="TF GEX r", y="Gene-based eReg. r",ax=ax, alpha=0.7, c=["lightgray"]*wormdf.shape[0])
    
    ax.set_title(f"cNMF$_{i+1}$ top TFs", fontsize=13)

    if len(TFs_candidate)>0:
        texts = []
        for g in np.setdiff1d(TFs_candidate,TFs_common):
            x = wormdf.loc[g,"TF GEX r"]
            y = wormdf.loc[g,"Gene-based eReg. r"]
            texts.append(ax.text(x,y,g,fontsize=13,c="red"))
        for g in TFs_common:
            x = wormdf.loc[g,"TF GEX r"]
            y = wormdf.loc[g,"Gene-based eReg. r"]
            texts.append(ax.text(x,y,g,fontsize=13,c="purple"))
    
    
    adjust_text(texts, arrowprops=dict(arrowstyle="-", color='r', lw=1.5), force_text=(0.2,0.3))
    ax.xaxis.set_tick_params(labelsize=13)
    ax.yaxis.set_tick_params(labelsize=13)
    ax.set_ylabel("Gene-based eReg. r", fontsize=13)
    ax.set_xlabel("TF GEX r", fontsize=13)
    pretty_ax(ax, linew=3)

    fig.savefig(f"/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/{state}_croft_et_al.png", 
            dpi=200, bbox_inches="tight")
    fig.savefig(f"/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/{state}_croft_et_al.svg", 
            dpi=200, bbox_inches="tight")

# Luo et al

In [None]:
adata_luo = sc.read_h5ad("/cluster/work/boeva/jyates/EAC_singlecell/atlas_datasets/CAF_atlas/CAF/GSE210347_fibroblast_counts.h5ad")

sc.pp.filter_genes(adata_luo, min_cells=500)

In [None]:
del adata_luo.raw

In [None]:
import loompy

f_loom_path_scenic = "/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/pyscenic-external-results/adata_luo_filtered_scenic.loom"

# create basic row and column attributes for the loom file:
row_attrs = {
    "Gene": np.array(adata_luo.var_names) ,
}
col_attrs = {
    "CellID": np.array(adata_luo.obs_names) ,
    "nGene": np.array( np.sum(adata_luo.X.transpose()>0 , axis=0)).flatten() ,
    "nUMI": np.array( np.sum(adata_luo.X.transpose() , axis=0)).flatten() ,
}
loompy.create( f_loom_path_scenic, adata_luo.X.transpose(), row_attrs, col_attrs)

In [None]:
adata_luo.layers["counts"] = adata_luo.X.copy()

sc.pp.normalize_total(adata_luo, target_sum=10000)
sc.pp.log1p(adata_luo)

In [None]:
# collect SCENIC AUCell output
lf = lp.connect("pyscenic-external-results/pyscenic_luo_output.loom", mode='r+', validate=False )
auc_mtx = pd.DataFrame( lf.ca.RegulonsAUC, index=lf.ca.CellID)
lf.close()

In [None]:
signature_dir = pl.Path("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/marker_genes/fibroblast/")

full_sigs = {}
for s in (signature_dir).iterdir():
    sig = s.stem
    full_sigs[sig] = pd.read_csv(s,index_col=0).set_index("names")
    full_sigs[sig] = full_sigs[sig][~full_sigs[sig].index.str.startswith(("MT-","RPS","RPL"))]
    full_sigs[sig] = full_sigs[sig].head(100).index.ravel()

In [None]:
sig = "Inflammatory CAF"
toptfs ={"Inflammatory CAF": ["RUNX1","RUNX2","PRRX1","BNC2"]} 
sc.tl.score_genes(adata=adata_luo,
                        gene_list=list(np.setdiff1d(full_sigs[sig],toptfs[sig])), 
                        score_name=f"{sig}_score")

In [None]:
all_tfs = pd.read_csv("/cluster/work/boeva/jyates/EAC_singlecell/auxiliary_data/utoronto_human_tfs_v_1.01.txt",header=None).values.ravel()

TF_expr_luo = pd.DataFrame(adata_luo[:,adata_luo.var_names.intersection(all_tfs)].X.copy().toarray(),
                       index=adata_luo.obs_names,columns=adata_luo.var_names.intersection(all_tfs))

all_corrs_luo = TF_expr_luo.corrwith(adata_luo.obs["Inflammatory CAF_score"]).to_frame()
all_corrs_luo.columns = ["Inflammatory CAF"]
all_corrs_luo = all_corrs_luo.dropna()

reg_corrs_luo = auc_mtx.loc[adata_luo.obs_names].corrwith(adata_luo.obs["Inflammatory CAF_score"]).to_frame()
reg_corrs_luo.columns = ["Inflammatory CAF"]
reg_corrs_luo.index = reg_corrs_luo.index.str[:-3]
reg_corrs_luo = reg_corrs_luo.dropna()

In [None]:
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
from adjustText import adjust_text

from tqdm.notebook import tqdm

mTFs_pp = toptfs.copy()
state = "Inflammatory CAF"

wormdf = pd.concat([all_corrs_luo[state].sort_values(ascending=False),reg_corrs_luo[state]],axis=1).dropna()
wormdf.columns = ["TF GEX r","pySCENIC Gene-based eReg. r"]

TFs_mostcorr = all_corrs_luo[(all_corrs_luo[state]>0.1) & (reg_corrs_luo[state]>0.1)].index

TFs_candidate = wormdf.index.intersection(mTFs_pp[state])

TFs_common = np.intersect1d(TFs_candidate, TFs_mostcorr)
    
fig, ax = plt.subplots(1,1, figsize=(3.5,3.5))
sns.scatterplot(data=wormdf, x="TF GEX r", y="pySCENIC Gene-based eReg. r",ax=ax, alpha=0.7, c=["lightgray"]*wormdf.shape[0])
pretty_ax(ax)
ax.set_title(f"{state} top TFs")

if len(TFs_candidate)>0:
    texts = []
    for g in np.setdiff1d(TFs_candidate,TFs_common):
        x = wormdf.loc[g,"TF GEX r"]
        y = wormdf.loc[g,"pySCENIC Gene-based eReg. r"]
        texts.append(ax.text(x,y,g,fontsize=10,c="red"))
    for g in TFs_common:
        x = wormdf.loc[g,"TF GEX r"]
        y = wormdf.loc[g,"pySCENIC Gene-based eReg. r"]
        texts.append(ax.text(x,y,g,fontsize=10,c="purple"))


adjust_text(texts, arrowprops=dict(arrowstyle="-", color='r', lw=1.5), force_text=(0.2,0.3))
fig.savefig("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/luo_et_al.png", 
            dpi=200, bbox_inches="tight")
fig.savefig("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/luo_et_al.svg", 
            dpi=200, bbox_inches="tight")

In [None]:
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
from adjustText import adjust_text

from tqdm.notebook import tqdm

mTFs_pp = toptfs.copy()
state = "Inflammatory CAF"

wormdf = pd.concat([all_corrs_luo[state].sort_values(ascending=False),all_corrs_luo[state].sort_values(ascending=False).rank()],axis=1)
wormdf.columns = ["Correlation","Rank"]

TFs_mostcorr = all_corrs_luo[(all_corrs_luo[state]>0.1)].index

TFs_candidate = wormdf.index.intersection(mTFs_pp[state])

TFs_common = np.intersect1d(TFs_candidate, TFs_mostcorr)
    
fig, ax = plt.subplots(1,1, figsize=(1,3.5))
sns.scatterplot(data=wormdf, x="Rank", y="Correlation",ax=ax)
pretty_ax(ax)
ax.set_title(f"{state} top TFs")

if len(TFs_candidate)>0:
    texts = []
    for g in np.setdiff1d(TFs_candidate,TFs_common):
        x = wormdf.loc[g,"Rank"]
        y = wormdf.loc[g,"Correlation"]
        texts.append(ax.text(x,y,g,fontsize=10,c="red"))
    for g in TFs_common:
        x = wormdf.loc[g,"Rank"]
        y = wormdf.loc[g,"Correlation"]
        texts.append(ax.text(x,y,g,fontsize=10,c="purple"))


adjust_text(texts, arrowprops=dict(arrowstyle="-", color='r', lw=1.5), force_text=(0.2,0.3))
adjust_text(texts, arrowprops=dict(arrowstyle="-", color='r', lw=1.5), force_text=(0.2,0.3))
fig.savefig("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/luo_et_al_onlygex.png", 
            dpi=200, bbox_inches="tight")
fig.savefig("/cluster/work/boeva/jyates/EAC_singlecell/upd_clean_code/figures/pyscenic-external/luo_et_al.svg", 
            dpi=200, bbox_inches="tight")

In [None]:
wormdf.sort_values("Gene-based eReg. r",ascending=False).head(20)