#Fibroblast subcluster annotation & functional programs
related to extended fig1

In [None]:
# CONFIG (edit here only)
import os
DATA_DIR = "./data"
OUT_DIR  = "./outputs"
FIG_DIR  = os.path.join(OUT_DIR, "figures")
TAB_DIR  = os.path.join(OUT_DIR, "tables")
H5AD_DIR = os.path.join(OUT_DIR, "h5ad")
INPUT_H5AD = os.path.join(DATA_DIR, "HCC_integrated_harmony.h5ad")
BATCH_KEY    = "sample"
DATASET_KEY  = "dataset"
BATCH_KEY_FALLBACKS = ["S_ID", "orig.ident", "patient", "donor", "sample_id", "sample"]
DATASET_KEY_FALLBACKS = ["dataset", "datasets", "batch", "orig.ident", "study", "source"]
GROUP_KEY_IN = None
CLUSTER_KEY  = "leiden_0.90"
SEED = 0
N_HVG = 1500
N_PCS = 20
LEIDEN_RES = 0.90
AUCELL_LAYER = "lognorm"
GENESET_DIR = os.path.join(DATA_DIR, "gene_sets")
HALLMARK_GMT = os.path.join(GENESET_DIR, "hallmark.gmt")
KEGG_GMT     = os.path.join(GENESET_DIR, "kegg.gmt")
GOBP_GMT     = os.path.join(GENESET_DIR, "gobp.gmt")
for d in (OUT_DIR, FIG_DIR, TAB_DIR, H5AD_DIR):
    os.makedirs(d, exist_ok=True)
print("INPUT_H5AD:", INPUT_H5AD)
print("OUT_DIR:", OUT_DIR)


In [None]:
# Environment & reproducibility
import random
import numpy as np

try:
    import torch
except Exception:
    torch = None

try:
    import scvi
except Exception:
    scvi = None

import scanpy as sc
import anndata as ad
import matplotlib as mpl

random.seed(SEED)
np.random.seed(SEED)
if torch is not None:
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
if scvi is not None:
    scvi.settings.seed = SEED

mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["ps.fonttype"]  = 42
mpl.rcParams["svg.fonttype"] = "none"

sc.settings.verbosity = 2
sc.settings.figdir = FIG_DIR
sc.settings.autoshow = False

versions = {
    "scanpy": sc.__version__,
    "anndata": ad.__version__,
}
if torch is not None:
    versions["torch"] = torch.__version__
if scvi is not None:
    versions["scvi-tools"] = scvi.__version__

print("Versions:", versions)

## Data loading



In [None]:
# Load data and perform minimal sanity checks
import scanpy as sc
import pandas as pd
import scipy.sparse as sp
adata = sc.read_h5ad(INPUT_H5AD)
# --- subset to Fibro only ---
MAJOR_KEY = "majortype"
FIBRO_LABEL = "Fibro"
if MAJOR_KEY not in adata.obs.columns:
    raise KeyError(
        f"Missing required adata.obs column: '{MAJOR_KEY}'. "
        f"Available columns (first 30): {list(adata.obs.columns)[:30]}"
    )
n0 = adata.n_obs
adata = adata[adata.obs[MAJOR_KEY].astype(str) == FIBRO_LABEL].copy()
print("adata shape:", adata.shape)
# Resolve batch/dataset keys with fallbacks for robustness
def _resolve_obs_key(adata_in, primary, fallbacks, label):
    if primary in adata_in.obs.columns:
        return primary
    for cand in fallbacks:
        if cand in adata_in.obs.columns:
            print(f"INFO: {label} key '{primary}' not found; using fallback '{cand}'.")
            return cand
    raise KeyError(
        f"Missing required adata.obs column for {label}: '{primary}'. "
        f"Available columns (first 30): {list(adata_in.obs.columns)[:30]}"
    )
BATCH_KEY = _resolve_obs_key(adata, BATCH_KEY, BATCH_KEY_FALLBACKS, "batch")
DATASET_KEY = _resolve_obs_key(adata, DATASET_KEY, DATASET_KEY_FALLBACKS, "dataset")
required_obs = [BATCH_KEY, DATASET_KEY]
missing = [k for k in required_obs if k not in adata.obs.columns]
if missing:
    raise KeyError(f"Missing required adata.obs columns: {missing}. Available columns (first 30): {list(adata.obs.columns)[:30]}")
if "counts" not in adata.layers:
    adata.layers["counts"] = adata.X.copy()
_tmp = adata.copy()
_tmp.X = _tmp.layers["counts"]
sc.pp.normalize_total(_tmp, target_sum=1e4)
sc.pp.log1p(_tmp)
adata.layers["lognorm"] = _tmp.X.copy()
del _tmp
if "subtype" in adata.obs.columns:
    print("subtype n:", adata.obs["subtype"].nunique())


## Core analysis

We perform four core steps:

1) **Subcluster annotation**: HVGs → PCA → Harmony → UMAP → Leiden → marker genes → map Leiden clusters to fibroblast programs (`subtype`) and remove obvious doublets.  
2) **AUCell scoring**: score curated fibroblast gene programs (ImmuneReg/ECM/Antigen presentation/Quiescent).  
3) **PROGENy scoring**: infer pathway activity (ULM) and summarize by `subtype`.  
4) **Pathway enrichment (ORA)**: run ORA on subtype marker genes using **local GMT gene-set files** under `./data/gene_sets/` (offline-reproducible).



In [None]:
# Core analysis: HVGs, Harmony, Leiden, and marker identification
import scanpy as sc
import scanpy.external as sce
import pandas as pd

fib = adata.copy()
fib.X = fib.layers["lognorm"]

sc.pp.highly_variable_genes(
    fib,
    flavor="seurat_v3",
    n_top_genes=N_HVG,
    layer="counts",
    batch_key=DATASET_KEY,
)
fib = fib[:, fib.var["highly_variable"]].copy()

sc.pp.scale(fib, max_value=10)
sc.tl.pca(fib, svd_solver="arpack")

sce.pp.harmony_integrate(fib, key=BATCH_KEY, max_iter_harmony=50)
sc.pp.neighbors(fib, n_pcs=N_PCS, use_rep="X_pca_harmony")
sc.tl.umap(fib)

sc.tl.leiden(fib, resolution=LEIDEN_RES, key_added=CLUSTER_KEY, flavor="igraph")

sc.tl.rank_genes_groups(fib, groupby=CLUSTER_KEY, method="wilcoxon")

markers = pd.concat(
    [sc.get.rank_genes_groups_df(fib, group=str(g)) for g in fib.obs[CLUSTER_KEY].cat.categories],
    ignore_index=True
)
markers.to_csv(os.path.join(TAB_DIR, f"markers_{CLUSTER_KEY}.csv"), index=False)

merge_dict = {
    '0':  'apFibro_CD74',
    '1':  'Fibro_S100A11',
    '2':  'myFibro_TAGLN',
    '3':  'Pericyte',
    '4':  'myFibro_TAGLN',
    '5':  'Fibro_RPLRPS',
    '6':  'iFibro_CXCL12',
    '7':  'ecmFibro_ASPN',
    '8':  'cycFibro_STMN1',
    '9':  'Pericyte',
    '10': 'doublets',
    '11': 'ecmFibro_FAP',
    '12': 'doublets',
    '13': 'myFibro_MYH11',
    '14': 'doublets'
}

fib.obs["subtype"] = fib.obs[CLUSTER_KEY].astype(str).replace(merge_dict).astype("category")

if "doublets" in set(fib.obs["subtype"].cat.categories):
    fib = fib[~fib.obs["subtype"].isin(["doublets"])].copy()
    fib.obs["subtype"] = fib.obs["subtype"].cat.remove_unused_categories()

sc.pl.umap(fib, color=["subtype", DATASET_KEY], wspace=0.4, save="_fib_subtype_umap.pdf")

marker_genes_dict = {
    "apFibro_CD74": ["HLA-DRB1", "CD74", "CXCL12"],
    "iFibro_CXCL12": ["CCL19", "CCL21", "IL7", "CCL2"],
    "ecmFibro_FAP": ["CTHRC1", "STAT1", "FAP", "MMP14", "VCAM1"],
    "ecmFibro_ASPN": ["ASPN", "COL1A1", "COL3A1"],
    "myFibro_TAGLN": ["TAGLN", "ACTA2"],
    "myFibro_MYH11": ["MYH11", "GPR4", "PTP4A3"],
    "Pericyte": ["RGS5", "GJA4", "NDUFA4L2"],
    "cycFibro_STMN1": ["STMN1", "UBE2C", "BIRC5"],
    "Fibro_S100A11": ["S100A11", "TMSB10", "LGALS1"],
    "Fibro_RPLRPS": ["ADIRF", "RPLP2"],
}

sc.tl.dendrogram(fib, groupby="subtype")
dp = sc.pl.dotplot(
    fib,
    marker_genes_dict,
    groupby="subtype",
    dendrogram=True,
    standard_scale="var",
    show=False,
)
dp.savefig(os.path.join(FIG_DIR, "fib_subtype_marker_dotplot.pdf"))

print("Annotated fibro object:", fib.shape)

In [None]:
# AUCell scoring for fibroblast gene programs
import inspect
import pandas as pd
import numpy as np
import scanpy as sc
import decoupler as dc

ImmuneReg = [
    "IL6","IL11","LIF","CXCL12","CXCL14","CXCL1","CXCL2","CXCL8",
    "CCL2","CCL7","ICAM1","PTGS2","TNFAIP6","SERPINE1","HAS1","HAS2",
    "PDGFRA","IGF1","OSM","PRG4"
]
ECM = [
    "ACTA2","TAGLN","MYL9","TPM2",
    "COL1A1","COL1A2","COL3A1","COL5A1","COL5A2","COL6A1","COL6A2","COL6A3","COL11A1","COL12A1",
    "FN1","POSTN","SPARC","THBS1","THBS2","LOX","LOXL2","PLOD1","PLOD2","PLOD3","SERPINH1",
    "MMP2","MMP11","MMP14","ITGA11","DDR2","PDGFRB","FBLN1","FBLN2","TNC","ASPN","LUM","DCN","FAP"
]
Antigen_presentation = [
    "HLA-DRA","HLA-DRB1","HLA-DRB5","HLA-DQA1","HLA-DQB1","HLA-DQA2","HLA-DQB2",
    "HLA-DPA1","HLA-DPB1","HLA-DMA","HLA-DMB","HLA-DOA","HLA-DOB","CD74","CIITA",
    "CTSS","CTSL","CTSB","LGMN","IFI30","LAMP1","LAMP2","RFX5","RFXAP","RFXANK"
]
Quiescent = [
    "LRAT","RBP1","RELN","LHX2","NGFR","PPARG","GFAP","SYNM","SYP",
    "ALDH1A1","ALDH1A2","RDH10","RARB"
]
SIGNATURES = {
    "ImmuneReg": ImmuneReg,
    "ECM": ECM,
    "Antigen_presentation": Antigen_presentation,
    "Quiescent": Quiescent,
}

rows = [(sig, g, 1.0) for sig, genes in SIGNATURES.items() for g in genes]
net = pd.DataFrame(rows, columns=["source","target","weight"])

def run_aucell_robust(adata_in, net, layer=AUCELL_LAYER, n_top=None):
    fn = dc.mt.aucell
    sig = inspect.signature(fn).parameters
    kwargs = {}
    if "data" in sig: kwargs["data"] = adata_in
    if "net"  in sig: kwargs["net"]  = net
    if "layer" in sig and layer is not None:
        kwargs["layer"] = layer
    if n_top is not None:
        for k in ("max_rank","auc_max_rank","maxRank","n_top"):
            if k in sig:
                kwargs[k] = n_top
                break
    res = fn(**kwargs)

    score = None
    for key_try in ("score_aucell", "aucell", "aucell_acts", "acts_aucell", "estimate_aucell"):
        try:
            score = dc.pp.get_obsm(adata=adata_in, key=key_try)
            break
        except Exception:
            pass

    if score is None:
        sig_names = set(net["source"].astype(str).unique())
        candidate = None
        for k in adata_in.obsm_keys():
            val = adata_in.obsm[k]
            if isinstance(val, pd.DataFrame):
                cols = set(map(str, val.columns))
                if len(sig_names & cols) >= max(2, int(0.5 * len(sig_names))):
                    candidate = val
                    break
        if candidate is None:
            if isinstance(res, pd.DataFrame):
                candidate = res
            else:
                raise RuntimeError(f"AUCell output not found. Available obsm keys: {list(adata_in.obsm_keys())}")
        score = sc.AnnData(X=candidate.values, obs=adata_in.obs.copy(), var=pd.DataFrame(index=list(map(str, candidate.columns))))

    if "X_umap" in adata_in.obsm and "X_umap" not in score.obsm:
        score.obsm["X_umap"] = adata_in.obsm["X_umap"]
    return score

n_top = max(10, int(0.05 * fib.n_vars))
au = run_aucell_robust(fib, net, layer=AUCELL_LAYER, n_top=n_top)

au_df = pd.DataFrame(au.X, index=au.obs_names, columns=au.var_names)
au_df.to_csv(os.path.join(TAB_DIR, "aucell_scores_per_cell.csv"))

sc.tl.dendrogram(au, groupby="subtype")
mp = sc.pl.matrixplot(
    au,
    var_names=list(SIGNATURES.keys()),
    groupby="subtype",
    dendrogram=True,
    standard_scale="var",
    return_fig=True,
    show=False,
)
mp.savefig(os.path.join(FIG_DIR, "aucell_matrixplot.pdf"))

sc.pl.violin(
    au,
    keys=list(SIGNATURES.keys()),
    groupby="subtype",
    rotation=90,
    stripplot=False,
    save="_aucell_violin.pdf",
)
if "X_umap" in au.obsm:
    sc.pl.umap(au, color=list(SIGNATURES.keys()), ncols=2, save="_aucell_umap.pdf")

print("AUCell score object:", au.shape)

In [None]:
# PROGENy pathway activity inference
import pandas as pd
import scanpy as sc
import decoupler as dc

progeny = dc.op.progeny(organism="human")

dc.mt.ulm(data=fib, net=progeny)

pro = dc.pp.get_obsm(adata=fib, key="score_ulm")

pro_df = pd.DataFrame(pro.X, index=pro.obs_names, columns=pro.var_names)
pro_df.to_csv(os.path.join(TAB_DIR, "progeny_scores_per_cell.csv"))

sc.tl.dendrogram(pro, groupby="subtype")
mp = sc.pl.matrixplot(
    pro,
    var_names=pro.var_names,
    groupby="subtype",
    dendrogram=True,
    standard_scale="var",
    return_fig=True,
    show=False,
)
mp.savefig(os.path.join(FIG_DIR, "progeny_matrixplot.pdf"))

if "Trail" in pro.var_names:
    sc.pl.violin(pro, keys=["Trail"], groupby="subtype", rotation=90, save="_progeny_trail_violin.pdf")
if "X_umap" in pro.obsm:
    sc.pl.umap(pro, color=list(pro.var_names)[:6], ncols=3, save="_progeny_umap.pdf")

print("PROGENy score object:", pro.shape)

In [None]:
# Pathway enrichment (ORA) using local GMT files
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import gseapy as gp
import inspect

if "rank_genes_groups" not in fib.uns or fib.uns["rank_genes_groups"]["params"].get("groupby") != "subtype":
    sc.tl.rank_genes_groups(fib, groupby="subtype", method="wilcoxon")

def top_markers_for_group(adata_in, group, n=200, padj_max=0.05, logfc_min=0.25):
    df = sc.get.rank_genes_groups_df(adata_in, group=group)
    df = df.dropna(subset=["names"])
    if "pvals_adj" in df.columns:
        df = df[df["pvals_adj"] <= padj_max]
    if "logfoldchanges" in df.columns:
        df = df[df["logfoldchanges"] >= logfc_min]
    return df.sort_values("logfoldchanges", ascending=False)["names"].head(n).astype(str).tolist()

gmt_files = {
    "Hallmark": HALLMARK_GMT,
    "KEGG": KEGG_GMT,
    "GO_BP": GOBP_GMT,
}
for k, p in gmt_files.items():
    if not os.path.exists(p):
        raise FileNotFoundError(
            f"Missing gene-set GMT file for {k}: {p}. "
            f"Provide GMT files under {GENESET_DIR} (authors provide for offline reproducibility)."
        )

if hasattr(gp, "enrich"):
    enrich_fn = gp.enrich
elif hasattr(gp, "enrichr"):
    enrich_fn = gp.enrichr
else:
    raise AttributeError("gseapy does not expose enrich/enrichr in this environment.")

def run_ora(gene_list, gmt_path):
    sig = inspect.signature(enrich_fn).parameters
    kwargs = {}
    if "gene_list" in sig: kwargs["gene_list"] = gene_list
    if "gene_sets" in sig: kwargs["gene_sets"] = gmt_path
    if "organism" in sig: kwargs["organism"] = "Human"
    if "outdir" in sig: kwargs["outdir"] = None
    if "no_plot" in sig: kwargs["no_plot"] = True
    if "verbose" in sig: kwargs["verbose"] = False
    res = enrich_fn(**kwargs)
    if hasattr(res, "results"):
        return res.results.copy()
    if isinstance(res, pd.DataFrame):
        return res.copy()
    raise RuntimeError("Unexpected gseapy result type.")

subtypes = list(fib.obs["subtype"].cat.categories)
all_res = []

for st in subtypes:
    genes = top_markers_for_group(fib, st, n=200)
    for gs_name, gmt in gmt_files.items():
        r = run_ora(genes, gmt)
        r["subtype"] = st
        r["gene_set"] = gs_name
        all_res.append(r)

enr_df = pd.concat(all_res, ignore_index=True)
enr_df.to_csv(os.path.join(TAB_DIR, "ora_enrichment_all.csv"), index=False)

hall = enr_df[enr_df["gene_set"] == "Hallmark"].copy()

sig_col = None
for cand in ["Adjusted P-value", "FDR q-value", "FDR", "Adjusted P-value (Benjamini-Hochberg)"]:
    if cand in hall.columns:
        sig_col = cand
        break
if sig_col is None:
    for cand in ["P-value", "P-value (Fisher exact test)"]:
        if cand in hall.columns:
            sig_col = cand
            break
if sig_col is None:
    raise KeyError(f"Cannot find a p-value/FDR column in enrichment results. Columns: {list(hall.columns)}")

term_col = "Term" if "Term" in hall.columns else ("Gene_set" if "Gene_set" in hall.columns else None)
if term_col is None:
    raise KeyError(f"Cannot find term column in enrichment results. Columns: {list(hall.columns)}")

hall[sig_col] = pd.to_numeric(hall[sig_col], errors="coerce")
hall["neglog10"] = -np.log10(hall[sig_col].clip(lower=1e-300))
top = hall.sort_values(["subtype", "neglog10"], ascending=[True, False]).groupby("subtype").head(6)

plt.figure(figsize=(10, max(3, 0.4 * top[term_col].nunique())))
x_map = {s:i for i,s in enumerate(subtypes)}
y_terms = list(dict.fromkeys(top[term_col].tolist()))
y_map = {t:i for i,t in enumerate(y_terms)}

plt.scatter(
    [x_map[s] for s in top["subtype"]],
    [y_map[t] for t in top[term_col]],
    s=top["neglog10"]*12 + 10,
)
plt.xticks(range(len(subtypes)), subtypes, rotation=45, ha="right")
plt.yticks(range(len(y_terms)), y_terms)
plt.xlabel("subtype")
plt.ylabel("Hallmark terms (top6 per subtype)")
plt.tight_layout()
plt.savefig(os.path.join(FIG_DIR, "ora_hallmark_dotplot.pdf"))
plt.close()

print("ORA enrichment exported:", os.path.join(TAB_DIR, "ora_enrichment_all.csv"))

In [None]:
# Save annotated object
fib.write(os.path.join(H5AD_DIR, "1_Fibro_count_new.h5ad"))
print("Saved:", os.path.join(H5AD_DIR, "1_Fibro_count_new.h5ad"))

# AUCell scoring for Myeloid gene programs
MAC_SYSTEMIC_INFLAMMATORY = [
    "IL1B","IL1A","CCL2","TNF","OSM","CXCL8",
    "IL6","PTGS2","NFKB1","NFKBIA","STAT3","SOCS3","TNFAIP3",
    "CXCL1","CXCL2","CXCL3","CXCL5","CXCL9","CXCL10",
    "CCL3","CCL4","ICAM1","ICAM2","CCR2"
]

MAC_MICROGLIAL_INFLAMMATORY = [
    "CXCR4","CXCL12","CX3CR1","CCL3","CCL4",
    "RHOB","JUN","JUNB","FOS","EGR1","EGR2","KLF2","KLF6","BTG2","ZFP36","DDIT4",
    "PDK4","P2RY13","NR4A1","AIF1","P2RY12","S100A6"
]

MAC_COMPLEMENT_IMMUNOSUPPRESSIVE = [
    "C1QA","C1QB","C1QC","C3","C2","C1R","C1S","C4A","C4B",
    "VSIG4","CD163","C3AR1","C5AR1","ITGAM","ITGB2","CFB","CFH","LILRB1","LILRB2"
]

MAC_SCAVENGER_IMMUNOSUPPRESSIVE = [
    "MRC1","MSR1","CD163","LYVE1","COLEC12","STAB1",
    "NRP1","RNASE1","CTSB","MERTK","MARCO","CD36","CLEC10A","DAB2","MRC2",
    "CSF1R","SEPP1","F13A1","TGFBI","SPARC"
]

SIGNATURES={
    "MAC_SystemicInflammatory": MAC_SYSTEMIC_INFLAMMATORY,
    "MAC_MicroglialInflammatory": MAC_MICROGLIAL_INFLAMMATORY,
    "MAC_ComplementImmunosuppressive": MAC_COMPLEMENT_IMMUNOSUPPRESSIVE,
    "MAC_ScavengerImmunosuppressive": MAC_SCAVENGER_IMMUNOSUPPRESSIVE,
}

MAC_M1_LIKE = [
    "IL1B","TNF","CXCL9","CXCL10","CXCL11",
    "STAT1","IRF1","GBP1","ISG15","IFI6","IFIT1","IFIT2","IFIT3",
    "NFKB1","NFKBIA","SOCS3","TNFAIP3","ICAM1"
]

MAC_M2_LIKE = [
    "MRC1","CD163","MSR1","STAB1","MERTK","DAB2","VSIG4",
    "LYVE1","RNASE1","MARCO","CD36","SEPP1","TGFBI","IL10","CCL18","CCL22"
]



SIGNATURES.update({
    "MAC_M1_like": MAC_M1_LIKE,
    "MAC_M2_like": MAC_M2_LIKE
})



# ---------- ECM signatures ----------
ECM_CORE = [
    "COL1A1","COL1A2","COL3A1","COL5A1","COL5A2","COL6A1","COL6A2","COL6A3","COL12A1","COL14A1",
    "FN1","LAMA2","LAMB1","LAMB2","LAMC1",
    "TNC","SPARC","POSTN","VCAN","BGN","DCN","LUM","THBS1","THBS2","FBLN1","FBLN2","FBN1","ELN"
]

ECM_REMODEL = [
    "MMP1","MMP2","MMP3","MMP7","MMP9","MMP11","MMP14",
    "TIMP1","TIMP2","TIMP3",
    "LOX","LOXL1","LOXL2",
    "PLOD1","PLOD2","P4HA1","P4HB","SERPINH1","CTGF",
    "ITGA11","ITGA5","ITGB1"  

SIGNATURES.update({
    "ECM_Core": ECM_CORE,
    "ECM_Remodeling": ECM_REMODEL,
    "ECM_All": sorted(set(ECM_CORE + ECM_REMODEL)),
})


# AUCell scoring for Hepato gene programs
HCC_ANGIOGENESIS_SIGNATURE = [
    "VEGFA", "VEGFC", "KDR", "ANGPT2", "HIF1A", "MMP9", "PDGFB",
    "PGF", "FLT4", "ANGPT1", "TEK", "PDGFA", "NRP2", "DLL4", "EFNB2", "ESM1"
]

HCC_DNA_REPAIR_SIGNATURE = [
    "BRCA1", "BRCA2", "ATM", "ATR", "PARP1", "RAD51", "XRCC1",
    "FANCD2", "MSH2", "MLH1", "CHEK1", "CHEK2", "MRE11", "RAD50", "NBN"
]

HCC_EMT_SIGNATURE = [
    "VIM","FN1","SPARC","THBS1","SPP1","CD44",
    "ITGA5","ITGAV","ITGB1","ITGB5","ITGA6","PXN","TLN1","ZYX",
    "LAMC2","LAMB3","COL17A1","TNC","TGFBI",
    "MMP2","MMP14","LOX","PLOD2","P4HA1","SERPINH1",
    "RHOC","RHOA",
    "S100A10","S100A4","ANXA2","CEMIP",
    "COL1A1","COL1A2","COL3A1"
]

HCC_INFLAMMATION_SIGNATURE = [
    "IL6", "IL1B", "TNF", "NFKB1", "STAT3", "CXCL10", "TLR4", "NLRP3"
]

HCC_INVASION_SIGNATURE = [
    "AXL","CXCL12","CXCR4","ITGA6","JAG1","LOX",
    "MMP14","MMP9","NOTCH1","PLAU","PLAUR",
    "VEGFA","VEGFC","MET",
    "ADAM10","ADAM17","ADAM9","ITGAV","LAMC2","TIMP2",
    "COL17A1","CDHR1","EPCAM","NFE2L2","SOX2","TP63","RUNX1","TP73"
]
HCC_INVASION_SIGNATURE = ['CXCL12', 'ITGAV', 'JAG1', 'COL17A1', 'VEGFC', 'PLAUR', 'EPCAM', 'AXL', 'MET', 'ITGA6']


HCC_PROLIFERATION_SIGNATURE = [
    "MKI67", "TOP2A", "PCNA", "CDK1", "CCNB1", "CCNB2", "CDC20",
    "AURKB", "TYMS", "UBE2C", "RRM2", "AURKA", "PLK1", "BIRC5", "MCM2", "MCM6"
]
HCC_EMT_SIGNATURE = ['THBS1', 'ITGA5', 'ZYX', 'CD44', 'SERPINH1', 'SOX2', 'ANXA2', 'RHOC', 'LOX', 'LAMB3']


HCC_CSC_GENES = [
    "TACSTD2","EPCAM","PROM1","KRT19","KRT7","KRT8","KRT18","SOX9","AFP",
    "ABCG2","ALDH1A1","ALDH1A3","POU5F1","SOX2","NANOG","KRT23","SPP1","MUC1","CLDN4","CLDN6",
    "CTNNB1","AXIN2","LGR5","FZD7","FZD2","DKK1","RSPO2","RSPO3","TCF7L1","WNT3A","WNT7B",
    "NOTCH1","JAG1","DLL1","DLL4","HES1","HES5",
    "GLI1","PTCH1","SMO","SHH","IHH",
    "TGFB1","TGFBR1","TGFBR2","SMAD2","SMAD3","SMAD7",
    "ITGA6","ITGB4",
    "MYC","BMI1","EZH2","KLF4","KLF5","HNF1B","YAP1","WWTR1","TEAD4","SOX4","SOX11","DLK1"
]
HEPATOCYTE_DIFF_GENES = [
    "ALB","TTR","TF","HP","APOA1","APOA2","APOB","APOC1","APOC2","APOC3","APOH",
    "SERPINA1","SERPINA3","SERPINC1","SERPIND1","SERPINF2",
    "FGA","FGB","FGG","F2","F7","F9","F10","F11","F12",
    "CP","HPX","APCS","PLG","AFM","AHSG",
    "PCK1","FBP1","G6PC","GLS2","GLUL","CPS1","ASS1","ASL","ARG1","BHMT","TAT","TDO2","ALDH2",
    "CYP3A4","CYP2E1","CYP2C9","CYP2C8","CYP2D6","CYP1A2","CYP2B6","CYP7A1","CYP8B1","FMO3","SULT2A1",
    "UGT1A1","UGT2B7","ABCB11","ABCC2","SLC10A1","SLC22A1","SLC22A7","SLC2A2","SLC27A5","MTTP",
    "ACOX1","CPT1A","PC",
    "HNF4A","HNF1A","ONECUT1","ONECUT2","KLF15","CFH","C3","C4A","C4B","C9","CFHR1","CPB2"
]

HEPATOCYTE_DIFF_GENES =['SERPINA3', 'AHSG', 'CP', 'GLS2', 'ALDH2', 'CFH', 'PC', 'CYP2D6', 'SERPINF2', 'CYP1A2', 'CFHR1', 'C4A']


HCC_PROLIFERATION_SIGNATURE = ['TOP2A', 'RRM2', 'CDC20', 'BIRC5', 'CCNB1', 'AURKB', 'PLK1', 'UBE2C', 'AURKA', 'CDK1', 'CCNB2']




SIGNATURES = {
    "Angiogenesis": HCC_ANGIOGENESIS_SIGNATURE,
    "DNA_Repair":   HCC_DNA_REPAIR_SIGNATURE,
    "EMT":          HCC_EMT_SIGNATURE,
    "Inflammation": HCC_INFLAMMATION_SIGNATURE,
    "Invasion":     HCC_INVASION_SIGNATURE,
    "Proliferation": HCC_PROLIFERATION_SIGNATURE,
        "CSC": HCC_CSC_GENES,
    "Differentiation": HEPATOCYTE_DIFF_GENES,
}
