# Annotate cell types

In [None]:
base_dir = pl.Path("../../../Broad_SpatialFoundation/VisiumHD-CRC/")

## P1CRC

In [None]:
adata = sc.read_h5ad(base_dir / 'P1CRC' / 'adata.h5ad')

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

---

| **Cluster** | **Likely Cell Type**                              | **Key Markers / Rationale**                | **Malignant?**                  |
| ----------- | ------------------------------------------------- | ------------------------------------------ | ------------------------------- |
| **0**       | Tumor epithelium                                  | OLFM4, CEACAM5/6, EPCAM, KRT8, MYC         | ‚úÖ Yes                           |
| **1**       | Tumor epithelium                                  | EPCAM, CEACAM5/6, OLFM4, KRT8, S100A6      | ‚úÖ Yes                           |
| **2**       | Cancer-associated fibroblasts (CAFs)              | COL1/3/6, SPARC, FN1, MMP2/11, THBS2       | ‚ùå No                            |
| **3**       | T cells                                           | TRBC1/2, TRAC, IL7R, CCL19/21, IKZF1       | ‚ùå No                            |
| **4**       | Goblet-like epithelium                            | MUC2, MUC12, PIGR, TFF3, AGR2, CEACAM5/6   | ‚ö† Possibly (depends on context) |
| **5**       | Myofibroblasts / endothelial/pericytes            | ACTA2, TAGLN, VIM, PECAM1, COL4A1/2        | ‚ùå No                            |
| **6**       | B cells / plasma cells                            | IGKC, IGHA1, IGHM, CD79A, MS4A1            | ‚ùå No                            |
| **7**       | Tumor cells (mitochondrial + germline antigen)    | MT-genes, MAGE, PRAME, CTAG2               | ‚úÖ Yes                           |
| **8**       | Plasma cells / inflammatory cells                 | IGHA1, IGHM, IGKC, JCHAIN, CXCL8, S100A8/9 | ‚ùå No                            |
| **9**       | TAMs (tumor-associated macrophages) + fibroblasts | CD68, APOE, C1QA/B/C, COL1/3/6, LYZ        | ‚ùå No                            |
| **10**      | Activated T / NK-like cells                       | CXCL8, CCL5, IL7R, IKZF1, TRBC2            | ‚ùå No                            |
| **11**      | Plasma cells (activated)                          | IGHA1, IGHM, JCHAIN, MZB1, CD79A           | ‚ùå No                            |
| **12**      | B cells / plasma cells                            | IGKC, IGHM, CD19, CD79B, PAX5              | ‚ùå No                            |
| **13**      | Absorptive colon epithelium                       | CEACAM5/6/7, EPCAM, KRT20, FABP1           | ‚ö† Possibly malignant            |
| **14**      | B cells                                           | IGKC, IGHA1, CD79A, PAX5, MS4A1            | ‚ùå No                            |
| **15**      | Smooth muscle / pericytes                         | ACTA2, TAGLN, MYH11, MYL9, DES             | ‚ùå No                            |
| **16**      | Secretory / Paneth-like epithelium                | DEFA5/6, REG1A/B, REG3A, OLFM4             | ‚úÖ Yes                           |
| **17**      | Neuroendocrine-like tumor cells                   | CALB2, NEUROD6, GFAP, POU3F3               | ‚úÖ Yes                           |
| **18**      | Tumor cells with cancer‚Äìtestis antigen expression | MAGE, PRAME, XAGE, CTAG2                   | ‚úÖ Yes                           |

---

‚úÖ **In short:**

* **Malignant epithelial/tumor clusters:** 0, 1, 7, 13, 16, 17, 18
* **Non-malignant immune clusters:** 3, 6, 8, 10, 11, 12, 14
* **Non-malignant stromal clusters:** 2, 5, 9, 15
* **Borderline (epithelium that could be normal vs tumor):** 4, 13

---


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant epithelial', '1': 'Malignant epithelial', 
                                                      '2': 'CAF', '3': 'T', 
                         '4': 'Goblet cells', '5': 'Myofibroblast', '6': 'B', '7': 'Malignant MT + germline', 
                         '8': 'Plasma', '9': 'TAM', '10': 'Activated T/NK', '11': 'Activated plasma', '12': 'B',
                            '13': 'Absorptive colon epithelium', '14': 'B', '15': 'Smooth muscle/pericytes',
                            '16': 'Malignant secretory/paneth', '17': 'Malignant neuroendocrine-like', '18': 'Malignant CTA'})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Lymphoid', 
                         '4': 'Epithelial', '5': 'Stromal', '6': 'Lymphoid', '7': 'Malignant', 
                         '8': 'Lymphoid', '9': 'Myeloid', '10': 'Lymphoid', '11': 'Lymphoid', '12': 'Lymphoid',
                            '13': 'Epithelial', '14': 'Lymphoid', '15': 'Stromal',
                            '16': 'Malignant', '17': 'Malignant', '18': 'Malignant'})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes','object_id']].to_csv(base_dir / 'P1CRC' / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Lymphoid', 
                         '4': 'Epithelial', '5': 'Stromal', '6': 'Lymphoid', '7': 'Malignant', 
                         '8': 'Lymphoid', '9': 'Myeloid', '10': 'Lymphoid', '11': 'Lymphoid', '12': 'Lymphoid',
                            '13': 'Epithelial', '14': 'Lymphoid', '15': 'Stromal',
                            '16': 'Malignant', '17': 'Malignant', '18': 'Malignant'})

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal", "Myeloid",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
adata.write_h5ad(base_dir / 'P1CRC' / 'adata.h5ad')

## P2CRC

In [None]:
adata = sc.read_h5ad(base_dir / 'P2CRC' / 'adata.h5ad')

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

---

| **Cluster** | **Likely Cell Type**                                         | **Key Markers / Rationale**                                     | **Malignant?** |
| ----------- | ------------------------------------------------------------ | --------------------------------------------------------------- | -------------- |
| **0**       | Stem/crypt-like tumor epithelium                             | CEACAM5/6, EPCAM, LGR5, NOTUM, NKD1, WNT6, MYC                  | ‚úÖ Yes          |
| **1**       | Plasma cells                                                 | IGKC, JCHAIN, IGHG/IGHA, XBP1, MZB1, PRDM1                      | ‚ùå No           |
| **2**       | Proliferative tumor epithelium (EGFR-responsive)             | CEACAM5/6, EPCAM, CLDN4, MYC, KLF5, EREG/AREG, GPX2, ST14, TFRC | ‚úÖ Yes          |
| **3**       | Stem-like tumor epithelium (oxidative/mt-high)               | CEACAM5/6, EPCAM, LGR5, ST14, MYC, KLF5, PPDPF                  | ‚úÖ Yes          |
| **4**       | Tumor vasculature (endothelium ¬± pericytes)                  | PECAM1, VWF, PLVAP, RGS5, NOTCH3, TAGLN/ACTA2, SPARCL1          | ‚ùå No           |
| **5**       | Goblet-like tumor epithelium                                 | PIGR, MUC2, TFF3, REG4, EPCAM, CLDN7, TSPAN8                    | ‚úÖ Likely       |
| **6**       | Cancer-associated fibroblasts (ECM-remodeling CAFs)          | COL1/3/5/6, SPARC, THBS2, CTHRC1, SFRP2/4, ANTXR1               | ‚ùå No           |
| **7**       | Stress-response tumor epithelium (pseudohypoxia/antioxidant) | NQO1, GSTP1, HSP90AA1/AB1, GPX2, MYC, CEACAM6, KLF5             | ‚úÖ Yes          |
| **8**       | Partial-EMT tumor epithelium (epithelial‚Äìstromal hybrid)     | EPCAM, ST14, VIM, COL genes, SPARC, CTSB; EMT/pEMT signature    | ‚úÖ Yes          |
| **9**       | Smooth muscle / pericytes                                    | TAGLN, ACTA2, MYH11, MYL9, CNN1, SPARCL1                        | ‚ùå No           |
| **10**      | Tumor-associated macrophages / phagocytes                    | CD68, C1QA/B/C, CTSD/CTSB, LYZ, FCER1G, APOE                    | ‚ùå No           |
| **11**      | Neutrophils / inflammatory myeloid                           | CXCL8, S100A8/A9, CSF3R, FCGR3B, FPR1/2, OSM                    | ‚ùå No           |
| **12**      | Hypoxia/invasive tumor epithelium                            | CEACAM5/6, EPCAM, VEGFA, TM4SF1, NDRG1, PLOD2, CCL20            | ‚úÖ Yes          |

---

‚úÖ **Quick takeaways**

* **Clearly malignant epithelial clusters:** **0, 2, 3, 7, 8, 12** (and **5** is goblet-like epithelium that‚Äôs **likely** malignant in CRC tissue context).
* **Stromal/vascular support:** **4, 6, 9**.
* **Immune infiltrates:** **1 (plasma)**, **10 (macrophages/TAMs)**, **11 (neutrophils)**.


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant stem/crypt-like', '1': 'Plasma', 
                                                      '2': 'Malignant proliferative', '3': 'Malignant stem-like/oxidative', 
                         '4': 'Endothelial/pericytes', '5': 'Goblet cells', '6': 'CAF', '7': 'Malignant stressed', 
                         '8': 'Malignant pEMT', '9': 'Smooth muscle', '10': 'TAM', '11': 'Neutrophil', '12': 'Malignant hypoxic/invasive',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Lymphoid', 
                                                      '2': 'Malignant', '3': 'Malignant', 
                         '4': 'Vascular', '5': 'Epithelial', '6': 'Stromal', '7': 'Malignant', 
                         '8': 'Malignant', '9': 'Stromal', '10': 'Myeloid', '11': 'Myeloid', '12': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes','object_id']].to_csv(base_dir / 'P2CRC' / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Lymphoid', 
                         '4': 'Epithelial', '5': 'Stromal', '6': 'Lymphoid', '7': 'Malignant', 
                         '8': 'Lymphoid', '9': 'Myeloid', '10': 'Lymphoid', '11': 'Lymphoid', '12': 'Lymphoid',
                            '13': 'Epithelial', '14': 'Lymphoid', '15': 'Stromal',
                            '16': 'Malignant', '17': 'Malignant', '18': 'Malignant'})

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal", "Myeloid",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
adata.write_h5ad(base_dir / 'P2CRC' / 'adata.h5ad')

## P5CRC

In [None]:
adata = sc.read_h5ad(base_dir / 'P5CRC' / 'adata.h5ad')

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())


---

| **Cluster** | **Likely Cell Type**                                           | **Key Markers / Rationale**                                                          | **Malignant?**                                                |
| ----------- | -------------------------------------------------------------- | ------------------------------------------------------------------------------------ | ------------------------------------------------------------- |
| **0**       | Goblet/secretory epithelium (goblet-like)                      | PIGR, MUC2, TFF3, FCGBP, REG4, EPCAM, CLDN7                                          | ‚ö† Possibly (goblet-like tumor epithelium vs. residual normal) |
| **1**       | Proliferative tumor epithelium (EGFR-responsive)               | CEACAM5/6, EPCAM, S100P, GPX2, IFI27, HOXB9, TM9SF2                                  | ‚úÖ Yes                                                         |
| **2**       | Cancer-associated fibroblasts (ECM-remodeling CAFs)            | COL1/3/5/6, DCN, LUM, THBS2, SFRP2, GREM1                                            | ‚ùå No                                                          |
| **3**       | Antigen-presenting myeloid cells (macrophage/B-cell interface) | CD74, C1QA/B/C, CD163, MS4A6A, APOE; some B/T markers (MS4A1, TRAC)                  | ‚ùå No                                                          |
| **4**       | Tumor epithelium (hypoxia/stress; mt-high)                     | CEACAM5/6, EPCAM, S100P, GPX2, NQO1, OLFM4, HOXB9                                    | ‚úÖ Yes                                                         |
| **5**       | Smooth muscle / pericytes                                      | TAGLN, ACTA2, MYH11, MYL9, CNN1                                                      | ‚ùå No                                                          |
| **6**       | Plasma cells                                                   | JCHAIN, IGKC, IGHA1/IGHM, XBP1, MZB1, PRDM1                                          | ‚ùå No                                                          |
| **7**       | Endothelium ¬± perivascular (tumor vasculature)                 | PECAM1, VWF, PLVAP, RGS5, NOTCH3, EGFL7                                              | ‚ùå No                                                          |
| **8**       | Partial-EMT / stromal-interacting tumor epithelium             | VIM, COL genes, SPARC with EPCAM/ST14 present in dataset ‚Üí epithelial‚Äìstromal hybrid | ‚úÖ Yes                                                         |
| **9**       | Stress-response/proliferative tumor epithelium                 | CEACAM6/5, EPCAM, GPX2, S100P, NQO1, IFITM3, CDC25B, KLF5                            | ‚úÖ Yes                                                         |
| **10**      | Tumor-associated macrophages / phagocytes                      | CD68, C1QA/B/C, CTSD/CTSB, LYZ, FCER1G, APOE                                         | ‚ùå No                                                          |
| **11**      | Neutrophils (inflammatory myeloid)                             | CXCL8, S100A8/A9, CSF3R, FCGR3B, FPR1/2, OSM                                         | ‚ùå No                                                          |
| **12**      | Absorptive colonocytes (normal-like epithelium)                | SLC26A3, GUCA2A, KRT20, CEACAM7, TMPRSS2, EPCAM                                      | ‚ùå No                                                          |
| **13**      | Epithelial‚Äìstromal mixed (likely doublets at interface)        | PIGR/MUC12/PHGR1 with COL/POSTN/PDGFRA and plasma markers (IGKC/JCHAIN)              | ‚ö† Unclear (likely non-malignant doublets)                     |
| **14**      | Low-quality / off-target (artifact)                            | RBC/neuronal/olfactory/testis genes (HBA2/HBB, KRT24, ORs) not CRC-specific          | ‚ùå No (artifact)                                               |

---

**Notes:**

* Malignant epithelial programs are strongest in **1, 4, 8, 9** (and often show EGFR/ERBB response: **AREG/EREG**, **GPX2**, **S100P**, **HOXB9**, **IFITM3**, **NQO1**).
* **0** shows goblet features common in CRC‚Äôs secretory-like tumor subclones; label as **possibly malignant** depending on spatial context (tumor core vs. margin).
* **12** retains a **normal-like absorptive** signature‚Äîuse CNV or spatial positioning to confirm non-malignancy.


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Goblet', '1': 'Malignant proliferative', 
                                                      '2': 'CAF', '3': 'Macrophage', 
                         '4': 'Malignant hypoxic', '5': 'Smooth muscle/pericyte', '6': 'Plasma', '7': 'Endothelial/perivascular', 
                         '8': 'CAF', '9': 'Malignant stressed', '10': 'TAM', '11': 'Neutrophil',
                                                      '12': 'Absorptive colon epithelium',
                            '13': 'Noise', '14': 'Noise', })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Myeloid', 
                         '4': 'Malignant', '5': 'Stromal', '6': 'Lymphoid', '7': 'Vascular', 
                         '8': 'Stromal', '9': 'Malignant', '10': 'Myeloid', '11': 'Myeloid',
                                                      '12': 'Epithelial',
                            '13': 'Noise', '14': 'Noise', })

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes','object_id']].to_csv(base_dir / 'P5CRC' / 'celltypes.csv')

### InferCNV

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal", "Myeloid",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
adata.write_h5ad(base_dir / 'P5CRC' / 'adata.h5ad')

## P3NAT

In [None]:
adata = sc.read_h5ad(base_dir / 'P3NAT' / 'adata.h5ad')

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())


---

| **Cluster** | **Likely Cell Type**                                                 | **Key Markers / Rationale**                                                                   | **Malignant?** |
| ----------- | -------------------------------------------------------------------- | --------------------------------------------------------------------------------------------- | -------------- |
| **0**       | Smooth muscle cells                                                  | TAGLN, ACTA2, MYH11, MYL9, CNN1, LMOD1                                                        | ‚ùå No           |
| **1**       | Plasma cells                                                         | JCHAIN, IGKC, IGHA1/2, XBP1, MZB1, PRDM1                                                      | ‚ùå No           |
| **2**       | Endothelial + perivascular cells                                     | PECAM1, VWF, CLDN5, RGS5, ENG, ADGRL4, PLVAP                                                  | ‚ùå No           |
| **3**       | Goblet/secretory epithelium                                          | PIGR, TFF3, AGR2, LCN2, MUC2, EPCAM                                                           | ‚ùå No           |
| **4**       | Goblet / secretory epithelium                                        | PIGR, MUC2, TFF3, AGR2, CLCA1, EPCAM, LGALS4                                                  | ‚ùå No           |
| **5**       | Fibroblasts (lamina propria)                                         | COL1A1/2/3, DCN, LUM, CXCL12, THBS1/2, TIMP2/3                                                | ‚ùå No           |
| **6**       | Absorptive colonocytes                                               | SLC26A3, CA1/2, KRT20, CEACAM7, GUCA2A, EPCAM                                                 | ‚ùå No           |
| **7**       | Vascular / intravascular cells (endothelium + circulating admixture) | CLDN5, ERG, FLT4, ECSCR (endothelium) with HBA2/HBB, PPBP, S100A8/9 (RBC/platelet/neutrophil) | ‚ùå No           |
| **8**       | Mature colonocytes (enterocytes)                                     | GUCA2A/B, AQP8, CLCA4, CEACAM7, KRT20, SLC9A3                                                 | ‚ùå No           |
| **9**       | T cells (na√Øve/central memory)                                       | TRAC/TRBC1/2, IL7R, CCL19/21, TCF7, CXCR4                                                     | ‚ùå No           |
| **10**      | Tissue macrophages                                                   | CD68, C1QA/B/C, APOE, MS4A7, FCER1G axis                                                      | ‚ùå No           |
| **11**      | Colon epithelium (absorptive + secretory mix)                        | PHGR1, FABP1, SLC26A3, PIGR, KRT20, EPCAM (Ig reads likely luminal-binding)                   | ‚ùå No           |
| **12**      | Smooth muscle / myofibroblasts                                       | ACTA2, TAGLN, MYH11, MYL9, CNN1, CALD1                                                        | ‚ùå No           |
| **13**      | Plasma cells (activated)                                             | JCHAIN, IGKC, IGHA1/2, MZB1, POU2AF1, TNFRSF17                                                | ‚ùå No           |
| **14**      | Resident macrophages (LYVE1/FOLR2‚Å∫)                                  | LYVE1, STAB1, MRC1, FOLR2, CD163, VSIG4                                                       | ‚ùå No           |
| **15**      | Enteric glia / peripheral glia ¬± neurons                             | PLP1, MPZ, S100B, PRNP, NCAM1, L1CAM, VIP, SCGN                                               | ‚ùå No           |

---

‚úÖ **Summary:**
All clusters from normal-adjacent colon tissue represent **non-malignant** epithelial, immune, stromal, endothelial, or neural lineages, consistent with normal tissue context.



In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Smooth muscle', '1': 'Plasma', 
                                                      '2': 'Endothelial/perivascular', '3': 'Goblet cells', 
                         '4': 'Goblet cells', '5': 'Fibroblasts', '6': 'Absorptive colon epithelium', '7': 'Vascular / intravascular', 
                         '8': 'Enterocyte', '9': 'T', '10': 'Tissue macrophage', '11': 'Absorptive + secretory colon epithelium', 
                                '12': 'Smooth muscle', '13': 'Activated plasma', '14': 'Resident macrophage', '15': 'Enteric/peripheral glia',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Stromal', '1': 'Lymphoid', 
                                                      '2': 'Vascular', '3': 'Epithelial', 
                         '4': 'Epithelial', '5': 'Stromal', '6': 'Epithelial', '7': 'Vascular', 
                         '8': 'Epithelial', '9': 'Lymphoid', '10': 'Myeloid', '11': 'Epithelial', 
                                '12': 'Stromal', '13': 'Lymphoid', '14': 'Myeloid', '15': 'Glia',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes','object_id']].to_csv(base_dir / 'P3NAT' / 'celltypes.csv')

### InferCNV

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal", "Myeloid",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
adata.write_h5ad(base_dir / 'P3NAT' / 'adata.h5ad')

## P5NAT

In [None]:
adata = sc.read_h5ad(base_dir / 'P5NAT' / 'adata.h5ad')

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())


---

| **Cluster** | **Likely Cell Type**                           | **Key Markers / Rationale**                                                                                      | **Malignant?**                 |
| ----------- | ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | ------------------------------ |
| **0**       | Plasma cells                                   | JCHAIN, IGKC, IGHA1/2, XBP1, MZB1, PRDM1                                                                         | ‚ùå No                           |
| **1**       | Goblet/secretory epithelium                    | PIGR, AGR2, MUC2/12, EPCAM, LGALS4, WFDC2                                                                        | ‚ùå No                           |
| **2**       | Smooth muscle cells                            | TAGLN, ACTA2, MYH11, MYL9, CNN1, LMOD1                                                                           | ‚ùå No                           |
| **3**       | Colonocytes (absorptive epithelium)            | SLC26A3, CA1/2, FABP1, EPCAM, KRT20, LGALS4                                                                      | ‚ùå No                           |
| **4**       | Fibroblasts (stromal)                          | COL1A1/2, DCN, LUM, FBLN1/2, TIMP2/3, SERPINF1                                                                   | ‚ùå No                           |
| **5**       | T cells (mixed CD4/CD8, some B-cell admixture) | TRAC/TRBC2, IL7R, MS4A1, IKZF1, CD79A, BTG1                                                                      | ‚ùå No                           |
| **6**       | Enteroendocrine / secretory epithelium         | PIGR, MUC2, CHGA, TPH1, PYY, GCG, LGR5, EPHB3                                                                    | ‚ùå No                           |
| **7**       | Absorptive colonocytes                         | SLC26A3, GUCA2A/B, KRT20, CEACAM7, EPCAM, AQP8                                                                   | ‚ùå No                           |
| **8**       | Mature colonocytes (absorptive)                | GUCA2A/B, CEACAM5/6/7, FABP1, KRT20, EPCAM                                                                       | ‚ùå No                           |
| **9**       | Mixed plasma / epithelial (possible doublets)  | IGKC, JCHAIN, IGHA1, PIGR, PHGR1, COL3A1                                                                         | ‚ùå No (likely technical mix)    |
| **10**      | Endothelial cells                              | PECAM1, VWF, CLDN5, PLVAP, RGS5, ENG, SPARCL1                                                                    | ‚ùå No                           |
| **11**      | Macrophages (tissue-resident)                  | CD68, APOE, C1QA/B/C, LGMN, CD163, STAB1                                                                         | ‚ùå No                           |
| **12**      | Low-quality / noise cluster                    | Many uncharacteristic genes (testis/spermatid, pseudogenes, olfactory receptors), lacks coherent lineage markers | ‚ö†Ô∏è Unclear (likely artifact)   |
| **13**      | Low-quality / noise cluster                    | Mixed pseudogenes, ORs, embryonic/testis genes (HBG1, NANOS3), not colon-specific                                | ‚ö†Ô∏è Unclear (artifact/doublets) |
| **14**      | Enteric glia / peripheral glia ¬± neurons       | CRYAB, S100B, PRNP, NCAM1, PMP22, VIP, L1CAM                                                                     | ‚ùå No                           |
| **15**      | Low-quality / developmental-like               | OR genes, SNAI1, NEUROD4, IFNA1; not colon lineage                                                               | ‚ö†Ô∏è Likely artifact             |
| **16**      | Crypt-base stem/progenitor / transit-amplifying epithelium                | EPHB3, NOTUM, NKD1, EPCAM, KRT8, TFF3, SLC12A2                                                          | ‚ùå No                         |
| **17**      | Low-quality / noise cluster                    | Pseudogenes, ORs, non-colon TFs; lacks coherent epithelial/immune markers                                        | ‚ö†Ô∏è Artifact                    |
| **18**      | Low-quality / noise cluster                    | Olfactory receptor / testis-specific mix (POU2F3, PRAMEF, ACTA1, COL2A1), not colon                              | ‚ö†Ô∏è Artifact                    |
| **19**      | Low-quality / noise cluster                    | Pseudogenes, developmental genes (HAND1, FOXC1, SRARP), ORs                                                      | ‚ö†Ô∏è Artifact                    |
| **20**      | Neural-like / noise                            | SNCA, CHRNA2, CRMP1, RELN; mixture with HBB (possible doublets)                                                  | ‚ö†Ô∏è Likely artifact/doublet     |
| **21**      | Low-quality / noise cluster                    | Testis-specific/OR-rich pseudogenes, PRAMEF, VCX2, NMS                                                           | ‚ö†Ô∏è Artifact                    |
| **22**      | Low-quality / noise cluster                    | Similar to 21/23/24; dominated by testis/OR genes, not colon                                                     | ‚ö†Ô∏è Artifact                    |
| **23**      | Low-quality / noise cluster                    | Same OR/testis pseudogene cluster as 21/22/24                                                                    | ‚ö†Ô∏è Artifact                    |
| **24**      | Low-quality / noise cluster                    | Testis-specific/OR-rich genes, repeats with 21‚Äì23                                                                | ‚ö†Ô∏è Artifact                    |

---

‚úÖ **Summary:**

* **Normal epithelial lineages**: clusters 1, 3, 6, 7, 8.
* **Stromal/vascular**: clusters 2, 4, 10.
* **Immune**: clusters 0 (plasma), 5 (T cells), 11 (macrophages).
* **Neural/glial**: cluster 14.
* **Likely artifacts/doublets**: clusters 12, 13, 15, 17‚Äì24 (dominated by olfactory receptor/testis/repetitive genes, not relevant to colon biology).

---


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Plasma', '1': 'Goblet cells', 
                                                      '2': 'Smooth muscle', '3': 'Absorptive colon epithelium', 
                         '4': 'Fibroblast', '5': 'T', '6': 'Enteroendocrine/secretory epithelium', '7': 'Absorptive colon epithelium', 
                         '8': 'Mature absorptive colon epithelium', '9': 'Noise', '10': 'Endothelial', '11': 'Tissue resident macrophage', 
                                '12': 'Noise', '13': 'Noise', '14': 'Enteric/peripheral glia', '15': 'Noise',
                            '16': 'Crypt-base stem/progenitor', '17': 'Noise',
                            '18': 'Noise', '19': 'Noise', '20': 'Noise', '21': 'Noise', '22': 'Noise', '23': 'Noise', '24': 'Noise',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Epithelial', 
                                                      '2': 'Stromal', '3': 'Epithelial', 
                         '4': 'Stromal', '5': 'Lymphoid', '6': 'Epithelial', '7': 'Epithelial', 
                         '8': 'Epithelial', '9': 'Noise', '10': 'Vascular', '11': 'Myeloid', 
                                '12': 'Noise', '13': 'Noise', '14': 'Glia', '15': 'Noise',
                            '16': 'Epithelial', '17': 'Noise',
                            '18': 'Noise', '19': 'Noise', '20': 'Noise', '21': 'Noise', '22': 'Noise', '23': 'Noise', '24': 'Noise',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes','object_id']].to_csv(base_dir / 'P5NAT' / 'celltypes.csv')

### InferCNV

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal", "Myeloid",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
adata.write_h5ad(base_dir / 'P5NAT' / 'adata.h5ad')

## P5CRC

In [None]:
adata = sc.read_h5ad(base_dir / 'P5CRC' / 'adata.h5ad')

In [None]:
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())


---

| **Cluster** | **Likely Cell Type**                                           | **Key Markers / Rationale**                                                          | **Malignant?**                                                |
| ----------- | -------------------------------------------------------------- | ------------------------------------------------------------------------------------ | ------------------------------------------------------------- |
| **0**       | Goblet/secretory epithelium (goblet-like)                      | PIGR, MUC2, TFF3, FCGBP, REG4, EPCAM, CLDN7                                          | ‚ö† Possibly (goblet-like tumor epithelium vs. residual normal) |
| **1**       | Proliferative tumor epithelium (EGFR-responsive)               | CEACAM5/6, EPCAM, S100P, GPX2, IFI27, HOXB9, TM9SF2                                  | ‚úÖ Yes                                                         |
| **2**       | Cancer-associated fibroblasts (ECM-remodeling CAFs)            | COL1/3/5/6, DCN, LUM, THBS2, SFRP2, GREM1                                            | ‚ùå No                                                          |
| **3**       | Antigen-presenting myeloid cells (macrophage/B-cell interface) | CD74, C1QA/B/C, CD163, MS4A6A, APOE; some B/T markers (MS4A1, TRAC)                  | ‚ùå No                                                          |
| **4**       | Tumor epithelium (hypoxia/stress; mt-high)                     | CEACAM5/6, EPCAM, S100P, GPX2, NQO1, OLFM4, HOXB9                                    | ‚úÖ Yes                                                         |
| **5**       | Smooth muscle / pericytes                                      | TAGLN, ACTA2, MYH11, MYL9, CNN1                                                      | ‚ùå No                                                          |
| **6**       | Plasma cells                                                   | JCHAIN, IGKC, IGHA1/IGHM, XBP1, MZB1, PRDM1                                          | ‚ùå No                                                          |
| **7**       | Endothelium ¬± perivascular (tumor vasculature)                 | PECAM1, VWF, PLVAP, RGS5, NOTCH3, EGFL7                                              | ‚ùå No                                                          |
| **8**       | Partial-EMT / stromal-interacting tumor epithelium             | VIM, COL genes, SPARC with EPCAM/ST14 present in dataset ‚Üí epithelial‚Äìstromal hybrid | ‚úÖ Yes                                                         |
| **9**       | Stress-response/proliferative tumor epithelium                 | CEACAM6/5, EPCAM, GPX2, S100P, NQO1, IFITM3, CDC25B, KLF5                            | ‚úÖ Yes                                                         |
| **10**      | Tumor-associated macrophages / phagocytes                      | CD68, C1QA/B/C, CTSD/CTSB, LYZ, FCER1G, APOE                                         | ‚ùå No                                                          |
| **11**      | Neutrophils (inflammatory myeloid)                             | CXCL8, S100A8/A9, CSF3R, FCGR3B, FPR1/2, OSM                                         | ‚ùå No                                                          |
| **12**      | Absorptive colonocytes (normal-like epithelium)                | SLC26A3, GUCA2A, KRT20, CEACAM7, TMPRSS2, EPCAM                                      | ‚ùå No                                                          |
| **13**      | Epithelial‚Äìstromal mixed (likely doublets at interface)        | PIGR/MUC12/PHGR1 with COL/POSTN/PDGFRA and plasma markers (IGKC/JCHAIN)              | ‚ö† Unclear (likely non-malignant doublets)                     |
| **14**      | Low-quality / off-target (artifact)                            | RBC/neuronal/olfactory/testis genes (HBA2/HBB, KRT24, ORs) not CRC-specific          | ‚ùå No (artifact)                                               |

---

**Notes:**

* Malignant epithelial programs are strongest in **1, 4, 8, 9** (and often show EGFR/ERBB response: **AREG/EREG**, **GPX2**, **S100P**, **HOXB9**, **IFITM3**, **NQO1**).
* **0** shows goblet features common in CRC‚Äôs secretory-like tumor subclones; label as **possibly malignant** depending on spatial context (tumor core vs. margin).
* **12** retains a **normal-like absorptive** signature‚Äîuse CNV or spatial positioning to confirm non-malignancy.


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Goblet', '1': 'Malignant proliferative', 
                                                      '2': 'CAF', '3': 'Macrophage', 
                         '4': 'Malignant hypoxic', '5': 'Smooth muscle/pericyte', '6': 'Plasma', '7': 'Endothelial/perivascular', 
                         '8': 'CAF', '9': 'Malignant stressed', '10': 'TAM', '11': 'Neutrophil',
                                                      '12': 'Absorptive colon epithelium',
                            '13': 'Noise', '14': 'Noise', })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Myeloid', 
                         '4': 'Malignant', '5': 'Stromal', '6': 'Lymphoid', '7': 'Vascular', 
                         '8': 'Stromal', '9': 'Malignant', '10': 'Myeloid', '11': 'Myeloid',
                                                      '12': 'Epithelial',
                            '13': 'Noise', '14': 'Noise', })

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes','object_id']].to_csv(base_dir / 'P1CRC' / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal", "Myeloid",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
adata.write_h5ad(base_dir / 'P5CRC' / 'adata.h5ad')

# Clean cell types

In [None]:
adatas = []
for sample in tqdm(sample_list):
    adata = sc.read_h5ad(base_dir / sample / 'adata.h5ad')
    embeddings_df = all_embeddings[sample].copy()
    embeddings_df = embeddings_df.set_index('cell_id')
    
    common_idx = adata.obs_names.intersection(embeddings_df.index)
    adata = adata[common_idx].copy()
    embeddings_df = embeddings_df.loc[common_idx]
    adata.obsm['NicheFinder'] = embeddings_df.loc[:,['0','1','2','3','4','5','6','7','8','9']]
    
    adata.obs_names = adata.obs_names + '::' + sample 
    adata.obs['sample_id'] = sample
    adatas.append(adata)
adata = adatas[0].concatenate(*adatas[1:])
adata.obs_names = adata.obs_names.str.split('-').str[0]

## Malignant

In [None]:
maladata = adata[adata.obs.celltypes=='Malignant'].copy()

In [None]:
maladata.obs['condition'] = maladata.obs.sample_id.str[-3:]

sc.pp.normalize_total(maladata, target_sum=10000)
sc.pp.log1p(maladata)

sc.tl.pca(maladata)

sc.external.pp.bbknn(maladata, batch_key='sample_id')

sc.tl.umap(maladata)

sc.pl.umap(maladata, color=['condition','cellsubtypes','sample_id',], ncols=2, wspace=0.5)

In [None]:
sc.tl.leiden(maladata, key_added='leiden_gex')

In [None]:
sc.pl.umap(maladata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(maladata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_mal = {}
for gr in maladata.obs.leiden_gex.unique():
    dgex_mal[gr] = sc.get.rank_genes_groups_df(maladata, group=gr)

In [None]:
for gr, diffg in dgex_mal.items():
    print(gr)
    print(diffg.head(100).names.ravel())

In [None]:
maladata.obs.leiden_gex.value_counts()

Here‚Äôs a quick ‚Äúwhy‚Äù for each cluster label‚Äîfocused on the most discriminative marker sets you shared and the spatial context (tumor nests).

* **0 ‚Äî Malignant TA-like (absorptive/crypt)**
  OLFM4‚Å∫ EPCAM‚Å∫ TSPAN8‚Å∫ SLC26A3/ANPEP/PCK1: TA/early-absorptive crypt program retained within tumor; epithelial adhesion/transport genes argue tumor epithelium rather than immune/stromal.

* **1 ‚Äî Malignant (LCN2+/injury)**
  LCN2‚Å∫ MUC13/1‚Å∫ CEACAM6/5‚Å∫ IFI27‚Å∫ TMPRSS4‚Å∫ PI3‚Å∫: classic injury/regenerative and EGFR-adjacent stress signature frequently seen in CRC tumor epithelium; co-localizes with malignant nests.

* **2 ‚Äî Malignant TA-like (secretory/REG‚Å∫)**
  OLFM4‚Å∫ REG1A/B‚Å∫ DEFA5/6‚Å∫ ATOH1-axis, MUC17, LYPD8: secretory-crypt differentiation overlaid on TA program‚Äîconsistent with dysplastic/early neoplastic secretory cells in tumors.

* **3 ‚Äî Malignant (stress/proliferative/translation)**
  EEF1G/EEF1B2/HSP90/HSPA/ribosome-high, NQO1/GSTP1: global protein synthesis/chaperone and oxidative-stress up‚Äîtypical of proliferating/stressed tumor epithelium.

* **4 ‚Äî Malignant (EGFR/ERBB; epithelial program)**
  CEACAM5/6‚Å∫ EPCAM‚Å∫ EGFR-pathway targets (EREG, AREG), CTNND1/VAV3/PTPRK, SOX4/MYC: epithelial, EGFR/ERBB signaling, invasion-associated scaffolders‚Äîhallmarks of CRC tumor cells.

* **5 ‚Äî TAM (inflammatory; TREM1‚Å∫/CXCL8‚Å∫)**
  CXCL8‚Å∫ IL1B‚Å∫ TREM1‚Å∫ FCER1G‚Å∫ ITGAX‚Å∫ SPP1‚Å∫ AQP9‚Å∫ PTGS2‚Å∫: inflammatory macrophage/TAM program; collagenases (MMP12/2) and complement (C1Q) support myeloid identity.

* **6 ‚Äî Malignant (EGFR/invasive; CEACAM6‚Å∫)**
  CEACAM6‚Å∫ PTP4A3‚Å∫ LCN2‚Å∫ RRBP1‚Å∫ TFRC‚Å∫ PRSS23‚Å∫ KIF5B‚Å∫: EGFR/adhesion/cargo-trafficking and invasion/stress modules; epithelial scaffold (CLDN4/7) intact.

* **7 ‚Äî Malignant (stress/chaperone-high)**
  NQO1/GSTP1/HSP90AA1/AB1/HSPD1, MCM4/7: oxidative and proteostasis stress with S-phase/DNA-rep cues‚Äîtumor stress/metabolic pressure rather than a lineage program.

* **8 ‚Äî Macrophages (tissue-resident C1Q‚Å∫)**
  C1QA/B/C‚Å∫ CD74‚Å∫ FCER1G‚Å∫ APOE‚Å∫ TYROBP‚Å∫: canonical tissue-resident macrophage/C1Q module; extracellular matrix (COL/THY1) contamination likely from vicinity.

* **9 ‚Äî B/Plasma (activated B/GC-like)**
  JCHAIN‚Å∫ MS4A1‚Å∫ CD79A‚Å∫ POU2AF1‚Å∫ CXCR4‚Å∫ CXCL12-axis, TNFRSF13B: germinal-center/activated B features; some CPA3/MS4A2 spill-in but core B program dominates.

* **10 ‚Äî Secretory crypt (REG‚Å∫/DEFA‚Å∫ dysplastic)**
  REG1A/B‚Å∫ DEFA5/6‚Å∫ ITLN1‚Å∫ TMIGD1‚Å∫ ATOH1‚Å∫ CLCA1‚Å∫: Paneth/goblet-like secretory crypt program in tumor context‚Äîconsistent with dysplastic secretory epithelium.

* **11 ‚Äî Secretory crypt (PYY‚Å∫/GUCA2B‚Å∫ dysplastic)**
  PYY‚Å∫ GUCA2B‚Å∫ REG1A/B‚Å∫ CLCA1‚Å∫ ITLN1‚Å∫: enteroendocrine/secretory-crypt blend within tumor nests; fits dysplastic secretory epithelium.

* **12 ‚Äî Malignant (hypoxia/metabolic; VEGFA‚Å∫)**
  VEGFA‚Å∫ NDRG1‚Å∫ PLOD2‚Å∫ HK2/PFKFB3/4, TM4SF1‚Å∫: HIF/Warburg-like metabolic rewiring and pro-angiogenic signaling‚Äîclassic hypoxic tumor state.

* **13 ‚Äî Malignant (stress/metabolic; FABP1‚Å∫/PRSS23‚Å∫)**
  FABP1‚Å∫ PRSS23‚Å∫ PYGB‚Å∫ KLF5/MYC response, chaperones and translation factors: lipid/glucose stress and growth signaling‚Äîtumor metabolic adaptation.

* **14 ‚Äî Noise / low-quality (germline/testis-biased)**
  CTAG/PRAMEF, KRTAPs, germline/testis and olfactory/keratin-associated loci: non-specific/ambient or doublet artifacts; not a coherent CRC/lineage program.

* **15 ‚Äî Noise / low-quality**
  Same germline/low-complexity panel with scattered neuronal/testis genes; likely ambient/doublets/low UMI.

* **16 ‚Äî Noise / low-quality**
  Similar to 14/15 (germline/testis/keratin-associated); no consistent epithelial/myeloid/stromal signature.

* **17 ‚Äî Noise / low-quality**
  Housekeeping/transport/nuclear scaffold with germline markers; lacks cohesive lineage signal‚Äîflag as noise.

If you later decide to collapse labels (e.g., merge **10** and **11** into ‚ÄúSecretory crypt (dysplastic)‚Äù, or **12** and **13** into ‚ÄúMalignant (hypoxia/stress)‚Äù for figures), I can provide a compact mapping too.


In [None]:
malignant_annotation = {
    "0":  "Malignant TA-like (absorptive/crypt)",            # TSPAN8, OLFM4, EPCAM, SLC26A3, ANPEP
    "1":  "Malignant (LCN2+/injury)",                        # LCN2, MUC13/1, CEACAM6/5, IFI27, TMPRSS4
    "2":  "Malignant TA-like (secretory/REG+)",              # OLFM4, REG1A/B, DEFA5/6, ATOH1 axis
    "3":  "Malignant (stress/proliferative/translation)",    # EEF1G/B2, HSPs, ribosome/translation-high
    "4":  "Malignant (EGFR/ERBB)",       # CEACAM5/6, EGFR-pathway targets, CTNND1, VAV3
    "5":  "TAM (TREM1+/SPP1+)",               # CXCL8, IL1B, TREM1, FCER1G, ITGAX, SPP1
    "6":  "Malignant (EGFR/invasive)",             # CEACAM6, PTP4A3, LCN2, RRBP1, stress/proliferation
    "7":  "Malignant (stress/chaperone-high)",               # NQO1, GSTP1, HSP90AA1/AB1, MCMs
    "8":  "Macrophages (tissue-resident C1Q+)",              # C1QA/B/C, CD74, FCER1G, APOE
    "9":  "B/Plasma (activated B/GC-like)",                  # JCHAIN, MS4A1, CD79A, CXCR4, POU2AF1
    "10": "Secretory crypt (dysplastic)",  # REG1A/B, DEFA5/6, ITLN1, TMIGD1, ATOH1
    "11": "Secretory crypt (dysplastic)",       # REG1A/B, PYY, GUCA2B, CLCA1
    "12": "Malignant (hypoxia/stress/metabolic)",           # VEGFA, PLOD2, NDRG1, HK2, PFKFB3/4
    "13": "Malignant (hypoxia/stress/metabolic)",    # FABP1, PRSS23, MYC/KLF5 response, injury/stress
    "14": "Noise",    # CTAG/PRAMEF, KRTAPs, assorted germline loci
    "15": "Noise",                             # similar low-complexity/germline panel
    "16": "Noise",                             # similar low-complexity/germline panel
    "17": "Noise"                              # similar low-complexity/germline panel
}


In [None]:
maladata.obs.leiden_gex.replace(malignant_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/malignant_compt_annotation.csv')

## Lymphoid

In [None]:
lymphadata = adata[adata.obs.celltypes=='Lymphoid'].copy()

In [None]:
lymphadata.obs['condition'] = lymphadata.obs.sample_id.str[-3:]

sc.pp.normalize_total(lymphadata, target_sum=10000)
sc.pp.log1p(lymphadata)

sc.tl.pca(lymphadata)

sc.external.pp.bbknn(lymphadata, batch_key='sample_id')

sc.tl.umap(lymphadata)


In [None]:
sc.pl.umap(lymphadata, color=['condition','sample_id',], ncols=2, wspace=0.5)

In [None]:
sc.tl.leiden(lymphadata, key_added='leiden_gex')

In [None]:
sc.pl.umap(lymphadata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(lymphadata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_lymphoid = {}
for gr in lymphadata.obs.leiden_gex.unique():
    dgex_lymphoid[gr] = sc.get.rank_genes_groups_df(lymphadata, group=gr)

In [None]:
for gr, diffg in dgex_lymphoid.items():
    print(gr)
    print(diffg.head(50).names.ravel())


---

| **Clust.** | **Call**                                                       | **Why (key markers)**                                                                              | **Contamination / Notes**                                                                 |
| ---------- | -------------------------------------------------------------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| **0**      | **T cells (na√Øve/central memory ‚Üí Tfh-like)**                  | TRAC/TRBC1/2, **IL7R**, **CCL19/CCL21**, CXCR4, IKZF1, BTG1                                        | Minor B-cell bleed (**MS4A1**) and stromal (**VIM**). Overall a bona-fide T-cell cluster. |
| **1**      | **Plasma cells**                                               | **JCHAIN, IGKC/IGLC1, IGHA1**, XBP1, MZB1, PRDM1                                                   | Some epithelial proximity (**PIGR, ITM2C**) and stress (TXNDC5/HERPUD1) ‚Äî still plasma.   |
| **2**      | **Plasma cells**                                               | **JCHAIN, IGKC/IGLC1, IGHM/IGHA1, TNFRSF17 (BCMA)**, XBP1, MZB1                                    | Clear **epithelial admixture** (**PIGR, MUC12, FCGBP**) ‚Üí likely spatial bleed/doublets.  |
| **3**      | **CAF / stromal (not lymphoid)**                               | **COL1A1/1A2/3A1/6A3**, DCN, LUM, **SPARC**, SFRP2/4, THBS2                                        | Misplaced in lymphoid compartment; strong ECM-remodeling CAF signature.                   |
| **4**      | **Mixed non-lymphoid (epithelial + vascular + myeloid)**       | CXCL8/OSM (myeloid chemotaxis), **OLFM4/LGR5/DEFA5/6** (crypt epith), **SELE/STAB2** (endothelial) | Predominantly **contamination/misclustering**; not a lymphoid population.                 |
| **5**      | **Plasma cells near stroma (doublets likely)**                 | **JCHAIN/IGKC/IGHA1**, ADAMDEC1; plus **COL1/3/6**, DCN, SPARCL1                                   | Plasma program with **CAF ECM** ‚Üí probable plasma‚Äìstromal doublets or edge pixels.        |
| **6**      | **Epithelial secretory (goblet/Reg4‚Å∫) ‚Äî not lymphoid**         | **MUC2, FCGBP, SPINK4, REG4, ITLN1, PIGR**, WFDC2; plus **RGS5/CLDN5**                             | Strong **epithelial** with pericyte/vascular genes ‚Üí contamination in lymphoid bin.       |
| **7**      | **Plasma cells (activated IgG/IgM)**                           | **IGHG1/IGHG3/IGHM**, XBP1, MZB1, PRDM1, CD38                                                      | Minor endothelial (**PECAM1**) bleed; still plasma.                                       |
| **8**      | **Macrophages / mononuclear phagocytes**                       | **CD68, C1QA/B/C, MS4A6A/7, CD163, CSF1R**, LGMN, TYROBP                                           | Not lymphoid; TAM-like myeloid infiltrate.                                                |
| **9**      | **Neutrophils / inflammatory myeloid**                         | **CXCL8, S100A8/A9, CSF3R, FCGR3B, FPR1/2, TREM1**, AQP9                                           | Endothelial activation (**SELE/ACKR1**) present ‚Üí vascular admixture.                     |
| **10**     | **Myeloid (TAM-leaning) with stromal proximity**               | **COL3A1/1A1**, SPARC, IGFBP7, CXCL14, LYZ, CTSB/IFI30                                             | Hybrid myeloid‚Äìstromal profile; not lymphoid.                                             |
| **11**     | **Crypt/secretory epithelial (mt-high) ‚Äî not lymphoid**        | **OLFM4, REG1A, DMBT1, LGR5, MUC17, NOTUM**, PRAP1                                                 | Likely epithelial spot leakage into lymphoid set.                                         |
| **12**     | **Crypt/secretory epithelial (injury/mt-high) ‚Äî not lymphoid** | **OLFM4, REG1A/B, DMBT1, SPINK4**, CXCL8/G0S2 (stress)                                             | Epithelial contamination; some myeloid chemokines.                                        |
| **13**     | **Low-quality / ambient mix (Ig + granulocytic)**              | **IGHG1**, FCGR3B, FPR2, TREM1, G0S2, odd ORs                                                      | Likely **ambient Ig** + **neutrophil** RNA; treat as **artifact**.                        |
| **14**     | **Endothelial / HEV-like (not lymphoid)**                      | **ECSCR/AGER**, TMEM47, USHBP1, P2RX4, KSR1                                                        | Vascular endothelium; misassigned to lymphoid.                                            |

---

TL;DR

* **True lymphoid:**

  * **T cells:** cluster **0** (na√Øve/central memory ¬± Tfh-like).
  * **Plasma cells:** clusters **1, 2, 5, 7** (note stromal/epithelial doublet risk in **2, 5**).
* **Myeloid (not lymphoid):** **8, 9, 10** (macrophages / neutrophils).
* **Non-lymphoid contamination:** epithelial/crypt/secretory (**4, 6, 11, 12**), endothelial (**4, 9, 14**), stromal CAF (**3, 5, 10**).
* **Artifacts/ambient:** **13**.


In [None]:
# Mapping for adata.obs["leiden"].replace to annotate lymphoid-compartment clusters
lymphoid_annotation = {
    "0":  "T (naive/Tfh)",
    "1":  "Plasma",
    "2":  "Plasma",
    "3":  "CAF",
    "4":  "Noise",
    "5":  "Noise",
    "6":  "Goblet",
    "7":  "Plasma (activated)",
    "8":  "TAM",
    "9":  "Neutrophil",
    "10": "TAM",
    "11": "Epithelial (crypt/secretory)",
    "12": "Epithelial (crypt/secretory)",
    "13": "Noise",
    "14": "Endothelial",
}

In [None]:
lymphadata.obs.leiden_gex.replace(lymphoid_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/lymphoid_compt_annotation.csv')

## Myeloid

In [None]:
myeladata = adata[adata.obs.celltypes=='Myeloid'].copy()

In [None]:
myeladata.obs['condition'] = myeladata.obs.sample_id.str[-3:]

sc.pp.normalize_total(myeladata, target_sum=10000)
sc.pp.log1p(myeladata)

sc.tl.pca(myeladata)

sc.external.pp.bbknn(myeladata, batch_key='sample_id')

sc.tl.umap(myeladata)

sc.pl.umap(myeladata, color=['condition','cellsubtypes','sample_id',], ncols=2)

In [None]:
sc.tl.leiden(myeladata, key_added='leiden_gex')

In [None]:
sc.pl.umap(myeladata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(myeladata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_myeloid = {}
for gr in myeladata.obs.leiden_gex.unique():
    dgex_myeloid[gr] = sc.get.rank_genes_groups_df(myeladata, group=gr)

In [None]:
for gr, diffg in dgex_myeloid.items():
    print(gr)
    print(diffg.head(50).names.ravel())

---

 ‚úÖ True Myeloid Populations

**Cluster 1 ‚Äì Monocytes/macrophages (classical-like)**

* **Markers:** LYZ, CD68, APOE, CST3, TYROBP
* **Interpretation:** Canonical phagocytic/antigen-presenting monocytes and macrophages, enriched for lysosomal genes.

**Cluster 7 ‚Äì Macrophages (M2-like / immunoregulatory)**

* **Markers:** CD163, SELENOP, APOE, C1QC
* **Interpretation:** Tissue-resident macrophages with anti-inflammatory/M2-polarized features, often enriched in tumor-associated macrophages (TAMs).

**Cluster 8 ‚Äì Tissue-resident macrophages (MRC1+, VSIG4+)**

* **Markers:** CD163, MRC1, VSIG4, FOLR2
* **Interpretation:** Classical tissue-resident macrophage program, immunoregulatory, likely stromal niche associated.

**Cluster 9 ‚Äì TAM-like / C1QC+ macrophages**

* **Markers:** C1QA/B/C, APOE, LGMN, CST3, MS4A6A
* **Interpretation:** Canonical C1QC+ TAM program described in CRC and other cancers. High phagocytic/lysosomal activity.

**Cluster 11 ‚Äì Neutrophils (inflammatory)**

* **Markers:** S100A8, S100A9, CXCL8, IL1B, OSM, FCAR
* **Interpretation:** Strong granulocytic inflammatory program, consistent with neutrophils or granulocytic-MDSCs.

---

üö´ Contamination / Non-myeloid

**Cluster 0 ‚Äì B cell/T cell contamination**

* **Markers:** MS4A1, CD79A, IGHM, CXCL13, TCL1A
* **Interpretation:** Strong B cell signature; not myeloid.

**Cluster 2 ‚Äì T cell contamination**

* **Markers:** TRAC, TRBC2, CD2, IL7R, CORO1A
* **Interpretation:** T cell transcriptome mixed into myeloid compartment.

**Cluster 3 ‚Äì Epithelial/stromal contamination**

* **Markers:** EPCAM, CEACAM5/6, KRT8, COL1A1, COL3A1
* **Interpretation:** Epithelial crypt-like plus stromal matrix markers; doublets.

**Cluster 4 ‚Äì CAF/stromal contamination**

* **Markers:** COL1A1/2, COL6A1/2/3, SPARC, VCAN
* **Interpretation:** Classic fibroblast/CAF signature, not immune.

**Cluster 6 ‚Äì Epithelial secretory goblet-like contamination**

* **Markers:** MUC2, REG4, ITLN1, SPINK4, ZG16
* **Interpretation:** Goblet/secretory epithelium leakage.

**Cluster 10 ‚Äì Endothelial contamination**

* **Markers:** VWF, ACKR1, CLDN5, EGFL7
* **Interpretation:** Vascular endothelial cell signature, not myeloid.

**Cluster 12 ‚Äì CAF/stromal-like contamination**

* **Markers:** Collagens, POSTN, PDGFRA, TIMP1
* **Interpretation:** Fibroblast/CAF cluster.

**Clusters 5, 13, 14 ‚Äì Erythroid contamination**

* **Markers:** HBA2, HBB, HBQ1
* **Interpretation:** Strong erythroid/hemoglobin signature.

**Cluster 15 ‚Äì Epithelial secretory contamination (mucinous)**

* **Markers:** MUC2, REG4, CLCA1, SPINK4, BEST2
* **Interpretation:** Goblet/mucinous epithelial cell program, not myeloid.

---

‚úÖ **Summary:**

* **True myeloid populations:** Clusters 1, 7, 8, 9, 11
* **Likely contamination/doublets:** All others (epithelial, stromal, lymphoid, erythroid, endothelial)


In [None]:
myeloid_annotation = {
    "0":  "B",   # CD79A, MS4A1, IGH; clearly lymphoid
    "1":  "Monocytes/macrophages",   # LYZ, CD68, APOE, CST3
    "2":  "T",              # TRAC, TRBC2, CD2, IL7R
    "3":  "Epithelial",  # EPCAM, CEACAM5/6, KRT8, COL genes
    "4":  "CAF",         # High collagen, SPARC, VCAN
    "5":  "Erythroid",           # HBA2, HBB
    "6":  "Goblet", # MUC2, REG4, ITLN1
    "7":  "Macrophages (M2-like)",               # CD163, SELENOP, APOE
    "8":  "Macrophages (tissue resident MRC1+)", # CD163, MRC1, VSIG4, FOLR2
    "9":  "Macrophages (TAM inflammatory)",      # C1QC+, LYZ, APOE
    "10": "Endothelial",         # ACKR1, VWF, CLDN5
    "11": "Neutrophils",          # S100A8/9, CXCL8, IL1B
    "12": "CAF",    # Collagen, POSTN, PDGFRA
    "13": "Erythroid",           # HBB, HBA2
    "14": "Erythroid",           # HBB, HBA2
    "15": "Goblet" # MUC2, REG4, CLCA1, goblet-like
}


In [None]:
myeladata.obs.leiden_gex.replace(myeloid_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/myeloid_compt_annotation.csv')

## Stromal

In [None]:
stromadata = adata[adata.obs.celltypes=='Stromal'].copy()

In [None]:
stromadata.obs['condition'] = stromadata.obs.sample_id.str[-3:]

sc.pp.normalize_total(stromadata, target_sum=10000)
sc.pp.log1p(stromadata)

sc.tl.pca(stromadata)

sc.external.pp.bbknn(stromadata, batch_key='sample_id')

sc.tl.umap(stromadata)

sc.pl.umap(stromadata, color=['condition','cellsubtypes','sample_id',], ncols=2, wspace=0.5)

In [None]:
sc.tl.leiden(stromadata, key_added='leiden_gex')

In [None]:
sc.pl.umap(stromadata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(stromadata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_stromal = {}
for gr in stromadata.obs.leiden_gex.unique():
    dgex_stromal[gr] = sc.get.rank_genes_groups_df(stromadata, group=gr)

In [None]:
for gr, diffg in dgex_stromal.items():
    print(gr)
    print(diffg.head(50).names.ravel())

Why these calls (quick rationale)

0 ‚Äî Smooth_muscle_vSMC: classic contractile set (TAGLN, MYH11, ACTA2, CNN1, MYLK).

1 ‚Äî iCAF (inflammatory CAF): ECM genes (COL1/3) plus cytokines/chemokines (IL6, CXCL8/2, CCL3/4/17/22, OSM), TNFAIP3 ‚Üí iCAF profile.

2 ‚Äî Pericytes/vSMC-like: contractile core (TAGLN, MYH11, MYL9) with pericyte markers (RGS5, PRIMA1, PLP1).

3 ‚Äî myCAF/ECM-remodeling CAF: strong matrix remodeling (COL1/3/5/6, SPARC, THBS2, POSTN, IGFBP7, TIMP1/2/3).

4 ‚Äî Matrix fibroblasts (resting CAF/adventitial): DCN, LUM, MGP, FBLN1, SFRP1/2, CXCL12‚Äîless inflammatory, ECM-homeostatic.

5 ‚Äî Plasma cells perivascular doublets: JCHAIN, IGKC, XBP1, MZB1 (plasma) together with PECAM1/VWF/ENG/AQP1 (endothelium) ‚Üí doublets near vessels.

6 ‚Äî Activated SMC/myofibroblast: contractile SMC genes with immediate-early/stress (EGR1, JUN, NR4A1) and CCN1/2.

7 ‚Äî Plasma cells in CAF background: strong Ig/plasma program layered on COL/ SPARC‚Äîlikely plasma infiltrate or doublets within CAF-rich regions.

8 ‚Äî Epithelial absorptive contamination: SLC26A3, CEACAM7, ALPI, MUC4/12, SCNN1A, PIGR ‚Üí colonocyte/enterocyte.

9 ‚Äî Epithelial absorptive/goblet contamination: PIGR, MUC12, ZG16, FCGBP, FABP1‚Äîsecretory/absorptive epithelium.

10 ‚Äî Epithelial proliferative contamination: ESRP1, TOP2A, TMPRSS4, AGR2 with absorptive markers ‚Üí cycling epithelium.

11 ‚Äî Epithelial proliferative (ERBB2-high) contamination: ERBB2, WEE1, ANO9, MUC12‚Äîtumor-like cycling epithelium.

12 ‚Äî Epithelial-like low-grade contamination: epithelial adhesion/processing (LGALS4, NECTIN1, PGRMC2), mild cycling.

13 ‚Äî Ambient/low-quality epithelial contamination: CEACAM5, PIGR, TM9SF3 with scattered germline/olfactory repertoire‚Äîtypical ambient RNA mix.

In [None]:
stromal_annotation = {
    "0":  "vSMC",                          # TAGLN, MYH11, ACTA2, CNN1, MYLK
    "1":  "iCAF",                       # COL1/3, IL6, CXCL8, CXCL2, TNFAIP3, OSM, CCL3/4/17/22
    "2":  "Pericytes/vSMC",                         # TAGLN, MYH11, MYL9, RGS5, PRIMA1, PLP1
    "3":  "myCAF",                    # COL1/3/5/6, SPARC, THBS2, POSTN, IGFBP7, TIMP1/2/3
    "4":  "Matrix fibroblasts resting",              # DCN, LUM, MGP, FBLN1, SFRP1/2, CXCL12
    "5":  "Perivascular",          # JCHAIN, IGKC/IGLC, XBP1, MZB1 + PECAM1/VWF/ENG
    "6":  "Activated SMC/myofibroblast",                 # TAGLN, ACTA2, MYH11 + IEGs (EGR1, JUN, NR4A1), CCN1/2
    "7":  "Plasma",              # Ig genes (IGHG1/IGHA1/JCHAIN) + COL1/3/6, SPARC
    "8":  "Absorptive colon epithelium",         # SLC26A3, CEACAM7, ALPI, MUC4/12, SCNN1A, PIGR
    "9":  "Goblet",  # PIGR, MUC12, ZG16, FCGBP, FABP1, SLC26A3
    "10": "Epithelial proliferative",      # CA1, ESRP1, TOP2A, TMPRSS4, AGR2, FABP1
    "11": "Epithelial proliferative",  # ERBB2, WEE1, ANO9, MUC12; cycling/mt-high epithelial
    "12": "Epithelial low-grade",      # LGALS4, NECTIN1, PGRMC2; epithelial/cycling skew
    "13": "Noise"  # CEACAM5, PIGR, TM9SF3 + many germline/ORs (ambient)
}


In [None]:
stromadata.obs.leiden_gex.replace(stromal_annotation).value_counts()

In [None]:
stromadata.obs.leiden_gex.replace(stromal_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/stromal_compt_annotation.csv')

## Epithelial

In [None]:
epadata = adata[adata.obs.celltypes=='Epithelial'].copy()

In [None]:
epadata.obs['condition'] = epadata.obs.sample_id.str[-3:]

sc.pp.normalize_total(epadata, target_sum=10000)
sc.pp.log1p(epadata)

sc.tl.pca(epadata)

sc.external.pp.bbknn(epadata, batch_key='sample_id')

sc.tl.umap(epadata)

sc.pl.umap(epadata, color=['condition','cellsubtypes','sample_id',], ncols=2, wspace=0.5)

In [None]:
sc.tl.leiden(epadata, key_added='leiden_gex')

In [None]:
sc.pl.umap(epadata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(epadata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_epithelial = {}
for gr in epadata.obs.leiden_gex.unique():
    dgex_epithelial[gr] = sc.get.rank_genes_groups_df(epadata, group=gr)

In [None]:
for gr, diffg in dgex_epithelial.items():
    print(gr)
    print(diffg.head(50).names.ravel())

Quick rationale per cluster

* **0 ‚Äì Crypt stem/TA early secretory:** OLFM4 (stem/TA), plus **REG4, MUC2, ITLN1, PIGR** ‚Üí early secretory-primed crypt cells.
* **1 ‚Äì Non-epithelial mix:** HOXD10‚Äì13, **CTSG/OSCAR (mast/granulocyte)**, RELN/MMRN2 (neuronal/endothelial) ‚Üí ambient/mixed contamination.
* **2 ‚Äì Goblet cells:** **MUC2, ZG16, FCGBP, PIGR, SPINK4, TFF3**‚Äîcanonical goblet/secretory program.
* **3 ‚Äì TA absorptive:** **SLC12A2**, KIAA1324, WFDC2, AGR2, translation-high ‚Üí dividing pre-absorptive epithelium.
* **4 ‚Äì Mature colonocytes I:** **SLC26A3, GUCA2A/B, CEACAM7, KRT20, CLCA4**‚Äîfully differentiated colonocytes.
* **5 ‚Äì Plasma/fibro-SMC contamination:** Ig genes + **COL1/3/6, TAGLN/ACTA2** ‚Üí not pure epithelium.
* **6 ‚Äì Mature colonocytes II:** **SLC26A3, CEACAM7, GUCA2A, MS4A12, TMPRSS2, TMIGD1**‚Äîenterocyte subset.
* **7 ‚Äì Absorptive colonocytes (CA+):** **CA1/CA2, FABP1, PHGR1, SELENBP1**‚Äîmetabolic/absorptive profile.
* **8 ‚Äì Plasma cell contamination:** **IGKC/JCHAIN/IGHA1/XBP1/MZB1** dominate.
* **9 ‚Äì Injury-response epithelium:** **LCN2**-high with **MUC1/5B, AGR2, STARD10**‚Äîregenerative/stress response.
* **10 ‚Äì Immune/granulocyte contamination:** **CXCL8, S100A8, TRDC, GZMA, FCAR, CSF3R**‚Äînon-epithelial.

In [None]:
epithelial_annotation = {
    "0": "Stem/TA",        # OLFM4+, REG4+, MUC2+, ITLN1+, PIGR+ (stem/TA with secretory bias)
    "1": "Noise",         # HOXD10/11/12/13, CTSG, OSCAR, RELN, CCL21 (mixed immune/neuronal/stromal; not epithelial)
    "2": "Goblet",                          # MUC2, ZG16, FCGBP, PIGR, SPINK4, TFF3 (classic goblet/secretory)
    "3": "TA absorptive epithelium",              # SLC12A2, KIAA1324, WFDC2, AGR2, OLFM4low (transit-amplifying, pre-absorptive)
    "4": "Mature colonocytes I",                  # SLC26A3, GUCA2A/B, CEACAM7, KRT20, CLCA4 (differentiated colonocytes)
    "5": "Noise",         # IGKC/JCHAIN/XBP1 (plasma) + COL1/3/6, TAGLN/ACTA2 (stromal/SMC)
    "6": "Mature colonocytes II",                 # SLC26A3, CEACAM7, GUCA2A, MS4A12, TMPRSS2, TMIGD1 (enterocyte subset)
    "7": "Absorptive colonocytes CA+",          # CA1/CA2, FABP1, PHGR1, SELENBP1, KRT20 (absorptive program)
    "8": "Plasma",             # IG genes (IGKC/IGHA1/JCHAIN/XBP1/MZB1) with stress genes
    "9": "Epithelial injury response (LCN2+)",     # LCN2, MUC1/5B, AGR2, STARD10 (regenerative/injury-responsive epithelium)
    "10":"Neutrophil",      # CXCL8, S100A8, GZMA, TRDC, FCAR, CSF3R (not epithelial)
}

In [None]:
epadata.obs.leiden_gex.replace(epithelial_annotation).value_counts()

In [None]:
epadata.obs.leiden_gex.replace(epithelial_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/epithelial_compt_annotation.csv')

In [None]:
adata.obs.celltypes.value_counts()

## Vascular

In [None]:
vasadata = adata[adata.obs.celltypes=='Vascular'].copy()

In [None]:
vasadata.obs['condition'] = vasadata.obs.sample_id.str[-3:]

sc.pp.normalize_total(vasadata, target_sum=10000)
sc.pp.log1p(vasadata)

sc.tl.pca(vasadata)

sc.external.pp.bbknn(vasadata, batch_key='sample_id')

sc.tl.umap(vasadata)

sc.pl.umap(vasadata, color=['condition','cellsubtypes','sample_id',], ncols=2, wspace=0.5)

In [None]:
sc.tl.leiden(vasadata, key_added='leiden_gex')

In [None]:
sc.pl.umap(vasadata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(vasadata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_vascular = {}
for gr in vasadata.obs.leiden_gex.unique():
    dgex_vascular[gr] = sc.get.rank_genes_groups_df(vasadata, group=gr)

In [None]:
for gr, diffg in dgex_vascular.items():
    print(gr)
    print(diffg.head(50).names.ravel())


üß© Classification of vascular compartment clusters

* **Cluster 0 ‚Äì Perivascular fibroblasts / vascular smooth muscle cells (vSMC)**
  COL1A1/2/3, COL6A1/2/3, ACTA2, TAGLN, PDGFRB, NOTCH3, CALD1 ‚Üí classic mural / pericyte/vSMC identity.

* **Cluster 1 ‚Äì Myeloid/macrophage contamination**
  CD14, CD68, CSF1R, C1QA/B/C, FCER1G, LYZ, TYROBP ‚Üí macrophage/monocyte genes, not vascular.

* **Cluster 2 ‚Äì Endothelial cells (venous/activated)**
  PECAM1, VWF, PLVAP, ENG, CLDN5, KLF2, EPAS1, ACKR1 ‚Üí vascular endothelium.

* **Cluster 3 ‚Äì Endothelial cells (capillary/angiogenic)**
  PECAM1, VWF, PLVAP, ENG, EGFL7, SPARCL1, AQP1, CALCRL ‚Üí strong vascular endothelial profile.
  Ig genes (IGKC, IGHA1, etc.) = likely **plasma contamination** within endothelial neighborhood.

* **Cluster 4 ‚Äì Granulocyte contamination**
  HBA1/2, HBB, S100A8/9/12, CSF3R, PPBP, CXCR1/2 ‚Üí neutrophil signature.

* **Cluster 5 ‚Äì Vascular smooth muscle cells (arterial)**
  ACTA2, TAGLN, MYH11, MYL9, CNN1, LMOD1, PDGFRB ‚Üí contractile vSMC.

* **Cluster 6 ‚Äì Endothelial with plasma contamination**
  PECAM1, VWF, PLVAP, ENG + strong Ig genes (IGHM, IGKC, JCHAIN, IGHA1) ‚Üí **mixed endothelial‚Äìplasma cluster**.

* **Cluster 7 ‚Äì Endothelial cells (arteriovenous signaling)**
  PECAM1, EPAS1, KLF2, CXCL12, JAG1, CLDN5, EFNB2, PODXL ‚Üí endothelial with arterial/angiocrine features.

* **Cluster 8 ‚Äì Mast cell / basophil contamination**
  TPSD1, SIGLEC6, HPGDS, CMA1, KIT ‚Üí classic mast/basophil program. Not vascular.

* **Cluster 9 ‚Äì Mast cells (active)**
  KIT, CMA1, MS4A2, SIGLEC8, HPGDS, RGS13 ‚Üí canonical mast cell signature.

* **Cluster 10 ‚Äì Lymphatic endothelium**
  PROX1, LYVE1, MMRN1, FLT4, TFPI, PDPN, ACKR2 ‚Üí lymphatic EC.

* **Cluster 11 ‚Äì Mixed contamination (erythroid + immune + epithelial)**
  HBA/HBB, MS4A12, CEACAM7, MARCO, TRDC, MUC4 ‚Üí not a pure vascular cluster. Likely **noise/doublets**.


In [None]:
vascular_annotation = {
    "0":  "Perivascular fibroblast",
    "1":  "Macrophages (tissue resident C1Q+)",
    "2":  "Endothelial (venous)",
    "3":  "Endothelial (capillary)",
    "4":  "Neutrophil",
    "5":  "vSMC",
    "6":  "Endothelial",
    "7":  "Endothelial (arteriovenous)",
    "8":  "Mast",
    "9":  "Mast",
    "10": "Endothelial (lymphatic)",
    "11": "Noise",
}


In [None]:
vasadata.obs.leiden_gex.replace(vascular_annotation).value_counts()

In [None]:
vasadata.obs.leiden_gex.replace(vascular_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/vascular_compt_annotation.csv')

## Glia

In [None]:
gliadata = adata[adata.obs.celltypes=='Glia'].copy()

In [None]:
gliadata.obs['condition'] = gliadata.obs.sample_id.str[-3:]

sc.pp.normalize_total(gliadata, target_sum=10000)
sc.pp.log1p(gliadata)

sc.tl.pca(gliadata)

sc.external.pp.bbknn(gliadata, batch_key='sample_id')

sc.tl.umap(gliadata)

sc.pl.umap(gliadata, color=['condition','cellsubtypes','sample_id',], ncols=2, wspace=0.5)

In [None]:
sc.tl.leiden(gliadata, key_added='leiden_gex')

In [None]:
sc.pl.umap(gliadata, color=['leiden_gex'], ncols=2)

In [None]:
sc.tl.rank_genes_groups(gliadata, groupby='leiden_gex', method='wilcoxon')

In [None]:
dgex_glia = {}
for gr in gliadata.obs.leiden_gex.unique():
    dgex_glia[gr] = sc.get.rank_genes_groups_df(gliadata, group=gr)

In [None]:
for gr, diffg in dgex_glia.items():
    print(gr)
    print(diffg.head(50).names.ravel())

* **0 ‚Äì Schwann / enteric glia (non-myelinating)**
  PLP1, S100B, SOX10, MPZ, MAL, PMP22, LGI4, PRIMA1 ‚Üí classic peripheral glia; glia ECM/adhesion (ITGB8, HSPG2), trophic factors (APOD).

* **1 ‚Äì Mixed contamination (B/plasma + endothelial/pericyte)**
  IGKC/JCHAIN (plasma/B), IL7R; endothelial/perivascular markers (PECAM1, VWF, RGS5) ‚Üí not glia.

* **2 ‚Äì Smooth muscle cells (contamination)**
  TAGLN, MYH11, ACTA2, CNN1, MYLK ‚Üí visceral SMC, not glia.

* **3 ‚Äì Smooth muscle / myofibroblast (contamination)**
  TAGLN, MYH11, ACTA2, SYNM, CNN1, ITGA5 ‚Üí SMC/myofibroblast program.

* **4 ‚Äì Schwann / enteric glia (myelinating-leaning)**
  PLP1, MPZ, PMP22, S100B, LGI4, PRIMA1, CDH19; ECM/adhesion (ITGB4, HSPG2).

* **5 ‚Äì Enteric neurons (contamination)**
  VIP/CALB2/SCGN/UCHL1/STMN2, synaptic genes (SYT1, SNAP25), neuronal structural (MAP1B, RTN1/3) ‚Üí neurons, not glia.

* **6 ‚Äì Perineurial-/fibroblast-like cells (glia-adjacent) with plasma/epi trace**
  ECM/FB (DCN, COL3A1, CCN1/2, DKK3, FN1, IGFBP7), glia-adjacent (SCN7A, NRXN1), plus IGKC/JCHAIN and a few epithelial hits ‚Üí stromal/perineurial; not neuronal.

* **7 ‚Äì Endothelial cells (arterial/activated; contamination)**
  CLDN5, KLF2, EPAS1, EGFL7, ICAM2, EFNB2, PTPRB, S1PR1, CXCL12 ‚Üí vascular endothelium.

* **8 ‚Äì Enteric neurons (VIP+/peptidergic; contamination)**
  VIP, SCGN, CALB2, SCG2, ELAVL4, SYT1, PRPH, ENO2 ‚Üí neurons.

* **9 ‚Äì Enteric neurons (TAC1/SST+; contamination)**
  SNAP25, SCG2, TAC1, SST, PRPH, SNCG ‚Üí peptidergic neurons.



In [None]:
glia_annotation = {
    "0": "Schwann enteric glia non-myelinating",
    "1": "Noise",
    "2": "SMC",
    "3": "vSMC/myofibroblast",
    "4": "Schwann enteric glia myelinating",
    "5": "Enteric neuron",
    "6": "Glia-adjacent fibroblast",
    "7": "Endothelial (arterial)",
    "8": "Enteric neuron",
    "9": "Enteric neuron"
}


In [None]:
gliadata.obs.leiden_gex.replace(glia_annotation).value_counts()

In [None]:
gliadata.obs.leiden_gex.replace(glia_annotation).to_csv('../../../Broad_SpatialFoundation/VisiumHD-CRC/glia_compt_annotation.csv')

# Combine cell types

In [None]:
cell_types = []
for ct in ['malignant','lymphoid','myeloid','stromal','vascular','glia','epithelial']:
    cell_types.append(pd.read_csv(f'../../../Broad_SpatialFoundation/VisiumHD-CRC/{ct}_compt_annotation.csv', index_col=0))

cell_types = pd.concat(cell_types)
cell_types.columns = ['refined_cellsubtypes']

In [None]:
adata.obs = pd.concat([adata.obs, cell_types.loc[adata.obs_names]], axis=1)

In [None]:
unified_annotation = {
    # Noise / QC
    "Noise": "Noise",

    # -------------------------
    # EPITHELIAL (normal / dysplastic but not fully malignant labels)
    # -------------------------
    "Epithelial": "Epithelial‚Äîunspecified",
    "Epithelial (crypt/secretory)": "Epithelial‚Äîcrypt/secretory",
    "Stem/TA": "Epithelial‚ÄîStem/TA",
    "TA absorptive epithelium": "Epithelial‚ÄîTA (pre-absorptive)",
    "Epithelial proliferative": "Epithelial‚Äîproliferative",
    "Epithelial low-grade": "Epithelial‚Äîlow-grade/dysplastic",
    "Secretory crypt (dysplastic)": "Epithelial‚Äîlow-grade/dysplastic",
    "Goblet": "Epithelial‚ÄîGoblet/secretory",
    "Mature colonocytes I": "Epithelial‚ÄîMature colonocyte I",
    "Mature colonocytes II": "Epithelial‚ÄîMature colonocyte II",
    "Absorptive colon epithelium": "Epithelial‚ÄîMature colonocyte (absorptive)",
    "Absorptive colonocytes CA+": "Epithelial‚ÄîMature colonocyte (CA+)",
    "Epithelial injury response (LCN2+)": "Epithelial‚Äîinjury/regenerative (LCN2+)",

    # -------------------------
    # MALIGNANT EPITHELIUM (refined)
    # -------------------------
    "Malignant (LCN2+/injury)": "Malignant‚Äîinjury/regenerative (LCN2+)",
    "Malignant (EGFR/ERBB)": "Malignant‚ÄîEGFR/ERBB",
    "Malignant (EGFR/invasive)": "Malignant‚ÄîEGFR/invasive",
    "Malignant TA-like (absorptive/crypt)": "Malignant‚ÄîTA-like (absorptive/crypt)",
    "Malignant TA-like (secretory/REG+)": "Malignant‚ÄîTA-like (secretory/REG‚Å∫)",
    "Malignant (stress/proliferative/translation)": "Malignant‚Äîstress/proliferative/translation",
    "Malignant (stress/chaperone-high)": "Malignant‚Äîstress/chaperone-high",
    "Malignant (hypoxia/stress/metabolic)": "Malignant‚Äîhypoxia/stress/metabolic",

    # -------------------------
    # B / T / Plasma
    # -------------------------
    "B": "B lineage‚ÄîB cell",
    "B/Plasma (activated B/GC-like)": "B lineage‚Äîactivated B / GC-like",
    "B/Plasma": "B lineage‚Äîmixed B/Plasma",   # kept for compatibility (if still present elsewhere)
    "Plasma": "B lineage‚ÄîPlasma cell",
    "Plasma (activated)": "B lineage‚ÄîPlasma cell (activated)",
    "T (naive/Tfh)": "T cell‚Äînaive/Tfh",
    "T": "T cell‚Äîunspecified",

    # -------------------------
    # MYELOID
    # -------------------------
    "Neutrophil": "Neutrophil",
    "Neutrophils": "Neutrophil",
    "Monocytes/macrophages": "Macrophage/TAM‚Äîunspecified",
    "Macrophages (tissue resident MRC1+)": "Macrophage‚Äîtissue-resident (MRC1‚Å∫)",
    "Macrophages (tissue-resident C1Q+)": "Macrophage‚Äîtissue-resident (C1Q‚Å∫)",
    "Macrophages (tissue resident C1Q+)": "Macrophage‚Äîtissue-resident (C1Q‚Å∫)",  # variant spelling
    "Macrophages (M2-like)": "Macrophage/TAM‚ÄîM2-like",
    "TAM": "Macrophage/TAM‚Äîunspecified",
    "TAM (TREM1+/SPP1+)": "Macrophage/TAM‚ÄîTREM1‚Å∫/SPP1‚Å∫",
    "Macrophages (TAM inflammatory)": "Macrophage/TAM‚Äîinflammatory",
    "Erythroid": "Erythroid",
    "Mast": "Mast cell",

    # -------------------------
    # ENDOTHELIAL / VASCULAR
    # -------------------------
    "Endothelial": "Endothelial‚Äîunspecified",
    "Endothelial (capillary)": "Endothelial‚Äîcapillary",
    "Endothelial (arteriovenous)": "Endothelial‚Äîarteriovenous",
    "Endothelial (arterial)": "Endothelial‚Äîarterial",
    "Endothelial (venous)": "Endothelial‚Äîvenous",
    "Endothelial (lymphatic)": "Endothelial‚Äîlymphatic",

    # -------------------------
    # PERIVASCULAR / SMC / PERICYTE
    # -------------------------
    "Perivascular": "Perivascular‚Äîpericyte/SMC",
    "Pericytes/vSMC": "Perivascular‚Äîpericyte/SMC",
    "vSMC": "Perivascular‚Äîvascular SMC",
    "SMC": "Perivascular‚Äîvascular SMC",
    "Activated SMC/myofibroblast": "Perivascular‚Äîactivated SMC/myofibroblast",
    "vSMC/myofibroblast": "Perivascular‚ÄîSMC/myofibroblast",

    # -------------------------
    # FIBROBLASTS / CAF
    # -------------------------
    "CAF": "Fibroblast‚ÄîCAF (unspecified)",
    "myCAF": "Fibroblast‚ÄîmyCAF",
    "iCAF": "Fibroblast‚ÄîiCAF",
    "Matrix fibroblasts resting": "Fibroblast‚Äîmatrix (resting)",
    "Perivascular fibroblast": "Fibroblast‚Äîperivascular-like",
    "Glia-adjacent fibroblast": "Fibroblast‚Äîglia-adjacent",

    # -------------------------
    # END-ORGAN / NEURAL & GLIA
    # -------------------------
    "Enteric neuron": "Enteric neuron",
    "Schwann enteric glia myelinating": "Glia‚ÄîSchwann (myelinating)",
    "Schwann enteric glia non-myelinating": "Glia‚ÄîSchwann (non-myelinating)",
}


In [None]:
lineage_map = {
    # Noise / QC
    "Noise": "Noise",

    # Malignant epithelium
    "Malignant‚Äîinjury/regenerative (LCN2+)": "Malignant",
    "Malignant‚ÄîEGFR/ERBB": "Malignant",
    "Malignant‚ÄîEGFR/invasive": "Malignant",
    "Malignant‚ÄîTA-like (absorptive/crypt)": "Malignant",
    "Malignant‚ÄîTA-like (secretory/REG‚Å∫)": "Malignant",
    "Malignant‚Äîstress/proliferative/translation": "Malignant",
    "Malignant‚Äîstress/chaperone-high": "Malignant",
    "Malignant‚Äîhypoxia/stress/metabolic": "Malignant",

    # Normal epithelium
    "Epithelial‚Äîunspecified": "Epithelial",
    "Epithelial‚Äîcrypt/secretory": "Epithelial",
    "Epithelial‚ÄîStem/TA": "Epithelial",
    "Epithelial‚ÄîTA (pre-absorptive)": "Epithelial",
    "Epithelial‚Äîproliferative": "Epithelial",
    "Epithelial‚Äîlow-grade/dysplastic": "Epithelial",
    "Epithelial‚ÄîGoblet/secretory": "Epithelial",
    "Epithelial‚ÄîMature colonocyte I": "Epithelial",
    "Epithelial‚ÄîMature colonocyte II": "Epithelial",
    "Epithelial‚ÄîMature colonocyte (absorptive)": "Epithelial",
    "Epithelial‚ÄîMature colonocyte (CA+)": "Epithelial",
    "Epithelial‚Äîinjury/regenerative (LCN2+)": "Epithelial",

    # Lymphoid
    "B lineage‚ÄîB cell": "Lymphoid",
    "B lineage‚Äîactivated B / GC-like": "Lymphoid",
    "B lineage‚Äîmixed B/Plasma": "Lymphoid",
    "B lineage‚ÄîPlasma cell": "Lymphoid",
    "B lineage‚ÄîPlasma cell (activated)": "Lymphoid",
    "T cell‚Äînaive/Tfh": "Lymphoid",
    "T cell‚Äîunspecified": "Lymphoid",

    # Myeloid / hematopoietic
    "Neutrophil": "Myeloid",
    "Macrophage/TAM‚Äîunspecified": "Myeloid",
    "Macrophage‚Äîtissue-resident (MRC1‚Å∫)": "Myeloid",
    "Macrophage‚Äîtissue-resident (C1Q‚Å∫)": "Myeloid",
    "Macrophage/TAM‚ÄîM2-like": "Myeloid",
    "Macrophage/TAM‚ÄîTREM1‚Å∫/SPP1‚Å∫": "Myeloid",
    "Macrophage/TAM‚Äîinflammatory": "Myeloid",
    "Erythroid": "Myeloid",
    "Mast cell": "Myeloid",

    # Endothelial / vascular
    "Endothelial‚Äîunspecified": "Endothelial",
    "Endothelial‚Äîcapillary": "Endothelial",
    "Endothelial‚Äîarteriovenous": "Endothelial",
    "Endothelial‚Äîarterial": "Endothelial",
    "Endothelial‚Äîvenous": "Endothelial",
    "Endothelial‚Äîlymphatic": "Endothelial",

    # Perivascular / SMC / pericyte
    "Perivascular‚Äîpericyte/SMC": "Perivascular/SMC",
    "Perivascular‚Äîvascular SMC": "Perivascular/SMC",
    "Perivascular‚Äîactivated SMC/myofibroblast": "Perivascular/SMC",
    "Perivascular‚ÄîSMC/myofibroblast": "Perivascular/SMC",

    # Fibroblasts / CAF
    "Fibroblast‚ÄîCAF (unspecified)": "Fibroblast/CAF",
    "Fibroblast‚ÄîmyCAF": "Fibroblast/CAF",
    "Fibroblast‚ÄîiCAF": "Fibroblast/CAF",
    "Fibroblast‚Äîmatrix (resting)": "Fibroblast/CAF",
    "Fibroblast‚Äîperivascular-like": "Fibroblast/CAF",
    "Fibroblast‚Äîglia-adjacent": "Fibroblast/CAF",

    # Neural / Glia
    "Enteric neuron": "Neural",
    "Glia‚ÄîSchwann (myelinating)": "Glia",
    "Glia‚ÄîSchwann (non-myelinating)": "Glia",
}


In [None]:
adata.obs.refined_cellsubtypes = adata.obs.refined_cellsubtypes.replace(unified_annotation)

In [None]:
adata.obs['refined_celltypes'] = adata.obs.refined_cellsubtypes.replace(lineage_map)

In [None]:
adata.obs

In [None]:
adata.obs.to_csv('full_CRC_obs.csv')