In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import pathlib as pl

In [None]:
base_dir = pl.Path('../../../Broad_SpatialFoundation/hest_processed_data/')

In [None]:
def preprocess_adata(base_dir, sample_id, n_cut=10):
    adata = sc.read_h5ad(base_dir / sample_id / 'adata.h5ad')
    
    adata = adata[adata.obs.total_counts>=n_cut].copy()
    
    adata.layers['counts'] = adata.X.copy()
    
    sc.pp.normalize_total(adata, target_sum=10000)
    sc.pp.log1p(adata)
    
    sc.tl.pca(adata)
    
    sc.pp.neighbors(adata)
    
    sc.tl.umap(adata)
    return adata

# 'TENX122'

In [None]:
sample_id = 'TENX122'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
adata

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(25).names.ravel())

ChatGPT answer: Here‚Äôs a proposed cell type annotation for each cluster based on the overexpressed genes:

---

### **Cluster 0**

**Likely Cell Type:** *Basal epithelial cells*
**Rationale:** Overexpression of **CLCA2**, **GATM**, **EGFR**, **ERBB2**, **EHF**, and **LY6D** suggests epithelial lineage with proliferative potential.

---

### **Cluster 1**

**Likely Cell Type:** *Proliferating epithelial cells / transit amplifying cells*
**Rationale:** Presence of **MKI67**, **PCNA**, **TOP2A**, **CENPF** indicates proliferation; **EGFR**, **APCDD1**, **COL17A1** also support epithelial identity.

---

### **Cluster 2**

**Likely Cell Type:** *Stromal fibroblasts / cancer-associated fibroblasts (CAFs)*
**Rationale:** High expression of **FBN1**, **VCAN**, **PDGFRB**, **THY1**, **SFRP2**, **COL5A2**, and **FBLN1** is typical of stromal/mesenchymal cells.

---

### **Cluster 3**

**Likely Cell Type:** *Endothelial cells (blood vessels)*
**Rationale:** Strong expression of **PECAM1 (CD31)**, **CD34**, **VWF**, **ADGRL4**, **EGFL7**, **RAMP2**, **MMRN1** confirms vascular endothelial identity.

---

### **Cluster 4**

**Likely Cell Type:** *Myeloid cells / macrophages and dendritic cells*
**Rationale:** Expression of **PTPRC (CD45)**, **CD14**, **CD163**, **CD1C**, **SPI1**, **MRC1**, **AIF1**, **LYVE1** and **MPEG1** points to myeloid lineage including macrophages and dendritic cells.

---

### **Cluster 5**

**Likely Cell Type:** *Secretory epithelial cells (likely glandular)*
**Rationale:** Overexpression of **MALL**, **PRDM1**, **AQP9**, **SERPINB2**, **SERPINB3**, **TAC1**, **EGFR**, **ELF5**, and **ERBB2** suggests secretory epithelial cells, potentially luminal progenitors or glandular cells.

---

### **Clusters 6 to 12 (Grouped Together)**

**Likely Cell Type:** *Smooth muscle cells / myofibroblasts*
**Rationale:** Strong and consistent expression across clusters of **MYH11**, **ACTA2**, **CNN1**, **DES**, **ACTG2**, **MYLK**, **ASPN**, **TNC**, **MEF2C**, **INMT**, **KCNMA1**, and **PLIN4** identifies these as smooth muscle or myofibroblastic cells. Minor variations may reflect different activation states or anatomical niches (e.g., pericytes vs. muscularis propria).

---

Let me know if you want dot plots, enrichment scores, or hierarchical classification to further support these assignments.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Basal epithelial', '1': 'Proliferating epithelial', '2': 'CAF', '3': 'Endothelial', 
                         '4': 'Macrophage', '5': 'Secretory epithelial', '6': 'SMC', '7': 'SMC', '8': 'SMC', 
                         '9': 'SMC', '10': 'SMC', '11': 'SMC', '12': 'SMC', })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Epithelial', '2': 'Mesenchymal', '3': 'Endothelial', 
                         '4': 'Myeloid', '5': 'Epithelial', '6': 'Mesenchymal', '7': 'Mesenchymal', '8': 'Mesenchymal', 
                         '9': 'Mesenchymal', '10': 'Mesenchymal', '11': 'Mesenchymal', '12': 'Mesenchymal', })

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX120

In [None]:
sample_id = 'TENX120'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(25).names.ravel())

ChatGPT answer: Here are the likely **cell type assignments** for each cluster based on marker gene enrichment and your note that the sample is from **liver cancer** (likely hepatocellular carcinoma, HCC):

---

### **Cluster 0**

**Likely Cell Type:** *Hepatocytes / Tumor cells*
**Rationale:** High expression of **GPC3**, **HMGCS2**, **UGP2**, **GSTA1**, **HPX**, and **CFHR genes** is typical of hepatocytes or HCC tumor cells. **GPC3** is a well-established HCC marker.

---

### **Cluster 1**

**Likely Cell Type:** *Proliferating hepatocytes / Tumor cells*
**Rationale:** Similar hepatocyte markers as cluster 0, plus **CENPF**, **PCNA**, **PPP1R1A** suggesting proliferation; supports identity as dividing tumor cells.

---

### **Cluster 2**

**Likely Cell Type:** *Cytotoxic T cells*
**Rationale:** Strong T cell markers (**CD3D**, **CD3E**, **CD8A**, **TRAC**, **CD2**, **CD28**) and cytotoxic effectors (**GZMK**, **GZMA**, **CCL5**), along with **PTPRC**, **KLRB1**, and **IL7R**.

---

### **Cluster 3**

**Likely Cell Type:** *Hepatocytes / metabolic hepatocyte subtype*
**Rationale:** Expression of **MAT1A**, **TAT**, **CYP3A4**, **CYP2A7**, **APOA5**, **ADH4**, and **NNMT** suggests metabolic functions characteristic of normal hepatocytes.

---

### **Cluster 4**

**Likely Cell Type:** *Myofibroblasts / CAFs (cancer-associated fibroblasts)*
**Rationale:** Enrichment for **ACTA2**, **PDGFRA**, **FBN1**, **VCAN**, **THBS2**, **LTBP2**, and **ASPN**, all canonical CAF markers in liver fibrosis and HCC.

---

### **Cluster 5**

**Likely Cell Type:** *Endothelial cells*
**Rationale:** Clear expression of vascular markers including **PECAM1 (CD31)**, **CD34**, **VWF**, **EGFL7**, **CLEC14A**, **ADGRL4**, and **MMRN2**.

---

### **Cluster 6**

**Likely Cell Type:** *Liver sinusoidal endothelial cells (LSECs)*
**Rationale:** Shared endothelial markers (**PECAM1**, **EGFL7**, **GNG11**) plus **LYVE1**, **TFPI**, **MMRN1**, **MRC1**, and **EPAS1** suggest LSEC identity, a liver-specific endothelial subtype.

---

### **Cluster 7**

**Likely Cell Type:** *Macrophages / Tumor-associated macrophages (TAMs)*
**Rationale:** Expression of **CD163**, **FCGR3A**, **CD68**, **MS4A6A**, **AIF1**, **VSIG4**, and **LAPTM5** supports macrophage identity. **CXCR4** and **HAVCR2** suggest TAM phenotype.

---

### **Cluster 8**

**Likely Cell Type:** *Cholangiocytes / Biliary epithelial cells*
**Rationale:** Expression of **EPCAM**, **KRT7**, **CFTR**, **CLIC6**, **ANXA3**, and **APCDD1** supports cholangiocyte identity, possibly reactive biliary epithelial cells.

---

### **Cluster 9**

**Likely Cell Type:** *Inflammatory macrophages / Monocytes*
**Rationale:** Similar to cluster 7 but with stronger expression of **CD4**, **CD28**, **CD163**, **MPEG1**, **MS4A4A**, **ALOX5AP**, and **LAPTM5**, suggesting an inflammatory myeloid subset.

---

Let me know if you‚Äôd like these converted into a table or overlaid on spatial plots.



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant hepatocyte', '1': 'Malignant proliferating hepatocyte',
                                                      '2': 'Cytotoxic T', '3': 'Healthy hepatocyte', 
                         '4': 'CAF', '5': 'Endothelial', '6': 'Liver sinusoidal endothelial', '7': 'Macrophage', '8': 'Cholangiocytes', 
                         '9': 'Inflammatory macrophage', })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Lymphoid', '3': 'Epithelial', 
                         '4': 'Mesenchymal', '5': 'Endothelial', '6': 'Endothelial', '7': 'Myeloid', '8': 'Epithelial', 
                         '9': 'Myeloid', })

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI873

In [None]:
sample_id = 'NCBI873'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(25).names.ravel())

ChatGPT answer: Here‚Äôs an interpretation of the clusters in the context of **lung fibrosis**, with suggested **cell type annotations** based on canonical markers:

---

### **Cluster 0**

**Likely Cell Type:** *Myofibroblasts / Activated fibroblasts*
**Rationale:** Highly fibrotic gene program including **COL1A1**, **COL1A2**, **COL3A1**, **POSTN**, **FAP**, **PDGFRA/B**, **CTHRC1**, **ACTA2**, and **SNAI2**. This is the classical activated fibroblast population driving fibrosis.

---

### **Cluster 1**

**Likely Cell Type:** *Alveolar Type II (AT2) epithelial cells*
**Rationale:** Expression of **SFTPC**, **SFTPD**, **NAPSA**, **LAMP3**, **NKX2-1**, **EPCAM**, and **SCGB3A2** is consistent with AT2 identity. **ITGB6**, **AGR3**, and **HSPA5** may indicate stress response or transitional state.

---

### **Cluster 2**

**Likely Cell Type:** *Endothelial cells (blood vascular)*
**Rationale:** Vascular markers **PECAM1**, **CD34**, **KDR**, **RAMP2**, **CLDN5**, **RNASE1**, and **APLNR**, along with **EPAS1** and **GNG11**, indicate vascular endothelial identity.

---

### **Cluster 3**

**Likely Cell Type:** *Fibroblast-like macrophages (profibrotic TAMs)*
**Rationale:** Overlap of **HLA-DRA**, **CD68**, **MS4A7**, **MRC1**, **FCER1G**, **CD14**, **AIF1**, **HAVCR2** (TAM marker), and ECM-related genes (**COL1A2**, **COL3A1**, **DCN**) suggests a transitional myeloid‚Äìfibrotic state.

---

### **Cluster 4**

**Likely Cell Type:** *Alveolar macrophages (M2-like)*
**Rationale:** Markers **CD68**, **MRC1**, **CCL18**, **MARCO**, **FABP4**, **HMOX1**, **PPARG**, and **TREM2** are typical of tissue-resident or M2-polarized alveolar macrophages in fibrotic lung.

---

### **Cluster 5**

**Likely Cell Type:** *Inflammatory monocytes / early macrophages*
**Rationale:** Strong expression of **S100A8**, **S100A9**, **S100A12**, **FCN1**, **IL1B**, **CXCR4**, and **CD14** indicates pro-inflammatory monocytes, with possible early macrophage differentiation.

---

### **Cluster 6**

**Likely Cell Type:** *Perivascular fibroblasts / Adventitial stromal cells*
**Rationale:** Shared fibrotic markers (**COL1A2**, **COL3A1**, **SPARCL1**, **CCN2**, **SFRP2/4**), plus **MFAP5**, **CD34**, **PLVAP**, and **ACKR1**, which are enriched in perivascular niches.

---

### **Cluster 7**

**Likely Cell Type:** *Cytotoxic T cells (CD8‚Å∫ effector/memory)*
**Rationale:** Canonical T cell and cytotoxic markers including **CD3D**, **CD8A**, **GZMB**, **GZMA**, **GZMK**, **CD28**, **KLRG1**, **NKG7**, and **TRAC**. Also includes **CXCR4**, suggesting tissue infiltration.

---

### **Cluster 8**

**Likely Cell Type:** *Transitional epithelial cells (KRT8‚Å∫/KRT18‚Å∫ / "basaloid" / bronchiolar)*
**Rationale:** Expression of **CEACAM6**, **CEACAM5**, **KRT8**, **KRT18**, **ICAM1**, **DUOX1**, **AGR3**, **ITGA3**, **EGFR**, and **EPCAM** suggest transitional/bronchiolar epithelium. **AGER** and **SFTPD** suggest some AT1/AT2 signature.

---

### **Cluster 9**

**Likely Cell Type:** *Plasma cells / ER-stressed B lineage*
**Rationale:** High expression of ER stress and secretory genes (**XBP1**, **HSPA5**, **PDIA4**, **JCHAIN**, **FKBP11**, **SEC11C**, **DNAJB9**) and **CD27**, **CD79A**, **TNFRSF17 (BCMA)** supports a plasma cell identity.

---

Let me know if you'd like help generating plots, cell type enrichment scores, or validating these assignments with reference atlases.



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'AT2 epithelial',
                                                      '2': 'Endothelial', '3': 'Fibroblast-like macrophage', 
                         '4': 'M2 Alveolar macrophage', '5': 'Inflammatory monocyte', '6': 'Perivascular fibroblast',
                                                      '7': 'Cytotoxic T', '8': 'Transitional epithelial', 
                         '9': 'Plasma/B', })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Epithelial',
                                                      '2': 'Endothelial', '3': 'Myeloid', 
                         '4': 'Myeloid', '5': 'Myeloid', '6': 'Mesenchymal', '7': 'Lymphoid', '8': 'Epithelial', 
                         '9': 'Lymphoid', })

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX142

In [None]:
sample_id = 'TENX142'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=1)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **cell type interpretation** of your clusters in the context of **ovarian cancer**, especially high-grade serous ovarian carcinoma (HGSOC), integrating immune, stromal, and epithelial markers:

---

### **Cluster 0**

**Likely Cell Type:** *Epithelial tumor cells (malignant)*
**Rationale:** High expression of:

* **Epithelial & oncogenic markers:** *EPCAM, ERBB2/3, AKT1, CTNNB1, TP53, PTEN, KRAS, PIK3CA, BRAF*
* **Cell cycle/proliferation:** *MKI67, CDK4, CDK2, ARID1A*
* This is a canonical epithelial cancer cell signature in HGSOC.

---

### **Cluster 1**

**Likely Cell Type:** *Proliferating epithelial tumor cells (high-grade subset)*
**Rationale:**

* Similar to Cluster 0 with additional emphasis on proliferation: *MKI67, CDK2, CDK4, CENPF*
* Also includes **SOX9**, a marker of epithelial progenitors and tumor aggressiveness.

---

### **Cluster 2**

**Likely Cell Type:** *Cytotoxic T cells (CD8‚Å∫ effector/memory)*
**Rationale:**

* **T cell markers:** *CD3D/E, CD8A, CD2, CD247, CTLA4, TIGIT, CCR7, LCK, IL2RB*
* **Cytotoxic granules:** *GZMA, GZMK, CTSW, GNLY*
* **Immunoglobulin transcripts (e.g., IGKC, IGHG1/4, IGLC3)** suggest some ambient mRNA or minor B cell contamination.

---

### **Cluster 3**

**Likely Cell Type:** *Tumor-associated macrophages (TAMs)*
**Rationale:**

* **Myeloid/M2 markers:** *CD68, CD163, MS4A6A, FCGR3A, CSF1R, TGFB1, APOE, TREM2, IL10RA, CD14*
* **Immune-regulatory:** *HAVCR2 (TIM-3), STAT1, IRF8*
* Classic suppressive TAM profile seen in ovarian cancer TME.

---

### **Cluster 4**

**Likely Cell Type:** *Cancer-associated fibroblasts (CAFs)*
**Rationale:**

* **ECM and contractile genes:** *LUM, DCN, FN1, SPARC, ACTA2, PDGFRA, COL5A2, CRISPLD2*
* **Chemokines:** *CXCL12, CCL2, CXCL14*
* **Immune interaction:** *IL7R, ICAM1, STAT1, FAS*

---

### **Cluster 5**

**Likely Cell Type:** *Endothelial cells (vascular)*
**Rationale:**

* **Endothelial markers:** *PECAM1, CD34, CD93, EGFL7, PLVAP, ANGPT2, CLEC14A*
* **Fibrotic/stromal overlap:** *SPARC, IGFBP7, ACTA2, VCAN, FN1*
* Suggests vascular remodeling, common in the tumor microenvironment.

---

### **Cluster 6**

**Likely Cell Type:** *Mixed T and B lymphocytes (memory and helper subsets)*
**Rationale:**

* **T cells:** *CD3D/E, CD2, CD8A, LCK, IL7R, CXCL13, FOXP3, GATA3, CD40LG*
* **B cell/plasma genes:** *IGKC, IGHG1/3/4, IGHGP, JCHAIN*
* Indicates lymphoid infiltration, including Tfh and memory B cells.

---

### **Cluster 7**

**Likely Cell Type:** *Inflammatory fibroblasts / EMT-like tumor cells / hybrid stromal-epithelial*
**Rationale:**

* Mixed expression of:

  * **Fibrotic/mesenchymal:** *ACTA2, RGS5, DCN, CXCL14, EGR3*
  * **Tumor-related:** *ERBB2, GPRC5A, MEST, SOS1, IFITM3*
  * **Immune genes:** *VEGFA, CTSD, MPEG1, HIF1A, IFNL1*
* This may represent EMT-like tumor cells or inflammatory fibroblasts.

---

Would you like these in a CSV or annotated on an `AnnData` object (`.obs['celltype']`) for visualization or further analysis?



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant proliferating',
                                                      '2': 'Cytotoxic T', '3': 'Macrophage M2', 
                         '4': 'CAF', '5': 'Endothelial', '6': 'T/B', '7': 'Inflammatory fibroblast'
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Lymphoid', '3': 'Myeloid', 
                         '4': 'Mesenchymal', '5': 'Endothelial', '6': 'Lymphoid', '7': 'Mesenchymal', })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'TENX124'

In [None]:
sample_id = 'TENX124'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=1)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a detailed **cell type annotation** for your clusters in the context of **human tonsil** tissue, a secondary lymphoid organ rich in B cells, T cells, dendritic cells, and vasculature:

---

### **Cluster 0**

**Likely Cell Type:** *Na√Øve and Memory B cells*
**Rationale:**

* Canonical B-cell markers: **MS4A1 (CD20)**, **CD19**, **CD79A**, **BANK1**, **PLCG2**, **TCL1A**, **PTPRC (CD45)**
* **SPIB**, **IRF8**, **MEF2C**, **TCF4** indicate a mature B-cell transcriptional program.
* **SELL (L-selectin)** and **CD86** indicate memory and activation states.

---

### **Cluster 1**

**Likely Cell Type:** *T helper and cytotoxic T cells (mixed)*
**Rationale:**

* **TRAC, CD3E, CD3D, CD2, CD8A, CD4**, **PTPRC (CD45)** are core T-cell markers.
* **CCR7**, **CD28**, **CD27**, **FOXP3**, **LAG3**, **CTLA4**, **PDCD1** show regulatory, helper, and effector phenotypes.
* **GZMA, GZMK, GNLY, PRF1** indicate cytotoxic potential.

---

### **Cluster 2**

**Likely Cell Type:** *Plasma cells / antibody-secreting B cells*
**Rationale:**

* High expression of **MZB1**, **PRDM1 (BLIMP1)**, **TNFRSF17 (BCMA)**, **FKBP11**, **CD27**, **CD79A**, **SLAMF7** ‚Äî all consistent with plasma cells.
* Also contains **APCDD1**, **DERL3**, and **CYTIP**, associated with plasma cell maturation.

---

### **Cluster 3**

**Likely Cell Type:** *Fibroblastic reticular cells (FRCs) / Stromal fibroblasts*
**Rationale:**

* ECM and stromal markers: **COL5A2, VCAN, ASPN, FBN1, PDGFRA, CRISPLD2, TFPI, SFRP2/4**
* Also includes **TNC**, **INMT**, **THBS2**, and **LTBP2**, consistent with tonsillar stromal support cells.

---

### **Cluster 4**

**Likely Cell Type:** *Myeloid cells / Monocyte-derived dendritic cells (moDCs)*
**Rationale:**

* **CD14, CD68, MPEG1, AIF1, IRF8, MS4A6A, SPI1, LILRB2, HAVCR2**
* Activation and antigen presentation: **HLA-DQB2, CD86, LAMP3, FCN1, GZMB, CLEC10A**
* **CCL19**, **CCR7** suggest lymph node trafficking.

---

### **Cluster 5**

**Likely Cell Type:** *Basal epithelial cells / squamous epithelium*
**Rationale:**

* **EHF, CLCA2, GPC1, EGFR, FGFBP1, ELF5, KRT7**, **MKI67** (cycling)
* **PDPN**, **SMYD2**, **CXCL6**, **CXCL2** ‚Äî supportive of squamous basal layer and inflammatory response.

---

### **Cluster 6**

**Likely Cell Type:** *Mast cells / Perivascular fibroblasts (some contamination)*
**Rationale:**

* **CPA3, MS4A2, MCEMP1**, **MARCO**, **CNN1**, **MYH11**, **MFAP5**
* Several unassigned/control probes may indicate ambient background or doublets.

---

### **Cluster 7**

**Likely Cell Type:** *Blood endothelial cells (BECs)*
**Rationale:**

* **PECAM1, CD34, VWF, ADGRL4, EGFL7, CLEC14A, MMRN1**, **ACKR1**
* **PDGFRB, ACTA2, COL5A2, MYLK** suggest some smooth muscle/pericyte overlap.

---

### **Cluster 8**

**Likely Cell Type:** *Effector memory T cells / cytotoxic CD8‚Å∫ T cells*
**Rationale:**

* T cell identity: **TRAC, CD3E, CD3D, CD8A, CD4, KLRB1, CCR7, CD27**
* **Effector genes:** *GZMA, GZMK, PRF1, NKG7*
* **Activation markers:** *CTLA4, FOXP3, SLAMF1, CD69*
* Also includes **PECAM1** and **AQP3**, suggesting tissue-resident phenotypes.

---

### **Cluster 9**

**Likely Cell Type:** *Germinal center B cells (cycling)*
**Rationale:**

* **MS4A1, CD19, CD79A, BANK1, CD83, CD86** ‚Äî typical of activated B cells
* **MKI67, PCNA, TOP2A, CDK1, CCNB2, CENPF, UBE2C** show high proliferation
* **TCL1A, IRF8, SPIB, TCF4** ‚Äî consistent with GC transcriptional programs.

---

Let me know if you‚Äôd like this converted into `.csv`, `.obs['celltype']`, or visualized using dotplots or UMAP overlays!


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Na√Øve and Memory B', '1': 'T helper and cytotoxic T',
                                                      '2': 'Plasma', '3': 'Fibroblastic reticular cells (FRCs)', 
                         '4': 'Myeloid', '5': 'Basal epithelial cells', '6': 'Stromal (+ Mast)', '7': 'Blood endothelial',
                                                      '8': 'T memory and cytotoxic T', '9': 'Germinal center B',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Lymphoid',
                                                      '2': 'Lymphoid', '3': 'Mesenchymal', 
                         '4': 'Myeloid', '5': 'Epithelial', '6': 'Mesenchymal', '7': 'Endothelial',
                                                      '8': 'Lymphoid', '9': 'Lymphoid',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI856

In [None]:
sample_id = 'NCBI856'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=1)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a proposed **cell type annotation** for each cluster based on marker genes in the context of **lung fibrosis**, integrating known markers from epithelial, immune, endothelial, and mesenchymal compartments:

---

### **Cluster 0**

**Likely Cell Type:** *Effector and cytotoxic T cells (CD8‚Å∫ dominant)*
**Rationale:**

* **T cell markers:** `CD3D/E/G`, `CD2`, `TRAC`, `IL7R`, `CD8A`, `CD4`, `CD28`, `CD27`, `FOXP3`, `CTLA4`, `GZMA/B/K`, `GNLY`, `NKG7`
* **Activation & migration:** `CCR7`, `FASLG`, `ISG20`, `KLRG1`, `TGFB1`

---

### **Cluster 1**

**Likely Cell Type:** *Capillary endothelial cells (aerocytes + gCap)*
**Rationale:**

* **Endothelial markers:** `PECAM1`, `CD34`, `CLDN5`, `EPAS1`, `KDR`, `RAMP2`
* **Gas exchange/AT1 overlap:** `AGER`, `SFTPC`, `SFTPD`, `NAPSA`
* **Regulatory & EMT markers:** `ZEB1`, `SNAI1`, `HEY1`, `WWTR1`, `RHOA`
* Likely includes **capillary ECs transitioning during fibrosis**

---

### **Cluster 2**

**Likely Cell Type:** *Secretory epithelial cells (club/goblet, alveolar, and basal subsets)*
**Rationale:**

* **Secretory markers:** `SCGB1A1`, `SCGB3A2`, `EPCAM`, `AGR3`, `KRT8/18`, `FOXJ1`, `DUOX1`, `WFDC2`
* **Development & stemness:** `SOX2/4/9`, `TP73`, `STAT6`, `GSR`, `CREB3L4`
* Suggests **bronchiolar epithelial identity**, possibly basal-to-club transition

---

### **Cluster 3**

**Likely Cell Type:** *Myofibroblasts (fibrotic CAFs)*
**Rationale:**

* **ECM/fibrosis markers:** `COL1A1/A2`, `DCN`, `FN1`, `SPARCL1`, `FAP`, `PDGFRA`, `SFRP2/4`, `ACTA2`, `CTHRC1`, `VIM`, `CCN2`, `ZEB1`
* Classic **activated fibroblast** signature in IPF.

---

### **Cluster 4**

**Likely Cell Type:** *Pro-fibrotic AT2 cells / transitional epithelial cells (KRT8‚Å∫/CLDN‚Å∫)*\*
**Rationale:**

* **AT2 identity:** `SFTPC`, `SFTPD`, `NAPSA`, `NKX2-1`, `PGC`, `XBP1`
* **Transitional/stress markers:** `HSPA5`, `ATF4`, `DUOX1`, `CEACAM6`, `COL4A3`, `YAP1`, `HIF1A`, `KRT18/8`
* Likely includes **‚ÄúKRT8‚Å∫ transitional‚Äù cells** central to fibrotic progression.

---

### **Cluster 5**

**Likely Cell Type:** *AT2 cells / secretory epithelial cells (normal-like)*
**Rationale:**

* Similar to Cluster 4 but with more **secretory function genes** (`EPCAM`, `SFTA2`, `MGST1`, `LAMP3`, `HSPA5`)
* Less EMT/stress response than Cluster 4 ‚Üí **homeostatic AT2**

---

### **Cluster 6**

**Likely Cell Type:** *M2-like macrophages / suppressive myeloid cells*
**Rationale:**

* **M2 markers:** `CD163`, `MRC1`, `TREM2`, `C1QC`, `TGFB1`, `PLIN2`, `HMOX1`, `FCER1G`, `MS4A7`
* **Tissue-resident myeloid program:** `CD68`, `LYZ`, `IRF8`, `AIF1`, `VIM`

---

### **Cluster 7**

**Likely Cell Type:** *Basal epithelial cells / airway progenitors*
**Rationale:**

* **Basal and secretory overlap:** `KRT5/14/17`, `TP63`, `MUC5B`, `CEACAM6`, `AKR1C1/2`, `WFDC2`
* **Stemness/stress:** `SOX2`, `SOX9`, `STAT6`, `YAP1`, `WWTR1`

---

### **Cluster 8**

**Likely Cell Type:** *Inflammatory monocytes / neutrophilic myeloid cells*
**Rationale:**

* **Pro-inflammatory signature:** `S100A8/9/12`, `IL1B`, `ITGAX`, `FCN1`, `CXCR4`, `IFIT3`, `ISG20`
* **Overlap with ECs and AT2:** `SFTPC`, `AGER`, `RAMP2`, suggesting **mixed identity or ambient RNA**

---

### **Cluster 9**

**Likely Cell Type:** *Mesenchymal/stromal fibroblasts (vascular/pericyte-like)*
**Rationale:**

* **Vascular/stromal mix:** `PECAM1`, `PDGFRB`, `CSPG4`, `CD34`, `PLVAP`, `COL15A1`, `GNG11`
* **Proliferative & remodeling:** `YAP1`, `TGFB3`, `HEY1`, `ACKR1`, `ACTA2`

---

### **Cluster 10**

**Likely Cell Type:** *Alveolar macrophages (M2-like TAMs)*
**Rationale:**

* **Strong M2/TAM signature:** `CD68`, `MRC1`, `CD14`, `CCL18`, `FABP4`, `TGFB1`, `MARCO`, `PPARG`, `CD52`, `CD4`
* Consistent with **IPF-expanded suppressive macrophages**

---

### **Cluster 11**

**Likely Cell Type:** *Pro-fibrotic AT2 / transitional epithelium (KRT8‚Å∫ subset)*
**Rationale:**

* Overlaps with cluster 4; includes **KRT8**, **SFTPC**, **CEACAM6**, **AXIN2**, **COL4A3**, **YAP1**, **RTKN2**
* Likely a more stressed/activated AT2-like population

---

### **Cluster 12**

**Likely Cell Type:** *Plasma cells / ER-stressed B-lineage cells*
**Rationale:**

* **ER stress & plasma markers:** `XBP1`, `PRDM1`, `JCHAIN`, `TNFRSF17`, `CD27`, `CD79A/B`, `FKBP11`, `SEC11C`, `HSPA5`
* Canonical **antibody-secreting plasma cell** profile

---

### **Cluster 13**

**Likely Cell Type:** *KRT‚Å∫ inflammatory epithelial cells (dysplastic/metaplastic epithelium)*
**Rationale:**

* Mixed markers:

  * **KRT5/15/17**, `SOX2/4/9`, `CEACAM6`, `FOXJ1`, `AGR3`, `WFDC2`
  * **Immune/stress**: `TGFB1`, `CXCR4`, `FCGR3A`, `CD68`, `CD86`, `HAVCR2`, `HLA genes`
* May represent **dysplastic airway epithelial cells under inflammation or EMT**

---

Would you like this exported as a `pandas.DataFrame`, `.obs['celltype']` in an `AnnData`, or turned into a dotplot for cluster annotation validation?



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Cytotoxic T', '1': 'Capillary endothelial',
                                                      '2': 'Secretory epithelial', '3': 'Myofibroblast', 
                         '4': 'AT2', '5': 'AT2', '6': 'Macrophage M2', '7': 'Basal epithelial',
                                                      '8': 'Inflammatory monocyte', '9': 'Fibroblast',
                                                      '10': 'Alveolar macrophages M2', '11': 'AT2', '12': 'Plasma',
                                                      '13': 'KRT‚Å∫ inflammatory epithelial'
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Endothelial',
                                                      '2': 'Epithelial', '3': 'Mesenchymal', 
                         '4': 'Epithelial', '5': 'Epithelial', '6': 'Myeloid', '7': 'Epithelial',
                                                      '8': 'Myeloid', '9': 'Mesenchymal',
                                                      '10': 'Myeloid', '11': 'Epithelial', '12': 'Lymphoid',
                                                      '13': 'Epithelial'
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI882

In [None]:
sample_id = 'NCBI882'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **brief annotation** of clusters in the context of **lung fibrosis**:

* **Cluster 0:** *Myofibroblasts* ‚Äî ECM-rich, contractile markers: **COL1A1/2**, **FN1**, **ACTA2**, **PDGFRA**, **FAP**, **SFRP2/4**
* **Cluster 1:** *AT2 cells / transitional epithelium* ‚Äî **SFTPC**, **NAPSA**, **KRT8**, **EPCAM**, **CEACAM6**, **HSPA5**
* **Cluster 2:** *Endothelial cells* ‚Äî **PECAM1**, **CD34**, **KDR**, **CLDN5**, **EPAS1**, **RAMP2**, **VIM**
* **Cluster 3:** *Activated fibroblasts* ‚Äî **ACTA2**, **PDGFRB**, **ZEB1**, **WWTR1**, **CTNNB1**, **COL1A1/2**, **YAP1**
* **Cluster 4:** *Club / secretory epithelial cells* ‚Äî **SCGB1A1**, **WFDC2**, **SOX2/9**, **KRT5/8/15/17**, **CEACAM6**
* **Cluster 5:** *Cytotoxic and helper T cells* ‚Äî **CD3D/E**, **CD8A**, **GZMB**, **FOXP3**, **CTLA4**, **CCR7**, **CD4**
* **Cluster 6:** *M2-like macrophages* ‚Äî **CD68**, **MRC1**, **TREM2**, **CCL18**, **HLA-DRA**, **TGFB1**, **CD86**
* **Cluster 7:** *Pro-fibrotic epithelial cells / EMT-like* ‚Äî mix of **KRTs**, **SFTPC**, **ZEB1**, **AXL**, **CEACAM5/6**
* **Cluster 8:** *Inflammatory monocytes / neutrophil-like* ‚Äî **S100A8/9/12**, **FCN1**, **CD14**, **IL1B**, **IFITs**
* **Cluster 9:** *Suppressive alveolar macrophages* ‚Äî **CD68**, **MARCO**, **PPARG**, **SPP1**, **TGFB1**, **HMOX1**
* **Cluster 10:** *Mast cells / myeloid hybrids* ‚Äî **CPA3**, **TPSAB1**, **KIT**, **FCER1G**, **HAVCR2**, **CD52**
* **Cluster 11:** *Plasma cells* ‚Äî **XBP1**, **JCHAIN**, **TNFRSF17**, **CD79A/B**, **HSPA5**, **FKBP11**, **SEC11C**

Let me know if you'd like this as a CSV or `.obs['celltype']`!



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'AT2',
                                                      '2': 'Endothelial', '3': 'Activate fibroblast', 
                         '4': 'Club / secretory epithelial', '5': 'Helper and cytotoxic T', '6': 'Macrophage M2',
                                                      '7': 'Pro-fibrotic epithelial',
                                                      '8': 'Inflammatory monocyte', '9': 'Alveolar macrophage M2',
                                                      '10': 'Mast cells / myeloid', '11': 'Plasma', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Epithelial',
                                                      '2': 'Endothelial', '3': 'Mesenchymal', 
                         '4': 'Epithelial', '5': 'Lymphoid', '6': 'Myeloid',
                                                      '7': 'Epithelial',
                                                      '8': 'Myeloid', '9': 'Myeloid',
                                                      '10': 'Myeloid', '11': 'Lymphoid', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX125

In [None]:
sample_id = 'TENX125'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **brief cell type annotation** for each Leiden cluster in the context of **human tonsil**:

---

### **Cluster 0** ‚Äì **Effector/memory T cells (CD4‚Å∫ and CD8‚Å∫)**

* Markers: `CD3E`, `TRAC`, `CD2`, `CD8A`, `CD4`, `PRF1`, `FOXP3`, `CXCL9`, `PDCD1`
* Includes cytotoxic, regulatory (Treg), and helper T cell subsets.

---

### **Cluster 1** ‚Äì **Na√Øve and memory B cells**

* Markers: `MS4A1 (CD20)`, `CD19`, `CD79A`, `BANK1`, `TCL1A`, `SPIB`, `IRF8`, `SELL`
* Typical of resting or activated B cells.

---

### **Cluster 2** ‚Äì **Fibroblastic stromal cells / FRCs**

* ECM markers: `COL1A2`, `DCN`, `FN1`, `PDGFRA`, `ACTA2`, `FBLN1`, `VCAN`, `SFRP4`
* Likely fibroblastic reticular cells in the tonsillar stroma.

---

### **Cluster 3** ‚Äì **Myeloid cells / Monocyte-derived dendritic cells (moDCs)**

* Markers: `CD14`, `CD68`, `MRC1`, `CXCL9`, `HLA-DQB2`, `CD86`, `CLEC10A`, `GZMB`
* Antigen-presenting and immune-regulating phenotype.

---

### **Cluster 4** ‚Äì **Epithelial cells / basal squamous epithelium**

* Markers: `EHF`, `SOX2`, `ERBB2`, `CLCA2`, `EGFR`, `COL17A1`, `KRTs`, `FGFBP1`, `PDPN`
* Tonsillar epithelial layer.

---

### **Cluster 5** ‚Äì **Proliferating B cells / Germinal center B cells**

* Markers: `MS4A1`, `CD79A`, `MKI67`, `PCNA`, `TOP2A`, `UBE2C`, `CCNB2`
* Cycling GC B cells.

---

### **Cluster 6** ‚Äì **Blood endothelial cells (BECs)**

* Markers: `PECAM1`, `CD34`, `VWF`, `MMRN1`, `EGFL7`, `ACKR1`, `ANGPT2`, `PDGFRB`
* Blood vasculature.

---

### **Cluster 7** ‚Äì **Plasma cells**

* Markers: `PRDM1`, `MZB1`, `SLAMF7`, `TNFRSF17`, `CD27`, `FKBP11`, `CD79A`, `FBLN1`
* Fully differentiated antibody-secreting cells.

---

Let me know if you'd like this exported to `adata.obs["celltype"]` or visualized.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T helper and cytotoxic T', '1': 'Na√Øve and Memory B',
                                                      '2': 'Fibroblastic reticular cells (FRCs)', '3': 'Myeloid', 
                         '4': 'Basal epithelial cells', '5': 'Germinal center B', '6': 'Blood endothelial',
                                                      '7': 'Plasma',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Lymphoid',
                                                      '2': 'Mesenchymal', '3': 'Myeloid', 
                         '4': 'Epithelial', '5': 'Lymphoid', '6': 'Endothelial',
                                                      '7': 'Lymphoid',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX96

In [None]:
sample_id = 'TENX96'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a concise **cell type annotation** for your breast cancer clusters based on canonical markers:

---

### **Cluster 0 ‚Äì Luminal epithelial cells**

* **Key markers**: `MUC1`, `FOXA1`, `GATA3`, `ESR1`, `ERBB2`, `KRT8`, `TPD52`, `MLPH`, `CCND1`, `EPCAM`
* **Notes**: Hormone receptor-positive luminal subtype

---

### **Cluster 1 ‚Äì Luminal epithelial cells (variant)**

* **Key markers**: Similar to cluster 0 (`MUC1`, `GATA3`, `TPD52`, `KRT7`, `FOXA1`)
* **Notes**: Potentially more differentiated luminal cells with less metabolic gene expression

---

### **Cluster 2 ‚Äì Luminal/Basal hybrid or partial EMT**

* **Key markers**: `GATA3`, `KRT7`, `FOXA1`, `ERBB2`, `EPCAM`, `IGF1R`, `THBS2`, `INHBA`
* **Notes**: Mixed epithelial identity; could include tumor cells with partial EMT features

---

### **Cluster 3 ‚Äì Myeloid-derived macrophages**

* **Key markers**: `CD68`, `FCGR3A`, `CD14`, `PTPRC`, `CD86`, `CXCR4`, `ITGAX`, `MRC1`, `CD163`
* **Notes**: TAMs (tumor-associated macrophages), potential immunosuppressive phenotype

---

### **Cluster 4 ‚Äì Cancer-associated fibroblasts (CAFs)**

* **Key markers**: `ACTA2`, `PDGFRB`, `MMP2`, `FBLN1`, `CXCL12`, `SFRP4`, `EGR1`, `THY1`, `ZEB1`
* **Notes**: ECM remodeling, tumor progression

---

### **Cluster 5 ‚Äì Endothelial cells (vascular)**

* **Key markers**: `PECAM1`, `VWF`, `RGS5`, `KDR`, `ADGRL4`, `CLEC14A`, `ANGPT2`, `ACKR1`, `EGFL7`
* **Notes**: Tumor vasculature

---

### **Cluster 6 ‚Äì T cells (CD4‚Å∫/CD8‚Å∫ mixed)**

* **Key markers**: `CD3E`, `CD4`, `CD8A`, `TRAC`, `IL7R`, `CCL5`, `GZMA`, `CTLA4`, `TIGIT`, `PRF1`
* **Notes**: Mixed cytotoxic and helper T cells, possibly exhausted

---

### **Cluster 7 ‚Äì Basal-like epithelial / myoepithelial**

* **Key markers**: `KRT5`, `KRT14`, `ACTA2`, `MYH11`, `EGFR`, `ID4`, `NNMT`, `GABRP`
* **Notes**: Basal subtype, may also include myoepithelial or progenitor-like cells

---

Let me know if you'd like this added to `.obs['celltype']` or converted to a dictionary for programmatic use.



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant Luminal epithelial', '1': 'Malignant Luminal epithelial',
                                                      '2': 'Malignant Luminal/Basal epithelial', '3': 'Macrophage', 
                         '4': 'CAF', '5': 'Endothelial', '6': 'T',
                                                      '7': 'Malignant Basal-like epithelial',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Malignant', '3': 'Myeloid', 
                         '4': 'Mesenchymal', '5': 'Endothelial', '6': 'Lymphoid',
                                                      '7': 'Malignant',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX141

In [None]:
sample_id = 'TENX141'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a full **cell type annotation** for your **lung cancer clusters**, based on canonical marker expression:

---

### üî∑ **Cluster 0 ‚Äì Malignant epithelial cells (proliferative)**

* **Key markers**: `EPCAM`, `ERBB2`, `KRAS`, `MKI67`, `CCND1`, `CDK4`, `TOP2A`, `STAT3`, `MET`
* **Notes**: Highly proliferative tumor cells; adenocarcinoma or other NSCLC subtype.

---

### üî∑ **Cluster 6 ‚Äì Malignant epithelial cells (EGFR/ERBB‚Å∫)**

* **Key markers**: `EPCAM`, `ERBB2`, `ERBB3`, `EGFR`, `SDC1`, `MET`, `GPRC5A`, `VEGFA`
* **Notes**: Tumor epithelial cells with RTK pathway activation; possible EGFR-mutant subtype.

---

### üî∑ **Cluster 9 ‚Äì Malignant epithelial cells (RTK pathway)**

* **Key markers**: `GPRC5A`, `EGFR`, `ERBB2/3/4`, `CEACAM6`, `ICAM1`, `EPCAM`
* **Notes**: Tumor cells with inflammatory and epithelial signatures; similar to 6.

---

### üî∑ **Cluster 10 ‚Äì Mixed T cells and malignant hybrid**

* **Key markers**: Proliferation (`MKI67`, `TOP2A`, `UBE2C`), cytotoxic (`GZMA`, `CD8A`), epithelial (`EPCAM`, `MET`)
* **Notes**: Mixed; may contain tumor-infiltrating T cells and cycling tumor cells.

---

### ‚úÖ **Cluster 1 ‚Äì T cells (CD4‚Å∫, CD8‚Å∫, Tregs)**

* **Key markers**: `CD3E`, `CD2`, `CD8A`, `CD4`, `GZMA`, `PRF1`, `FOXP3`, `IL2RG`, `CTLA4`
* **Notes**: Cytotoxic, helper, and regulatory T cells.

---

### ‚úÖ **Cluster 2 ‚Äì B cells**

* **Key markers**: `CD19`, `MS4A1`, `CD79A/B`, `IGHM`, `IGHG1`, `PAX5`
* **Notes**: Naive, memory, and plasma B cells.

---

### ‚úÖ **Cluster 4 ‚Äì Macrophages / Monocytes**

* **Key markers**: `CD68`, `CD163`, `ITGAX`, `MPEG1`, `MS4A6A`, `APOE`
* **Notes**: Tumor-associated macrophages (TAMs), likely immunosuppressive.

---

### ‚úÖ **Cluster 11 ‚Äì Plasma B cells / Stromal mix**

* **Key markers**: `IGHG1-4`, `JCHAIN`, `XBP1`, `MZB1`, `PRDM1`
* **Notes**: Plasma cells with some stromal signatures (e.g. `DCN`, `FN1`).

---

### ‚úÖ **Cluster 3 ‚Äì Fibroblasts / CAFs (collagen-producing)**

* **Key markers**: `COL5A2`, `FN1`, `DCN`, `PDGFRA`, `SPARC`, `LUM`, `ACTA2`
* **Notes**: Cancer-associated fibroblasts (CAFs)

---

### ‚úÖ **Cluster 5 ‚Äì Endothelial cells**

* **Key markers**: `PECAM1`, `CD34`, `CLEC14A`, `PLVAP`, `FLT1`, `ANGPT2`, `RGS5`, `SOX18`
* **Notes**: Tumor vasculature / endothelial lineage

---

### ‚úÖ **Cluster 7 ‚Äì Mucinous / secretory epithelial cells or rare tumor cells**

* **Key markers**: `EPCAM`, `MUC5AC`, `AREG`, `IL1A`, `REG4`, `ERBB4`, `SOX2`
* **Notes**: Possibly airway secretory or variant malignant lineage

---

### ‚úÖ **Cluster 8 ‚Äì Fibroblasts / myeloid mix**

* **Key markers**: `SPARC`, `CRISPLD2`, `IGFBP7`, `DCN`, `LUM`, `PDGFRA`, `IL6`, `CD163`, `CXCL12`
* **Notes**: CAFs + inflammatory signals (TAM contamination possible)



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant proliferative epithelial', '1': 'T',
                                                      '2': 'B', '3': 'CAF', 
                         '4': 'Macrophage / Monocyte', '5': 'Endothelial', '6': 'Malignant EGFR/ERBB‚Å∫ epithelial',
                                                      '7': 'Mucinous / secretory epithelial', '8': 'CAF', 
                                                      '9': 'Malignant RTK epithelial', '10': 'T/Malignant', '11': 'Plasma',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Lymphoid',
                                                      '2': 'Lymphoid', '3': 'Mesenchymal', 
                         '4': 'Myeloid', '5': 'Endothelial', '6': 'Malignant',
                                                      '7': 'Epithelial', '8': 'Mesenchymal', 
                                                      '9': 'Malignant', '10': 'Malignant', '11': 'Lymphoid',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI860

In [None]:
sample_id = 'NCBI860'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Sure ‚Äî here is a **succinct cluster annotation** for your **lung fibrosis** dataset:

---

**Cluster 0** ‚Äì *Myofibroblasts / Activated fibroblasts*

* Markers: `COL1A1`, `COL1A2`, `POSTN`, `PDGFRA`, `CTHRC1`, `FAP`, `SFRP2`
* ‚ùå Non-malignant

**Cluster 1** ‚Äì *Alveolar epithelial cells (AT2 / mixed type)*

* Markers: `SFTPC`, `SFTPD`, `EPCAM`, `NKX2-1`, `AGER`, `CEACAM6`
* ‚ö†Ô∏è Possibly transformed if in fibrotic niche, but generally non-malignant

**Cluster 2** ‚Äì *Mast cells / S100A8‚Å∫ inflammatory cells*

* Markers: `S100A8`, `S100A9`, `TPSAB1`, `KIT`
* ‚ùå Non-malignant

**Cluster 3** ‚Äì *Monocyte-derived macrophages (SPP1‚Å∫/MARCO‚Å∫)*

* Markers: `MARCO`, `CD68`, `SPP1`, `FABP4`, `TREM2`
* ‚ùå Non-malignant

**Cluster 4** ‚Äì *Capillary / Endothelial cells*

* Markers: `PECAM1`, `CLDN5`, `RAMP2`, `ACKR1`
* ‚ùå Non-malignant

**Cluster 5** ‚Äì *Pro-fibrotic macrophages*

* Markers: `LYZ`, `MRC1`, `COL1A1`, `S100A8`, `FCN1`
* ‚ùå Non-malignant

**Cluster 6** ‚Äì *Pathogenic fibroblasts / myofibroblasts (SFRP4‚Å∫)*

* Markers: `ACTA2`, `SFRP4`, `PDGFRB`, `SPARCL1`, `ZEB1`, `CCN2`
* ‚ùå Non-malignant

**Cluster 7** ‚Äì *T cells (CD4‚Å∫, CD8‚Å∫, cytotoxic, and Tregs)*

* Markers: `CD3D`, `CD4`, `CD8A`, `GZMK`, `CTLA4`
* ‚ùå Non-malignant

**Cluster 8** ‚Äì *KRT‚Å∫ epithelial / basal-like cells (possibly bronchiolar)*

* Markers: `KRT5`, `KRT14`, `KRT17`, `CEACAM6`, `EPCAM`, `EGFR`, `TP63`
* ‚ö†Ô∏è Possibly malignant or dysplastic (basal cell metaplasia)

---

Let me know if you'd like help visualizing or scoring malignancy (e.g. inferCNV or cell-cycle).



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'AT2',
                                                      '2': 'Mast', '3': 'Monocyte-derived macrophage', 
                         '4': 'Capillary / Endothelial', '5': 'Pro-fibrotic macrophage', '6': 'Myofibroblast',
                                                      '7': 'T', '8': 'KRT‚Å∫ epithelial / basal-like', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Epithelial',
                                                      '2': 'Myeloid', '3': 'Myeloid', 
                         '4': 'Endothelial', '5': 'Myeloid', '6': 'Mesenchymal',
                                                      '7': 'Lymphoid', '8': 'Epithelial', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX121

In [None]:
sample_id = 'TENX121'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a succinct annotation of your **healthy liver tissue clusters**, ordered by cluster ID:

---

**Cluster 0** ‚Äì *Hepatocytes*

* Markers: `CYP3A4`, `ADH4`, `TAT`, `CFHR1`, `CFHR3`, `AQP9`
* ‚ùå Non-malignant

**Cluster 1** ‚Äì *Hepatocytes (zone 3 / periportal bias)*

* Markers: `CYP3A4`, `CFB`, `HPX`, `HMGCS2`, `ADH1C`, `KNG1`
* ‚ùå Non-malignant

**Cluster 2** ‚Äì *Periportal hepatocytes / metabolic liver cells*

* Markers: `IGF1`, `CYP2B6`, `GATM`, `RBP5`, `PROX1`, `TFPI`
* ‚ùå Non-malignant

**Cluster 3** ‚Äì *Kupffer cells / liver-resident macrophages*

* Markers: `CD68`, `CD163`, `MARCO`, `MRC1`, `MS4A6A`, `VSIG4`
* ‚ùå Non-malignant

**Cluster 4** ‚Äì *Stellate cells / hepatic fibroblasts*

* Markers: `PDGFRA`, `ACTA2`, `FBN1`, `COL5A2`, `THY1`, `OGN`
* ‚ùå Non-malignant

**Cluster 5** ‚Äì *Dendritic cells / monocyte-derived*

* Markers: `CLEC4E`, `CD1C`, `FCGR1A`, `FCN1`, `CD86`, `CSF2RA`
* ‚ùå Non-malignant

**Cluster 6** ‚Äì *Endothelial cells (sinusoidal/vascular)*

* Markers: `LYVE1`, `APOLD1`, `PECAM1`, `RAMP2`, `CLEC14A`, `INMT`
* ‚ùå Non-malignant

**Cluster 7** ‚Äì *T/NK cells*

* Markers: `CD3D`, `CD8A`, `GZMA`, `NKG7`, `KLRB1`, `PRF1`
* ‚ùå Non-malignant

**Cluster 8** ‚Äì *Cholangiocytes / biliary epithelial cells*

* Markers: `EPCAM`, `KRT7`, `TM4SF4`, `GPX2`, `ERBB2`, `FSTL3`
* ‚ùå Non-malignant

---

All clusters here are consistent with **healthy liver cell types** and show **no evidence of malignancy**.



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Hepatocyte', '1': 'Hepatocyte',
                                                      '2': 'Periportal hepatocyte', '3': 'Kupffer', 
                         '4': 'Stellate', '5': 'Dendritic', '6': 'Endothelial',
                                                      '7': 'T/NK', '8': 'Cholangiocyte', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Epithelial',
                                                      '2': 'Epithelial', '3': 'Myeloid', 
                         '4': 'Mesenchymal', '5': 'Myeloid', '6': 'Endothelial',
                                                      '7': 'Lymphoid', '8': 'Epithelial', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX95

In [None]:
sample_id = 'TENX95'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation** of your **breast cancer clusters**, in order:

---

**Cluster 0** ‚Äì *Luminal tumor cells*

* Markers: `ESR1`, `PGR`, `FOXA1`, `GATA3`, `EPCAM`, `KRT8`, `CDH1`
* ‚úÖ **Likely malignant**

**Cluster 1** ‚Äì *Cancer-associated fibroblasts (CAFs)*

* Markers: `ACTA2`, `POSTN`, `PDGFRB`, `SFRP4`, `CCDC80`, `ZEB1`
* ‚ö†Ô∏è **Possibly tumor-supportive, not malignant**

**Cluster 2** ‚Äì *Luminal tumor cells (high ER/PR)*

* Markers: Similar to cluster 0: `ESR1`, `PGR`, `EPCAM`, `ERBB2`
* ‚úÖ **Likely malignant**

**Cluster 3** ‚Äì *Luminal tumor cells (dividing/subtype variation)*

* Markers: `GATA3`, `FOXA1`, `ESR1`, `TACSTD2`, `CDH1`, `CCND1`
* ‚úÖ **Likely malignant**

**Cluster 4** ‚Äì *T/NK cells*

* Markers: `CD3E`, `CD8A`, `TRAC`, `GZMA`, `NKG7`, `CD4`
* ‚ùå Non-malignant

**Cluster 5** ‚Äì *Myeloid/macrophages (M2-polarized)*

* Markers: `CD68`, `CD163`, `MRC1`, `FCER1G`, `CX3CR1`
* ‚ùå Non-malignant

**Cluster 6** ‚Äì *Endothelial cells / pericytes*

* Markers: `PECAM1`, `VWF`, `RAMP2`, `ANGPT2`, `ACTA2`
* ‚ùå Non-malignant

**Cluster 7** ‚Äì *Basal-like tumor cells / myoepithelial*

* Markers: `KRT5`, `KRT14`, `EGFR`, `ACTA2`, `MYLK`
* ‚úÖ **Possibly malignant (basal subtype)**

**Cluster 8** ‚Äì *B cells / plasma cells (mature and cycling)*

* Markers: `CD79A`, `CD27`, `TNFRSF17`, `MZB1`, `SLAMF7`
* ‚ùå Non-malignant

---

**Likely malignant clusters:** 0, 2, 3, 7
**Non-malignant or stromal/immune:** 1, 4, 5, 6, 8


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal', '1': 'CAF',
                                                      '2': 'Malignant luminal', '3': 'Malignant luminal dividing', 
                         '4': 'T/NK', '5': 'Macrophage M2', '6': 'Endothelial',
                                                      '7': 'Malignant basal-like', '8': 'B/Plasma', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Mesenchymal',
                                                      '2': 'Malignant', '3': 'Malignant', 
                         '4': 'Lymphoid', '5': 'Myeloid', '6': 'Endothelial',
                                                      '7': 'Malignant', '8': 'Lymphoid', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX138

In [None]:
sample_id = 'TENX138'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a concise annotation of your **brain cancer clusters**, in order:

---

**Cluster 0** ‚Äì *Malignant glioma-like cells*

* Markers: `PDGFRA`, `EGFR`, `MKI67`, `CDK4`, `STMN1`, `CTNNB1`, `CDK1`, `KRAS`
* ‚úÖ **Likely malignant**

**Cluster 1** ‚Äì *Reactive astrocytes / mesenchymal-like malignant*

* Markers: `SPARCL1`, `APOE`, `ID4`, `SOCS3`, `STAT1/3`, `CD44`
* ‚ö†Ô∏è **Possibly malignant (mesenchymal)**

**Cluster 2** ‚Äì *Mostly noise/control genes*

* Markers: Enriched in negative control probes
* üö´ **Likely technical/noise**

**Cluster 3** ‚Äì *Proliferative glioma or myeloid-like hybrid*

* Markers: `MKI67`, `CDK6`, `CDK1`, `TUBB`, `FCGR3A`, `CD74`, `EGFR`
* ‚úÖ **Possibly malignant**

**Cluster 4** ‚Äì *Myeloid cells (macrophages / microglia)*

* Markers: `CD163`, `CD68`, `CSF1R`, `ITGAX`, `FCGR2A`, `TREM2`, `AIF1`
* ‚ùå **Non-malignant**

**Cluster 5** ‚Äì *Endothelial / pericyte-like cells*

* Markers: `PDGFRB`, `VWF`, `SPARC`, `CAV1`, `RAMP2`, `ACTA2`, `SPARCL1`
* ‚ùå **Non-malignant**

---

**Likely malignant clusters:** 0, 1 (partial), 3
**Non-malignant clusters:** 4, 5
**Likely noise/low-quality:** 2



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant glioma-like', '1': 'Malignant mesenchymal-like',
                                                      '2': 'Low-quality', '3': 'Malignant proliferative', 
                         '4': 'Myeloid', '5': 'Endothelial',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Low-quality', '3': 'Malignant', 
                         '4': 'Myeloid', '5': 'Endothelial',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI875

In [None]:
sample_id = 'NCBI875'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a succinct annotation of the **lung fibrosis clusters**, in order:

---

**Cluster 0** ‚Äì *Endothelial / mesenchymal hybrid*

* Markers: `PECAM1`, `KDR`, `ACTA2`, `POSTN`, `TGFB1`, `ZEB1`
* ‚ö†Ô∏è **Possible mesenchymal transition; mildly profibrotic**

**Cluster 1** ‚Äì *Inflammatory myeloid cells*

* Markers: `S100A8/9/12`, `FCGR3A`, `LYZ`, `PTPRC`, `ITGAX`
* ‚ùå **Non-malignant**

**Cluster 2** ‚Äì *Activated fibroblasts / myofibroblasts*

* Markers: `COL1A1`, `FN1`, `PDGFRA/B`, `ACTA2`, `CCN2`, `SPARCL1`
* ‚ö†Ô∏è **Pathogenic, but not malignant**

**Cluster 3** ‚Äì *Alveolar epithelial cells (AT2-like)*

* Markers: `SFTPC`, `SFTPD`, `NAPSA`, `EPCAM`, `NKX2-1`
* ‚ùå **Non-malignant**

**Cluster 4** ‚Äì *Macrophages / profibrotic immune cells*

* Markers: `CD68`, `MARCO`, `FABP4`, `TGFB1`, `CCL18`, `MRC1`
* ‚ùå **Non-malignant**

**Cluster 5** ‚Äì *Pathogenic fibroblasts / myofibroblasts*

* Markers: `ACTA2`, `COL3A1`, `ITGB1`, `POSTN`, `YAP1`, `SPARCL1`
* ‚ö†Ô∏è **Profibrotic, but not malignant**

**Cluster 6** ‚Äì *Dysregulated epithelial / transitional state*

* Markers: `CEACAM6`, `ITGB6`, `AGR3`, `EGFR`, `YAP1`, `TGFB2`
* ‚úÖ **Potentially malignant-like / aberrant epithelial**

**Cluster 7** ‚Äì *T/NK cells*

* Markers: `CD3E`, `TRAC`, `GZMA/B/K`, `CD8A`, `NKG7`
* ‚ùå **Non-malignant**

**Cluster 8** ‚Äì *Secretory epithelial / club cells*

* Markers: `SCGB1A1`, `KRT8/18`, `SOX2`, `WWTR1`, `TP63`, `YAP1`
* ‚ö†Ô∏è **Aberrant epithelium, but not overtly malignant**

---

**Likely malignant-like / dysplastic:** 6
**Profibrotic / pathogenic but not malignant:** 2, 5
**Non-malignant (immune or normal epithelial):** 1, 3, 4, 7, 8
**Possible transition / EMT features:** 0




In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Inflammatory myeloid',
                                                      '2': 'Myofibroblast', '3': 'AT2', 
                         '4': 'Macrophage', '5': 'Myofibroblast', '6': 'Epithelial dysregulated', '7': 'T/NK',
                                                      '8': 'Secretory epithelial'
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Myeloid',
                                                      '2': 'Mesenchymal', '3': 'Epithelial', 
                         '4': 'Myeloid', '5': 'Mesenchymal', '6': 'Epithelial', '7': 'Lymphoid',
                                                      '8': 'Epithelial'
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX94

In [None]:
sample_id = 'TENX94'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct cluster annotation** in the context of **breast cancer**, with malignancy indications:

---

**Cluster 0** ‚Äì *Luminal epithelial (ER+/HER2+)*

* Markers: `FOXA1`, `ESR1`, `ERBB2`, `EPCAM`, `GATA3`, `CCND1`
* ‚úÖ **Malignant**

**Cluster 1** ‚Äì *Similar luminal epithelial / noise mix*

* Markers: `ESR1`, `ANKRD30A`, `FOXA1`, plus many blanks/controls
* ‚úÖ **Malignant** (but contains technical noise)

**Cluster 2** ‚Äì *Myeloid / macrophage lineage*

* Markers: `CD68`, `FCGR3A`, `CD14`, `CD163`, `C1QA`, `MRC1`
* ‚ùå **Non-malignant**

**Cluster 3** ‚Äì *Cancer-associated fibroblasts (CAFs)*

* Markers: `POSTN`, `PDGFRA`, `SFRP4`, `ZEB1`, `IGF1`, `ACTA2`
* ‚ö†Ô∏è **Not malignant, but tumor-supportive**

**Cluster 4** ‚Äì *Luminal epithelial (mixed state)*

* Markers: `ESR1`, `EPCAM`, `FOXA1`, `PDGFRB`, `GATA3`, `CCND1`
* ‚úÖ **Malignant**

**Cluster 5** ‚Äì *Endothelial/perivascular-like*

* Markers: `PECAM1`, `VWF`, `CAV1`, `KDR`, `MMRN2`, `SOX17`
* ‚ùå **Non-malignant**

**Cluster 6** ‚Äì *T cells / cytotoxic / regulatory mix*

* Markers: `CD3E`, `CD4`, `CD8A`, `GZMA`, `CXCR4`, `FOXP3`
* ‚ùå **Non-malignant**

---

**Likely malignant clusters:** 0, 1, 4
**Tumor-supportive CAFs:** 3
**Immune / endothelial (non-malignant):** 2, 5, 6


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal', '1': 'Malignant luminal',
                                                      '2': 'Macrophage', '3': 'CAF', 
                         '4': 'Malignant luminal', '5': 'Endothelial', '6': 'T', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Myeloid', '3': 'Mesenchymal', 
                         '4': 'Malignant', '5': 'Endothelial', '6': 'Lymphoid', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'TENX149'

In [None]:
sample_id = 'TENX149'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a concise assessment of the **clusters in colorectal cancer**, with malignancy indications:

---

**Cluster 0** ‚Äì *Cancer-associated fibroblasts (CAFs)*

* Markers: `FAP`, `MMP11`, `INHBA`, `VCAN`, `PDGFRA`, `THBS1`, `GREM1`
* ‚ö†Ô∏è **Not malignant**, but **tumor-supportive**

**Cluster 1** ‚Äì *Proliferative epithelial / cancer-like*

* Markers: `LGR5`, `MYC`, `SOX9`, `MKI67`, `UBE2C`, `AXIN2`, `RNF43`
* ‚úÖ **Malignant**

**Cluster 2** ‚Äì *T cells / B cells (immune infiltrate)*

* Markers: `CD3D`, `CD8A`, `CXCL13`, `CD79A`, `TRAC`, `GZMK`
* ‚ùå **Non-malignant**

**Cluster 3** ‚Äì *Colorectal epithelial (cancer-like)*

* Markers: `OLFM4`, `CEACAM5`, `LGR5`, `RNF43`, `ASCL2`, `MYC`
* ‚úÖ **Malignant**

**Cluster 4** ‚Äì *Differentiated enterocytes / TA-like*

* Markers: `CEACAM1`, `MUC17`, `SLC12A2`, `CD24`, `REG1A`
* ‚ö†Ô∏è **Borderline; could include cancer-adjacent cells**

**Cluster 5** ‚Äì *Tumor-associated macrophages (TAMs)*

* Markers: `CD68`, `CD163`, `C1QA`, `FCGR3A`, `MRC1`, `APOE`, `MMP9`
* ‚ùå **Non-malignant**, but **tumor-supportive**

**Cluster 6** ‚Äì *Mature colonocytes / goblet-like*

* Markers: `MUC5B`, `REG4`, `PIGR`, `BEST2`, `CLCA1`, `CA2`
* ‚ùå **Non-malignant**

**Cluster 7** ‚Äì *Endothelial / vascular-like*

* Markers: `PECAM1`, `VWF`, `PLVAP`, `CDH5`, `NOTCH3`, `ACTA2`
* ‚ùå **Non-malignant**

**Cluster 8** ‚Äì *Myofibroblasts / CAFs*

* Markers: `ACTA2`, `TAGLN`, `FAP`, `THBS4`, `GREM1`, `CXCL12`
* ‚ö†Ô∏è **Not malignant**, but **tumor-supportive**

**Cluster 9** ‚Äì *Absorptive epithelium / colonocytes*

* Markers: `PIGR`, `FABP1`, `CEACAM7`, `CA1`, `MUC12`, `SLC26A3`
* ‚ùå **Non-malignant**

**Cluster 10** ‚Äì *Plasma / B cell-like (possibly TLS-related)*

* Markers: `IGHG3`, `MZB1`, `CD79A`, `TNFRSF17`, `CD27`
* ‚ùå **Non-malignant**

---

**Likely malignant clusters:** 1, 3
**Tumor-supportive (but not malignant):** 0, 5, 8
**Immune / endothelial / normal epithelial:** 2, 4, 6, 7, 9, 10



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'CAF', '1': 'Malignant proliferative',
                                                      '2': 'T/B', '3': 'Malignant', 
                         '4': 'Malignant TA-like', '5': 'Macrophage', '6': 'Mature colonocyte', '7': 'Endothelial',
                                                     '8':'Myofibroblast', '9': 'Absorptive epithelium', '10': 'Plasma/B'
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Malignant',
                                                      '2': 'Lymphoid', '3': 'Malignant', 
                         '4': 'Malignant', '5': 'Myeloid', '6': 'Epithelial', '7': 'Endothelial',
                                                     '8':'Mesenchymal', '9': 'Epithelial', '10': 'Lymphoid'
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI880

In [None]:
sample_id = 'NCBI880'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct malignancy assessment** of clusters in the **lung fibrosis** context:

---

**Cluster 0** ‚Äì Alveolar/secretory epithelial (e.g. `SCGB1A1`, `MUC5B`, `KRT18`)
‚Üí ‚ùå Non-malignant

**Cluster 1** ‚Äì Myofibroblasts (`ACTA2`, `COL1A1`, `TGFB3`, `FAP`)
‚Üí ‚ö†Ô∏è Tumor-supportive, not malignant

**Cluster 2** ‚Äì Endothelial cells (`PECAM1`, `PLVAP`, `CLDN5`)
‚Üí ‚ùå Non-malignant

**Cluster 3** ‚Äì Activated fibroblasts (`SPARCL1`, `PDGFRB`, `ZEB1`, `RSPO3`)
‚Üí ‚ö†Ô∏è Tumor-supportive, not malignant

**Cluster 4** ‚Äì Basal-like epithelial (`KRT5`, `TP63`, `EGFR`, `MMP7`, `ITGB6`)
‚Üí ‚úÖ Likely malignant

**Cluster 5** ‚Äì M2-like macrophages (`CD68`, `CD163`, `FCER1G`)
‚Üí ‚ùå Non-malignant

**Cluster 6** ‚Äì T cells (`CD3E`, `CD8A`, `GZMA`, `CTLA4`)
‚Üí ‚ùå Non-malignant

**Cluster 7** ‚Äì Macrophages (M2-like, `CD68`, `SPP1`, `FCGR3A`, `TREM2`)
‚Üí ‚ùå Non-malignant

**Cluster 8** ‚Äì Aberrant AT2-like epithelial (`SFTPC`, `CEACAM5/6`, `EGFR`, `MMP7`)
‚Üí ‚úÖ Likely malignant

**Cluster 9** ‚Äì ER stress / plasma-like (`XBP1`, `HSPA5`, `SEC11C`, `CD79A`)
‚Üí ‚ùå Non-malignant

**Cluster 10** ‚Äì Fibro-mast hybrid (`PDGFRA`, `PTGDS`, `COL1A1`, `CPA3`)
‚Üí ‚ö†Ô∏è Tumor-supportive, not malignant

**Cluster 11** ‚Äì Inflammatory monocytes (`S100A8`, `S100A9`, `ITGAX`)
‚Üí ‚ùå Non-malignant

**Cluster 12** ‚Äì Endothelial / mesenchymal hybrid (`PECAM1`, `VIM`, `COL1A2`, `CTHRC1`)
‚Üí ‚ùå Non-malignant

---

**Likely malignant clusters:** 4, 8
**Tumor-supportive but non-malignant:** 1, 3, 10
**Immune/endothelial/normal epithelial:** 0, 2, 5‚Äì7, 9, 11, 12




In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Epithelial alveolar/secretory', '1': 'Myofibroblast',
                                                      '2': 'Endothelial', '3': 'Activated fibroblast', 
                         '4': 'Epithelial basal-like', '5': 'Macrophage M2', '6': 'T', '7': 'Macrophage M2',
                                                     '8':'AT2', '9': 'Plasma', '10': 'Fibro-mast', '11': 'Inflammatory monocyte',
                                                      '12': 'Endothelial',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Mesenchymal',
                                                      '2': 'Endothelial', '3': 'Mesenchymal', 
                         '4': 'Epithelial', '5': 'Myeloid', '6': 'Lymphoid', '7': 'Myeloid',
                                                     '8':'Epithelial', '9': 'Lymphoid', '10': 'Mesenchymal', '11': 'Myeloid',
                                                      '12': 'Endothelial',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'NCBI861'

In [None]:
sample_id = 'NCBI861'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of each cluster in the context of **lung fibrosis**:

---

**Cluster 0** ‚Äì *Myofibroblasts / activated fibroblasts*
‚Üí `ACTA2`, `COL1A1`, `FAP`, `POSTN`, `PDGFRA`
‚Üí ‚ö†Ô∏è **Tumor-supportive**

**Cluster 1** ‚Äì *Club-like secretory epithelial*
‚Üí `SCGB3A2`, `MMP7`, `CEACAM6`, `EGFR`, `KRT8`
‚Üí ‚úÖ **Aberrant epithelium**

**Cluster 2** ‚Äì *Ciliated / secretory epithelial*
‚Üí `SCGB1A1`, `FOXJ1`, `MUC5B`, `TP73`
‚Üí ‚ùå **Normal epithelium**

**Cluster 3** ‚Äì *T cells / cytotoxic and regulatory mix*
‚Üí `CD3E`, `CD8A`, `GZMA`, `FOXP3`, `CTLA4`
‚Üí ‚ùå **Immune**

**Cluster 4** ‚Äì *Endothelial cells*
‚Üí `PECAM1`, `PLVAP`, `CLDN5`, `RAMP2`
‚Üí ‚ùå **Non-malignant**

**Cluster 5** ‚Äì *Macrophages (M2-like, fibrotic)*
‚Üí `CD68`, `SPP1`, `FCGR3A`, `MS4A7`, `TREM2`
‚Üí ‚ùå **Immune**

**Cluster 6** ‚Äì *Alveolar type II (AT2) epithelial*
‚Üí `SFTPC`, `NAPSA`, `AGER`, `DMBT1`
‚Üí ‚úÖ **Aberrant epithelium**

**Cluster 7** ‚Äì *Basal epithelial (proliferative)*
‚Üí `KRT5`, `TP63`, `KRT14`, `EGFR`, `ITGB6`
‚Üí ‚úÖ **Aberrant epithelium**

**Cluster 8** ‚Äì *Mast / fibro-hybrid*
‚Üí `TPSAB1`, `CPA3`, `PDGFRA`, `COL1A2`, `JCHAIN`
‚Üí ‚ö†Ô∏è **Tumor-supportive**

---

**Aberrant/likely malignant:** 1, 6, 7
**Fibrotic/tumor-supportive:** 0, 8
**Normal/immune/endothelial:** 2, 3, 4, 5


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'Epithelial club-like secretory',
                                                      '2': 'Epithelial ciliated / secretory', '3': 'T', 
                         '4': 'Endothelial', '5': 'Macrophage M2', '6': 'AT2', '7': 'Epithelial basal-like',
                                                     '8':'Fibro/Mast', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Epithelial',
                                                      '2': 'Epithelial', '3': 'Lymphoid', 
                         '4': 'Endothelial', '5': 'Myeloid', '6': 'Epithelial', '7': 'Epithelial',
                                                     '8':'Mesenchymal', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'TENX97'

In [None]:
sample_id = 'TENX97'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a succinct annotation of each cluster in the context of **breast cancer**:

---

**Cluster 0** ‚Äì *Luminal epithelial (ER+)*
‚Üí `ESR1`, `FOXA1`, `PGR`, `GATA3`, `EPCAM`
‚Üí ‚úÖ **Malignant (luminal subtype)**

**Cluster 1** ‚Äì *Cancer-associated fibroblasts (CAFs)*
‚Üí `POSTN`, `ACTA2`, `PDGFRB`, `FBLN1`, `SFRP4`
‚Üí ‚ö†Ô∏è **Tumor-supportive**

**Cluster 2** ‚Äì *Luminal epithelial (ER+)*
‚Üí `ESR1`, `GATA3`, `FOXA1`, `PGR`, `MUC1`
‚Üí ‚úÖ **Malignant (luminal subtype)**

**Cluster 3** ‚Äì *T cells (CD8+, CD4+), B cells*
‚Üí `CD3E`, `CD8A`, `CD4`, `MS4A1`, `PTPRC`
‚Üí ‚ùå **Non-malignant immune**

**Cluster 4** ‚Äì *Luminal epithelial (ER+)*
‚Üí `GATA3`, `ESR1`, `EPCAM`, `AGR3`, `CDH1`
‚Üí ‚úÖ **Malignant (luminal subtype)**

**Cluster 5** ‚Äì *Mixed luminal epithelial + CAFs*
‚Üí `ESR1`, `EPCAM`, `POSTN`, `ACTA2`, `CTHRC1`
‚Üí ‚úÖ **Malignant (luminal) + ‚ö†Ô∏è tumor-supportive fibroblasts**

**Cluster 6** ‚Äì *Myeloid cells / macrophages (TAMs)*
‚Üí `CD68`, `CD14`, `MRC1`, `CXCL12`, `AIF1`
‚Üí ‚ùå **Non-malignant immune**

**Cluster 7** ‚Äì *Endothelial cells*
‚Üí `PECAM1`, `VWF`, `CAV1`, `RAMP2`, `KDR`
‚Üí ‚ùå **Non-malignant stromal**

**Cluster 8** ‚Äì *Basal-like / myoepithelial*
‚Üí `KRT14`, `KRT5`, `ACTA2`, `EGFR`, `FOXC2`
‚Üí ‚úÖ **Possible malignant (basal subtype)**

---

**Malignant clusters**: 0, 2, 4, 5, 8
**Tumor-supportive stromal**: 1, 5
**Non-malignant immune/stromal**: 3, 6, 7


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal', '1': 'CAF',
                                                      '2': 'Malignant luminal', '3': 'T/B', 
                         '4': 'Malignant luminal', '5': 'Malignant luminal + stromal', '6': 'Macrophage', '7': 'Endothelial',
                                                     '8':'Malignant basal-like', 
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Mesenchymal',
                                                      '2': 'Malignant', '3': 'Lymphoid', 
                         '4': 'Malignant', '5': 'Malignant', '6': 'Myeloid', '7': 'Endothelial',
                                                     '8':'Malignant', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'TENX133'

In [None]:
sample_id = 'TENX133'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of each cluster in the context of **human bone marrow**:

---

**Cluster 0** ‚Äì *Proliferating erythroid precursors*
‚Üí `AHSP`, `ALAS2`, `GYPA`, `SLC4A1`, `MKI67`, `TOP2A`
‚Üí ‚úÖ **Normal hematopoiesis**

**Cluster 1** ‚Äì *Mature neutrophils / granulocytes*
‚Üí `CTSG`, `CEACAM8`, `LTF`, `MMP8`, `RETN`, `CHIT1`
‚Üí ‚ùå **Normal myeloid**

**Cluster 2** ‚Äì *Immature neutrophils / promyelocytes*
‚Üí `S100A12`, `MMP8`, `FCN1`, `ARG1`, `OLFM4`, `MNDA`
‚Üí ‚ùå **Normal myeloid**

**Cluster 3** ‚Äì *Mesenchymal stromal / fibroblasts*
‚Üí `CXCL12`, `COL1A1`, `PDGFRB`, `SPARC`, `VCAN`
‚Üí ‚ùå **Non-hematopoietic support**

**Cluster 4** ‚Äì *B lineage / plasma cells*
‚Üí `MS4A1`, `CD79A`, `MZB1`, `TCL1A`, `SLAMF7`
‚Üí ‚ùå **Normal lymphoid**

**Cluster 5** ‚Äì *Late erythroid cells*
‚Üí `ALAS2`, `GYPA`, `GYPB`, `HEMGN`, `SLC4A1`
‚Üí ‚úÖ **Normal hematopoiesis**

**Cluster 6** ‚Äì *Cytotoxic T / NK-like cells*
‚Üí `CD8A`, `PRF1`, `GZMA`, `IL2RA`, `CD3E`
‚Üí ‚ùå **Normal lymphoid**

**Cluster 7** ‚Äì *Monocytes / early macrophages*
‚Üí `S100A12`, `CD14`, `MS4A6A`, `CD68`, `SPI1`
‚Üí ‚ùå **Normal myeloid**

**Cluster 8** ‚Äì *Endothelial cells*
‚Üí `VWF`, `PECAM1`, `ADGRL4`, `ENG`, `EGFL7`
‚Üí ‚ùå **Non-hematopoietic stromal**

**Cluster 9** ‚Äì *Mature neutrophils / granulocytes*
‚Üí `MMP8`, `CEACAM8`, `ARG1`, `OLFM4`, `CHIT1`
‚Üí ‚ùå **Normal myeloid**

**Cluster 10** ‚Äì *Vascular / stromal mix (mature)*
‚Üí `PECAM1`, `VWF`, `CAVIN1`, `MMRN1`, `CD68`
‚Üí ‚ùå **Non-hematopoietic stromal**

**Cluster 11** ‚Äì *Mixed lymphoid (B + T)*
‚Üí `CD79A`, `MS4A1`, `CD247`, `CD2`, `GZMA`
‚Üí ‚ùå **Normal lymphoid**

**Cluster 12** ‚Äì *Mesenchymal progenitors / osteolineage*
‚Üí `COL1A1`, `SPP1`, `IBSP`, `RUNX2`, `PDGFRA`
‚Üí ‚ùå **Non-hematopoietic stromal**

---

**Malignant-appearing clusters**: None clearly malignant
**Proliferative but normal**: 0, 5
**Non-hematopoietic / stromal**: 3, 8, 10, 12
**Immune (myeloid/lymphoid)**: 1, 2, 4, 6, 7, 9, 11


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Proliferating erythroid precursor', '1': 'Mature neutrophil',
                                                      '2': 'Immature neutrophil', '3': 'Mesenchymal stromal', 
                         '4': 'Plasma/B', '5': 'Late erythroid', '6': 'Cytotoxic T/NK', '7': 'Monocyte',
                                                     '8':'Endothelial', '9': 'Mature neutrophil', '10': 'Vascular / stromal', 
                                                      '11': 'Mixed lymphoid', '12': 'Mesenchymal progenitor'
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Myeloid', '1': 'Myeloid',
                                                      '2': 'Myeloid', '3': 'Mesenchymal', 
                         '4': 'Lymphoid', '5': 'Myeloid', '6': 'Lymphoid', '7': 'Myeloid',
                                                     '8':'Endothelial', '9': 'Myeloid', '10': 'Mesenchymal', 
                                                      '11': 'Lymphoid', '12': 'Mesenchymal'
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX116

In [None]:
sample_id = 'TENX116'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation of clusters** in the context of **pancreatic cancer (PDAC)**:

---

### **Cluster 0** ‚Äì *Tumor-infiltrating immune cells (T cells, macrophages, myeloid)*

‚Üí `PTPRC`, `CD68`, `CD14`, `CD8A`, `MS4A6A`, `CXCR4`, `HAVCR2`
‚Üí ‚ùå **Non-malignant (immune infiltrate)**

### **Cluster 1** ‚Äì *Acinar / digestive epithelial cells*

‚Üí `AMY2A`, `ANPEP`, `SERPINB1`, `GSTA1`, `DPEP1`
‚Üí ‚úÖ **Possibly malignant (tumor-transformed acinar)**

### **Cluster 2** ‚Äì *Epithelial / ductal-like tumor cells*

‚Üí `EPCAM`, `CXCL6`, `ERBB2`, `KRT7`, `FSTL3`
‚Üí ‚úÖ **Likely malignant**

### **Cluster 3** ‚Äì *Cancer-associated fibroblasts (CAFs)*

‚Üí `PDGFRA`, `PDGFRB`, `ACTA2`, `FAP`, `CTHRC1`, `SFRP2`
‚Üí ‚ö†Ô∏è **Tumor-supportive, non-malignant CAFs**

### **Cluster 4** ‚Äì *Endothelial cells*

‚Üí `PECAM1`, `VWF`, `CD34`, `RAMP2`, `EGFL7`
‚Üí ‚ùå **Non-malignant vascular**

### **Cluster 5** ‚Äì *Neuroendocrine / islet-like cells*

‚Üí `INS`, `GCG`, `SST`, `CHGA`, `SCGN`
‚Üí ‚ö†Ô∏è **Possibly malignant if neuroendocrine tumor (PanNET)**

### **Cluster 6** ‚Äì *Malignant epithelial (mixed ductal / stress response)*

‚Üí `EPCAM`, `CFTR`, `TMC5`, `ERBB2`, `MET`, `KRT7`, `SERPING1`
‚Üí ‚úÖ **Likely malignant**

---

### Summary:

* ‚úÖ **Likely malignant**: Clusters **1, 2, 6**
* ‚ö†Ô∏è **Tumor-supportive / possibly malignant**: Clusters **3, 5**
* ‚ùå **Non-malignant (immune or stromal)**: Clusters **0, 4**


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Mixed myeloid & lymphoid', '1': 'Malignant acinar/digestive epithelial',
                                                      '2': 'Malignant ductal-like epithelial', '3': 'CAF', 
                         '4': 'Endothelial', '5': 'Neuroendocrine/islet', '6': 'Malignant ductal/stress epithalial',
                                                    })

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Myeloid/Lymphoid', '1': 'Malignant',
                                                      '2': 'Malignant', '3': 'Mesenchymal', 
                         '4': 'Endothelial', '5': 'Epithelial', '6': 'Malignant',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX99

In [None]:
sample_id = 'TENX99'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a succinct malignancy-oriented annotation of the **breast cancer clusters**:

---

### **Cluster 0** ‚Äì *Luminal epithelial / ER+ tumor cells*

‚Üí `ESR1`, `PGR`, `FOXA1`, `GATA3`, `CDH1`, `ERBB2`, `CCND1`
‚Üí ‚úÖ **Likely malignant**

### **Cluster 1** ‚Äì *Cancer-associated fibroblasts (CAFs)*

‚Üí `ACTA2`, `PDGFRB`, `CXCL12`, `POSTN`, `SFRP4`, `ZEB1`
‚Üí ‚ö†Ô∏è **Tumor-supportive, non-malignant**

### **Cluster 2** ‚Äì *Proliferative luminal epithelial / tumor cells*

‚Üí Similar to Cluster 0, with `TPD52`, `KRT8`, `GATA3`, `ERBB2`, `AGR3`
‚Üí ‚úÖ **Likely malignant**

### **Cluster 3** ‚Äì *Myeloid cells / macrophages*

‚Üí `CD68`, `CD14`, `PTPRC`, `FCGR3A`, `CXCL16`, `CD163`
‚Üí ‚ùå **Non-malignant immune infiltrate**

### **Cluster 4** ‚Äì *T cells / NK / B cells*

‚Üí `CD3E`, `GZMA`, `NKG7`, `CD8A`, `CD79A`, `FOXP3`, `PRF1`
‚Üí ‚ùå **Non-malignant lymphoid**

### **Cluster 5** ‚Äì *Endothelial / perivascular cells*

‚Üí `PECAM1`, `VWF`, `CAV1`, `ANGPT2`, `EGFL7`, `MMRN2`
‚Üí ‚ùå **Non-malignant vascular**

### **Cluster 6** ‚Äì *Basal-like / myoepithelial tumor cells*

‚Üí `KRT5`, `KRT14`, `EGFR`, `KRT6B`, `DST`, `ALDH1A3`
‚Üí ‚úÖ **Likely malignant (basal subtype)**

---

### Summary:

* ‚úÖ **Likely malignant**: Clusters **0, 2, 6**
* ‚ö†Ô∏è **Tumor-supportive stromal**: Cluster **1**
* ‚ùå **Non-malignant (immune/stromal)**: Clusters **3, 4, 5**



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal', '1': 'CAF',
                                                      '2': 'Malignant proliferative luminal', '3': 'Macrophage', 
                         '4': 'T/NK/B', '5': 'Endothelial', '6': 'Malignant basal-like',
                                                    })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Mesenchymal',
                                                      '2': 'Malignant', '3': 'Myeloid', 
                         '4': 'Lymphoid', '5': 'Endothelial', '6': 'Malignant',
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'NCBI879'

In [None]:
sample_id = 'NCBI879'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation of clusters in lung fibrosis**, with emphasis on cell type identity and disease relevance:

---

### **Cluster 0** ‚Äì *Endothelial cells*

‚Üí `PECAM1`, `EPAS1`, `CD34`, `KDR`, `CLDN5`, `RAMP2`
‚Üí ‚ùå **Non-fibrotic vasculature**

### **Cluster 1** ‚Äì *Low-confidence / control codewords & mast cells*

‚Üí `TPSAB1`, `CPA3`, `IRF7`, many unassigned markers
‚Üí ‚ö†Ô∏è **Likely noise or rare immune (mast-like)**

### **Cluster 2** ‚Äì *Activated fibroblasts / myofibroblasts*

‚Üí `COL1A1`, `COL1A2`, `FN1`, `ACTA2`, `PDGFRA`, `YAP1`, `SFRP4`
‚Üí ‚úÖ **Pro-fibrotic core myofibroblasts**

### **Cluster 3** ‚Äì *AT2 / epithelial cells (injured/pro-fibrotic)*

‚Üí `SFTPC`, `SFTPD`, `EPCAM`, `DUOX1`, `YAP1`, `KRT18`, `EGFR`
‚Üí ‚ö†Ô∏è **Pro-fibrotic alveolar epithelial (aberrant AT2)**

### **Cluster 4** ‚Äì *Monocyte-derived macrophages / Mo-Macs*

‚Üí `CD68`, `CD14`, `MS4A7`, `FCGR3A`, `AIF1`, `MRC1`, `TGFB1`
‚Üí ‚ö†Ô∏è **Pro-fibrotic immune (Mo-Macs)**

### **Cluster 5** ‚Äì *Inflammatory monocytes / granulocytes*

‚Üí `S100A8`, `S100A9`, `ITGAX`, `LYZ`, `FCN1`, `TGFB1`, `IL1B`
‚Üí ‚ö†Ô∏è **Inflammatory / fibrotic amplifiers**

### **Cluster 6** ‚Äì *T cells (CD8+, cytotoxic, exhausted)*

‚Üí `CD3E`, `CD8A`, `GZMB`, `CTLA4`, `TRAC`, `LEF1`, `KLRG1`
‚Üí ‚ùå **Non-fibrotic lymphoid**

### **Cluster 7** ‚Äì *TAM-like macrophages / M2-polarized*

‚Üí `CD68`, `MRC1`, `HAVCR2`, `PPARG`, `TGFB1`, `TREM2`, `HLA-DRA`
‚Üí ‚ö†Ô∏è **Fibrosis-associated macrophages (M2/TAM-like)**

### **Cluster 8** ‚Äì *Aberrant epithelial / basal-like cells*

‚Üí `EPCAM`, `CEACAM6`, `AGER`, `KRT8`, `FN1`, `YAP1`, `ITGB6`
‚Üí ‚úÖ **Pathogenic basaloid epithelial cells**

### **Cluster 9** ‚Äì *Myofibroblasts (matrix-producing, invasive)*

‚Üí `COL1A1`, `ACTA2`, `SPARCL1`, `YAP1`, `AXL`, `ITGB1`, `ZEB1`
‚Üí ‚úÖ **Pathogenic myofibroblasts**

### **Cluster 10** ‚Äì *Club / secretory cells (dysregulated)*

‚Üí `SCGB1A1`, `EPCAM`, `AGR3`, `KRT8`, `SOX2`, `YAP1`, `IDH1`
‚Üí ‚ö†Ô∏è **Pro-fibrotic secretory phenotype**

### **Cluster 11** ‚Äì *Hybrid epithelial / transitional cells*

‚Üí `COL1A1`, `KRT8`, `SFRP2`, `WNT5A`, `SPINK1`, `DMBT1`, `SNAI2`
‚Üí ‚úÖ **Aberrant transitional epithelium**

### **Cluster 12** ‚Äì *Vascular-associated fibroblasts / pericytes*

‚Üí `GNG11`, `PDGFRB`, `PECAM1`, `RAMP2`, `ZEB1`, `TGFB1`, `FABP4`
‚Üí ‚ö†Ô∏è **Pro-fibrotic fibrovascular**

---

### Summary:

‚úÖ **Pro-fibrotic / pathogenic**: Clusters **2, 8, 9, 11**
‚ö†Ô∏è **Fibrosis-associated or supportive**: Clusters **1, 3, 4, 5, 7, 10, 12**
‚ùå **Non-fibrotic (immune or vascular)**: Clusters **0, 6**

In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Noise',
                                                      '2': 'Myofibroblast', '3': 'AT2', 
                         '4': 'Monocyte-derived macrophage', '5': 'Inflammatory monocyte', '6': 'T cytotoxic', 
                                                      '7': 'Macrophage M2', '8': 'Epithelial basal-like', 
                                                      '9': 'Myofibroblast', "10": "Epithelial club / secretory", 
                                                      "11": "Hybrid epithelial / transitional", "12": "Vascular-associated fibroblast"
                                                    })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Noise',
                                                      '2': 'Mesenchymal', '3': 'Epithelial', 
                         '4': 'Myeloid', '5': 'Myeloid', '6': 'Lymphoid', 
                                                      '7': 'Myeloid', '8': 'Epithelial', 
                                                      '9': 'Mesenchymal', "10": "Epithelial", 
                                                      "11": "Epithelial", "12": "Mesenchymal"
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# 'NCBI864'

In [None]:
sample_id = 'NCBI864'

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct cluster-wise cell type annotation** for the **lung fibrosis context**, with key marker genes for each:

---

* **Cluster 0**: **Myofibroblasts**
  ‚Ü≥ *COL1A1, COL1A2, ACTA2, FN1, PDGFRA, SFRP2*

* **Cluster 1**: **T Cells (CD4‚Å∫, CD8‚Å∫, Regulatory, Cytotoxic)**
  ‚Ü≥ *CD3E, CD2, CD8A, FOXP3, CTLA4, GZMK*

* **Cluster 2**: **Monocyte-derived Macrophages / Antigen-Presenting Cells**
  ‚Ü≥ *HLA-DRA, CD68, FCER1G, CD14, MRC1, ITGAM*

* **Cluster 3**: **Alveolar Type II (AT2) Epithelial Cells**
  ‚Ü≥ *SFTPC, NAPSA, LAMP3, EPCAM, CEACAM6*

* **Cluster 4**: **Endothelial Cells (vascular/lymphatic mix)**
  ‚Ü≥ *PECAM1, CD34, CLDN5, PLVAP, RAMP2, EPAS1*

* **Cluster 5**: **Transitional / Mixed Epithelial-Mesenchymal Cells**
  ‚Ü≥ *SFTPC, KRT8, EPCAM, SPARCL1, FN1, ITGA3*

* **Cluster 6**: **M2-like Alveolar Macrophages**
  ‚Ü≥ *CD68, MRC1, CCL18, FABP4, PPARG, TREM2*

* **Cluster 7**: **Inflammatory Monocytes / Macrophages**
  ‚Ü≥ *S100A8, S100A9, IL1B, FCN1, HMOX1, CCL2*

* **Cluster 8**: **Mast Cells / Stromal Hybrid**
  ‚Ü≥ *CPA3, TPSAB1, KIT, FCER1A, LUM, PTGDS*

* **Cluster 9**: **Plasma / B Cells (including ER-stressed / active state)**
  ‚Ü≥ *CD79A, JCHAIN, XBP1, TNFRSF17, HSPA5, PRDX4*

---

Let me know if you‚Äôd like to merge or relabel any categories!


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'T cytotoxic',
                                                      '2': 'Monocyte-derived Macrophage', '3': 'AT2', 
                         '4': 'Endothelial', '5': 'Transitional / Mixed Epithelial', '6': 'Macrophage M2', 
                                                      '7': 'Inflammatory Monocyte', '8': 'Mast / Stromal', 
                                                      '9': 'Plasma / B', 
                                                    })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Lymphoid',
                                                      '2': 'Myeloid', '3': 'Epithelial', 
                         '4': 'Endothelial', '5': 'Epithelial', '6': 'Myeloid', 
                                                      '7': 'Myeloid', '8': 'Myeloid', 
                                                      '9': 'Lymphoid', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# "TENX132"

In [None]:
sample_id = "TENX132"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation of bone marrow clusters**, with focus on cell identity and malignant relevance:

---

### **Cluster 0** ‚Äì *Immature or stress hematopoietic cells*

‚Üí `PTPRC`, `GYPA`, `CD2`, `IL7R`, `AGR3`, `CXCL9`, `HOXA10`
‚Üí ‚ö†Ô∏è Mixed lineage/stress signals; possible **pre-leukemic or early progenitor-like**

---

### **Cluster 1** ‚Äì *Osteolineage / stromal cells*

‚Üí `COL1A1`, `RUNX2`, `CTSK`, `SFRP4`, `PDGFRA`, `CDH11`
‚Üí ‚ùå **Non-malignant osteogenic mesenchyme**

---

### **Cluster 2** ‚Äì *Osteocytes / niche mesenchyme*

‚Üí `MEPE`, `SOST`, `DMP1`, `OGN`, `DKK1`, `HOXA10`
‚Üí ‚ùå **Bone-resident / niche-supporting cells**

---

### **Cluster 3** ‚Äì *Heterogeneous immune & stromal signals*

‚Üí `FOXP2`, `IGF1`, `CNMD`, `IRF8`, `CD24`, `SLC4A1`
‚Üí ‚ö†Ô∏è Possibly stromal/immune mix; unlikely malignant

---

### **Cluster 4** ‚Äì *Proliferative / stress erythroid-like*

‚Üí `GYPA`, `SLC4A1`, `PRF1`, `PCNA`, `IL2RA`, `SPP1`, `LAG3`
‚Üí ‚ö†Ô∏è High proliferation and stress markers; **monitor for dysplasia/malignancy**

---

### **Cluster 5** ‚Äì *Endothelial / perivascular niche*

‚Üí `VWF`, `PECAM1`, `ENG`, `GAS6`, `CD34`, `CALD1`, `TCF4`
‚Üí ‚ùå **Normal vascular niche**

---

### **Cluster 6** ‚Äì *Adipocyte-associated vasculature*

‚Üí `FABP4`, `CD34`, `CAV1`, `PPARG`, `ADIPOQ`, `TENT5C`
‚Üí ‚ùå **Niche-supportive vasculature/adipogenic**

---

### **Cluster 7** ‚Äì *Bone-lining mesenchyme / osteoblast-like*

‚Üí `COL1A1`, `COMP`, `SFRP2`, `FBLN1`, `SPP1`, `CDH11`
‚Üí ‚ùå **Normal stromal**

---

### **Cluster 8** ‚Äì *Mixed / low-confidence signals*

‚Üí `SFRP2`, `SPINK2`, `PDCD1`, `DKK1`, `COL10A1`, `PRRX1`
‚Üí ‚ö†Ô∏è Possibly disordered mesenchyme; **requires context**

---

### **Cluster 9** ‚Äì *Pericytes / endothelial with ACTA2+*

‚Üí `ACTA2`, `PECAM1`, `CD34`, `GAS6`, `MYH11`, `VCAN`
‚Üí ‚ùå **Smooth muscle‚Äìlike niche cells**

---

### **Cluster 10** ‚Äì *Osteoprogenitor / matrix-producing cells*

‚Üí `RUNX2`, `IBSP`, `COL1A2`, `CTSK`, `MMP9`, `FGFR1`
‚Üí ‚ùå **Osteolineage; matrix builders**

---

### **Cluster 11** ‚Äì *Proliferating B-lineage / progenitors*

‚Üí `CD79A`, `MKI67`, `TCL1A`, `CD19`, `LEF1`, `MZB1`, `PCNA`
‚Üí ‚úÖ **Suspicious for malignant/plasmablastic clone**

---

### **Cluster 12** ‚Äì *Myeloid / monocytic cells*

‚Üí `CD163`, `CD14`, `MS4A4A`, `FABP4`, `SPI1`, `MRC1`, `PTPRC`
‚Üí ‚ùå **Non-malignant myeloid compartment**

---

### Summary:

‚úÖ **Malignancy-suspect**:

* **Cluster 11** ‚Äì proliferative B-like/plasmablast
* **Cluster 4** ‚Äì erythroid/stress proliferative (monitor)

‚ö†Ô∏è **Watch / mixed features**:

* **Clusters 0, 3, 8** ‚Äì stress, pre-leukemic, or ambiguous stromal/immune

‚ùå **Non-malignant supportive**:

* **Clusters 1, 2, 5, 6, 7, 9, 10, 12** ‚Äì normal stroma, endothelium, or immune

Let me know if you'd like marker-based visualization or cell-type refinement.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Immature / stress hematopoietic', '1': 'Osteolineage',
                                                      '2': 'Osteocyte', '3': 'Immune / Stromal', 
                         '4': 'Proliferative / stress erythroid', '5': 'Endothelial / perivascular', '6': 'Adipocyte-associated vasculature', 
                                                      '7': 'Bone-lining mesenchyme', '8': 'Noise', 
                                                      '9': 'Endothelial / pericyte', "10": "Osteoprogenitor", 
                                                      "11": "Proliferating B-lineage", "12": "Myeloid"
                                                    })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Mesenchymal',
                                                      '2': 'Mesenchymal', '3': 'Mesenchymal', 
                         '4': 'Myeloid', '5': 'Endothelial', '6': 'Endothelial', 
                                                      '7': 'Mesenchymal', '8': 'Noise', 
                                                      '9': 'Endothelial', "10": "Mesenchymal", 
                                                      "11": "Lymphoid", "12": "Myeloid"
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX139

In [None]:
sample_id = "TENX139"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation of colorectal cancer clusters**, with representative marker genes:

---

* **Cluster 0**: **Proliferative Tumor Epithelial Cells**
  ‚Ü≥ *EPCAM, CEACAM1, CEACAM6, MKI67, CDX2, ERBB3, GPRC5A*

* **Cluster 1**: **Differentiated Tumor Epithelial Cells**
  ‚Ü≥ *EPCAM, CEACAM6, CDX2, MET, RNF43, PROX1, GPRC5A*

* **Cluster 2**: **Stem-like / EMT-like Tumor Epithelial Cells**
  ‚Ü≥ *EPCAM, SOX2, SOX9, CXCL14, ID1, CEACAM6, PDCD1*

* **Cluster 3**: **Plasma / B Cells (Ig-producing)**
  ‚Ü≥ *IGHG1, IGKC, JCHAIN, CD79A, PRDM1, MZB1, TNFRSF17*

* **Cluster 4**: **Cancer-Associated Fibroblasts (CAFs)**
  ‚Ü≥ *SPARC, LUM, ACTA2, COL5A2, DCN, PDGFRA, THY1*

* **Cluster 5**: **Tumor-Associated Myeloid Cells / TAMs**
  ‚Ü≥ *S100A9, IL1B, CXCL5, FCGR3A, CD14, ARG1, ITGAX*

* **Cluster 6**: **Interferon-Responsive / Inflamed Tumor Cells**
  ‚Ü≥ *CEACAM6, EPCAM, IFIT3, ISG15, STAT1, IFNGR1, VEGFA*

---

Let me know if you'd like malignant probability, trajectory, or subtype classifications (e.g., CMS1‚Äì4).



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant proliferative', '1': 'Malignant differentiated',
                                                      '2': 'Malignant EMT-like', '3': 'Plasma / B', 
                         '4': 'CAF', '5': 'Macrophage', '6': 'Malignant inflamed', 
                                                    })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Malignant', '3': 'Lymphoid', 
                         '4': 'Mesenchymal', '5': 'Myeloid', '6': 'Malignant', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI867

In [None]:
sample_id = "NCBI867"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation of lung fibrosis clusters**, in order, with representative markers:

---

* **Cluster 0**: **Myofibroblasts / Activated Fibroblasts**
  ‚Ü≥ *COL1A1, COL3A1, DCN, FN1, PDGFRA, SFRP2, TGFB3*

* **Cluster 1**: **Endothelial Cells (Capillary & Vascular)**
  ‚Ü≥ *PECAM1, CLDN5, VWF, EPAS1, CD34*

* **Cluster 2**: **T Cells (CD4‚Å∫, CD8‚Å∫, Tregs)**
  ‚Ü≥ *CD3D, CD3E, CD8A, FOXP3, IL7R, CTLA4*

* **Cluster 3**: **Mesenchymal / Transitional Fibroblasts**
  ‚Ü≥ *ACTA2, SPARCL1, ITGB1, COL1A2, ZEB1, PDGFRB*

* **Cluster 4**: **Immature Neutrophils / Monocytes**
  ‚Ü≥ *S100A8, S100A9, FCGR3A, IL1B, HAS1*

* **Cluster 5**: **Plasma / B Cells**
  ‚Ü≥ *JCHAIN, CD79A, TNFRSF17, CD19, MZB1*

* **Cluster 6**: **Alveolar Macrophages (M2-like)**
  ‚Ü≥ *CD68, CD163, MRC1, MARCO, HLA-DRA, FCGR3A*

* **Cluster 7**: **Secretory / Club / Goblet Epithelial Cells**
  ‚Ü≥ *SCGB1A1, AGR3, CEACAM6, KRT5, MUC5AC*

* **Cluster 8**: **AT2 / Injured Epithelial Cells**
  ‚Ü≥ *SFTPC, NAPSA, EPCAM, AGER, LAMP3, CEACAM6*

* **Cluster 9**: **Monocyte-Derived Macrophages / Inflammatory**
  ‚Ü≥ *LYZ, FCER1G, CD14, S100A9, HAVCR2*

* **Cluster 10**: **Mast Cells**
  ‚Ü≥ *CPA3, TPSAB1, KIT, IL4R, PTGDS*

* **Cluster 11**: **Endothelial / Pericyte-like Cells (Activated)**
  ‚Ü≥ *PECAM1, FABP4, CCL21, KDR, PPARG*

* **Cluster 12**: **Stressed / Apoptotic Epithelial Cells**
  ‚Ü≥ *KRT8, KRT18, HSPA5, SOD2, PDIA4, CHAC1*

---

Let me know if you'd like malignancy risk, trajectory paths, or spatial context for these populations.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'Endothelial',
                                                      '2': 'T', '3': 'Mesenchymal', 
                         '4': 'Immature Neutrophil', '5': 'Plasma / B', '6': 'Macrophage M2',
                        '7': 'Epithelial Secretory / Club / Goblet', '8': 'AT2', '9': 'Monocyte-Derived Macrophage', 
                        '10': 'Mast', '11': 'Endothelial', '12': 'Epithelial Stressed'
                                                    })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Endothelial',
                                                      '2': 'Lymphoid', '3': 'Mesenchymal', 
                         '4': 'Myeloid', '5': 'Lymphoid', '6': 'Myeloid',
                        '7': 'Epithelial', '8': 'Epithelial', '9': 'Myeloid', 
                        '10': 'Myeloid', '11': 'Endothelial', '12': 'Epithelial'
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI881

In [None]:
sample_id = "NCBI881"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation of lung fibrosis clusters**, in order, with representative marker genes supporting each assignment:

---

* **Cluster 0**: **Myofibroblasts / Activated Fibroblasts**
  ‚Ü≥ *COL1A1, COL1A2, FN1, SPARCL1, PDGFRA, TGFB3, FAP*

* **Cluster 1**: **Endothelial Cells**
  ‚Ü≥ *PECAM1, CD34, CLDN5, KDR, EPAS1*

* **Cluster 2**: **Mast Cells**
  ‚Ü≥ *CPA3, TPSAB1, KIT, PI16, S100A8*

* **Cluster 3**: **Alveolar Type 2 (AT2) / Epithelial Cells**
  ‚Ü≥ *SFTPC, NAPSA, EPCAM, CEACAM6, NKX2-1*

* **Cluster 4**: **Macrophages / Monocytes (M2-like)**
  ‚Ü≥ *CD68, MRC1, FCGR3A, HLA-DRA, S100A9*

* **Cluster 5**: **Club / Goblet / Secretory Epithelial Cells**
  ‚Ü≥ *SCGB1A1, AGR3, CEACAM6, SOX2, FOXJ1, KRT5*

* **Cluster 6**: **T Cells (CD4‚Å∫, CD8‚Å∫, Tregs)**
  ‚Ü≥ *CD3D, CD3E, CD8A, FOXP3, IL7R, CTLA4*

* **Cluster 7**: **Mesenchymal Transitional Cells / Pericytes**
  ‚Ü≥ *ACTA2, COL1A2, PDGFRB, AXL, ZEB1, YAP1, ITGA3*

* **Cluster 8**: **Monocyte-Derived Macrophages / Inflammatory**
  ‚Ü≥ *LYZ, CD68, MARCO, HAVCR2, S100A8, CD14*

* **Cluster 9**: **Matrix Fibroblasts / Fibrogenic Niche**
  ‚Ü≥ *DCN, COL3A1, LUM, FGF7, SFRP2, PDGFRA, FAP*

* **Cluster 10**: **Plasma / B Cells (Stress-Responsive)**
  ‚Ü≥ *XBP1, JCHAIN, CD19, CD79A, TNFRSF17, HSPA5*

---

Let me know if you'd like spatial context, trajectory relationships (e.g., fibroblast to myofibroblast), or cross-condition comparisons.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'Endothelial',
                                                      '2': 'Mast', '3': 'AT2', 
                         '4': 'Macrophage M2', '5': 'Epithelial Secretory / Club / Goblet', '6': 'T',
                        '7': 'Mesenchymal', '8': 'Monocyte-Derived Macrophage', '9': 'Matrix Fibroblast', 
                        '10': 'Plasma / B', })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Endothelial',
                                                      '2': 'Myeloid', '3': 'Epithelial', 
                         '4': 'Myeloid', '5': 'Epithelial', '6': 'Lymphoid',
                        '7': 'Mesenchymal', '8': 'Myeloid', '9': 'Mesenchymal', 
                        '10': 'Lymphoid', 
                                                    })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI858

In [None]:
sample_id = "NCBI858"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct cell type annotation** for each cluster in **lung fibrosis context**, with supporting genes:

---

* **Cluster 0**: **Fibroblasts / Myofibroblasts**
  ‚Ü≥ *COL1A1, COL1A2, FN1, SPARCL1, ACTA2, PDGFRA, FAP*

* **Cluster 1**: **Monocyte-Derived Macrophages (M2-like)**
  ‚Ü≥ *CD68, MRC1, HLA-DRA, CD14, FCER1G, AIF1, CCL18*

* **Cluster 2**: **Endothelial Cells**
  ‚Ü≥ *PECAM1, CD34, CLDN5, KDR, BMPR2, EPAS1*

* **Cluster 3**: **Alveolar Type 2 (AT2) / Epithelial Cells**
  ‚Ü≥ *SFTPC, NAPSA, EPCAM, KRT8, NKX2-1, CEACAM6*

* **Cluster 4**: **Pericytes / Mesenchymal Transitional Cells**
  ‚Ü≥ *CSPG4, ACTA2, PDGFRB, AXL, COL3A1, LGR6, WNT5A*

* **Cluster 5**: **Secretory Epithelial Cells (Club/Goblet)**
  ‚Ü≥ *SCGB3A2, CEACAM6, KRT8, FOXJ1, MMP7, WFDC2, SOX2*

* **Cluster 6**: **Differentiated Epithelial (AT1 / Mixed)**
  ‚Ü≥ *EPCAM, KRT18, AGER, CEACAM6, SFTPD, SLC25A4, VEGFA*

* **Cluster 7**: **T Cells (CD4‚Å∫, CD8‚Å∫, Effector/Regulatory)**
  ‚Ü≥ *CD3D, CD3E, CD8A, FOXP3, GZMB, IL7R, CTLA4*

* **Cluster 8**: **Macrophages / Monocytes (Activated)**
  ‚Ü≥ *CD68, HLA-DRA, MARCO, SPP1, PPARG, TREM2*

* **Cluster 9**: **Neutrophils / Inflammatory Myeloid Cells**
  ‚Ü≥ *S100A8, S100A9, ITGAX, LYZ, IL1B, FCGR3A, S100A12*

* **Cluster 10**: **Matrix Fibroblasts / Fibrogenic Niche**
  ‚Ü≥ *COL1A1, POSTN, FAP, TGFB3, SFRP2, DCN, PRDX4*

---

Let me know if you'd like grouping into superclusters (e.g., epithelial/mesenchymal/immune), trajectory relationships, or spatial context.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'Macrophage M2',
                                                      '2': 'Endothelial', '3': 'AT2', 
                         '4': 'Pericytes / Mesenchymal', '5': 'Epithelial Secretory / Club / Goblet', '6': 'AT1',
                        '7': 'T', '8': 'Monocyte-Derived Macrophage', '9': 'Neutrophil', 
                        '10': 'Matrix Fibroblast', })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Myeloid',
                                                      '2': 'Endothelial', '3': 'Epithelial', 
                         '4': 'Mesenchymal', '5': 'Epithelial', '6': 'Epithelial',
                        '7': 'Lymphoid', '8': 'Myeloid', '9': 'Myeloid', 
                        '10': 'Mesenchymal', })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX157

In [None]:
sample_id = "TENX157"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is the **annotated cell type list for prostate cancer clusters**, with notes on **malignancy likelihood** based on canonical markers:

---

* **Cluster 0**: **Luminal Epithelial Cells** ‚Äî **Likely Malignant**
  ‚Ü≥ *NKX3-1, TMPRSS2, KLK4, FOLH1, FOXA1, ACPP, EPCAM*

* **Cluster 1**: **Luminal Epithelial Cells (Variant)** ‚Äî **Likely Malignant**
  ‚Ü≥ *NKX3-1, TMPRSS2, PCA3, AMACR, FOXA1, GDF15*

* **Cluster 2**: **Stromal Fibroblasts / Smooth Muscle Cells** ‚Äî **Non-Malignant**
  ‚Ü≥ *MYLK, FHL1, FLNC, SMTN, COL4A1, CCN2*

* **Cluster 3**: **Cancer-Associated Fibroblasts (CAFs)** ‚Äî **Non-Malignant (but tumor-promoting)**
  ‚Ü≥ *THBS1, PDGFRA, COL5A1/2, FBLN5, AEBP1, CCN1*

* **Cluster 4**: **Myeloid (Macrophages / Monocytes)** ‚Äî **Non-Malignant**
  ‚Ü≥ *CD68, MRC1, CD14, FCGR2A, CSF1R, ITGAX*

* **Cluster 5**: **Endothelial Cells** ‚Äî **Non-Malignant**
  ‚Ü≥ *PECAM1, CD34, CLDN5, EPAS1, PLVAP, ENG*

* **Cluster 6**: **Basal/Progenitor Epithelial Cells** ‚Äî **Possibly Malignant**
  ‚Ü≥ *TP63, SOX9, ITGB4, MMP14, CLDN1, EGFR*

* **Cluster 7**: **Neutrophils / Innate-like Myeloid** ‚Äî **Non-Malignant**
  ‚Ü≥ *FCGR3B, FPR1, CXCR1, CXCR2, CSF3R*

* **Cluster 8**: **Luminal Epithelial Cells (Differentiating)** ‚Äî **Likely Malignant**
  ‚Ü≥ *TMPRSS2, ACPP, XBP1, FOXA1, CDH1, DPP4*

* **Cluster 9**: **Neutrophils / Inflammatory Myeloid** ‚Äî **Non-Malignant**
  ‚Ü≥ *S100A8, CSF3R, NCF2, TREM1, FPR2*

* **Cluster 10**: **Mixed Inflammatory / Proliferative Cells** ‚Äî **Uncertain / Possibly Malignant**
  ‚Ü≥ *DUX4, SOX2-OT, CSF3R, HMGA2, TREM1*

* **Cluster 11**: **T Cells (Cytotoxic / Memory)** ‚Äî **Non-Malignant**
  ‚Ü≥ *CD3E, CD8A, GZMK, CD2, CST7*

* **Cluster 12**: **Neutrophils / Granulocytes** ‚Äî **Non-Malignant**
  ‚Ü≥ *CXCR1, CXCR2, FCGR3B, NCF1, FPR1*

* **Cluster 13**: **Mast / Myeloid-like / Ambiguous** ‚Äî **Non-Malignant**
  ‚Ü≥ *MS4A2, KIT, IL1RL1, CMA1, ADAMDEC1*

---

Let me know if you'd like to stratify the malignant epithelial populations further (e.g., by androgen response, ERG fusion, neuroendocrine features).


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
## MODIF AFTER PATH ANNOT
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal', '1': 'Malignant luminal',
                                                      '2': 'Stromal Fibroblast', '3': 'CAF', 
                         '4': 'Macrophage / Monocyte', '5': 'Endothelial', '6': 'Epithelial basal/progenitor',
                        '7': 'Neutrophil', '8': 'Epithelial luminal differentiating', '9': 'Neutrophil', 
                        '10': 'Mixed Inflammatory / Proliferative', '11': 'T cytotoxic', '12': 'Neutrophil',
                                                      '13': 'Mast / Myeloid'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Mesenchymal', '3': 'Mesenchymal', 
                         '4': 'Myeloid', '5': 'Endothelial', '6': 'Epithelial',
                        '7': 'Myeloid', '8': 'Epithelial', '9': 'Myeloid', 
                        '10': 'Malignant', '11': 'Lymphoid', '12': 'Myeloid',
                                                      '13': 'Myeloid'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI884

In [None]:
sample_id = "NCBI884"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here are **cell type annotations** for the lung fibrosis clusters, with notes on **malignancy likelihood** (malignancy is generally **not expected** in this context, but atypical activation is noted where relevant):

---

* **Cluster 0**: **Capillary / Vascular Endothelial Cells** ‚Äî **Non-Malignant**
  ‚Ü≥ *PECAM1, CD34, CLDN5, EPAS1, CA4, AGER*

* **Cluster 1**: **Pro-Inflammatory Myeloid (Neutrophil-like)** ‚Äî **Non-Malignant**
  ‚Ü≥ *S100A8, S100A9, ITGAX, FCGR3A, IL1B*

* **Cluster 2**: **Myofibroblasts / Activated Fibroblasts** ‚Äî **Non-Malignant**
  ‚Ü≥ *COL1A1, COL3A1, DCN, FN1, PDGFRA, PDGFRB*

* **Cluster 3**: **Alveolar Type II Cells (AT2)** ‚Äî **Non-Malignant**
  ‚Ü≥ *SFTPC, SFTPD, NAPSA, LAMP3, EPCAM*

* **Cluster 4**: **M2-like Macrophages / Monocyte-Derived** ‚Äî **Non-Malignant**
  ‚Ü≥ *CD68, MRC1, CD14, HLA-DRA, MS4A7*

* **Cluster 5**: **Myofibroblasts (Contractile)** ‚Äî **Non-Malignant**
  ‚Ü≥ *ACTA2, PDGFRB, ITGB1, VIM, SPARCL1*

* **Cluster 6**: **T Cells (CD8+ Effector-like)** ‚Äî **Non-Malignant**
  ‚Ü≥ *CD3E, CD8A, GZMA, TRAC, IL7R*

* **Cluster 7**: **Monocyte-Derived Macrophages (Lipid-laden)** ‚Äî **Non-Malignant**
  ‚Ü≥ *MARCO, MRC1, FABP4, C1QC, CD68*

* **Cluster 8**: **Aberrant / Transitional Epithelial Cells** ‚Äî **Potentially Pathogenic**
  ‚Ü≥ *CEACAM5/6, MSLN, EPCAM, SFTPC, KRT18*
  ‚ú≥ May reflect epithelial dysregulation or metaplasia in fibrosis.

* **Cluster 9**: **Club Cells / Ciliated Airway Epithelial Cells** ‚Äî **Non-Malignant**
  ‚Ü≥ *SCGB1A1, MUC5B, FOXJ1, KRT5, SOX2*

* **Cluster 10**: **Adventitial Fibroblasts / Lymphoid Niche-like Stroma** ‚Äî **Non-Malignant**
  ‚Ü≥ *CCL21, DCN, COL1A2, VIM, LUM, JCHAIN*

---

Let me know if you'd like classification into broader lineages (e.g. myeloid, epithelial) or highlighting of potential fibrotic drivers.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Pro-Inflammatory Myeloid',
                                                      '2': 'Myofibroblast', '3': 'AT2', 
                         '4': 'Macrophage M2', '5': 'Myofibroblast', '6': 'T cytotoxic',
                        '7': 'Monocyte-Derived Macrophage', '8': 'Epithelial Aberrant / Transitional', '9': 'Epithelial Club / Ciliated', 
                        '10': 'Adventitial Fibroblast',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Myeloid',
                                                      '2': 'Mesenchymal', '3': 'Epithelial', 
                         '4': 'Myeloid', '5': 'Mesenchymal', '6': 'Lymphoid',
                        '7': 'Myeloid', '8': 'Epithelial', '9': 'Epithelial', 
                        '10': 'Mesenchymal',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX123

In [None]:
sample_id = "TENX123"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of the clusters from **healthy skin**, in order, along with selected **marker genes** and cell type lineage assignments:

---

* **Cluster 0**: **Dermal Fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *FBLN1, COL5A2, PDGFRA, MFAP5, SFRP2*

* **Cluster 1**: **Interfollicular / Differentiating Keratinocytes** ‚Äî *Epithelial*
  ‚Ü≥ *AQP3, CLCA2, LY6D, SERPINB2, KRTs, EGFR*

* **Cluster 2**: **Proliferating Basal Keratinocytes** ‚Äî *Epithelial*
  ‚Ü≥ *PCNA, MKI67, CDK1, CENPF, MYC, FGFBP1*

* **Cluster 3**: **Vascular Smooth Muscle Cells (vSMCs)** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, MYH11, CNN1, MYLK, DES*

* **Cluster 4**: **Vascular Endothelial Cells** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, CD34, VWF, CAV1, CLEC14A*

* **Cluster 5**: **Dendritic Cells / T Cells Mix** ‚Äî *Lymphoid & Myeloid*
  ‚Ü≥ *PTPRC, CD3E, CD1C, FOXP3, IL7R, CLEC10A*

* **Cluster 6**: **Suprabasal Keratinocytes / Secretory Epithelia** ‚Äî *Epithelial*
  ‚Ü≥ *SERPINB2, FGFBP1, GPC1, ERBB2, EGFR*

* **Cluster 7**: **Anti-inflammatory Macrophages (M2-like)** ‚Äî *Myeloid*
  ‚Ü≥ *CD163, MRC1, MS4A4A, LYVE1, FGL2*

* **Cluster 8**: **Lymphatic Endothelial Cells** ‚Äî *Endothelial*
  ‚Ü≥ *LYVE1, PROX1, MMRN1, EGFL7, PDPN*

* **Cluster 9**: **Secretory / Glandular Epithelial Cells (e.g., sweat gland)** ‚Äî *Epithelial*
  ‚Ü≥ *SCGB2A1, S100A1, CFTR, GPRC5A, EPCAM*

* **Cluster 10**: **Contractile Mesenchymal (vSMC-like)** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, MYH11, CNN1, DES, CAV1*

* **Cluster 11**: **Adipocytes / Pre-adipocyte-like Fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *PLIN4, ADIPOQ, PPARG, MFAP5, CAV1*

* **Cluster 12**: **Pericytes / Transitional Mesenchymal** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, MYLK, DES, ASPN, GLIPR1*

---

Let me know if you'd like cell states, inferred functions, or spatial roles (e.g., papillary vs reticular dermis).


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Dermal Fibroblast', '1': 'Interfollicular / Differentiating Keratinocyte',
                                                      '2': 'Proliferating Basal Keratinocyte', '3': 'Vascular Smooth Muscle Cell', 
                         '4': 'Vascular Endothelial', '5': 'Dendritic Cells / T', '6': 'Suprabasal Keratinocyte',
                        '7': 'Macrophage M2', '8': 'Lymphatic Endothelial', '9': 'Secretory / Glandular Epithelial', 
                        '10': 'Contractile Mesenchymal', '11': 'Adipocytes / Pre-adipocyte-like Fibroblast',
                                                      '12': 'Pericyte' })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Epithelial',
                                                      '2': 'Epithelial', '3': 'Mesenchymal', 
                         '4': 'Endothelial', '5': 'Myeloid / Lymphoid', '6': 'Epithelial',
                        '7': 'Myeloid', '8': 'Endothelial', '9': 'Epithelial', 
                        '10': 'Mesenchymal', '11': 'Mesenchymal',
                                                      '12': 'Mesenchymal' })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX147

In [None]:
sample_id = "TENX147"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Yes ‚Äî in the **context of colon cancer**, some clusters may reflect **malignant or transformed epithelial states**, **tumor-associated stromal cells**, or **immune remodeling**. Here's the **revised classification**, still succinct and ordered by cluster:

---

* **Cluster 0**: **Tumor Epithelial Cells** ‚Äî *Epithelial*, **Likely Malignant**
  ‚Ü≥ *CEACAM6, LCN2, S100P, MYC, IFITM1, MUC17, PPDPF*

* **Cluster 1**: **Mucin-Secreting Tumor Epithelium / Goblet-like** ‚Äî *Epithelial*, **Possibly Malignant**
  ‚Ü≥ *REG4, MUC12, CEACAM5, BEST2, GPX2*

* **Cluster 2**: **CD8‚Å∫ T Cells (Effector/Exhausted)** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *CD8A, GZMK, PRF1, LAG3, CTLA4*

* **Cluster 3**: **Transformed Enterocyte-like Cells** ‚Äî *Epithelial*, **Likely Malignant**
  ‚Ü≥ *GPX2, MUC5B, CEACAM5, LCN2, SOX9*

* **Cluster 4**: **Cancer-Associated Fibroblasts (CAFs)** ‚Äî *Mesenchymal*, **Tumor-Associated, Non-malignant**
  ‚Ü≥ *FAP, PDGFRA, IGFBP7, MMP1, LOX, THBS1*

* **Cluster 5**: **Tumor Endothelial Cells** ‚Äî *Endothelial*, **Tumor-Associated, Non-malignant**
  ‚Ü≥ *VWF, PECAM1, RGS5, PLVAP, TAGLN, NOTCH3*

* **Cluster 6**: **M2-like Tumor-Associated Macrophages** ‚Äî *Myeloid*, **Non-malignant but immunosuppressive**
  ‚Ü≥ *CD163, MRC1, MS4A7, APOE, MMP9*

* **Cluster 7**: **Plasma Cells (IgA‚Å∫, possibly tumor-induced)** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *MZB1, TNFRSF17, FKBP11, CD27*

* **Cluster 8**: **Myofibroblasts / Pericyte-like Stromal Cells** ‚Äî *Mesenchymal*, **Tumor-Associated, Non-malignant**
  ‚Ü≥ *ACTA2, TAGLN, RGS5, IGFBP7*

* **Cluster 9**: **B Cells / Early Plasma Cells (Reactive)** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *CD79A, CD79B, IGHG3, CD27*

* **Cluster 10**: **Pro-inflammatory Monocytes / DCs** ‚Äî *Myeloid*, **Non-malignant**
  ‚Ü≥ *IL1B, FCGR3A, TNFAIP3, CCL4, MMP9*

* **Cluster 11**: **Differentiated Colonocytes** ‚Äî *Epithelial*, **Likely Non-malignant**
  ‚Ü≥ *CEACAM7, CA2, CA4, CD177, SLC26A3*

* **Cluster 12**: **Naive/Memory B Cells** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *MS4A1, BANK1, PAX5, CXCR5*

* **Cluster 13**: **Stressed / EMT-like Epithelial Cells** ‚Äî *Epithelial*, **Likely Malignant or Transitional**
  ‚Ü≥ *LAMC2, MMP7, TGFBI, PERP, IFI27, ACTA2*

---

Let me know if you‚Äôd like groupings (e.g. epithelial subsets, immune composition, CAF diversity) or trajectory interpretation.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Epithelial Mucin-Secreting',
                                                      '2': 'T cytotoxic', '3': 'Malignant Transformed Enterocyte-like', 
                         '4': 'CAF', '5': 'Endothelial', '6': 'Macrophage M2',
                        '7': 'Plasma', '8': 'Myofibroblast', '9': 'B', 
                        '10': 'Pro-inflammatory Monocyte', '11': 'Epithelial Differentiated Colonocyte',
                                                      '12': 'Naive/Memory B', '13': 'Malignant EMT-like' })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Epithelial',
                                                      '2': 'Lymphoid', '3': 'Malignant', 
                         '4': 'Mesenchymal', '5': 'Endothelial', '6': 'Myeloid',
                        '7': 'Lymphoid', '8': 'Mesenchymal', '9': 'Lymphoid', 
                        '10': 'Myeloid', '11': 'Epithelial',
                                                      '12': 'Lymphoid', '13': 'Malignant' })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI870

In [None]:
sample_id = "NCBI870"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: In the context of **lung fibrosis**, here's a **succinct cell type annotation** for each cluster (by number), highlighting key markers and whether the cluster is typically considered **malignant**, **non-malignant**, or **reactive/pathogenic** in this disease context:

---

* **Cluster 0**: **Pathogenic Myofibroblasts** ‚Äî *Mesenchymal*, **Reactive/Pathogenic**
  ‚Ü≥ *COL1A1, ACTA2, PDGFRA, FN1, SFRP2, YAP1*

* **Cluster 1**: **T Cells (CD8‚Å∫, Activated/Exhausted)** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *CD3D, CD8A, GZMB, FOXP3, CXCR4, CTLA4*

* **Cluster 2**: **Vascular Endothelial Cells** ‚Äî *Endothelial*, **Non-malignant**
  ‚Ü≥ *PECAM1, PLVAP, CD34, CLDN5, KDR*

* **Cluster 3**: **Plasma Cells (ER-Stressed)** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *CD79A, TNFRSF17, FKBP11, XBP1, JCHAIN*

* **Cluster 4**: **Low-quality / Ambient / Control-like** ‚Äî *Likely technical noise*, **Non-informative**
  ‚Ü≥ *UnassignedCodewords, Negative Control Probes*

* **Cluster 5**: **Activated/Contractile Myofibroblasts** ‚Äî *Mesenchymal*, **Reactive/Pathogenic**
  ‚Ü≥ *ACTA2, COL1A1, SFRP4, AXL, TGFB3, CCN2*

* **Cluster 6**: **Alveolar Epithelial Cells (AT2/Transitional)** ‚Äî *Epithelial*, **Non-malignant**
  ‚Ü≥ *SFTPC, KRT18, NAPSA, CEACAM6, EPCAM, SOX9*

* **Cluster 7**: **Monocyte-Derived Macrophages** ‚Äî *Myeloid*, **Non-malignant**
  ‚Ü≥ *CD68, MS4A7, MRC1, FCGR3A, S100A8/9*

* **Cluster 8**: **Naive/Memory B Cells** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *MS4A1, CD79A/B, CXCR5, CXCL13, PTPRC*

* **Cluster 9**: **Mast Cells** ‚Äî *Myeloid*, **Non-malignant**
  ‚Ü≥ *CPA3, TPSAB1, KIT, FCER1A, IL4R*

* **Cluster 10**: **Lymphatic/Peribronchial Endothelial Cells** ‚Äî *Endothelial*, **Non-malignant**
  ‚Ü≥ *PECAM1, VIM, FABP4, CLDN5, PPARG, SFRP2*

---

Let me know if you'd like disease-specific interpretations (e.g. IPF vs healthy), or trajectory mapping across fibroblast states.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'T cytotoxic',
                                                      '2': 'Vascular Endothelial', '3': 'Plasma', 
                         '4': 'Noise', '5': 'Activated/Contractile Myofibroblast', '6': 'AT2',
                        '7': 'Monocyte-Derived Macrophage', '8': 'Naive/Memory B', '9': 'Mast', 
                        '10': 'Lymphatic/Peribronchial Endothelial',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Lymphoid',
                                                      '2': 'Endothelial', '3': 'Lymphoid', 
                         '4': 'Noise', '5': 'Mesenchymal', '6': 'Epithelial',
                        '7': 'Myeloid', '8': 'Lymphoid', '9': 'Myeloid', 
                        '10': 'Endothelial',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI876

In [None]:
sample_id = "NCBI876"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer:  Here is a **succinct annotation** of the clusters in the context of **lung fibrosis**, noting the **likely cell identity**, **lineage**, and whether it is typically **malignant**, **non-malignant**, or **reactive/pathogenic** in this setting:

---

* **Cluster 0**: **Endothelial Cells (Vascular/Capillary)** ‚Äî *Endothelial*, **Non-malignant**
  ‚Ü≥ *PECAM1, PLVAP, EPAS1, KDR, CLDN5, ACKR1*

* **Cluster 1**: **Pro-inflammatory Monocytes / Macrophages** ‚Äî *Myeloid*, **Non-malignant**
  ‚Ü≥ *S100A8, FCGR3A, ITGAX, IL1B, S100A12*

* **Cluster 2**: **Alveolar Type 2 (AT2) Epithelial Cells** ‚Äî *Epithelial*, **Non-malignant**
  ‚Ü≥ *SFTPC, NAPSA, SFTPD, EPCAM, DUOX1*

* **Cluster 3**: **Anti-inflammatory Macrophages / MoMFs** ‚Äî *Myeloid*, **Non-malignant**
  ‚Ü≥ *CD68, MRC1, MS4A7, HLA-DRA, AIF1*

* **Cluster 4**: **Pathogenic Myofibroblasts** ‚Äî *Mesenchymal*, **Reactive/Pathogenic**
  ‚Ü≥ *COL1A1, COL3A1, PDGFRA, FN1, DCN, SFRP2*

* **Cluster 5**: **AT1/AT2 Hybrid Cells or Transitional Epithelium** ‚Äî *Epithelial*, **Reactive/Pathogenic**
  ‚Ü≥ *EPCAM, CEACAM6, AGER, ITGB6, NKX2-1, ICAM1, EGFR*

* **Cluster 6**: **CD8‚Å∫ Effector T Cells** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *CD8A, CD3E, GZMA, KLRG1, CD69*

* **Cluster 7**: **Club/Ciliated Epithelial Cells** ‚Äî *Epithelial*, **Non-malignant**
  ‚Ü≥ *SCGB1A1, FOXJ1, KRT5, DUOX1, WFDC2*

* **Cluster 8**: **Anti-inflammatory Macrophages / Foam-like Macrophages** ‚Äî *Myeloid*, **Non-malignant**
  ‚Ü≥ *CD68, MRC1, MARCO, PPARG, PLIN2, HLA-DQA1*

---

Let me know if you‚Äôd like trajectory inference or fibrosis-associated marker enrichment across clusters.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Pro-inflammatory Monocyte',
                                                      '2': 'AT2', '3': 'Anti-inflammatory Macrophage', 
                         '4': 'Myofibroblast', '5': 'AT1/AT2', '6': 'T cytotoxic',
                        '7': 'Epithelial Club / Ciliated', '8': 'Anti-inflammatory Macrophage',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Myeloid',
                                                      '2': 'Epithelial', '3': 'Myeloid', 
                         '4': 'Mesenchymal', '5': 'Epithelial', '6': 'Lymphoid',
                        '7': 'Epithelial', '8': 'Myeloid',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX98

In [None]:
sample_id = "TENX98"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer:  Here are **succinct annotations** for each cluster in the context of **breast cancer**, including **cell type**, **malignancy likelihood**, and **key marker genes** that support each call:

---

* **Cluster 0**: **Luminal Epithelial Cells (ER‚Å∫ tumor cells)** ‚Äî *Epithelial*, **Malignant**
  ‚Ü≥ *FOXA1, GATA3, ESR1, ERBB2, EPCAM, CDH1, TPD52, KRT8*

* **Cluster 1**: **TAMs (Tumor-Associated Macrophages)** ‚Äî *Myeloid*, **Non-malignant / Immunosuppressive**
  ‚Ü≥ *CD68, FCGR3A, CD163, C1QA, C1QC, MRC1, CXCL16, HAVCR2*

* **Cluster 2**: **Cancer-Associated Fibroblasts (CAFs)** ‚Äî *Mesenchymal*, **Reactive/Pathogenic**
  ‚Ü≥ *ACTA2, PDGFRA, PDGFRB, POSTN, MMP2, SFRP4, CXCL12*

* **Cluster 3**: **Mixed Tumor Cells (Luminal + EMT)** ‚Äî *Epithelial*, **Malignant**
  ‚Ü≥ *CDH1, KRT8, EPCAM, PDGFRA, ZEB1, ENAH, CXCL12, SFRP4*

* **Cluster 4**: **Luminal Epithelial / Tumor Cells** ‚Äî *Epithelial*, **Malignant**
  ‚Ü≥ *GATA3, FOXA1, KRT7, CDH1, TACSTD2, TPD52, ERBB2, AGR3*

* **Cluster 5**: **Vascular Endothelial Cells** ‚Äî *Endothelial*, **Non-malignant**
  ‚Ü≥ *PECAM1, VWF, CD93, CAV1, KDR, CLEC14A, RAMP2*

* **Cluster 6**: **Basal-like / Myoepithelial Tumor Cells** ‚Äî *Epithelial*, **Malignant**
  ‚Ü≥ *KRT5, KRT14, EGFR, DST, MYLK, FOXC2, SFRP1, PDGFRA*

---

Let me know if you‚Äôd like immune subtype decomposition (e.g., M1/M2 macrophages, T cell exhaustion), trajectory analysis, or CAF subtype breakdown.



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal', '1': 'Macrophage',
                                                      '2': 'CAF', '3': 'Malignant luminal + EMT', 
                         '4': 'Malignant luminal', '5': 'Vascular endothelial', '6': 'Malignant basal-like',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Myeloid',
                                                      '2': 'Mesenchymal', '3': 'Malignant', 
                         '4': 'Malignant', '5': 'Endothelial', '6': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# "TENX134"

In [None]:
sample_id = "TENX134"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer:  Here is a **succinct cell type and malignancy annotation** for each cluster in the context of **bone marrow cancer**, based on the provided marker genes:

---

* **Cluster 0**: **Proliferating B cells / Pre-B ALL-like cells** ‚Äî *Lymphoid*, **Likely malignant**
  ‚Ü≥ *CD19, CD79A, TCL1A, MKI67, TOP2A, FOXP1, MYC, PCNA, CENPF*

* **Cluster 1**: **Tumor-associated myeloid cells (monocytic)** ‚Äî *Myeloid*, **Non-malignant / immunosuppressive**
  ‚Ü≥ *CD163, CD68, FCGR3A, BASP1, PTGDS, CD4*

* **Cluster 2**: **Ambiguous / low-quality / negative controls** ‚Äî *Unclassified*, **Unclear (mostly control probes)**
  ‚Ü≥ *LEF1, TCL1A* + mostly *NegControl* genes ‚Üí likely technical

* **Cluster 3**: **Ambiguous / low-quality / negative controls** ‚Äî *Unclassified*, **Unclear (mostly control probes)**
  ‚Ü≥ *TCL1A* + mostly *NegControl* genes ‚Üí likely technical

* **Cluster 4**: **Cytotoxic T cells (possibly exhausted)** ‚Äî *Lymphoid*, **Non-malignant**
  ‚Ü≥ *CD8A, TRAC, GZMK, PRF1, NKG7, HAVCR2, LAG3, SLAMF7*

* **Cluster 5**: **Fibroblasts / MSC-like stromal cells** ‚Äî *Mesenchymal*, **Non-malignant / tumor-supportive**
  ‚Ü≥ *COL1A1, COL6A3, PDGFRA, CXCL12, SPARC, THY1, PRRX1, SFRP4*

---

Let me know if you‚Äôd like malignant B-cell subtype inference (e.g., B-ALL vs. CLL), trajectory inference, or T-cell exhaustion scoring.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant proliferating B', '1': 'Tumor-associated myeloid',
                                                      '2': 'Noise', '3': 'Noise', 
                         '4': 'Cytotoxic T', '5': 'Fibroblasts / MSC-like',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Myeloid',
                                                      '2': 'Noise', '3': 'Noise', 
                         '4': 'Lymphoid', '5': 'Mesenchymal',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI865

In [None]:
sample_id = "NCBI865"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** for each cluster in the context of **lung fibrosis**, including **cell type**, **lineage**, and whether it's **pathogenic/reactive**, **homeostatic**, or **technical/low-quality**:

---

* **Cluster 0**: **Capillary / vascular endothelial cells** ‚Äî *Endothelial*, **Homeostatic**
  ‚Ü≥ *PECAM1, CLDN5, KDR, RAMP2, APLNR, ACKR1*

* **Cluster 1**: **Myofibroblasts (activated fibroblasts)** ‚Äî *Mesenchymal*, **Pathogenic**
  ‚Ü≥ *COL1A1, ACTA2, POSTN, PDGFRA, FN1, SFRP4*

* **Cluster 2**: **Naive / Memory T cells** ‚Äî *Lymphoid*, **Homeostatic / Inflammatory**
  ‚Ü≥ *TRAC, CD3E, IL7R, CD27, CD8A, FOXP3, CXCL13*

* **Cluster 3**: **Mast cells** ‚Äî *Myeloid*, **Homeostatic / Inflammatory**
  ‚Ü≥ *TPSAB1, CPA3, KIT, S100A8, S100A12*

* **Cluster 4**: **Interstitial / anti-inflammatory macrophages** ‚Äî *Myeloid*, **Reactive / Fibrosis-promoting**
  ‚Ü≥ *CD68, MRC1, HLA-DRA, CCL18, TREM2, AXL, TGFB1*

* **Cluster 5**: **Plasma B cells** ‚Äî *Lymphoid*, **Reactive / Inflammatory**
  ‚Ü≥ *CD79A, JCHAIN, TNFRSF17, FKBP11, HSPA5*

* **Cluster 6**: **Alveolar type 2 (AT2) epithelial cells** ‚Äî *Epithelial*, **Homeostatic / Injured**
  ‚Ü≥ *SFTPC, NAPSA, SFTPD, EPCAM, NKX2-1*

* **Cluster 7**: **Pro-fibrotic macrophages / Mo-Macs** ‚Äî *Myeloid*, **Pathogenic**
  ‚Ü≥ *CD68, MRC1, COL1A1, SFRP2, FN1, HLA-DRA*

* **Cluster 8**: **Basal-like / aberrant epithelial cells** ‚Äî *Epithelial*, **Pathogenic / Reactive**
  ‚Ü≥ *KRT5, KRT14, KRT17, SOX4, EGFR, YAP1, WFDC2*

* **Cluster 9**: **Myofibroblasts (hybrid mesenchymal)** ‚Äî *Mesenchymal*, **Pathogenic**
  ‚Ü≥ *ACTA2, COL1A1, PDGFRB, SNAI2, FN1, YAP1*

* **Cluster 10**: **Aberrant basaloid epithelial cells / Transitional** ‚Äî *Epithelial*, **Pathogenic**
  ‚Ü≥ *SCGB1A1, KRT5, KRT17, SOX2, SOX4, MUC5B, YAP1*

* **Cluster 11**: **CD8‚Å∫ Effector T cells (cytotoxic)** ‚Äî *Lymphoid*, **Inflammatory**
  ‚Ü≥ *CD8A, GZMB, PRF1, CD3E, FGFBP2, HAVCR2*

* **Cluster 12**: **Perivascular fibroblasts / adventitial cells** ‚Äî *Mesenchymal*, **Pathogenic**
  ‚Ü≥ *FN1, VIM, PECAM1, TGFB1, SNAI2, COL1A1*

---

Let me know if you'd like to overlay spatial, trajectory, or pseudotime analysis across these clusters.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Capillary / vascular endothelial', '1': 'Myofibroblast',
                                                      '2': 'T', '3': 'Mast', 
                         '4': 'Interstitial / anti-inflammatory macrophage', '5': 'Plasma', '6': 'AT2',
                                                     '7': 'Pro-fibrotic macrophage', '8': 'Epithelial basal-like', 
                                                     '9': 'Myofibroblast', "10": "Epithelial basaloid", "11": "T cytotoxic",
                                                      "12": "Perivascular fibroblast"})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Mesenchymal',
                                                      '2': 'Lymphoid', '3': 'Myeloid', 
                         '4': 'Myeloid', '5': 'Lymphoid', '6': 'Epithelial',
                                                     '7': 'Myeloid', '8': 'Epithelial', 
                                                     '9': 'Mesenchymal', "10": "Epithelial", "11": "Lymphoid",
                                                      "12": "Mesenchymal"})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX118

In [None]:
sample_id = "TENX118"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of the clusters in the context of **lung cancer**, including **cell type**, **lineage**, and whether the population is **likely malignant**, **immune**, or **stromal**:

---

* **Cluster 0**: **Epithelial tumor cells (lung adenocarcinoma-like)** ‚Äî *Epithelial*, **Likely malignant**
  ‚Ü≥ *EPCAM, GPRC5A, EGFR, ERBB2, MUC5B, PCNA, MEST, KRT7*

* **Cluster 1**: **Cycling tumor cells / Proliferative epithelial** ‚Äî *Epithelial*, **Likely malignant**
  ‚Ü≥ *EPCAM, PCNA, TOP2A, MKI67, MYC, ERBB2, MEST*

* **Cluster 2**: **T cells (CD4‚Å∫, CD8‚Å∫, regulatory, cytotoxic)** ‚Äî *Lymphoid*, **Non-malignant immune**
  ‚Ü≥ *TRAC, CD3E, CD8A, GZMA, FOXP3, CTLA4, PRF1*

* **Cluster 3**: **Non-malignant epithelial / secretory-like** ‚Äî *Epithelial*, **Possibly reactive / pre-malignant**
  ‚Ü≥ *SCGB2A1, SFTA2, AGR3, SPDEF, CFTR, EPCAM*

* **Cluster 4**: **Cancer-associated fibroblasts (CAFs)** ‚Äî *Mesenchymal*, **Stromal / pro-tumorigenic**
  ‚Ü≥ *PDGFRA, PDGFRB, ACTA2, FAP, COL5A2, SFRP4*

* **Cluster 5**: **Tumor-associated macrophages (TAMs)** ‚Äî *Myeloid*, **Pro-tumorigenic immune**
  ‚Ü≥ *CD68, MRC1, FCGR3A, CD163, TREM2, AIF1, IL2RA*

* **Cluster 6**: **Endothelial cells (vascular / lymphatic)** ‚Äî *Endothelial*, **Stromal / angiogenic**
  ‚Ü≥ *PECAM1, VWF, CLEC14A, RAMP2, LYVE1, ANGPT2*

* **Cluster 7**: **Plasma / B cells (mature)** ‚Äî *Lymphoid*, **Non-malignant immune**
  ‚Ü≥ *CD19, CD79A, TNFRSF17, MZB1, PRDM1, SLAMF7*

* **Cluster 8**: **Ciliated / basal-like epithelial** ‚Äî *Epithelial*, **Mixed ‚Äî possibly pre-malignant or reactive**
  ‚Ü≥ *FOXJ1, CFB, CCDC39, SOX2, EHF, AGR3, CLCA2*

---

Let me know if you‚Äôd like inference on EMT status, exhaustion scores, or tumor subtyping (e.g. LUAD vs. LUSC).



In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant proliferative',
                                                      '2': 'T', '3': 'Epithelial secretory', 
                         '4': 'CAF', '5': 'Macrophage', '6': 'Endothelial',
                                                     '7': 'Plasma/B', '8': 'Epithelial ciliated/basal-like',  })

In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Lymphoid', '3': 'Epithelial', 
                         '4': 'Mesenchymal', '5': 'Myeloid', '6': 'Endothelial',
                                                     '7': 'Lymphoid', '8': 'Epithelial',  })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX115

In [None]:
sample_id = "TENX115"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of the clusters in the context of **skin cancer** (likely melanoma or cutaneous SCC), with **cell type**, **lineage**, and a note on whether the cluster is **likely malignant**:

---

* **Cluster 0**: **Differentiated Melanocytic / Melanoma cells** ‚Äî *Epithelial*, **Likely malignant**
  ‚Ü≥ *MITF, MLANA, TYR, TYRP1, PMEL, S100B, MIA, DCT, VIM*

* **Cluster 1**: **Proliferating Melanoma cells** ‚Äî *Epithelial*, **Likely malignant**
  ‚Ü≥ *MITF, MLANA, TYR, MKI67, TOP2A, CENPF, STMN1, MYC*

* **Cluster 2**: **Tumor-associated macrophages / Fibroblasts** ‚Äî *Myeloid / Mesenchymal*, **Stromal / immune**
  ‚Ü≥ *CD68, AIF1, LYZ, THY1, PDGFRA, COL6A1-3, CXCL12*

* **Cluster 3**: **Pigmented Melanocytic cells / Melanoma** ‚Äî *Epithelial*, **Likely malignant**
  ‚Ü≥ *TYR, PMEL, MITF, MLANA, EDNRB, CALD1, KIT*

* **Cluster 4**: **Basal keratinocytes / squamous-like tumor** ‚Äî *Epithelial*, **Possibly malignant**
  ‚Ü≥ *KRT5, KRT15, TP63, COL17A1, AREG, TFAP2A*

* **Cluster 5**: **Control / empty droplets / noise** ‚Äî *Artifacts*, **Discard**
  ‚Ü≥ *BLANK and NegControl features only*

* **Cluster 6**: **Differentiating keratinocytes** ‚Äî *Epithelial*, **Possibly malignant or reactive**
  ‚Ü≥ *KRT16, KRT17, S100A14, SLPI, CRABP2, PERP*

* **Cluster 7**: **Endothelial / perivascular stromal cells** ‚Äî *Endothelial / Mesenchymal*, **Non-malignant stromal**
  ‚Ü≥ *VWF, CDH5, AQP1, RGS5, THY1, COL6A3, CALD1*

* **Cluster 8**: **Highly proliferative melanoma cells** ‚Äî *Epithelial*, **Likely malignant**
  ‚Ü≥ *MKI67, CENPF, TYR, MITF, UBE2C, TYMS, TOP2A*

---

Let me know if you'd like deeper analysis (e.g. EMT state, immune infiltration score, subtype markers).


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant differentiated melanocytic', '1': 'Malignant proliferative',
                                                      '2': 'Macrophage / Fibroblast', '3': 'Malignant pigmented melanocytic', 
                         '4': 'Epithelial basal keratinocytes', '5': 'Noise', '6': 'Epithelial differentiating keratinocytes',
                                                     '7': 'Endothelial', '8': 'Malignant proliferative',  })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Myeloid/Mesenchymal', '3': 'Malignant', 
                         '4': 'Epithelial', '5': 'Noise', '6': 'Epithelial',
                                                     '7': 'Endothelial', '8': 'Malignant',  })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI784

In [None]:
sample_id = "NCBI784"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here's a **succinct annotation** of your clusters in the context of **breast cancer**, keeping cluster order, identifying **lineage**, select **marker genes**, and flagging **likely malignant** clusters:

---

* **Cluster 0** ‚Äì **T cells (CD8‚Å∫ and CD4‚Å∫ cytotoxic/exhausted)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3E, CD8A, GZMA, PRF1, CTLA4, PDCD1* ‚Üí **Not malignant**

* **Cluster 1** ‚Äì **Cancer-associated fibroblasts (CAFs)** ‚Äî *Mesenchymal*
  ‚Ü≥ *LUM, POSTN, SFRP4, PDGFRA/B, ZEB1, ACTA2* ‚Üí **Not malignant, but tumor-supportive**

* **Cluster 2** ‚Äì **Proliferating luminal epithelial cells (likely malignant)** ‚Äî *Epithelial*
  ‚Ü≥ *EPCAM, ERBB2, CCND1, FOXA1, KRT7/8, ENAH, MDM2, TOP2A* ‚Üí **Malignant**

* **Cluster 3** ‚Äì **Tumor-associated macrophages (M2/TAM-like)** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, CD163, C1QA/B/C, AIF1, MRC1, CXCL16* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **Luminal epithelial cells (differentiated, ER‚Å∫ features)** ‚Äî *Epithelial*
  ‚Ü≥ *EPCAM, ESR1, GATA3, FOXA1, MLPH, CEACAM6* ‚Üí **Malignant**

* **Cluster 5** ‚Äì **Endothelial cells (vascular and lymphatic)** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, VWF, KDR, CLDN5, RAMP2, EGFL7* ‚Üí **Not malignant**

* **Cluster 6** ‚Äì **Basal-like epithelial cells (or EMT / hybrid)** ‚Äî *Epithelial / Mesenchymal*
  ‚Ü≥ *KRT14, KRT5, SFRP1, ACTA2, FOXC2, ALDH1A3, EGFR* ‚Üí **Possibly malignant (basal subtype)**

* **Cluster 7** ‚Äì **Luminal epithelial cells (ER‚Å∫, less proliferative)** ‚Äî *Epithelial*
  ‚Ü≥ *ESR1, FOXA1, GATA3, CCND1, CDH1* ‚Üí **Malignant**

* **Cluster 8** ‚Äì **Plasma / B lineage (memory/plasma transition)** ‚Äî *Lymphoid*
  ‚Ü≥ *MZB1, CD79A/B, SLAMF7, PRDM1* ‚Üí **Not malignant**

* **Cluster 9** ‚Äì **Myeloid/monocytic cells (inflammatory TAMs)** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, FCGR3A, MMP1/12, S100A8/A9, CLECL1* ‚Üí **Not malignant**

* **Cluster 10** ‚Äì **B cells (naive and memory)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD19, MS4A1, CD79A/B, BANK1* ‚Üí **Not malignant**

---

Let me know if you'd like these mapped to **PAM50 subtypes** (e.g., Luminal A, B, Basal-like, HER2‚Å∫) or linked with **spatial or treatment resistance** features.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T', '1': 'CAF',
                                                      '2': 'Malignant proliferating luminal', '3': 'Macrophage M2', 
                         '4': 'Malignant luminal ER+', '5': 'Endothelial', '6': 'Malignant basal',
                                                     '7': 'Malignant luminal ER+', '8': 'Plasma / B', 
                                                      '9': 'Inflammatory macrophage', '10': 'B'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Mesenchymal',
                                                      '2': 'Malignant', '3': 'Myeloid', 
                         '4': 'Malignant', '5': 'Endothelial', '6': 'Malignant',
                                                     '7': 'Malignant', '8': 'Lymphoid', 
                                                      '9': 'Myeloid', '10': 'Lymphoid'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI785

In [None]:
sample_id = "NCBI785"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct cluster annotation** for your breast cancer data, keeping clusters in order, with **cell type**, **marker genes**, and whether the cluster is **likely malignant**:

---

* **Cluster 0** ‚Äì **Luminal epithelial cells (proliferative)** ‚Äî *Epithelial*
  ‚Ü≥ *ERBB2, CCND1, EPCAM, FOXA1, KRT7/8, ENAH, MDM2, TOP2A* ‚Üí **Malignant**

* **Cluster 1** ‚Äì **T cells (cytotoxic & helper)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3E, CD8A, CD4, GZMA, PRF1, CTLA4, CXCR4* ‚Üí **Not malignant**

* **Cluster 2** ‚Äì **Cancer-associated fibroblasts (CAFs)** ‚Äî *Mesenchymal*
  ‚Ü≥ *LUM, POSTN, ACTA2, PDGFRA/B, SFRP4, ZEB1* ‚Üí **Not malignant**

* **Cluster 3** ‚Äì **Tumor-associated macrophages (TAMs)** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, CD163, MRC1, FCGR3A, C1QA/B/C, ZEB2, APOC1* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **Luminal epithelial cells (differentiated)** ‚Äî *Epithelial*
  ‚Ü≥ *GATA3, FOXA1, ESR1, EPCAM, CEACAM6, CDH1* ‚Üí **Malignant**

* **Cluster 5** ‚Äì **Endothelial cells (vascular)** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, VWF, KDR, RAMP2, CAV1, CLDN5* ‚Üí **Not malignant**

* **Cluster 6** ‚Äì **Luminal epithelial cells (ER‚Å∫ HER2‚Å∫)** ‚Äî *Epithelial*
  ‚Ü≥ *ERBB2, FOXA1, CCND1, EPCAM, MLPH, ENAH, AR* ‚Üí **Malignant**

* **Cluster 7** ‚Äì **Basal-like epithelial cells (EMT/high plasticity)** ‚Äî *Epithelial / Mesenchymal*
  ‚Ü≥ *KRT14, KRT5, SFRP1, EGFR, ACTA2, FOXC2, ALDH1A3* ‚Üí **Likely malignant (basal subtype)**

* **Cluster 8** ‚Äì **Luminal epithelial cells (HER2‚Å∫ enriched)** ‚Äî *Epithelial*
  ‚Ü≥ *ERBB2, CCND1, FOXA1, ENAH, CTTN, MDM2, FASN* ‚Üí **Malignant**

* **Cluster 9** ‚Äì **Stressed or inflamed luminal-like cells / myeloid mix** ‚Äî *Epithelial / Myeloid*
  ‚Ü≥ *ERBB2, MMP12, KRT8, S100A8, MPO, CEACAM6, C15orf48* ‚Üí **Likely malignant**

* **Cluster 10** ‚Äì **Plasma / pre-B cells** ‚Äî *Lymphoid*
  ‚Ü≥ *MZB1, CD79A/B, PRDM1, SLAMF7, TNFRSF17* ‚Üí **Not malignant**

* **Cluster 11** ‚Äì **B cells (naive and memory)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD19, MS4A1, CD79A/B, BANK1, CXCR4, CCR7* ‚Üí **Not malignant**

---

Let me know if you want these linked with **PAM50 subtypes**, **immune scores**, or **trajectory analyses**.


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant luminal proliferative', '1': 'T',
                                                      '2': 'CAF', '3': 'Macrophage', 
                         '4': 'Malignant luminal', '5': 'Endothelial', '6': 'Malignant luminal ER+ HER2+',
                                                     '7': 'Malignant basal', '8': 'Malignant luminal HER2+', 
                                                      '9': 'Malignant luminal stressed', '10': 'Plasma', '11': 'B'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Lymphoid',
                                                      '2': 'Mesenchymal', '3': 'Myeloid', 
                         '4': 'Malignant', '5': 'Endothelial', '6': 'Malignant',
                                                     '7': 'Malignant', '8': 'Malignant', 
                                                      '9': 'Malignant', '10': 'Lymphoid', '11': 'Lymphoid'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX126

In [None]:
sample_id = "TENX126"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of the **pancreatic cancer clusters**, listing **cell type**, **lineage**, and whether likely **malignant** ‚Äî based on gene markers:

---

* **Cluster 0**: **Cancer-associated fibroblasts (myofibroblastic)** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, PDGFRB, SFRP2, THBS2, MYH11, SFRP4* ‚Üí **Likely non-malignant**

* **Cluster 1**: **Acinar-like cells (digestive enzyme-producing)** ‚Äî *Epithelial*
  ‚Ü≥ *AMY2A, ANPEP, CFTR, DPEP1, KCNK3, AQP8* ‚Üí **Likely non-malignant**

* **Cluster 2**: **T cells / Myeloid mix** ‚Äî *Lymphoid / Myeloid*
  ‚Ü≥ *CD3E, CD4, CD8A, CD68, CD163, MRC1, CTLA4* ‚Üí **Non-malignant**

* **Cluster 3**: **Ductal / cancer cells (low/intermediate proliferation)** ‚Äî *Epithelial*
  ‚Ü≥ *EPCAM, CFTR, KRT7, EGFR, ERBB2, MLPH* ‚Üí **Possibly malignant**

* **Cluster 4**: **Proliferative ductal / tumor cells (high-grade)** ‚Äî *Epithelial*
  ‚Ü≥ *MKI67, TOP2A, CDK1, MDM2, ERBB2, KRT7, EPCAM, MYC* ‚Üí **Likely malignant**

* **Cluster 5**: **Endothelial cells (vascular / lymphatic)** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, VWF, CD34, EGFL7, SOX17, ANGPT2* ‚Üí **Non-malignant**

* **Cluster 6**: **Acinar-like / mixed secretory phenotype** ‚Äî *Epithelial*
  ‚Ü≥ *AMY2A, CFTR, KLK11, ANPEP, DIRAS3, KIT* ‚Üí **Likely non-malignant**

* **Cluster 7**: **Endocrine islet cells (Œ±, Œ≤, Œ¥, PP)** ‚Äî *Epithelial*
  ‚Ü≥ *INS, GCG, SST, CHGA, PCSK2, PPY, SCGN* ‚Üí **Non-malignant**

* **Cluster 8**: **Ductal epithelial cells (non-proliferative)** ‚Äî *Epithelial*
  ‚Ü≥ *EPCAM, FXYD2, CFTR, CA4, TM4SF18* ‚Üí **Possibly malignant (low-grade)**

---

Let me know if you'd like spatial mapping, subtype (classical/basal-like) annotations, or clustering condensed by major compartments.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'Acinar',
                                                      '2': 'T / Myeloid', '3': 'Malignant ductal', 
                         '4': 'Malignant proliferative ductal', '5': 'Endothelial', '6': 'Acinar',
                                                     '7': 'Endocrine islet', '8': 'Malignant ductal', })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Epithelial',
                                                      '2': 'Lymphoid/Myeloid', '3': 'Malignant', 
                         '4': 'Malignant', '5': 'Endothelial', '6': 'Epithelial',
                                                     '7': 'Epithelial', '8': 'Malignant', })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX148

In [None]:
sample_id = "TENX148"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a concise annotation of the **colorectal cancer clusters**, each labeled with **cell type**, **lineage**, key **marker genes**, and whether the cluster is **likely malignant** or not:

---

* **Cluster 0**: **Colorectal cancer epithelial cells** ‚Äî *Epithelial*
  ‚Ü≥ *AXIN2, CEACAM5/6, MYC, RNF43, ASCL2, LGR5* ‚Üí **Likely malignant**

* **Cluster 1**: **Colorectal cancer epithelial cells (proliferative/stem-like)** ‚Äî *Epithelial*
  ‚Ü≥ *CEACAM5/6, LGR5, MYC, ASCL2, KRT23, RNF43, MKI67* ‚Üí **Likely malignant**

* **Cluster 2**: **Pro-inflammatory macrophages / monocytes** ‚Äî *Myeloid*
  ‚Ü≥ *IL1B, FCGR3A, ITGAX, CD14, CCL4, S100A12* ‚Üí **Non-malignant**

* **Cluster 3**: **Endothelial cells / pericytes** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, CDH5, VWF, PLVAP, RGS5, NOTCH3* ‚Üí **Non-malignant**

* **Cluster 4**: **TAMs / M2-like macrophages** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, CD163, MRC1, APOE, MMP9, C1QA/B/C* ‚Üí **Non-malignant**

* **Cluster 5**: **T cells (CD4+/CD8+, cytotoxic and regulatory mix)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3D/E/G, CD8A, GZMK, CTLA4, TIGIT, PRF1* ‚Üí **Non-malignant**

* **Cluster 6**: **Enterocyte-like / normal colon epithelium** ‚Äî *Epithelial*
  ‚Ü≥ *CA2, REG4, FABP2, OLFM4, CLCA1, CDHR5, ITLN1* ‚Üí **Likely non-malignant**

* **Cluster 7**: **Colorectal cancer epithelial cells (cycling)** ‚Äî *Epithelial*
  ‚Ü≥ *MKI67, UBE2C, LGR5, MYC, SLC7A5, CEACAM6, ASCL2* ‚Üí **Likely malignant**

* **Cluster 8**: **Cancer-associated fibroblasts (iCAF/myCAF mix)** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, FAP, COL11A1, INHBA, MMP11, VCAN, IGFBP7* ‚Üí **Non-malignant**

* **Cluster 9**: **B/plasma-like cells (IgA/IgG producing)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD79A/B, MZB1, TNFRSF17, BANK1, IGHG3* ‚Üí **Non-malignant**

* **Cluster 10**: **Stromal fibroblasts / myofibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, TAGLN, GREM1, CRYAB, SMOC2, TGFBI, RSPO3* ‚Üí **Non-malignant**

---

Let me know if you'd like subtyping (CMS classification), tumor purity estimates, or ligand‚Äìreceptor interaction predictions based on these clusters.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant proliferative/stem-like',
                                                      '2': 'Pro-inflammatory macrophage', '3': 'Endothelial', 
                         '4': 'Macrophage M2', '5': 'T', '6': 'Epithelial enterocyte',
                                                     '7': 'Malignant cycling', '8': 'CAF', '9': 'Plasma/B', '10': "Myofibroblast"})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Myeloid', '3': 'Endothelial', 
                         '4': 'Myeloid', '5': 'Lymphoid', '6': 'Epithelial',
                                                     '7': 'Malignant', '8': 'Mesenchymal', '9': 'Lymphoid', '10': "Mesenchymal"})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI883

In [None]:
sample_id = "NCBI883"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of the clusters in the context of **lung fibrosis**, with **cell types**, **lineages**, key **marker genes**, and whether they are **likely fibrogenic/pro-fibrotic** or not:

---

* **Cluster 0** ‚Äì **Endothelial cells** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, CD34, CLDN5, KDR, PLVAP* ‚Üí **Not fibrogenic**

* **Cluster 1** ‚Äì **Inflammatory/activated immune mix (neutrophil/DC-like)** ‚Äî *Myeloid / Other*
  ‚Ü≥ *S100A8/A12, ITGAX, CXCL13, LTB, MS4A1* ‚Üí **Possibly pro-fibrotic**

* **Cluster 2** ‚Äì **Macrophages (M2-like / fibrotic)** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, MRC1, C1QC, CD163, TGFB1, SPP1* ‚Üí **Fibrogenic**

* **Cluster 3** ‚Äì **Alveolar epithelial cells (AT2/AT1 mix)** ‚Äî *Epithelial*
  ‚Ü≥ *SFTPC, SFTPD, NAPSA, EPCAM, KRT8, NKX2-1* ‚Üí **Possibly damaged/transitioning (pro-fibrotic)**

* **Cluster 4** ‚Äì **Activated fibroblasts / myofibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *COL1A1, ACTA2, PDGFRA/B, FAP, FN1, VIM, SNAI2* ‚Üí **Fibrogenic**

* **Cluster 5** ‚Äì **Fibroblasts / matrix fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *COL1A1, COL3A1, FN1, DCN, SPARCL1, RSPO3, TGFB2* ‚Üí **Fibrogenic**

* **Cluster 6** ‚Äì **Alveolar epithelial cells (injured AT2-like)** ‚Äî *Epithelial*
  ‚Ü≥ *SFTPD, CEACAM6, EPCAM, KRT18, NAPSA, AXIN2, EGFR* ‚Üí **Possibly pro-fibrotic**

* **Cluster 7** ‚Äì **T cells (CD8+ cytotoxic and effector)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3D/E, CD8A/B, GZMK, KLRG1, CTLA4* ‚Üí **Not fibrogenic**

---

Let me know if you'd like fibroblast subtype annotation (e.g., myoCAF/iCAF), EMT signature scoring, or pseudotime trajectory inference.



In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Inflammatory/activated myeloid mix',
                                                      '2': 'Macrophage M2', '3': 'AT1/AT2', 
                         '4': 'Myofibroblast', '5': 'Matrix fibroblast', '6': 'AT2',
                                                     '7': 'T cytotoxic', })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Myeloid',
                                                      '2': 'Myeloid', '3': 'Epithelial', 
                         '4': 'Mesenchymal', '5': 'Mesenchymal', '6': 'Epithelial',
                                                     '7': 'Lymphoid', })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI857

In [None]:
sample_id = "NCBI857"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of the clusters in the context of **lung fibrosis**, including **cell types**, **lineages**, key **marker genes**, and whether they are **likely fibrogenic** or **not**:

---

* **Cluster 0** ‚Äì **T cells (CD4+, CD8+, activated)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3D/E/G, CD8A, GZMK, FOXP3, CXCL13* ‚Üí **Not fibrogenic**

* **Cluster 1** ‚Äì **Myofibroblasts / activated fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *COL1A1/2, ACTA2, FAP, PDGFRA, SFRP2, FN1* ‚Üí **Fibrogenic**

* **Cluster 2** ‚Äì **Monocyte-derived macrophages (M2-like)** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, MRC1, SPP1, CD14, FCGR3A, TGFB1* ‚Üí **Fibrogenic**

* **Cluster 3** ‚Äì **Club cells / bronchiolar epithelial** ‚Äî *Epithelial*
  ‚Ü≥ *SCGB1A1, KRT5/8/17, CEACAM6, MUC5B, EPCAM* ‚Üí **Not fibrogenic**

* **Cluster 4** ‚Äì **Ciliated / bronchiolar epithelial cells** ‚Äî *Epithelial*
  ‚Ü≥ *FOXJ1, KRT8/18, NKX2-1, SOX2, EPCAM* ‚Üí **Not fibrogenic**

* **Cluster 5** ‚Äì **B cells / plasma B cells** ‚Äî *Lymphoid*
  ‚Ü≥ *MS4A1 (CD20), CD19, CD79A/B, TNFRSF13C* ‚Üí **Not fibrogenic**

* **Cluster 6** ‚Äì **Endoplasmic reticulum-stressed cells / plasma-like** ‚Äî *Mixed (possibly B-lineage or stressed epithelium)*
  ‚Ü≥ *XBP1, FKBP11, DERL3, PDIA4, SEC11C* ‚Üí **Possibly fibrogenic**

* **Cluster 7** ‚Äì **Endothelial cells** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, CD34, PLVAP, CLDN5, KDR* ‚Üí **Not fibrogenic**

* **Cluster 8** ‚Äì **Myofibroblasts / invasive fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *COL1A1/2, ACTA2, FN1, PDGFRB, AXL, ZEB1, TGFB3* ‚Üí **Fibrogenic**

* **Cluster 9** ‚Äì **Inflammatory macrophages / monocytes** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, MARCO, ITGAX, FCER1G, SPP1, TGFB1* ‚Üí **Fibrogenic**

* **Cluster 10** ‚Äì **Mixed T cells with epithelial gene expression (possibly doublets)** ‚Äî *Mixed*
  ‚Ü≥ *CD3E/D, SCGB1A1, KRTs, CEACAM6, CD8A* ‚Üí **Uncertain / likely artifact or mixed**

---

Let me know if you want help with trajectory inference (e.g., epithelial-to-mesenchymal transition) or more granular fibroblast subtyping.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T', '1': 'Myofibroblast',
                                                      '2': 'Macrophage M2', '3': 'Epithelial club / bronchiolar', 
                         '4': 'Epithelial ciliated / bronchiolar', '5': 'Plasma/B', '6': 'Endoplasmic reticulum-stressed',
                                                     '7': 'Endothelial', '8': 'Myofibroblast',
                                                      '9': 'Inflammatory macrophage', '10': "Noise"})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Mesenchymal',
                                                      '2': 'Myeloid', '3': 'Epithelial', 
                         '4': 'Epithelial', '5': 'Lymphoid', '6': 'Lymphoid',
                                                     '7': 'Endothelial', '8': 'Mesenchymal',
                                                      '9': 'Myeloid', '10': "Noise"})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX105

In [None]:
sample_id = "TENX105"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation** of the clusters in the context of **kidney cancer**, with **cell types**, **lineages**, select **marker genes**, and whether they‚Äôre **likely malignant**:

---

* **Cluster 0** ‚Äì **CD8‚Å∫ T cells (cytotoxic, exhausted)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD8A, GZMK, PRF1, PDCD1, CTLA4* ‚Üí **Not malignant**

* **Cluster 1** ‚Äì **Myofibroblasts / cancer-associated fibroblasts (CAFs)** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, MYH11, PDGFRB, SFRP4, THY1* ‚Üí **Possibly malignant-associated**

* **Cluster 2** ‚Äì **Secretory / rare epithelial or neuroendocrine-like cells** ‚Äî *Epithelial / mixed*
  ‚Ü≥ *SST, SCGN, DMBT1, CPA3, PGR, CYP2A7* ‚Üí **Unclear / likely non-malignant**

* **Cluster 3** ‚Äì **Tumor-associated macrophages (TAMs), M2-like** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, CD163, MRC1, VSIG4, AIF1* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **CD4‚Å∫ and B cells (mixed T-helper and B-lineage)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD4, IL7R, CD3D/E, CD19, CD79A* ‚Üí **Not malignant**

* **Cluster 5** ‚Äì **Endothelial cells** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, VWF, CD34, EGFL7, CLEC14A* ‚Üí **Not malignant**

* **Cluster 6** ‚Äì **Smooth muscle cells / fibroblasts with myeloid contamination** ‚Äî *Mesenchymal / Myeloid*
  ‚Ü≥ *ACTA2, MYLK, CD68, FCGR3A, CNN1* ‚Üí **Not malignant**

* **Cluster 7** ‚Äì **Proliferating tumor-like cells or CAFs** ‚Äî *Epithelial / Mesenchymal (ambiguous)*
  ‚Ü≥ *MKI67, MYC, MET, EGFR, TNC, MDM2* ‚Üí **Possibly malignant**

* **Cluster 8** ‚Äì **Plasma B cells / cycling lymphoid-like** ‚Äî *Lymphoid*
  ‚Ü≥ *MZB1, CD79A, TNFRSF17, FKBP11* ‚Üí **Not malignant**

---

Let me know if you want specific identification of malignant epithelial clusters using known **ccRCC / papillary RCC markers**.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T cytotoxic', '1': 'Myofibroblast',
                                                      '2': 'Neuroendocrine-like', '3': 'Macrophage M2', 
                         '4': 'T/B', '5': 'Endothelial', '6': 'Smooth muscle cell',
                                                     '7': 'Malignant', '8': 'Plasma/B'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Mesenchymal',
                                                      '2': 'Epithelial', '3': 'Myeloid', 
                         '4': 'Lymphoid', '5': 'Endothelial', '6': 'Mesenchymal',
                                                     '7': 'Malignant', '8': 'Lymphoid'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI783

In [None]:
sample_id = "NCBI783"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation** of your **breast cancer clusters**, with **cell type**, **supporting marker genes**, and indication of whether the cluster is **likely malignant**, ordered by cluster number:

---

* **Cluster 0** ‚Äì **T cells (CD8‚Å∫, cytotoxic & regulatory)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3E, CD8A, GZMA, PRF1, CTLA4, FOXP3* ‚Üí **Not malignant**

* **Cluster 1** ‚Äì **Cancer-associated fibroblasts (CAFs)** ‚Äî *Mesenchymal*
  ‚Ü≥ *LUM, POSTN, PDGFRA, SFRP4, MMP2, ZEB1/2* ‚Üí **Not malignant**

* **Cluster 2** ‚Äì **Tumor-associated macrophages (TAMs)** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, CD163, MRC1, FCGR3A, FGL2, CXCL16* ‚Üí **Not malignant**

* **Cluster 3** ‚Äì **Basal-like epithelial cells** ‚Äî *Epithelial (Basal subtype)*
  ‚Ü≥ *KRT5, KRT14, ACTA2, EGFR, SFRP1, RUNX1* ‚Üí **Likely malignant**

* **Cluster 4** ‚Äì **Luminal epithelial cells (HER2‚Å∫, ER‚Å∫)** ‚Äî *Epithelial*
  ‚Ü≥ *ERBB2, FOXA1, EPCAM, KRT8, MLPH, CCND1, MDM2, TOP2A* ‚Üí **Malignant**

* **Cluster 5** ‚Äì **Plasma/B cell mix with fibroblast signals** ‚Äî *Lymphoid / Mesenchymal*
  ‚Ü≥ *MS4A1, CD19, BANK1 + CCDC80, SFRP4, LUM* ‚Üí **Mixed, mostly not malignant**

* **Cluster 6** ‚Äì **Endothelial cells** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, VWF, CLEC14A, RAMP2, CAV1, SOX17, ANGPT2* ‚Üí **Not malignant**

* **Cluster 7** ‚Äì **Plasma cells** ‚Äî *Lymphoid*
  ‚Ü≥ *CD79A/B, MZB1, TNFRSF17, PRDM1, SLAMF7* ‚Üí **Not malignant**

* **Cluster 8** ‚Äì **Naive/activated B cells** ‚Äî *Lymphoid*
  ‚Ü≥ *CD19, CD83, SPIB, IL3RA, TCL1A, BANK1* ‚Üí **Not malignant**

---

Let me know if you want these labeled by PAM50 subtype or visualized on UMAP.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T', '1': 'CAF',
                                                      '2': 'Macrophage', '3': 'Malignant basal', 
                         '4': 'Malignant luminal ER+ HER2+', '5': 'Plasma/B', '6': 'Endothelial',
                                                     '7': 'Plasma', '8': 'B'})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Mesenchymal',
                                                      '2': 'Myeloid', '3': 'Malignant', 
                         '4': 'Malignant', '5': 'Lymphoid', '6': 'Endothelial',
                                                     '7': 'Lymphoid', '8': 'Lymphoid'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX143

In [None]:
sample_id = "TENX143"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation** of your **lymph node clusters**, with **cell type**, **marker genes**, and whether the cluster is **likely malignant**, in order:

---

* **Cluster 0** ‚Äì **Na√Øve/central memory T cells** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3E, TCF7, LEF1, CD6* ‚Üí **Not malignant**

* **Cluster 1** ‚Äì **B cells (na√Øve/follicular)** ‚Äî *Lymphoid*
  ‚Ü≥ *MS4A1, CD79A, CD19, PAX5* ‚Üí **Not malignant**

* **Cluster 2** ‚Äì **Mast cells** ‚Äî *Myeloid*
  ‚Ü≥ *MS4A2, CMA1, HDC, TPSAB1* ‚Üí **Not malignant**

* **Cluster 3** ‚Äì **Fibroblastic reticular/stromal cells** ‚Äî *Mesenchymal*
  ‚Ü≥ *PDGFRA, CXCL12, TNC, MMP14* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **Blood endothelial cells** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, PLVAP, CD34, CLDN5* ‚Üí **Not malignant**

* **Cluster 5** ‚Äì **Lymphatic endothelial-like macrophages (LEC-like/TREM2‚Å∫ macrophages)** ‚Äî *Myeloid*
  ‚Ü≥ *MRC1, MARCO, CLEC4M, CD209, TGFBR2* ‚Üí **Not malignant**

* **Cluster 6** ‚Äì **TAM-like macrophages** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, CD163, MAF, SIGLEC1, AXL* ‚Üí **Not malignant**

* **Cluster 7** ‚Äì **Plasma cells** ‚Äî *Lymphoid*
  ‚Ü≥ *XBP1, MZB1, SLAMF7, PRDM1, TNFRSF17* ‚Üí **Not malignant**

* **Cluster 8** ‚Äì **Conventional dendritic cells (cDCs)** ‚Äî *Myeloid*
  ‚Ü≥ *ITGAX, SPI1, CLEC9A, BATF3* ‚Üí **Not malignant**

* **Cluster 9** ‚Äì **Plasmacytoid DCs / Activated monocyte-like cells** ‚Äî *Myeloid*
  ‚Ü≥ *CLEC4C, TCF4, IRF7, GZMB, SPIB* ‚Üí **Not malignant**

* **Cluster 10** ‚Äì **Proliferating germinal center B cells / plasmablasts** ‚Äî *Lymphoid*
  ‚Ü≥ *CD79A, BACH2, IRF8, PCNA, MCM5, TYMS* ‚Üí **Not malignant**

* **Cluster 11** ‚Äì **Adipocyte-like stromal / perivascular cells** ‚Äî *Mesenchymal*
  ‚Ü≥ *ADIPOQ, PLIN1, LEP, ALPL* ‚Üí **Not malignant**

* **Cluster 12** ‚Äì **Granulocytes / neutrophil-like** ‚Äî *Myeloid*
  ‚Ü≥ *FCGR3B, CXCR2, S100A12, ANXA3* ‚Üí **Not malignant**

---

Let me know if you'd like UMAP overlays, trajectory inference, or a malignant cell classifier applied.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Na√Øve/central memory T', '1': 'B',
                                                      '2': 'Mast', '3': 'Fibroblastic reticular/stromal', 
                         '4': 'Blood endothelial', '5': 'Lymphatic endothelial-like macrophage', '6': 'TAM-like macrophage',
                                                     '7': 'Plasma', '8': 'Conventional dendritic cell', '9': 'Plasmacytoid DC', 
                                                      '10': 'Proliferating germinal center B', '11': 'Adipocyte-like stromal', 
                                                     '12': 'Granulocytes / neutrophil'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Lymphoid',
                                                      '2': 'Myeloid', '3': 'Mesenchymal', 
                         '4': 'Endothelial', '5': 'Myeloid', '6': 'Myeloid',
                                                     '7': 'Lymphoid', '8': 'Myeloid', '9': 'Myeloid',
                                                      '10': 'Lymphoid', '11': 'Mesenchymal', 
                                                     '12': 'Myeloid'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI865

In [None]:
sample_id = "NCBI865"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here's a **succinct annotation** of your **lung fibrosis clusters**, with **cell types**, **marker genes**, and **malignancy relevance**, ordered by cluster number:

---

* **Cluster 0** ‚Äì **Vascular Endothelial Cells (Capillary/Arterial)** ‚Äî *Endothelial*
  ‚Ü≥ *PECAM1, EPAS1, KDR, CLDN5, APLNR, RAMP2* ‚Üí **Not malignant**

* **Cluster 1** ‚Äì **Myofibroblasts / Activated Fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *COL1A1, ACTA2, FAP, POSTN, SFRP2, CTHRC1* ‚Üí **Key in fibrosis**

* **Cluster 2** ‚Äì **T & B Cells (mixed)** ‚Äî *Lymphoid*
  ‚Ü≥ *CD3E, CD8A, CD19, MS4A1, FOXP3, TRAC* ‚Üí **Not malignant**

* **Cluster 3** ‚Äì **Mast Cells** ‚Äî *Myeloid*
  ‚Ü≥ *TPSAB1, CPA3, KIT, S100A12* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **TAMs / M2-like Macrophages** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, MRC1, CCL18, AXL, HLA-DRA* ‚Üí **Fibrosis-promoting**

* **Cluster 5** ‚Äì **Plasma B Cells / Secretory-like** ‚Äî *Lymphoid*
  ‚Ü≥ *XBP1, TNFRSF17, FKBP11, CD27, DERL3* ‚Üí **Not malignant**

* **Cluster 6** ‚Äì **Alveolar Type 2 (AT2) / Tumor-like Epithelial** ‚Äî *Epithelial*
  ‚Ü≥ *SFTPC, SFTPD, NAPSA, NKX2-1, EPCAM* ‚Üí **Can include dysplastic / pre-malignant cells**

* **Cluster 7** ‚Äì **Fibro-inflammatory Macrophages** ‚Äî *Myeloid*
  ‚Ü≥ *CD68, MS4A7, COL1A1, COL3A1, LUM, DCN* ‚Üí **Profibrotic**

* **Cluster 8** ‚Äì **Basal & Secretory Epithelial Cells (mixed)** ‚Äî *Epithelial*
  ‚Ü≥ *KRT5, CEACAM6, EPCAM, AGR3, EGFR, TP63* ‚Üí **Can harbor transformation in fibrosis**

* **Cluster 9** ‚Äì **Pathogenic Myofibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *ACTA2, COL3A1, COL1A1, PDGFRB, FGF2, ZEB1* ‚Üí **Key fibrosis drivers**

* **Cluster 10** ‚Äì **Club & Goblet Cells / Mucociliary Epithelium** ‚Äî *Epithelial*
  ‚Ü≥ *SCGB1A1, MUC5B, FOXJ1, KRT8, CEACAM5, SOX2* ‚Üí **Hyperplasia in fibrosis**

* **Cluster 11** ‚Äì **CD8‚Å∫ Cytotoxic T Cells / NK-like** ‚Äî *Lymphoid*
  ‚Ü≥ *GZMB, CD8A, NKG7, FGFBP2, HAVCR2* ‚Üí **Not malignant**

* **Cluster 12** ‚Äì **Transitional / EMT-like Fibroblasts** ‚Äî *Mesenchymal*
  ‚Ü≥ *VIM, FN1, COL1A1, SNAI2, PPARG, ZEB1* ‚Üí **Fibrosis-associated**

---

Let me know if you'd like UMAPs annotated or epithelial subclusters parsed further (e.g. AT1 vs AT2 vs transitional).


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Vascular Endothelial', '1': 'Myofibroblast',
                                                      '2': 'T / B', '3': 'Mast', 
                         '4': 'Macrophage M2', '5': 'Plasma', '6': 'AT2',
                                                     '7': 'Fibro-inflammatory Macrophage',
                        '8': 'Epithelial basal / secretory', '9': 'Myofibroblast', '10': 'Epithelial club / goblet',
                             '11': 'T cytotoxic', '12': 'Transitional / EMT-like Fibroblast'
                                                     })

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Endothelial', '1': 'Mesenchymal',
                                                      '2': 'Lymphoid', '3': 'Myeloid', 
                         '4': 'Myeloid', '5': 'Lymphoid', '6': 'Epithelial',
                                                     '7': 'Myeloid',
                        '8': 'Epithelial', '9': 'Mesenchymal', '10': 'Epithelial',
                             '11': 'Lymphoid', '12': 'Mesenchymal'
                                                     })

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX140

In [None]:
sample_id = "TENX140"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a concise annotation of your **pancreatic cancer clusters**, with assigned **cell types** and whether they are likely **malignant**, based on canonical marker genes and expression patterns:

---

* **Cluster 0** ‚Äì **Fibroblasts / Cancer-associated fibroblasts (CAFs)**
  ‚Ü≥ *SPARC, FN1, LUM, ACTA2, DCN, TGFB1, PDGFRA* ‚Üí **Not malignant**, but **pro-tumorigenic stromal**

* **Cluster 1** ‚Äì **B Cells / Plasma Cells (mixed)**
  ‚Ü≥ *IGHG1, MS4A1, CD19, CD79A, JCHAIN, MZB1* ‚Üí **Not malignant**

* **Cluster 2** ‚Äì **Epithelial / Ductal Tumor Cells**
  ‚Ü≥ *CEACAM6, EPCAM, KRAS, ERBB2, SOX9, CTNNB1* ‚Üí **Likely malignant**

* **Cluster 3** ‚Äì **Myeloid-derived suppressor cells (MDSCs) / Immature myeloid**
  ‚Ü≥ *S100A9, IL1B, CXCR2, IL1A, ARG1* ‚Üí **Not malignant**, but **immunosuppressive**

* **Cluster 4** ‚Äì **Monocyte-derived macrophages / TAM-like**
  ‚Ü≥ *CD14, CXCR4, TREM2, FCGR2A, APOE, ITGAX* ‚Üí **Not malignant**

* **Cluster 5** ‚Äì **Epithelial / Highly proliferative Tumor Cells**
  ‚Ü≥ *EPCAM, REG4, MKI67, CENPF, CDK1, KRAS, ERBB2, TP53* ‚Üí **Malignant**

* **Cluster 6** ‚Äì **Myofibroblasts / Stromal Fibroblasts**
  ‚Ü≥ *SPARCL1, ACTA2, PDGFRA, DCN, IL1R1* ‚Üí **Not malignant**, **fibrotic support**

* **Cluster 7** ‚Äì **TAMs / M2-like Macrophages**
  ‚Ü≥ *CD163, CSF1R, AIF1, IL10RA, MARCO, VSIG4* ‚Üí **Not malignant**, but **tumor-promoting**

---

Let me know if you'd like clusters visualized or subclustered (e.g., CAF subtypes or epithelial hierarchy).


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'CAF', '1': 'Plasma/B',
                                                      '2': 'Malignant ductal', '3': 'Myeloid-derived suppressor cell', 
                         '4': 'Monocyte-derived macrophage', '5': 'Malignant proliferative', '6': 'Myofibroblast',
                                                     '7': 'Macrophage M2',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Lymphoid',
                                                      '2': 'Malignant', '3': 'Myeloid', 
                         '4': 'Myeloid', '5': 'Malignant', '6': 'Mesenchymal',
                                                     '7': 'Myeloid',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI866

In [None]:
sample_id = "NCBI866"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer:Here‚Äôs a concise annotation of the **clusters** in your **lung fibrosis** dataset, with **cell types**, **supporting marker genes**, and notes on likely **malignancy**:

---

* **Cluster 0** ‚Äì *T cells (mixed CD4‚Å∫/CD8‚Å∫)*
  ‚Üí `CD3E`, `CD8A`, `CD4`, `IL7R`, `CXCL13`
  **Malignant?** No

* **Cluster 1** ‚Äì *Myofibroblasts / Activated Fibroblasts*
  ‚Üí `COL1A1`, `ACTA2`, `FAP`, `SFRP2`, `YAP1`, `CTHRC1`
  **Malignant?** No (fibrogenic)

* **Cluster 2** ‚Äì *Endothelial Cells (vascular)*
  ‚Üí `PECAM1`, `PLVAP`, `CLDN5`, `KDR`, `RAMP2`
  **Malignant?** No

* **Cluster 3** ‚Äì *Plasma Cells / ER-stressed secretory cells*
  ‚Üí `JCHAIN`, `XBP1`, `CD79A`, `PDIA4`, `SSR3`
  **Malignant?** No

* **Cluster 4** ‚Äì *B cells (naive and memory)*
  ‚Üí `MS4A1`, `CD19`, `CD79A`, `CXCR5`, `BANK1`
  **Malignant?** No

* **Cluster 5** ‚Äì *Macrophages (M2-like / profibrotic)*
  ‚Üí `CD68`, `CD163`, `MARCO`, `MS4A7`, `HLA-DRA`
  **Malignant?** No

* **Cluster 6** ‚Äì *Fibroblasts / Myofibroblasts (pro-fibrotic)*
  ‚Üí `COL1A1`, `FAP`, `PDGFRA`, `ACTA2`, `SFRP4`
  **Malignant?** No

* **Cluster 7** ‚Äì *Pathologic Fibroblasts / Transitional (basaloid) cells*
  ‚Üí `ACTA2`, `COL1A1`, `YAP1`, `ITGB1`, `ZEB1`, `SNAI2`, `LGALS1`
  **Malignant?** Possibly (metaplastic/EMT-like)

* **Cluster 8** ‚Äì *Epithelial Cells (club / AT2-like / basal)*
  ‚Üí `SCGB3A2`, `KRT5`, `KRT8`, `SOX2`, `EGFR`, `SOX9`
  **Malignant?** Possibly (if basal markers co-exist with stress/EMT)

* **Cluster 9** ‚Äì *Monocyte-derived Macrophages (pro-inflammatory)*
  ‚Üí `LYZ`, `S100A8`, `SPP1`, `TGFB1`, `FCGR3A`
  **Malignant?** No

* **Cluster 10** ‚Äì *Alveolar Epithelial Cells (AT2/AT1 transition)*
  ‚Üí `SFTPC`, `KRT18`, `EPCAM`, `AGER`, `ITGB6`, `YAP1`
  **Malignant?** Possibly (if stressed/dysplastic)

---

Let me know if you'd like to visualize this as a table or compare with idiopathic pulmonary fibrosis (IPF) datasets.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T', '1': 'Myofibroblast',
                                                      '2': 'Endothelial', '3': 'Plasma', 
                         '4': 'B', '5': 'Macrophage M2', '6': 'Myofibroblast',
                                                     '7': 'Pathologic Fibroblast', '8': 'Epithelial AT2 / club / basal', 
                                                     '9': 'Monocyte-derived Macrophage', '10': 'AT1/AT2'})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Mesenchymal',
                                                      '2': 'Endothelial', '3': 'Lymphoid', 
                         '4': 'Lymphoid', '5': 'Macrophage M2', '6': 'Myofibroblast',
                                                     '7': 'Pathologic Fibroblast', '8': 'Epithelial AT2 / club / basal', 
                                                     '9': 'Monocyte-derived Macrophage', '10': 'AT1/AT2'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX111

In [None]:
sample_id = "TENX111"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: -Certainly! Here's your **colorectal cancer (CRC)** cluster annotation rewritten in the same succinct format as before ‚Äî listing **cluster number**, **cell type**, **malignancy status**, and example **marker genes**:

---

* **Cluster 0** ‚Äì **Proliferative tumor epithelium**
  ‚Ü≥ *EPCAM, MKI67, TYMS, TOP2A, RRM2, LGR5, CEACAM5* ‚Üí **Malignant**

* **Cluster 1** ‚Äì **Differentiated tumor / normal-like epithelium**
  ‚Ü≥ *EPCAM, CDH1, CEACAM6, OLFM4, LGR5, RNF43* ‚Üí **Likely malignant**

* **Cluster 2** ‚Äì **Myofibroblasts / cancer-associated fibroblasts (CAFs)**
  ‚Ü≥ *ACTA2, COL1A1, PDGFRA, TNC, SFRP4, GREM1* ‚Üí **Not malignant**

* **Cluster 3** ‚Äì **T cells and tumor-associated macrophages (TAMs)**
  ‚Ü≥ *CD3E, CD8A, CXCR4, CD14, CD163, GZMK, HAVCR2* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **Endothelial / pericytes / stromal**
  ‚Ü≥ *PECAM1, VWF, RGS5, PDGFRB, COL5A2, PLVAP* ‚Üí **Not malignant**

* **Cluster 5** ‚Äì **Myofibroblast-like CAFs**
  ‚Ü≥ *ACTA2, GREM1, TAGLN, IGFBP7, COL1A1, THBS1* ‚Üí **Not malignant**

* **Cluster 6** ‚Äì **Inflamed tumor epithelium / hybrid epithelial-immune**
  ‚Ü≥ *EPCAM, CEACAM5, CTNNB1, CTSB, APOE, HAVCR2* ‚Üí **Likely malignant**

---

Let me know if you'd like to assess CAF subtypes or immune cell states next.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant proliferative', '1': 'Malignant differentiated',
                                                      '2': 'CAF', '3': 'T / Macrophage', 
                         '4': 'Endothelial', '5': 'Myofibroblast', '6': 'Malignant inflamed',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Mesenchymal', '3': 'Myeloid/Lymphoid', 
                         '4': 'Endothelial', '5': 'Mesenchymal', '6': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX119

In [None]:
sample_id = "TENX119"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here's a succinct annotation of your **healthy heart clusters**, including **cell type** and example **marker genes**:

---

* **Cluster 0** ‚Äì **Cardiomyocytes**
  ‚Ü≥ *DES, S100A1, PTGDS*

* **Cluster 1** ‚Äì **Fibroblasts / Stromal cells**
  ‚Ü≥ *FHL2, LPL, SMYD2, GPC1, PPARG*

* **Cluster 2** ‚Äì **Endothelial cells**
  ‚Ü≥ *VWF, PECAM1, CD34, CAV1, RAMP2*

* **Cluster 3** ‚Äì **Fibroblasts / Perivascular stromal cells**
  ‚Ü≥ *FBLN1, PDGFRA, VCAN, MFAP5, THBS2*

* **Cluster 4** ‚Äì **Smooth muscle cells / Pericytes**
  ‚Ü≥ *ACTA2, MYH11, CNN1, PDGFRB, MYLK*

* **Cluster 5** ‚Äì **Macrophages / Myeloid cells**
  ‚Ü≥ *CD68, CD14, MRC1, CD163, PTPRC*

* **Cluster 6** ‚Äì **T cells / immune mix**
  ‚Ü≥ *CD3E, CD2, GZMA, PRF1, IL7R*

---

Let me know if you'd like functional annotations or comparisons to disease states next.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Cardiomyocyte', '1': 'Fibroblast',
                                                      '2': 'Endothelial', '3': 'Fibroblast', 
                         '4': 'Smooth muscle cell', '5': 'Macrophage', '6': 'T',})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Mesenchymal',
                                                      '2': 'Endothelial', '3': 'Mesenchymal', 
                         '4': 'Mesenchymal', '5': 'Myeloid', '6': 'Lymphoid',})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX158

In [None]:
sample_id = "TENX158"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here's a concise cluster annotation for your **skin cancer** dataset, including **cell type**, **malignancy status**, and example **marker genes**:

---

* **Cluster 0** ‚Äì **Melanoma-like tumor cells**
  ‚Ü≥ *MLANA, SOX10, PRAME, ERBB3, MMP14* ‚Üí **Malignant**

* **Cluster 1** ‚Äì **Melanoma-like tumor cells** (similar to cluster 0)
  ‚Ü≥ *MLANA, SOX10, PRAME, CDK2, MMP14* ‚Üí **Malignant**

* **Cluster 2** ‚Äì **CD8‚Å∫ T cells (cytotoxic)**
  ‚Ü≥ *CD8A, GZMB, CD3E, LAG3, CTLA4* ‚Üí **Not malignant**

* **Cluster 3** ‚Äì **Macrophages / TAMs**
  ‚Ü≥ *CD68, CD14, MRC1, CD163, CSF1R* ‚Üí **Not malignant**

* **Cluster 4** ‚Äì **Cancer-associated fibroblasts (CAFs)**
  ‚Ü≥ *COL1A1, PDGFRB, POSTN, MMP14, FAP* ‚Üí **Not malignant**

* **Cluster 5** ‚Äì **Proliferative tumor cells (melanocytic origin)**
  ‚Ü≥ *TYMS, PCNA, BIRC5, PRAME, CDK2* ‚Üí **Malignant**

* **Cluster 6** ‚Äì **Dedifferentiated tumor / EMT-like cells**
  ‚Ü≥ *HOXB7, VGF, L1CAM, MME, LDHA* ‚Üí **Malignant**

* **Cluster 7** ‚Äì **Vascular endothelial cells**
  ‚Ü≥ *PECAM1, CDH5, COL4A1, PLVAP, EGFL7* ‚Üí **Not malignant**

* **Cluster 8** ‚Äì **Monocyte-derived macrophages / MDSCs**
  ‚Ü≥ *FCGR3A, CD68, MSR1, PLAUR, CXCR4* ‚Üí **Not malignant**

* **Cluster 9** ‚Äì **Plasma / B cells**
  ‚Ü≥ *MZB1, CD79A, XBP1, PRDM1* ‚Üí **Not malignant**

* **Cluster 10** ‚Äì **Keratinocytes / Differentiated epithelial**
  ‚Ü≥ *DSG1, KLF4, TP63, GJA1, CDH3* ‚Üí **Possibly pre-malignant or bystanders**

* **Cluster 11** ‚Äì **Macrophages / myeloid (inflammatory)**
  ‚Ü≥ *CD68, CTSC, CYBB, GRN, FCGR2A* ‚Üí **Not malignant**

* **Cluster 12** ‚Äì **Noise / low-quality / ambiguous**
  ‚Ü≥ *DUX4, DEFA1, FOXA1, various testis- or GI-associated genes* ‚Üí **Uncertain / likely artifact**

* **Cluster 13** ‚Äì **Noise / ambiguous transcriptome**
  ‚Ü≥ *DUX4, DEFB4A, GH1, IL5, TEX14* ‚Üí **Uncertain / likely artifact**

* **Cluster 14** ‚Äì **Noise / ambiguous transcriptome**
  ‚Ü≥ *DUX4, PCA3, POTEI, GH1, HTR4* ‚Üí **Uncertain / likely artifact**

---

Let me know if you'd like help merging similar clusters (e.g., tumor subtypes) or identifying immune cell states.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant melanocytic', '1': 'Malignant melanocytic',
                                                      '2': 'T cytotoxic', '3': 'Macrophage', 
                         '4': 'CAF', '5': 'Malignant melanocytic proliferative', '6': 'Malignant mesenchymal',
                                                     '7': 'Vascular endothelial', '8': 'Monocyte-derived macrophage', 
                                                     '9': 'Plasma/B', '10': 'Keratinocyte', '11': 'Macrophage', 
                                                     '12': 'Noise', '13': 'Noise', '14': 'Noise'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant',
                                                      '2': 'Lymphoid', '3': 'Myeloid', 
                         '4': 'Mesenchymal', '5': 'Malignant', '6': 'Malignant',
                                                     '7': 'Endothelial', '8': 'Myeloid', 
                                                     '9': 'Lymphoid', '10': 'Epithelial', '11': 'Myeloid', 
                                                     '12': 'Noise', '13': 'Noise', '14': 'Noise'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX114

In [None]:
sample_id = "TENX114"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here's a succinct annotation of your **healthy colon clusters**, including **cell types** and representative **marker genes**:

---

* **Cluster 0** ‚Äì **Mesenchymal (Fibroblasts / Stromal)**
  ‚Ü≥ *IGFBP7, TIMP3, THBS1, FZD7, ROBO1*

* **Cluster 1** ‚Äì **B cells / Plasma-like**
  ‚Ü≥ *CD79A, TNFRSF17, FKBP11, VCAN, MS4A7*

* **Cluster 2** ‚Äì **Proliferative Epithelial (Transit-amplifying)**
  ‚Ü≥ *SLC12A2, MKI67, PCLAF, LGR5, TYMS*

* **Cluster 3** ‚Äì **Enterocytes (Absorptive Epithelium)**
  ‚Ü≥ *SLC26A3, CA1, FABP2, CEACAM7, CLCA4*

* **Cluster 4** ‚Äì **Enterocytes (Differentiated)**
  ‚Ü≥ *CA1, CEACAM5, FABP2, SELENBP1, REG4*

* **Cluster 5** ‚Äì **T Cells (CD4‚Å∫ / CD8‚Å∫ mixed)**
  ‚Ü≥ *CD3D, CD8A, TRAC, GZMK, IL7R*

* **Cluster 6** ‚Äì **Macrophages / Monocytes**
  ‚Ü≥ *CD163, C1QA, MS4A7, FCN1, CD14*

* **Cluster 7** ‚Äì **Inflammatory Epithelium / Immune-Stressed**
  ‚Ü≥ *IL1B, S100A12, CCL20, PI3, TNFAIP3*

* **Cluster 8** ‚Äì **Enteroendocrine / Neuroendocrine-like**
  ‚Ü≥ *CHGA, CHGB, CALB2, SCG2, SCGN*

---

Let me know if you'd like to group subtypes further (e.g., secretory lineage vs. absorptive), or compare with CRC.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Plasma/B',
                                                      '2': 'Epithelial Proliferative', '3': 'Enterocyte Absorptive', 
                         '4': 'Enterocyte Differentiated', '5': 'T', '6': 'Macrophage',
                                                     '7': 'Inflammatory Epithelium', '8': 'Neuroendocrine-like'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Lymphoid',
                                                      '2': 'Epithelial', '3': 'Epithelial', 
                         '4': 'Epithelial', '5': 'Lymphoid', '6': 'Myeloid',
                                                     '7': 'Epithelial', '8': 'Epithelial'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX106

In [None]:
sample_id = "TENX106"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a **succinct annotation** of your clusters in the context of a **healthy kidney**, based on canonical markers:

---

**Cluster 0** ‚Äî *Proximal tubule epithelial cells*
**Markers**:‚ÄØ*ANPEP*, *SLC22A8*, *DPEP1*, *GATM*, *ACE2*

**Cluster 1** ‚Äî *Fibroblasts / Mesangial-like cells*
**Markers**:‚ÄØ*ACTA2*, *PDGFRA*, *VCAN*, *FBLN1*, *FBN1*

**Cluster 2** ‚Äî *Distal tubule / Loop of Henle*
**Markers**:‚ÄØ*S100A1*, *CDH16*, *FXYD2*, *UMOD*, *TMEM174*

**Cluster 3** ‚Äî *Collecting duct principal cells*
**Markers**:‚ÄØ*AQP2*, *EPCAM*, *CDH16*, *FOXI1*, *PLCG2*

**Cluster 4** ‚Äî *Endothelial cells (vascular)*
**Markers**:‚ÄØ*PECAM1*, *CD34*, *VWF*, *ERG*, *RAMP2*

**Cluster 5** ‚Äî *Myeloid cells / Macrophages / T cells (mixed)*
**Markers**:‚ÄØ*PTPRC*, *CD14*, *CD68*, *CD3E*, *CD8A*, *CXCR4*

**Cluster 6** ‚Äî *Collecting duct intercalated cells / Principal cells (mixed)*
**Markers**:‚ÄØ*UMOD*, *FXYD2*, *KNG1*, *CDH16*, *PROX1*, *GPC3*

**Cluster 7** ‚Äî *Endothelial (lymphatic & vascular mixed)*
**Markers**:‚ÄØ*CD34*, *PDPN*, *RAMP2*, *SOX18*, *EGFL7*

**Cluster 8** ‚Äî *Distal tubule / collecting duct (cycling/proliferating subset)*
**Markers**:‚ÄØ*EPCAM*, *CDH16*, *PCNA*, *CD83*, *MET*

**Cluster 9** ‚Äî *Vascular smooth muscle / pericytes*
**Markers**:‚ÄØ*ACTA2*, *MYH11*, *CNN1*, *DES*, *PDGFRB*

**Cluster 10** ‚Äî *Interstitial fibroblasts*
**Markers**:‚ÄØ*OGN*, *SFRP4*, *MFAP5*, *COL5A2*, *PDGFRA*

---

Let me know if you‚Äôd like malignant potential or trajectory information added.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Epithelial proximal tubule', '1': 'Fibroblast',
                                                      '2': 'Epithelial distal tubule', '3': 'Epithelial collecting duct principal cells', 
                         '4': 'Endothelial', '5': 'Myeloid/T', '6': 'Epithelial collecting duct intercalated cells',
                                                     '7': 'Endothelial', '8': 'Epithelial distal tubule / collecting duct', 
                                                     '9': 'Vascular smooth muscle', "10": "Interstitial fibroblast"})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Mesenchymal',
                                                      '2': 'Epithelial', '3': 'Epithelial', 
                         '4': 'Endothelial', '5': 'Myeloid/Lymphoid', '6': 'Epithelial',
                                                     '7': 'Endothelial', '8': 'Epithelial', 
                                                     '9': 'Mesenchymal', "10": "Mesenchymal"})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# NCBI859

In [None]:
sample_id = "NCBI859"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here is a **succinct annotation** of your clusters in the context of **lung fibrosis**, including **cell type** and **example marker genes**:

---

* **Cluster 0** ‚Äì **Myofibroblasts / Activated Fibroblasts**
  ‚Ü≥ *COL1A1, COL3A1, POSTN, CTHRC1, FAP*

* **Cluster 1** ‚Äì **T cells (CD8‚Å∫ / CD4‚Å∫ mix)**
  ‚Ü≥ *CD3D, CD3E, CD8A, GZMA, CTLA4*

* **Cluster 2** ‚Äì **Macrophages / Monocytes (M2-like / Alveolar)**
  ‚Ü≥ *CD68, MRC1, FCGR3A, SPP1, HLA-DRA*

* **Cluster 3** ‚Äì **Matrix fibroblasts / Remodeling stromal cells**
  ‚Ü≥ *COL1A1, LUM, FGF7, FN1, PDGFRA*

* **Cluster 4** ‚Äì **AT2 cells / Epithelial (alveolar)**
  ‚Ü≥ *SFTPC, NAPSA, EPCAM, KRT18, CEACAM6*

* **Cluster 5** ‚Äì **Endothelial (vascular / capillary)**
  ‚Ü≥ *PECAM1, CD34, EPAS1, KDR, CLDN5*

* **Cluster 6** ‚Äì **Pericytes / Myofibroblasts (contractile)**
  ‚Ü≥ *ACTA2, PDGFRB, CSPG4, ITGB1, CCN2*

* **Cluster 7** ‚Äì **Macrophages (lipid-laden / profibrotic)**
  ‚Ü≥ *CD68, MARCO, PPARG, FABP4, TREM2*

* **Cluster 8** ‚Äì **Mast cells / Basophils (activated)**
  ‚Ü≥ *CPA3, TPSAB1, KIT, FCER1A, CD69*

* **Cluster 9** ‚Äì **Plasma B cells / ER-stressed B-lineage**
  ‚Ü≥ *XBP1, JCHAIN, TNFRSF17, CD79A, HSPA5*

* **Cluster 10** ‚Äì **Endothelial (inflamed / venous-like)**
  ‚Ü≥ *PECAM1, VIM, RAMP2, GNG11, CLDN5*

* **Cluster 11** ‚Äì **Neutrophils / Pro-inflammatory Myeloid**
  ‚Ü≥ *S100A8, S100A9, ITGAX, IL1B, FCN1*

---

Let me know if you'd like to flag clusters with high fibrotic remodeling potential or compare to normal lung.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Myofibroblast', '1': 'T',
                                                      '2': 'Macrophage M2', '3': 'Matrix fibroblast', 
                         '4': 'AT2', '5': 'Endothelial', '6': 'Myofibroblast',
                                                     '7': 'Macrophage lipid-laden',
                                                      '8': 'Mast', '9': 'Plasma/B', '10': 'Endothelial', '11': "Neutrophil"})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Mesenchymal', '1': 'Lymphoid',
                                                      '2': 'Myeloid', '3': 'Mesenchymal', 
                         '4': 'Epithelial', '5': 'Endothelial', '6': 'Mesenchymal',
                                                     '7': 'Myeloid',
                                                      '8': 'Myeloid', '9': 'Lymphoid', '10': 'Endothelial', '11': "Myeloid"})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# TENX117

In [None]:
sample_id = "TENX117"

In [None]:
adata = preprocess_adata(base_dir, sample_id, n_cut=10)

In [None]:
sc.tl.leiden(adata, resolution=0.7)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden')

In [None]:
for cl in adata.obs.leiden.unique():
    print(cl)
    print(sc.get.rank_genes_groups_df(adata, group=cl).head(50).names.ravel())

ChatGPT answer: Here‚Äôs a concise annotation of your **skin cancer** clusters, including **cell type identity** and example **marker genes**:

---

* **Cluster 0** ‚Äì **T cells (CD8‚Å∫ / CD4‚Å∫ mix)**
  ‚Ü≥ *CD3D, CD8A, GZMA, CCR7, FOXP3*

* **Cluster 1** ‚Äì **Macrophages / Monocytes (immune suppressive)**
  ‚Ü≥ *CD68, CD163, HLA-DPB1, C1QA, CXCL10*

* **Cluster 2** ‚Äì **Malignant Melanocytes / Melanoma cells**
  ‚Ü≥ *TYR, PMEL, MLANA, MITF, SOX10*

* **Cluster 3** ‚Äì **Malignant Melanocytes / Melanoma cells**
  ‚Ü≥ *PMEL, TYRP1, TFAP2B, MITF, TYR*

* **Cluster 4** ‚Äì **Cancer-associated Fibroblasts (CAFs)**
  ‚Ü≥ *COL6A1, FN1, PDGFRB, SFRP2, POSTN*

* **Cluster 5** ‚Äì **Endothelial Cells (vascular / lymphatic)**
  ‚Ü≥ *PECAM1, VWF, CD34, PLVAP, IGFBP7*

* **Cluster 6** ‚Äì **Keratinocytes / Epithelial-like Tumor Cells**
  ‚Ü≥ *KRT5, KRT17, TP63, TACSTD2, CLDN1*

* **Cluster 7** ‚Äì **Malignant Melanocytes / EMT-like**
  ‚Ü≥ *MITF, SOX10, EDNRB, S100B, MLANA*

* **Cluster 8** ‚Äì **Proliferating T Cells (cytotoxic / exhausted)**
  ‚Ü≥ *MKI67, CD8A, GZMB, TIGIT, LAG3*

* **Cluster 9** ‚Äì **Mixed Tumor (Melanocytic + CAF signature)**
  ‚Ü≥ *TYRP1, COL6A3, FN1, MITF, PDGFRB*

---

Let me know if you'd like to flag potential malignant clusters explicitly or compare to healthy skin.


In [None]:
adata.obs.groupby('leiden').mean()

In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'T', '1': 'Macrophage',
                                                      '2': 'Malignant melanocytic', '3': 'Malignant melanocytic', 
                         '4': 'CAF', '5': 'Endothelial', '6': 'Malignant keratinocytes',
                                                     '7': 'Malignant mesenchymal', '8': 'T cytotoxic', '9': 'Noise'})

In [None]:
adata.obs['celltypes'] =  adata.obs.leiden.replace({'0': 'Lymphoid', '1': 'Myeloid',
                                                      '2': 'Malignant', '3': 'Malignant', 
                         '4': 'Mesenchymal', '5': 'Endothelial', '6': 'Malignant',
                                                     '7': 'Malignant', '8': 'Lymphoid', '9': 'Noise'})

In [None]:
sc.pl.umap(adata, color=['celltypes', 'cellsubtypes'])

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(base_dir / sample_id / 'celltypes.csv')

# Checking files

In [None]:
for spl in pl.Path('/ewsc/yatesjos/Broad_SpatialFoundation/hest_processed_data/').iterdir():
    print(spl.stem)
    if spl.stem=='move_embeddings':
        continue
    print('------------------------------')
    for f in spl.iterdir():
        print(f)
        if f.stem=='embeddings':
            for emb in f.iterdir():
                print(emb)