In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path
import logging
import json
import pandas as pd
import numpy as np
import scanpy as sc
import anndata as ad

from tqdm import tqdm

In [None]:
base_dir = Path('../../../Broad_SpatialFoundation/VisiumHD-LUAD-processed/')
sample_list = [spl.stem for spl in base_dir.iterdir()]

In [None]:
sample_list

# 'LIB-064889st1'

In [None]:
sample_name = 'LIB-064889st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation (LUAD context)                                                                                 | Quick rationale (marker highlights)                                                       | Malignant likely?                        |
| ------- | ---------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | ---------------------------------------- |
| **2**   | **Myofibroblastic CAFs (ECM-remodeling stroma)**                                                                 | COL1A1/1A2/3A1/6A3, FN1, SPARC, VCAN, THBS2; MMP2/11/14; TAGLN/MYL9 → classic CAF         | ❌ No (stromal)                           |
| **5**   | **Secretory/mucinous epithelial (airway/club–goblet)** → *could be benign airway or mucinous LUAD component*     | MUC5B/MUC6/MUC1, SCGB3A1, WFDC2, PIGR, AZGP1, NAPSA, EPCAM                                | ⚠️ Possible (mucinous LUAD admixture)    |
| **0**   | **Epithelial–stroma interface (mucinous + ECM)**                                                                 | MUC5B/SCGB3A1/PIGR/WFDC2 with collagens/TIMP1 → epithelium over desmoplastic stroma       | ⚠️ Mixed (tumor + stroma)                |
| **14**  | **Tumor epithelium under stress/immune signaling (non-mucinous LUAD-like)**                                      | SCGB3A1 with IL10RA, CD38, DNA repair/stress (RAD1, AIFM2, OXSR1)                         | ✅ Likely                                 |
| **1**   | **Club/WNT-active tumor epithelium (LUAD-like)**                                                                 | SCGB3A1, **AXIN2** (WNT), DYRK1B, SHANK3; epithelial program with stress/repair           | ✅ Likely                                 |
| **15**  | **EMT-like LUAD cells with stromal program**                                                                     | SCGB3A1 + ECM (COL1A1/5A1), TAGLN/CALD1, SULF1; Ig background                             | ✅ Likely                                 |
| **4**   | **Secretory epithelium with Ig transport (airway/club)**                                                         | SCGB3A1/3A2, **PIGR**, IGKC; epithelial regulators (QKI, BRD8) without plasma/B-cell core | ❌ Lean non-malignant epithelium          |
| **9**   | **Basal/adhesion-rich tumor epithelium (LUAD, basal-leaning)**                                                   | CLDN1, LOXL4, CEACAM6, IFI6; signaling (AKT2/GRB2) with SCGB3A1                           | ✅ Likely                                 |
| **8**   | **Secretory/mucinous epithelial (airway/club–goblet)** → *candidate mucinous LUAD if spatially tumor-associated* | MUC5B/MUC6, SCGB3A1, WFDC2, PIGR, SLPI, EPCAM                                             | ⚠️ Possible (context dependent)          |
| **6**   | **Inflammatory/secretory LUAD epithelium**                                                                       | SCGB3A1, EPCAM, **MMP7, LCN2**, IGHA1, PLAT; stress/anti-apoptotic (BCL2L1)               | ✅ Likely                                 |
| **3**   | **Lymphoid immune (T/B mix; tertiary lymphoid-like)**                                                            | **CCL19/13/18**, SELPLG, TNFRSF11B, **POU2AF1**                                           | ❌ No — **lymphoid**                      |
| **13**  | **CAF–epithelial hybrid near tumor (reactive stroma + LUAD cells)**                                              | COL1A1, TNC, LTBP2, **MMP2/MMP7**, EPCAM, IFITM3                                          | ✅ Likely                                 |
| **12**  | **Activated CAFs / perivascular smooth-muscle–like**                                                             | **POSTN, PRRX1, TAGLN, MYH11, MYLK, AQP1**, EDIL3; ECM/vascular                           | ❌ No — **stromal/perivascular**          |
| **7**   | **Low-signal epithelial/tumor-leaning (uncertain)**                                                              | SCGB3A1, TNXB, THSD1, ST6GAL1; weak lineage cues                                          | ⚠️ Unclear (borderline tumor epithelium) |
| **10**  | **EGFR-positive LUAD epithelium (non-mucinous)**                                                                 | **EGFR**, PROM2, IL17RD, ALDH1A1; ER/UPR trafficking (ATF6, LAMP1)                        | ✅ Likely                                 |
| **11**  | **Lymphoid immune (B-cell–enriched)**                                                                            | **CD79A, BTK**, CCL19, TNFAIP6                                                            | ❌ No — **lymphoid (B-enriched)**         |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Epithelial - stromal interface', '1': 'Malignant - WNT-active', 
                                                      '2': 'Stromal - CAF myofibroblastic', '3': 'Lymphoid - B/T', 
                         '4': 'Malignant - mucous/secretory', '5': 'Epithelial - mucous/secretory',
                        '6': 'Malignant - inflammatory/secretory', '7': 'Noise', 
                         '8': 'Epithelial - mucous/secretory', '9': 'Malignant - basal',
                            '10': 'Malignant - EGFR+', '11': 'Lymphoid - B', '12': 'Stromal - Activated CAF/perivascular',
                            '13': 'Malignant - stromal interface', '14': 'Malignant - stressed/inflammatory',
                                                      '15': 'Malignant - EMT',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Epithelial', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Lymphoid', 
                         '4': 'Malignant', '5': 'Epithelial',
                        '6': 'Malignant', '7': 'Noise', 
                         '8': 'Epithelial', '9': 'Malignant',
                            '10': 'Malignant', '11': 'Lymphoid', '12': 'Stromal',
                            '13': 'Malignant', '14': 'Malignant',
                                                      '15': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal",]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-065294st1

In [None]:
sample_name = 'LIB-065294st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation                                                     | Quick rationale (marker highlights)                                                                                                      | Immune compartment                                      | Malignant likely? |
| ------- | --------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | ----------------- |
| **1**   | **Basal/suprabasal squamous tumor cells (ISG-high)**                  | KRT5/6A/16/19, TACSTD2 (TROP2), **S100A8/A9**, IFI6/IFI27/ISG15, LCN2, SLC2A1 → LUSC basal w/ inflammatory & interferon response         | —                                                       | **✅ Yes**         |
| **2**   | **Basal squamous tumor cells (mitochondrial/ISG program)**            | KRT5, ITGB4, COL7A1 (basement membrane), FXDY3; strong MT genes, IFI6/IFI27, TRIM29 → stressed basal LUSC                                | —                                                       | **✅ Yes**         |
| **5**   | **Mast cells (activated)**                                            | **MS4A2**, RASGRP4, ACKR1; NR4A3/RND1; mixed background Ig reads                                                                         | **Myeloid (mast)**                                      | **❌ No**          |
| **12**  | **Epithelial tumor (luminal/EMT-leaning)**                            | KRT8, TACSTD2, TM4SF1, FXYD5, ANXA2, CD44, LLGL2; GABRP; EMT/trafficking (CD44, DVL3)                                                    | —                                                       | **✅ Yes**         |
| **7**   | **Alveolar type II cells (AT2)**                                      | **SFTPC, SFTPA1, SFTPB**, SLC34A2, CLDN18 → normal alveolar epithelium                                                                   | —                                                       | **❌ No**          |
| **0**   | **Basal/suprabasal squamous tumor cells (inflammatory)**              | Mirrors clust. 1: KRT5/6A/16, TACSTD2, **S100A8/A9**, IFI6/27, LCN2; glycolysis/stress                                                   | —                                                       | **✅ Yes**         |
| **4**   | **CAF / myofibroblasts, with plasma-cell admixture**                  | Collagens (COL1A1/1A2/3A1/6A3), SPARC, POSTN, VCAN, TAGLN + Ig genes (IGHG1/IGHA1/JCHAIN) & LYZ                                          | **Plasma present (adjacent)**; stroma is **non-immune** | **❌ No** (stroma) |
| **3**   | **Plasma cells**                                                      | **MZB1, JCHAIN, XBP1**, IGHG/IGHA/IGKC; PC differentiation genes (ELL2)                                                                  | **Plasma (lymphoid lineage)**                           | **❌ No**          |
| **8**   | **AT2 / reactive alveolar epithelium**                                | **SFTPB/C/A1**, SLC34A2, LPCAT1; immediate-early genes (FOS/JUNB/EGR1), DUSP1 → injury/IFN-responsive AT2                                | —                                                       | **❌ No**          |
| **10**  | **Low-quality/multiplet (AT2-contaminated) — **unclear****            | Sparse cohesive biology; **SFTPC** plus many ectopic/testis/neuronal genes (ABCB5, PIWIL3, NEFH, ORs) → likely multiplets/ambient RNA    | **Unclear**                                             | **⚠️ Unclear**    |
| **6**   | **Myofibroblastic CAFs (activated)**                                  | COLs/SPARC/FN1/VCAN/POSTN/SFRP4, **TAGLN, ACTA2, CTGF**, NR4A1/3, THBS1 → activated CAF/invasive edge                                    | —                                                       | **❌ No**          |
| **9**   | **Tumor-associated macrophages (TAMs), endothelial/B-cell admixture** | **MARCO, TREM2, OLR1, MRC1, VSIG4**, APOE/OSM; some endothelial (ACKR1/THBD, AQP1/KLF2) & MS4A1 (B) bleed-through; some surfactant reads | **Myeloid (TAMs)**                                      | **❌ No**          |
| **11**  | **Tumor-associated macrophages / mixed myeloid**                      | **MARCO, SLC11A1, MRC1, OLR1**, CSF3; FABP4 (alveolar macrophages), OSM; minor fibro/AT2 bleed-through                                   | **Myeloid (TAMs/alveolar-like)**                        | **❌ No**          |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant - basal/suprabasal', '1': 'Malignant - basal/suprabasal', 
                                                      '2': 'Malignant - basal/squamous', '3': 'Lymphoid - plasma', 
                         '4': 'Stromal - CAF myofibroblastic', '5': 'Myeloid - mast',
                        '6': 'Stromal - CAF myofibroblastic', '7': 'Epithelial - AT2', 
                         '8': 'Epithelial - AT2', '9': 'Myeloid - TAM',
                            '10': 'Noise', '11': 'Myeloid - TAM', '12': 'Malignant - EMT',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant', 
                                                      '2': 'Malignant', '3': 'Lymphoid', 
                         '4': 'Stromal', '5': 'Myeloid',
                        '6': 'Stromal', '7': 'Epithelial', 
                         '8': 'Epithelial', '9': 'Myeloid',
                            '10': 'Noise', '11': 'Myeloid', '12': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-065295st1

In [None]:
sample_name = 'LIB-065295st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation                                            | Quick rationale (marker highlights)                                                                                  | Immune compartment                           | Malignant likely?                 |
| ------- | ------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------- | -------------------------------------------- | --------------------------------- |
| **2**   | Basal/suprabasal squamous tumor cells (proliferative/stress) | KRT5/14/15/17, KRT6A/19, **CLDN1**, EPCAM, **CEACAM5/6**, ITGB4, PERP, SLC2A1; high MT reads, SGK1/EIF4EBP1 (stress) | —                                            | ✅ Yes                             |
| **7**   | Basal squamous tumor cells (ISG/EGFR-adjacent program)       | KRT5/15/6A/17, CLDN1, EPCAM, CEACAM6, GSTP1, PTHLH, KRAS/KLF5; stress (HSPB1, EIF4EBP1)                              | —                                            | ✅ Yes                             |
| **9**   | Lymphoid (T/B zone–like), **low confidence**                 | CCL19, C7, LTA, NTRK3; but many atypical/olfactory/testis genes → likely low-quality/ambient mix                     | Lymphoid (unclear subtype)                   | ⚠️ Unclear (non-malignant likely) |
| **8**   | Tumor–stroma hybrid at invasive edge                         | Basal epi (KRT5/15/17) + strong **CAF/ECM** (COL1/3/6, SPARC, FN1) and **MMP1/2/11/12**, SPP1, CXCL8                 | —                                            | ⚠️ Mixed (malignant admixture)    |
| **1**   | Myofibroblastic CAFs (activated)                             | COL1A1/1A2/3A1, COL11A1/12A1, **TAGLN, ACTA2**, POSTN, THBS1/2, CTGF, VCAN, SULF1                                    | —                                            | ❌ No (stroma)                     |
| **3**   | Tumor-associated macrophages (TAMs) ± plasma admixture       | **CD68, MS4A7, CTSS/CTSB, C1QA/B/C, APOE/APOC1, GPNMB**, LGMN; Ig transcripts (JCHAIN/MZB1) likely bystander         | **Myeloid (macrophage/TAM)**                 | ❌ No                              |
| **5**   | Basal/suprabasal squamous tumor cells (inflammatory)         | Mirrors 2/7: KRT5/14/15/17, CLDN1, EPCAM, CEACAM6, SLC2A1, HSPB1; CXCL8; stress/MT genes                             | —                                            | ✅ Yes                             |
| **0**   | Multiplet/ambiguous (plasma/lymphoid + perivascular SMC)     | Ig genes (IGKC/IGHG/IGHA) + **MYH11, DES, COL14A1**, MGP; CCL19/C7 present → mixed cell capture                      | Lymphoid/plasma **and** perivascular (mixed) | ⚠️ Unclear (non-malignant)        |
| **4**   | TAMs with stromal/ECM program (myeloid–stroma mix)           | **CD68, C1QA/B/C, CTSS/CTSB, APOE, GPNMB**, plus **COL1/3/6, SPARC, FN1, MMP2**                                      | **Myeloid (macrophage/TAM)**                 | ❌ No                              |
| **6**   | Plasma cells in stromal niche                                | **XBP1, JCHAIN, MZB1**, IGKC/IGHG/IGHA; embedded with CAF ECM (COL1/3/6, SPARC)                                      | **Plasma (lymphoid lineage)**                | ❌ No                              |
| **10**  | Endothelial/perivascular (capillary–venule, with pericytes)  | **PECAM1, VWF, PLVAP, ENG, EGFL7**, MCAM; basement membrane **COL4A1/4A2**; PDGFRB/NOTCH3/pericyte features          | —                                            | ❌ No (stroma/vascular)            |
| **11**  | TAMs / inflammatory myeloid (matrix-interacting)             | **CD68, ITGAX, TYROBP, C1QA/B**, APOE, FCGR2A; ECM/collagens, SPP1, MMP12; macrophage activation                     | **Myeloid (macrophage/TAM)**                 | ❌ No                              |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Noise', '1': 'Stromal - CAF myofibroblastic', 
                                                      '2': 'Malignant - basal/suprabasal', '3': 'Myeloid - TAM', 
                         '4': 'Myeloid - TAM', '5': 'Malignant - basal/suprabasal',
                        '6': 'Lymphoid - Plasma', '7': 'Malignant - basal/squamous', 
                         '8': 'Malignant - stromal interface', '9': 'Noise',
                            '10': 'Perivascular - Endothelial', '11': 'Myeloid - TAM',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Noise', '1': 'Stromal', 
                                                      '2': 'Malignant', '3': 'Myeloid', 
                         '4': 'Myeloid', '5': 'Malignant',
                        '6': 'Lymphoid', '7': 'Malignant', 
                         '8': 'Malignant', '9': 'Noise',
                            '10': 'Perivascular', '11': 'Myeloid',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-064887st1

In [None]:
sample_name = 'LIB-064887st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation (LUAD)                                             | Quick rationale (key markers)                                                                                     | Immune compartment             | Malignant likely?                                                 |
| ------- | -------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------- | ------------------------------ | ----------------------------------------------------------------- |
| **2**   | **Reactive AT2/AT1 alveolar epithelium**                             | AT2/AT1 program: **SFTPC/B/A1, SFTPD, SLC34A2, NAPSA, LPCAT1, AGER**, PIGR/MUC1/SLPI; stress/injury (DUSP1, CTGF) | —                              | ⚠️ Possible (context-dependent)                                   |
| **7**   | **Tumor-associated macrophages (TAMs)**                              | **C1QA/B/C, CD68, MS4A7, ITGAX, MARCO, MRC1**, SLC11A1, LYZ; antigen processing (CTSB/CTSD/IFI30)                 | **Myeloid (macrophage)**       | ❌ No                                                              |
| **10**  | **Perivascular smooth muscle / pericytes (± endothelial admixture)** | **ACTG2, CNN1, MYH11, DES**, EMCN/SCUBE1; vessel-associated                                                       | —                              | ❌ No                                                              |
| **0**   | **LUAD tumor epithelium (AT2/club, non-mucinous)**                   | **SFTPC/B/A1, SLC34A2, ABCA3, NKX2-1**, CLDN18, **CEACAM6**, ITGB6, MUC1, PIGR, ROS1                              | —                              | ✅ Yes                                                             |
| **3**   | **Endothelial / perivascular niche**                                 | **VWF, PECAM1, CLDN5, EGFL7**, ENG; pericyte/smooth muscle (**PDGFRB, ACTA2, MYL9, NOTCH3**); ECM (COL4A1/2)      | —                              | ❌ No                                                              |
| **4**   | **Plasma cells**                                                     | **JCHAIN, XBP1, PRDM1**, IG heavy/light chains; CXCR4, TENT5C                                                     | **Plasma (lymphoid)**          | ❌ No                                                              |
| **12**  | **LUAD tumor epithelium (secretory/AT2-like; stress)**               | **SFTPC/B/A1, SFTPD, SLC34A2, NAPSA, ABCA3**, MUC1, **LAMP3**, SCGB3A1/WFDC2; **NKX2-1**, ROS1                    | —                              | ✅ Yes                                                             |
| **9**   | **Ambiguous multiplet (lymphoid + stromal mix)**                     | IGKC with **CTLA4** and smooth muscle (**DES, MYH11**), scattered neuronal/testis genes → mixed/low-quality       | **Lymphoid (unclear subtype)** | ⚠️ Unclear (likely non-malignant)                                 |
| **6**   | **Vascular endothelium (capillary/venule) ± pericytes**              | **VWF, ACKR1, CLDN5, PLVAP, EGFL7, ADGRL4, CALCRL/RAMP2/3, APLNR, EPHB4**; some MYH11                             | —                              | ❌ No                                                              |
| **8**   | **Secretory/AT2-leaning epithelium (mucinous-prone)**                | **SFTPB/A1/SFTPD**, **WFDC2, SERPINA1**, SCGB3A1, PIGR, NAPSA, ABCA3; **IFITM3/IFI6** stress                      | —                              | ⚠️ Possible (mucinous LUAD component if spatially tumor-adjacent) |
| **13**  | **Fibroblasts / CAFs (reactive stroma)**                             | **DCN, LUM, COL6A2/6A3, FN1, LTBP2/TNXB, CTGF**, GPX3; mild SFTPC/B contamination                                 | —                              | ❌ No                                                              |
| **5**   | **Plasma cells (GC/PC continuum)**                                   | **XBP1, PRDM1, JCHAIN, MZB1**, IGHG/IGKC; POU2AF1/CD79A traces                                                    | **Plasma (lymphoid)**          | ❌ No                                                              |
| **1**   | **CAFs (matrix-remodeling / inflammatory)**                          | **COL1A1/1A2/3A1, DCN, VCAN, CCDC80, SPARC**, IGF1/BPs, THBS2; cytokines (IL6), NNMT                              | —                              | ❌ No                                                              |
| **14**  | **B cells (germinal-center/activated)**                              | **MS4A1 (CD20), CD79A**, **CXCR4/CXCL13**, CIITA; co-stimulation/activation genes                                 | **Lymphoid (B-cells)**         | ❌ No                                                              |
| **11**  | **Inflammatory CAFs / stromal activation**                           | Collagens + **THBS1/2, SERPINE1, VEGFA, IL6**, NR4A1/2/3, DUSP1; wound-healing/EMT-adjacent stroma                | —                              | ❌ No                                                              |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant - AT2/club', '1': 'Stromal - CAF inflammatory', 
                                                      '2': 'Malignant - AT1/AT2', '3': 'Perivascular - Endothelial', 
                         '4': 'Lymphoid - Plasma', '5': 'Lymphoid - Plasma',
                        '6': 'Perivascular - Endothelial', '7': 'Myeloid - TAM', 
                         '8': 'Malignant - AT2/secretory', '9': 'Noise',
                            '10': 'Perivascular - smooth muscle/pericytes', '11': 'Stromal - CAF inflammatory',
                                                     '12': 'Malignant - AT2/secretory', '13': 'Stromal - CAF inflammatory',
                                                      '14': 'Lymphoid - GC B'})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Stromal', 
                                                      '2': 'Malignant', '3': 'Perivascular', 
                         '4': 'Lymphoid', '5': 'Lymphoid',
                        '6': 'Perivascular', '7': 'Myeloid', 
                         '8': 'Malignant', '9': 'Noise',
                            '10': 'Perivascular', '11': 'Stromal',
                                                     '12': 'Malignant', '13': 'Stromal',
                                                      '14': 'Lymphoid'})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-065290st1

In [None]:
sample_name = 'LIB-065290st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation (LUAD)                                     | Quick rationale (key markers)                                                                                           | Immune compartment            | Malignant likely? |
| ------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------- | ----------------------------- | ----------------- |
| **3**   | **LUAD tumor epithelium (AT2/secretory, non-mucinous)**      | SLC34A2, **CEACAM6**, KRT7/8, **LPCAT1**, TACSTD2, **WFDC2**, MUC1, TCIM → AT2/secretory tumor program                  | —                             | **✅ Yes**         |
| **5**   | **CAF / desmoplastic stroma**                                | Collagens (COL1A1/1A2/3A1/6A3/5A1), **FN1, SPARC, VCAN**, **MMP1/2/11**, TAGLN/CTHRC1 → ECM remodeling                  | —                             | **❌ No**          |
| **2**   | **LUAD tumor epithelium (AT2-like)**                         | **SFTPA1/SFTPB**, SLC34A2, **LPCAT1**, CEACAM6, MUC1, **NAPSA**, ITGB6, CLDN4, WFDC2 → AT2 tumor                        | —                             | **✅ Yes**         |
| **0**   | **CAF stroma with plasma/B admixture**                       | Collagens/ECM (COL1/3/5/6, SPARC, VCAN, TAGLN) + **Ig genes (IGHG/IGHA/IGKC, JCHAIN, XBP1)** → stroma with plasma cells | **Plasma present (adjacent)** | **❌ No**          |
| **6**   | **B cells (GC/activated)**                                   | **MS4A1 (CD20), CD79A**, **POU2AF1**, CCR7/SELL, LTB; some plasma signals (XBP1/MZB1/TNFRSF17) → B-cell zone            | **Lymphoid (B cells)**        | **❌ No**          |
| **1**   | **Plasma cells**                                             | **JCHAIN, XBP1, MZB1, PRDM1**, IGHG/IGHA/IGHM/IGKC; ER/protein folding (DERL3/SEC11C)                                   | **Plasma (lymphoid)**         | **❌ No**          |
| **4**   | **CAF / matrix-remodeling fibroblasts**                      | **COL1A1/1A2/3A1/11A1/12A1**, SPARC, **VCAN**, THBS1/2, **POSTN**, INHBA, ITGBL1 → activated CAFs                       | —                             | **❌ No**          |
| **7**   | **Tumor-associated macrophages (TAMs)**                      | **CD68, TYROBP, ITGAX, C1QA/B/C, FCER1G**, SPP1, MARCO, MSR1, IFI30/CTSB/CTSZ                                           | **Myeloid (macrophage)**      | **❌ No**          |
| **9**   | **Airway secretory/club–ciliated compartment (benign)**      | **SCGB3A1/SCGB1A1/SCGB3A2**, BPIFB1/BPIFA1, ciliogenesis genes (DNAH9/12, CFAPs), LTF; some smooth muscle bleed-through | —                             | **❌ No**          |
| **8**   | **Endothelial/perivascular niche (with lymphoid admixture)** | **VWF, ACKR1, CALCRL/RAMP3, APLNR, ADGRL4, ECSCR**, SELP; some lymphoid (CCL19, GZMK) and pericyte (MYH11)              | —                             | **❌ No**          |
| **10**  | **Myeloid (mono/TAM) with matrix/angiogenic signals**        | LYZ, **TYROBP, MRC1, SLC11A1**, C1QA; **MMP1**, COL1/3, **KDR**/angiogenic context → macrophage/mono at invasive edge   | **Myeloid (mono/TAM)**        | **❌ No**          |
| **11**  | **Myeloid (inflammatory macrophages/monocytes)**             | **LYZ, C1QA, MS4A6A, IFI30**, TYROBP; mixed Ig traces; stress/IEGs (TWIST1, ZNFs) likely bystander                      | **Myeloid (macrophage/mono)** | **❌ No**          |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Stromal - CAF inflammatory', '1': 'Lymphoid - Plasma', 
                                                      '2': 'Malignant - AT2', '3': 'Malignant - AT2/secretory', 
                         '4': 'Stromal - CAF matrix-remodeling', '5': 'Stromal - CAF matrix-remodeling',
                        '6': 'Lymphoid - GC B', '7': 'Myeloid - TAM', 
                         '8': 'Perivascular - Endothelial', '9': 'Epithelial - secretory/club',
                            '10': 'Myeloid - TAM/monocytes', '11': 'Myeloid - TAM/monocytes',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Stromal', '1': 'Lymphoid', 
                                                      '2': 'Malignant', '3': 'Malignant', 
                         '4': 'Stromal', '5': 'Stromal',
                        '6': 'Lymphoid', '7': 'Myeloid', 
                         '8': 'Perivascular', '9': 'Epithelial',
                            '10': 'Myeloid', '11': 'Myeloid',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Lymphoid", "Stromal","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-065292st1

In [None]:
sample_name = 'LIB-065292st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation                                           | Quick rationale (marker highlights)                                                                                                 | Immune compartment      | Malignant likely? |
| ------- | ----------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------- | ----------------- |
| **5**   | Suprabasal/keratinizing squamous tumor (inflammatory)       | **SPRR1B/2A/2D/3**, **KRT6A/16/17/80**, PERP, TACSTD2, KLF4; some ECM (COL1/3/6, SPARC) and Ig bleed                                | —                       | ✅ Yes             |
| **1**   | Basal squamous tumor (TP63/CLDN1+)                          | **KRT5/15/16/17**, **TP63**, **CLDN1**, PERP, TRIM29, **NFE2L2**, ALDH3A1; stress (EIF4EBP1, HSPB1)                                 | —                       | ✅ Yes             |
| **3**   | Basal/proliferative squamous tumor (CCND1/oxidative-stress) | **KRT5**, TP63, **EIF4EBP1**, CCND1-adjacent signature, GSTP1, TRIM29, CLDN1; mitochondrial/stress genes                            | —                       | ✅ Yes             |
| **7**   | Suprabasal squamous tumor (wound/UPR)                       | **SPRR family**, **KRT6A/6B/16/17**, TACSTD2, CLDN4, CEACAM6, LCN2, SPP1; keratinization/wound response                             | —                       | ✅ Yes             |
| **8**   | Airway secretory/AT2 epithelium (benign)                    | **SFTPA1/SFTPB**, **SLC34A2**, **LPCAT1**, **PIGR**, **WFDC2**, MUC1, CEACAM6; club/AT2 features                                    | —                       | ❌ No              |
| **4**   | Basal/suprabasal squamous tumor (hypoxia/glycolysis)        | NDRG1, HK2, **SLC2A1**, EIF4EBP1, PERP, **KRT6A/16/17/5/14**, IGFBP2; wound/ER-stress                                               | —                       | ✅ Yes             |
| **6**   | Basal squamous tumor (NOTCH/oxidative-stress)               | **KRT5/15/17**, TP63, **EIF4EBP1**, NFE2L2, ALDH1A1; CLDN1, GSTP1; basal program                                                    | —                       | ✅ Yes             |
| **11**  | Suprabasal squamous tumor (ISG-high/inflammatory)           | **KRT5/6/14/16/17**, **S100A8/A9**, PERP, TRIM29, CLDN1, SLC2A1; keratinization                                                     | —                       | ✅ Yes             |
| **9**   | CAF / desmoplastic stroma with epithelial bleed             | **COL1A1/1A2/3A1/6A**, **FN1, SPARC, VCAN**, TIMP1; some SLC34A2/CEACAM6 contamination                                              | —                       | ❌ No              |
| **0**   | CAF / myofibroblasts (ECM remodeling)                       | **COL1/3/5/6**, **SPARC, FN1, VCAN, POSTN**, **MMP2/11/14**, CTHRC1, THBS2, CDH11                                                   | —                       | ❌ No              |
| **10**  | Basal squamous tumor (TP63/CLDN1+, mitoch. stress)          | **KRT5/15**, **TP63**, **CLDN1**, EIF4EBP1, TRIM29, CCND1, DSC3; mt-encoded transcripts                                             | —                       | ✅ Yes             |
| **12**  | Basal squamous tumor (LY6D+/keratinization)                 | **KRT5/13/15/16**, **LY6D**, **S100A8/A9**, CLDN1, PERP, DMKN, DSC3; basal/keratinizing                                             | —                       | ✅ Yes             |
| **2**   | Plasma/B cells in stromal niche                             | Ig heavy/light (**IGHG/IGHA/IGHM, IGKC**), **JCHAIN, XBP1, MZB1**, CD79A/POU2AF1 → plasma/GCB continuum; ECM transcripts from niche | **Plasma/B (lymphoid)** | ❌ No              |
| **13**  | Basal squamous tumor (hypoxia/stress/maspin)                | **NDRG1, SERPINB5 (maspin), GPNMB**, TP63, **CLDN1**, HK2, SLC2A1; basal keratin program                                            | —                       | ✅ Yes             |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Stromal - CAF myofibroblastic', '1': 'Malignant - basal/squamous', 
                                                      '2': 'Lymphoid - Plasma', '3': 'Malignant - basal/squamous', 
                         '4': 'Malignant - suprabasal/basal', '5': 'Malignant - suprabasal/squamous',
                        '6': 'Malignant - basal/squamous', '7': 'Malignant - suprabasal/squamous', 
                         '8': 'Epithelial - AT2', '9': 'Stromal - CAF matrix-remodeling',
                            '10': 'Malignant - basal/squamous', '11': 'Malignant - suprabasal/squamous',
                                '12': 'Malignant - basal/squamous','13': 'Malignant - basal/squamous',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Stromal', '1': 'Malignant', 
                                                      '2': 'Lymphoid', '3': 'Malignant', 
                         '4': 'Malignant', '5': 'Malignant',
                        '6': 'Malignant', '7': 'Malignant', 
                         '8': 'Epithelial', '9': 'Stromal',
                            '10': 'Malignant', '11': 'Malignant',
                                '12': 'Malignant','13': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Stromal","Lymphoid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-064885st1

In [None]:
sample_name = 'LIB-064885st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation                                          | Quick rationale (marker highlights)                                                             | Immune compartment            | Malignant likely? |
| ------- | ---------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------- | ----------------- |
| **5**   | Tumor-associated macrophages (TAMs)                        | **CD163, CD68, ITGAX, C1QA/B/C, MSR1, SLC11A1**, APOE/CTSB/CTSD/CTSZ; lipid/lysosomal program   | **Myeloid (macrophage/TAM)**  | ❌ No              |
| **12**  | Endothelial/perivascular stroma (activated)                | **COL4A1/4A2, ENG, EPAS1, CLDN5** with ECM (**COL1/3/6, SPARC, IGFBP7**), **TAGLN/MYL9**        | —                             | ❌ No              |
| **4**   | Alveolar epithelium (AT2/secretory; benign)                | **SFTPA1/B/C/D, SLC34A2, NAPSA, LPCAT1, PIGR**, MUC1; some stromal bleed                        | —                             | ❌ No              |
| **3**   | CAF stroma with plasma admixture                           | **COL1/3/5/6, SPARC, FN1, MMP2/14, TAGLN**, + Ig genes (**IGHG/IGHA/IGKC, XBP1**)               | **Plasma present (adjacent)** | ❌ No              |
| **9**   | Lymphoid (T/B mix) in vascular niche                       | **IGKC, TRAC/TRBC2, CCL19**, RGS1; endothelial/stromal proximity (AQP1/DCN/PECAM1)              | **Lymphoid (mixed)**          | ❌ No              |
| **0**   | Basal/suprabasal **squamous tumor** (stress)               | **KRT5, TP63, EPCAM**, **IGFBP2, SLC2A1, PERP**, GPX2; mitochondrial stress set                 | —                             | ✅ Yes             |
| **16**  | Inflammatory monocytes / TAM-like (neutro-adjacent)        | **CXCL8, TREM1/2, FCER1G, MS4A7, CD14**, ITGAX, SIGLEC9                                         | **Myeloid (mono/TAM)**        | ❌ No              |
| **14**  | Vascular–myeloid niche (endothelium/pericyte + macrophage) | **C1QA/B/C, ITGAX, FCGR3A** with **EGFL7/PTPRB/PDGFB/ACTA2**; wound/angiogenic signals          | **Myeloid + vascular stroma** | ❌ No              |
| **13**  | Plasma cells in stromal niche                              | **JCHAIN, XBP1, MZB1**, Ig heavy/light; embedded in **COL1/3/6** matrix                         | **Plasma (lymphoid)**         | ❌ No              |
| **6**   | TAMs (antigen-presentation/activated)                      | **CD163, ITGAX, CIITA, C1QC**, STAB1, AGER; macrophage activation                               | **Myeloid (macrophage/TAM)**  | ❌ No              |
| **10**  | Low-quality / ambient (OR/rare/testis genes)               | Dominated by OR/rare transcripts (BPIFC, ORs, PIWIL2…) → no coherent lineage                    | **Unclear**                   | ⚠️ Unclear        |
| **2**   | Basal **squamous tumor** (TP63/SOX2/IGFBP2+)               | **KRT5, TP63, SOX2, EPCAM**, **IGFBP2, SLC2A1**, GPX2/TXNRD1                                    | —                             | ✅ Yes             |
| **17**  | Ambiguous / low-quality (eosinophil-leaning?)              | Scatter incl. **CD177, CCL11** but many OR/odd genes; no stable program                         | **Unclear**                   | ⚠️ Unclear        |
| **11**  | Basal **squamous tumor** (replicating 2-like)              | **KRT5, TP63, SOX2**, IGFBP2, SLC2A1; same stress/mt signature as 2                             | —                             | ✅ Yes             |
| **8**   | Plasma cells (clean)                                       | **IGHG/IGHA/IGKC**, **JCHAIN, XBP1, MZB1**, DERL3                                               | **Plasma (lymphoid)**         | ❌ No              |
| **15**  | Plasma/B-cell continuum (GC/PC)                            | **POU2AF1, CD38, FCRL5**, with **XBP1/JCHAIN/MZB1**; ECM proximity                              | **B ↔ Plasma**                | ❌ No              |
| **1**   | Basal **squamous tumor** (TP63/IGFBP2+)                    | **KRT5/6/17, TP63**, **IGFBP2, PERP, SLC2A1**, GSTP1; keratin junctions (DSP/DSC)               | —                             | ✅ Yes             |
| **7**   | Basal **squamous tumor** (stress/glycolysis)               | **KRT5/17, TP63**, **PTHLH, IGFBP2, SLC2A1, NDRG1**, GPX2; mito/glycolytic stress               | —                             | ✅ Yes             |
| **18**  | Dendritic-cell–like / APC                                  | **ITGAX (CD11c), DOCK2/DOCK10, ARHGAP4, CD37**, CIITA-adjacent milieu; lymphoid-migratory genes | **Myeloid (dendritic-like)**  | ❌ No              |
| **19**  | Ambiguous multiplet (secretory/adipocyte/rare)             | Mixed airway secretory (BPIFB6), adipocyte (**ADIPOQ**), and rare/testis/OR genes               | **Unclear**                   | ⚠️ Unclear        |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant - suprabasal/basal', '1': 'Malignant - basal/squamous', 
                                                      '2': 'Malignant - basal/squamous', '3': 'Stromal - CAF myofibroblastic', 
                         '4': 'Epithelial - AT2', '5': 'Myeloid - TAM',
                        '6': 'Myeloid - TAM', '7': 'Malignant - basal/squamous', 
                         '8': 'Lymphoid - Plasma', '9': 'Lymphoid - T/B',
                            '10': 'Noise', '11': 'Malignant - basal/squamous',
                                '12': 'Perivascular - endothelial','13': 'Lymphoid - Plasma',
                                '14': 'Noise', '15': 'Lymphoid - Plasma',
                                '16': 'Myeloid - TAM/monocytes','17': 'Noise',
                                '18': 'Noise','19': 'Noise'})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant', 
                                                      '2': 'Malignant', '3': 'Stromal', 
                         '4': 'Epithelial', '5': 'Myeloid',
                        '6': 'Myeloid', '7': 'Malignant', 
                         '8': 'Lymphoid', '9': 'Lymphoid',
                            '10': 'Noise', '11': 'Malignant',
                                '12': 'Perivascular','13': 'Lymphoid',
                                '14': 'Noise', '15': 'Lymphoid',
                                '16': 'Myeloid','17': 'Noise',
                                '18': 'Noise','19': 'Noise'})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Stromal","Lymphoid","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-064890st1

In [None]:
sample_name = 'LIB-064890st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cl     | Likely annotation                               | Quick rationale (marker highlights)                                                 | Immune compartment           | Malignant? |
| ------ | ----------------------------------------------- | ----------------------------------------------------------------------------------- | ---------------------------- | ---------- |
| **1**  | LUAD tumor epithelium (mucinous/secretory)      | **MUC5B**, **SFTPB**, **EPCAM**, **CEACAM5/6**, **NAPSA**, TFF3, MAL2, CCND1        | —                            | ✅          |
| **0**  | LUAD tumor epithelium (AT2/secretory)           | **SFTPB**, EPCAM, **CEACAM6**, **NAPSA**, AQP3, MAL2, GPX4; secretory program       | —                            | ✅          |
| **4**  | LUAD tumor epithelium (inflamed/invasive)       | AT2/secretory core with **CXCL8/5/20**, TGFA, ZBED2, LYPD3 → inflamed/EGFR-adjacent | —                            | ✅          |
| **3**  | LUAD tumor epithelium (AT2/secretory)           | **SFTPB**, EPCAM, **TFF3**, MUC5B, CCND1, MAL2, NAPSA                               | —                            | ✅          |
| **2**  | CAF / myofibroblasts (matrix-remodeling)        | **COL1/3/5/6**, **FN1, SPARC**, **TAGLN/MYL9/ACTA2**, **IGFBP7**, VCAN, THBS2       | —                            | ❌          |
| **5**  | Perivascular CAF / mural-like stroma            | ECM collagens + **RGS5/MYLK/ABCC9**, **ACTA2**, THY1 → pericyte/CAF mix             | —                            | ❌          |
| **6**  | CAF (matrix/angiogenic; endothelial bleed)      | Collagens/CAF set + **PECAM1/LAMB1/LAMA4**, **MMP14**, **LTBP2**                    | —                            | ❌          |
| **7**  | Ambiguous / low-quality                         | OR/rare/testis/neuronal genes; no coherent lineage                                  | *Unclear*                    | ⚠️ Unclear |
| **9**  | Ambiguous cytotoxic/innate mix (low confidence) | **SPP1**, **CXCL8**, GZMH with many atypical/OR genes → likely multiplets/ambient   | *Unclear*                    | ⚠️ Unclear |
| **8**  | Tumor-associated macrophages (TAMs)             | **CD163, CD68, ITGAX, C1QA/B/C, TYROBP**, LGMN, FCER1G; APOE/CTSB/CTSD              | **Myeloid (macrophage/TAM)** | ❌          |
| **11** | Endothelial/immune admixture (low confidence)   | **ECSCR, MMRN1**, CHRDL1, SIGLEC9, MRC1, CXCL8 → vascular + myeloid mix             | *Mixed (endo + myeloid)*     | ⚠️ Unclear |
| **10** | Endothelium (activated vascular niche)          | **VWF, EMILIN2, ADAMTSL4/1, STAB1**, FBN1; perivascular signaling                   | —                            | ❌          |
| **13** | LUAD tumor epithelium (stress/UPR)              | **MUC5B/MUC1**, **HSPA5**, DUSP1/23, RTN3, TIMP1, CEACAM set; epithelial metabolism | —                            | ✅          |
| **12** | LUAD tumor epithelium (progenitor/stress)       | **SOX4, DUSP6**, DSP, PLXND1, RAB25-like surface program; epithelial                | —                            | ✅          |
| **14** | LUAD tumor epithelium (CCND1/CEACAM6+)          | **CEACAM6**, **PRSS23**, **CCND1**, MDK/ERRFI1; secretory/stress pattern            | —                            | ✅          |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant - AT2/secretory', '1': 'Malignant - mucinous/secretory', 
                                                      '2': 'Stromal - CAF myofibroblastic', '3': 'Malignant - AT2/secretory', 
                         '4': 'Malignant - inflamed/invasive', '5': 'Stromal - CAF perivascular',
                        '6': 'Stromal - CAF matrix-remodeling', '7': 'Noise', 
                         '8': 'Myeloid - TAM', '9': 'Noise',
                            '10': 'Perivascular - endothelial', '11': 'Noise',
                                '12': 'Malignant - progenitor','13': 'Malignant - stress',
                                '14': 'Malignant - stress',})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Malignant', 
                                                      '2': 'Stromal', '3': 'Malignant', 
                         '4': 'Malignant', '5': 'Stromal',
                        '6': 'Stromal', '7': 'Noise', 
                         '8': 'Myeloid', '9': 'Noise',
                            '10': 'Perivascular', '11': 'Noise',
                                '12': 'Malignant','13': 'Malignant',
                                '14': 'Malignant',})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Stromal","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-065291st1

In [None]:
sample_name = 'LIB-065291st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1.5)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely annotation                                                              | Quick rationale (marker highlights)                                                                                                 | Immune compartment                        | Malignant likely?                     |
| ------- | ------------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------- | ------------------------------------- |
| **0**   | Basal/suprabasal **squamous tumor** (inflammatory/ISG-high)                    | **KRT5/6A/6B/6C/14/17**, CLDN1, PERP, **S100A8/A9/A14/A16**, PI3/SLPI, GSTP1, SLC2A1, TM4SF1                                        | —                                         | **✅ Yes**                             |
| **5**   | **CAF / myofibroblasts** (ECM remodeling)                                      | **COL1A1/1A2/3A1/6A1/6A2**, FN1, **SPARC**, VCAN, POSTN, **MMP2/11**, CTHRC1, LUM/DCN                                               | —                                         | **❌ No**                              |
| **3**   | **Squamous tumor** (metabolic/EMT-leaning)                                     | **SPP1**, NOTCH3/NRARP, ITGB8, **AKR1C1/2/3**, GPX2/4, NQO1, CES1, SLC7A5                                                           | —                                         | **✅ Yes**                             |
| **12**  | Basal **squamous tumor** (NOTCH program)                                       | **KRT5/6A/14/17**, CLCA2, PERP, **NOTCH3/NRARP**, PTHLH, SLC2A1, SDC1                                                               | —                                         | **✅ Yes**                             |
| **4**   | **Myeloid – neutrophils/activated monocytes** (TAM/neutro mix)                 | **CXCL8, ITGAX, SLC11A1, TREM1, FCAR, C5AR1**, LCP1, ALOX5AP, NCF2; inflammatory mediators (OSM, PLAUR) with some KRT bleed-through | **Myeloid (neutro/mono mix)**             | **❌ No**                              |
| **2**   | Basal/suprabasal **squamous tumor** (stress/glycolytic)                        | **KRT5/6A/16/17**, CLDN1, PERP, **S100A8/A9** (subset), SLC2A1, HSPB1, EIF4EBP1, GSTP1, TM4SF1                                      | —                                         | **✅ Yes**                             |
| **1**   | **CAF / myofibroblasts** with plasma admixture                                 | ECM core (COL1/3/5/6, **SPARC, FN1, POSTN, TAGLN/ACTA2**), **TIMP1/THBS2**, plus Ig (IGHG/IGKC/JCHAIN/XBP1)                         | Plasma present (adjacent)                 | **❌ No**                              |
| **9**   | **Myeloid – neutrophil-skewed / low-quality** (ambient OR/mucin contamination) | **CXCL8, SRGN, BCL2A1, PROK2, CCRL2**, FCAR; many olfactory/mucin genes → likely low-quality / mixed                                | **Myeloid (neutro-like), low confidence** | **⚠️ Unclear (non-malignant likely)** |
| **6**   | **Squamous tumor** (xenobiotic/stress program)                                 | **SPP1**, NOTCH3/NRARP, **AKR1C1/2/3, AKR1B10**, GPX2, ALDH3A1, WNT5A, SLC transporters                                             | —                                         | **✅ Yes**                             |
| **8**   | **Myeloid – neutrophils** (TREM1/FCGR3B axis)                                  | **CXCL8, TREM1, FCGR3B, CEACAM3, CSF3R**, FPR1/2, LSP1, NCF2, AQP9                                                                  | **Myeloid (neutrophils)**                 | **❌ No**                              |
| **7**   | **Low-quality / ambient multiplet**                                            | Dominated by **USP17L** family, OR/sex-germline/cosmic reads; no coherent lineage                                                   | **Unclear**                               | **⚠️ Unclear**                        |
| **10**  | **Myeloid – inflammatory neutro/mono** (ambient contamination)                 | **CXCL8, BCL2A1, OSM, SRGN, NCF2**, FCAR; numerous OR/testis hits → mixed/low-quality                                               | **Myeloid (neutro/mono), low confidence** | **❌ No**                              |
| **11**  | **Myeloid – mono/TAM** (with Ig bleed-through)                                 | **LYZ, C1QA, CD74, IFI30**, MS4A6A; scattered Ig and stress genes                                                                   | **Myeloid (monocytes/TAM)**               | **❌ No**                              |
| **13**  | **Ambiguous / low-quality multiplet**                                          | IGKC with many non-lung OR/reproductive/neuronal genes; no stable program                                                           | **Unclear**                               | **⚠️ Unclear**                        |
| **14**  | **Myeloid – neutrophil-like (low confidence)**                                 | **PLAUR**, **CEACAM3**, CES1; mixed metabolic/rare transcripts; likely neutro-adjacent with ambient                                 | **Myeloid (neutrophil-like)**             | **❌ No**                              |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({'0': 'Malignant - basal/suprabasal', '1': 'Stromal - CAF myofibroblastic', 
                                                      '2': 'Malignant - basal/suprabasal', '3': 'Malignant - squamous', 
                         '4': 'Myeloid - neutrophils/monocytes', '5': 'Stromal - CAF myofibroblastic',
                        '6': 'Malignant - squamous', '7': 'Noise', 
                         '8': 'Myeloid - neutrophils/monocytes', '9': 'Noise',
                            '10': 'Noise', '11': 'Noise', '12': 'Malignant - basal/squamous', 
                                                     '13': 'Noise', '14': 'Noise'})

In [None]:
adata.obs['celltypes'] = adata.obs.leiden.replace({'0': 'Malignant', '1': 'Stromal', 
                                                      '2': 'Malignant', '3': 'Malignant', 
                         '4': 'Myeloid', '5': 'Stromal',
                        '6': 'Malignant', '7': 'Noise', 
                         '8': 'Myeloid', '9': 'Noise',
                            '10': 'Noise', '11': 'Noise', '12': 'Malignant', 
                                                     '13': 'Noise', '14': 'Noise'})

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Stromal","Myeloid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# LIB-064886st1

In [None]:
sample_name = 'LIB-064886st1'
sample_path = base_dir / sample_name
adata_path = sample_path / "adata.h5ad"

In [None]:
adata = sc.read_h5ad(adata_path)

In [None]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

adata = adata[adata.obs.total_counts>=50].copy()

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata, target_sum=10000)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.pp.neighbors(adata)

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.leiden(adata, resolution=1)

In [None]:
sc.pl.umap(adata, color=['leiden'])

In [None]:
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')

In [None]:
de_genes = {}
for gr in adata.obs.leiden.unique():
    de_genes[gr] = sc.get.rank_genes_groups_df(adata, group=gr)

In [None]:
for gr in de_genes:
    print(gr)
    print(de_genes[gr].head(50).names.ravel())

| Cluster | Likely Annotation                               | Rationale                                                                                          | Malignant? |
| ------- | ----------------------------------------------- | -------------------------------------------------------------------------------------------------- | ---------- |
| **0**   | Malignant – basal/squamous (SOX2/FGFR1)         | SOX2, FGFR1, EPCAM, CLDN4, KRT17 → squamous-leaning malignant program with stress (HSP90, HSPB1).  | **Yes**    |
| **1**   | Malignant – basal/squamous (SOX2/FGFR1, stress) | Overlaps with 0 but more stress/chaperones (HSPs, DDIT4, ERO1A, DNAJB1).                           | **Yes**    |
| **2**   | Stromal – CAF (myofibroblastic)                 | Collagen (COL1A1/3/5/6), TAGLN, MMP2, MMP14, THBS1, DCN, FN1.                                      | No         |
| **3**   | Malignant – EMT/stress (hypoxia/invasive)       | Hypoxia/glycolysis (SLC2A1, CA9, NDUFA4L2), EMT (CDH2, MMP12), SOX4, P4HA1, TFRC.                  | **Yes**    |
| **4**   | Myeloid – inflamed/chemokine-high               | CXCL8, IL1B, TREM1, OSM, ITGAX, MMP12 → activated neutrophil/macrophage-like inflammatory cluster. | No         |
| **5**   | B cell / plasma                                 | IGKC, IGHG, IGHA1, IGHM, JCHAIN, MZB1, XBP1 → clear plasma cell program.                           | No         |
| **6**   | Myeloid – TAM (interferon/inflammatory)         | APOE, CD163, ITGAX, CXCL9/10, ISG set (IFI30, IFI6, IFI44L), SPI1.                                 | No         |
| **7**   | Myeloid – TAM (SPP1+/APOE+)                     | SPP1-high macrophages with APOE, CD163, ECM-linked signature (SPARC, COL6A\*).                     | No         |
| **8**   | Malignant – AT2/secretory                       | Canonical AT2/secretory genes (SFTPA1, SFTPB, SFTPC, SLC34A2, NAPSA, MUC1, SCGB3A1/2).             | **Yes**    |
| **9**   | Stromal – CAF (matrix-remodeling)               | Strong ECM program: COL1/3/5/6, POSTN, TIMP1/3, SERPINE1, VCAN, FN1, SPARC.                        | No         |
| **10**  | Perivascular – endothelial                      | ACKR1, PROX1-like endothelial signature, vascular-associated markers.                              | No         |
| **11**  | Malignant – basal/squamous (SOX2/FGFR1)         | SOX2, FGFR1, EPCAM, CRABP2, HHIP, VEGFA. Squamous-like malignant epithelial.                       | **Yes**    |


In [None]:
adata.obs['cellsubtypes'] = adata.obs.leiden.replace({
    '4':  'Myeloid - inflamed/chemokine-high',          # CXCL8/IL1B/TREM1/AQP9/FCAR/MMP12
    '0':  'Malignant - basal/squamous (SOX2/FGFR1)',    # SOX2/FGFR1/CLDN4/EPCAM/KRT17
    '1':  'Malignant - basal/squamous (SOX2/FGFR1)',    # SOX2/FGFR1/KRT15/KRT17 + HSPs (stress)
    '3':  'Malignant - EMT/stress (hypoxia/invasive)',  # SLC2A1/TFRC/P4HA1/CA9/NDUFA4L2/MMP12/CDH2
    '6':  'Myeloid - TAM',    # APOE/CD163/ITGAX/CXCL9/10/IFI* genes
    '9':  'Stromal - CAF matrix-remodeling',            # COL1A1/A2, COL6A*, POSTN/SERPINE1/TIMP3
    '5':  'Lymphoid - Plasma',                            # IGKC/IGHG*/IGHA1/JCHAIN/XBP1/MZB1
    '8':  'Malignant - AT2',                  # SFTPA1/SFTPB/SFTPC/SLC34A2/NAPSA/MUC1
    '2':  'Stromal - CAF myofibroblastic',              # COL1/3/5/6, TAGLN, MMP14, THBS1
    '10': 'Perivascular - endothelial',                 # ACKR1/PROX1-like signal, vascular markers
    '7':  'Myeloid - TAM',                # SPP1/APOE/CD163/LYZ + ECM-adjacent signal
    '11': 'Malignant - basal/squamous (SOX2/FGFR1)',    # SOX2/FGFR1/CRABP2/HHIP
})


In [None]:
adata.obs['celltypes'] = adata.obs['cellsubtypes'].str.split(' - ').str[0]

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'], wspace=0.35)

In [None]:
adata.obs[['celltypes','cellsubtypes']].to_csv(sample_path / 'celltypes.csv')

### InferCNV

In [None]:
import pandas as pd
import gzip

def parse_gtf_attributes(attr_str):
    """Parse the last GTF column into a dict."""
    d = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, val = field.strip().split(" ", 1)
        d[key] = val.strip('"')
    return d

records = []
with gzip.open("../../../Broad_SpatialFoundation/gencode.v48.basic.annotation.gtf.gz", "rt") as fh:
    for line in fh:
        if line.startswith("#"):
            continue
        chrom, source, feature, start, end, score, strand, frame, attrs = line.strip().split("\t")
        if feature != "gene":   # only need gene rows
            continue
        attr_dict = parse_gtf_attributes(attrs)
        records.append({
            "gene_id": attr_dict.get("gene_id"),
            "gene_name": attr_dict.get("gene_name"),
            "chromosome": chrom.replace("chr",""),  # remove 'chr' if present
            "start": int(start),
            "end": int(end),
            "strand": strand
        })

gtf_df = pd.DataFrame(records).set_index('gene_name')
gtf_df = gtf_df.loc[gtf_df.index.intersection(adata.var_names)]
gtf_df = gtf_df.loc[~gtf_df.index.duplicated()]

In [None]:
adata.var = pd.concat([adata.var, gtf_df],axis=1)

adata.var['chromosome'] = 'chr'+adata.var['chromosome']

In [None]:
del adata.uns['log1p']
adata.X = adata.layers['counts']

In [None]:
import infercnvpy as cnv
import scanpy as sc

# ---- Choose your AnnData ----
# (assumes you already have one adata per patient loaded)
# adata = ...

# ---- 1. Preprocess for infercnv ----
# Choose reference populations (e.g., immune + stromal).
# Adjust to your actual column & labels:
reference_groups = ["Stromal","Myeloid","Lymphoid"]

cnv.tl.infercnv(
    adata,
    reference_key="celltypes",
    reference_cat=reference_groups,
    window_size=250,
    n_jobs=1,
)


In [None]:
# ---- 3. Visualization ----
# Heatmap of CNV profiles (chromosomes along x, cells along y)
cnv.pl.chromosome_heatmap(adata, groupby="cellsubtypes")

In [None]:
adata.write_h5ad(adata_path)

# Reannotate full cohort

In [None]:
adatas = []
for spl in tqdm(sample_list):
    if spl in ['full_cohort','LIB-064888st1']:
        continue
    adata = sc.read_h5ad(base_dir / spl / 'adata.h5ad')
    adata.obs['sample_id'] = spl
    adatas.append(adata)

In [None]:
adata = adatas[0].concatenate(*adatas[1:])

In [None]:
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [None]:
adata = adata[adata.obs.total_counts>=50].copy()

In [None]:
adata = adata[adata.obs.celltypes!='Noise'].copy()

In [None]:
adata.layers['counts'] = adata.X.copy()

In [None]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)

In [None]:
sc.tl.pca(adata)

In [None]:
sc.external.pp.bbknn(adata, batch_key='sample_id')

In [None]:
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=['sample_id','celltypes'])

# Malignant & epithelial

In [None]:
subadata = adata[(adata.obs.celltypes.isin(['Malignant','Epithelial']))].copy()

In [None]:
sc.external.pp.bbknn(subadata, batch_key='sample_id')

In [None]:
sc.tl.umap(subadata)

In [None]:
sc.tl.leiden(subadata, resolution=2, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(subadata, color=['sample_id','celltypes','leiden'])

In [None]:
sc.pl.umap(subadata, color=['sample_id','log1p_total_counts','pct_counts_in_top_50_genes'])

In [None]:
vc = subadata.obs.leiden.value_counts()
to_group = vc[vc<1000].index

In [None]:
subadata.obs.leiden = subadata.obs.leiden.replace({i: 'Other' for i in to_group})

In [None]:
subadata.obs[['leiden','sample_id']].value_counts().unstack()

In [None]:
sc.tl.rank_genes_groups(subadata, groupby='leiden', method='wilcoxon')

In [None]:
dgex = {}
for gr in subadata.obs.leiden.unique():
    dgex[gr] = sc.get.rank_genes_groups_df(subadata, group=gr)

In [None]:
for cl in dgex:
    print(cl)
    print(dgex[cl].head(50).names.ravel())

Super-condensed view

AT2-like adenocarcinoma: 2, 6, 10

Secretory / club-like adenocarcinoma: 0, 1

Inflamed / invasive SPP1–S100–high squamous carcinoma: 3, 12, 13

Basal–progenitor squamous carcinoma: 4, 5, 14

EMT / invasive-front tumor cells: 0 (edge), 11

Stem-like / SOX2–FGFR1–HOX–high: 7, 8, 9, Other

In [None]:
subadata.obs['cellsubtypes'] = subadata.obs.leiden.replace({'0': 'Malignant - EMT',
                             '1': 'Malignant - Secretory/club-like adenocarcinoma',
                            '2': 'Malignant - AT2-like adenocarcinoma',
                            '3': 'Malignant - Inflamed/invasive squamous-like carcinoma',
                            '4': 'Malignant - Basal/progenitor squamous carcinoma',
                            '5': 'Malignant - Basal/progenitor squamous carcinoma',
                            '6': 'Malignant - AT2-like adenocarcinoma',
                            '7': 'Malignant - Stem-like',
                            '8': 'Malignant - Stem-like',
                            '9': 'Malignant - Stem-like',
                            '10': 'Malignant - AT2-like adenocarcinoma',
                             '11': 'Malignant - EMT',
                            '12': 'Malignant - Inflamed/invasive squamous-like carcinoma',
                            '13': 'Malignant - Inflamed/invasive squamous-like carcinoma',
                            '14': 'Malignant - Basal/progenitor squamous carcinoma', 'Other': 'Noise'})

In [None]:
subadata.obs[['cellsubtypes','sample_id']].to_csv('nsclc_malep_ct.csv')

In [None]:
subadata.obs[['sample_id','cellsubtypes',]].value_counts().unstack()

In [None]:
arr = pd.read_csv('nsclc_malep_ct.csv',index_col=0)

In [None]:
subadata = adata[(adata.obs.celltypes=='Epithelial')].copy()

In [None]:
sc.external.pp.bbknn(subadata, batch_key='sample_id')

In [None]:
sc.tl.umap(subadata)

In [None]:
sc.tl.leiden(subadata, resolution=0.8, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(subadata, color=['sample_id','celltypes','leiden'])

In [None]:
sc.tl.rank_genes_groups(subadata, groupby='leiden', method='wilcoxon')

In [None]:
dgex = {}
for gr in subadata.obs.leiden.unique():
    dgex[gr] = sc.get.rank_genes_groups_df(subadata, group=gr)

In [None]:
for cl in dgex:
    print(cl)
    print(dgex[cl].head(50).names.ravel())

| Cluster | Likely normal identity                     | Status                          | ADC-associated flags (if any)  |
| ------- | ------------------------------------------ | ------------------------------- | ------------------------------ |
| 1       | **AT2 pneumocytes**                        | Healthy, IEG/hypoxia-responsive | —                              |
| 4       | **Club/goblet secretory epithelium**       | Likely healthy                  | MSLN, MMP7 (context-dependent) |
| 3       | **Club-like secretory epithelium**         | Likely healthy, OXPHOS-high     | CEACAM6 (soft)                 |
| 0       | **Injury/repair epithelium (partial EMT)** | Likely reactive/healthy         | MMP11, CTHRC1 (watchlist)      |
| 2       | **Inflamed epithelial–immune interface**   | Likely healthy                  | CEACAM6 (soft)                 |


In [None]:
subadata.obs['cellsubtypes'] = subadata.obs.leiden.replace({'0': 'Malignant - pEMT',
                             '1': 'Epithelial - AT2',
                            '2': 'Epithelial - immune interface',
                            '3': 'Epithelial - Club-like secretory',
                            '4': 'Epithelial - Club-like secretory',})

In [None]:
subadata.obs[['cellsubtypes','sample_id']].to_csv('nsclc_ep_ct.csv')

In [None]:
subadata.obs[['sample_id','cellsubtypes',]].value_counts().unstack()

# Lymphoid

In [None]:
subadata = adata[(adata.obs.celltypes=='Lymphoid')].copy()

In [None]:
sc.external.pp.bbknn(subadata, batch_key='sample_id')

In [None]:
sc.tl.umap(subadata)

In [None]:
sc.tl.leiden(subadata, resolution=0.8, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(subadata, color=['sample_id','celltypes','leiden'])

In [None]:
sc.tl.rank_genes_groups(subadata, groupby='leiden', method='wilcoxon')

In [None]:
dgex = {}
for gr in subadata.obs.leiden.unique():
    dgex[gr] = sc.get.rank_genes_groups_df(subadata, group=gr)

In [None]:
for cl in dgex:
    print(cl)
    print(dgex[cl].head(50).names.ravel())

| Cluster | Lymphoid call               | Confidence  | Notes                                               |
| ------- | --------------------------- | ----------- | --------------------------------------------------- |
| 2       | CD8⁺ T / NK effector        | High        | GNLY/PRF1/GZMB core; some myeloid/stromal spillover |
| 6       | Activated B → plasmablast   | High        | IFN/APC/ER-stress genes; transitional ASC           |
| 8       | Plasmablasts / plasma cells | High        | PRDM1/XBP1/MZB1/CD38                                |
| 4       | Plasma cells (IgA/IgG)      | Medium-High | JCHAIN/XBP1; stromal admixture                      |
| 7       | Plasma cells (IgG-rich)     | Medium-High | IGHG1/3; stromal neighbors                          |
| 3       | TLS-like Tfh–B niche        | Medium      | CXCL13/CCL19/CXCL9/10; APC markers                  |
| 1       | AT2 epithelium              | — (exclude) | Epithelial contamination                            |
| 0       | Secretory/AT2 + club        | — (exclude) | Epithelial doublets                                 |
| 5       | Plasma + CAF mixture        | Medium      | Consider deconvolution / stricter filtering         |


In [None]:
subadata.obs['cellsubtypes'] = subadata.obs.leiden.replace({'0': 'Epithelial - AT2',
                             '1': 'Epithelial - AT2',
                            '2': 'Lymphoid - CD8+ T',
                            '3': 'Lymphoid - B',
                            '4': 'Lymphoid - Plasma (IgA/IgG)',
                            '5': 'Noise',
                            '6': 'Lymphoid - Plasma',
                            '7': 'Lymphoid - Plasma (IgG)',
                            '8': 'Lymphoid - Plasma',})

In [None]:
subadata.obs[['cellsubtypes','sample_id']].to_csv('nsclc_lymph_ct.csv')

In [None]:
subadata.obs[['sample_id','cellsubtypes',]].value_counts().unstack()

# Myeloid

In [None]:
subadata = adata[(adata.obs.celltypes=='Myeloid')].copy()

In [None]:
sc.external.pp.bbknn(subadata, batch_key='sample_id')

In [None]:
sc.tl.umap(subadata)

In [None]:
sc.tl.leiden(subadata, resolution=1.2, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(subadata, color=['sample_id','celltypes','leiden'])

In [None]:
subadata.obs[['leiden','sample_id']].value_counts().unstack()

In [None]:
sns.boxplot(data=pd.concat([subadata[:,'EPCAM'].to_df(),subadata.obs],axis=1), x='leiden', y='EPCAM')

In [None]:
sc.tl.rank_genes_groups(subadata, groupby='leiden', method='wilcoxon')

In [None]:
dgex = {}
for gr in subadata.obs.leiden.unique():
    dgex[gr] = sc.get.rank_genes_groups_df(subadata, group=gr)

In [None]:
for cl in dgex:
    print(cl)
    print(dgex[cl].head(50).names.ravel())

| Cluster | Myeloid call                        | Confidence | Key features                             |
| ------- | ----------------------------------- | ---------- | ---------------------------------------- |
| 3       | C1Q⁺ APC-high TAMs                  | High       | C1QA/B/C, CD68, IFI30/CTSS, APOE         |
| 6       | C1Q⁺ TAMs (plasma-rich niche)       | Med–High   | C1Q, CD68 + XBP1/MZB1/JCHAIN             |
| 8       | C1Q⁺ TAMs (stroma/plasma admixture) | Med–High   | C1Q, CD68, IFI30 + Igs, COLs             |
| 7       | IFN-activated macrophages           | Medium     | STAT1, CXCL10, TAPBP, C1Q                |
| 1       | NR4A/IEG mono-mac (activated)       | Low–Med    | FOS/JUN, NR4A1/2, KLF2/4; endo/epi bleed |
| 0       | Epithelial (squamous)               | —          | KRTs, TP63, CLDN1, SPP1/CXCL8            |
| 2       | Epithelial (inflamed)               | —          | KRTs + SPP1/CXCL8/S100A8/A9              |
| 4       | Epithelial (oncogenic)              | —          | SOX2/FGFR1/TP63, IL1B                    |
| 5       | Epithelial (oncogenic)              | —          | SOX2/FGFR1/CLDN4/KRT8                    |


In [None]:
subadata.obs['cellsubtypes'] = subadata.obs.leiden.replace({'0': 'Epithelial - Squamous',
                             '1': 'Myeloid - Monocyte/macrophage',
                            '2': 'Epithelial - Inflamed',
                            '3': 'Myeloid - C1Q+ TAM',
                            '4': 'Malignant - Inflamed/invasive squamous-like carcinoma',
                            '5': 'Malignant - Inflamed/invasive squamous-like carcinoma',
                            '6': 'Malignant - EMT',
                            '7': 'Noise',
                            '8': 'Malignant - EMT',})

In [None]:
subadata.obs[['cellsubtypes','sample_id']].to_csv('nsclc_myeloid_ct.csv')

In [None]:
subadata.obs[['sample_id','cellsubtypes',]].value_counts().unstack()

# Stromal

In [None]:
subadata = adata[(adata.obs.celltypes=='Stromal')].copy()

In [None]:
sc.external.pp.bbknn(subadata, batch_key='sample_id')

In [None]:
sc.tl.umap(subadata)

In [None]:
sc.tl.leiden(subadata, resolution=1.2, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(subadata, color=['sample_id','celltypes','leiden'])

In [None]:
subadata.obs.leiden.value_counts()

In [None]:
sc.tl.rank_genes_groups(subadata, groupby='leiden', method='wilcoxon')

In [None]:
dgex = {}
for gr in subadata.obs.leiden.unique():
    dgex[gr] = sc.get.rank_genes_groups_df(subadata, group=gr)

In [None]:
for cl in dgex:
    print(cl)
    print(dgex[cl].head(50).names.ravel())

| Cluster | Stromal call                         | Confidence | Hallmarks                                     |
| ------- | ------------------------------------ | ---------- | --------------------------------------------- |
| 0       | myCAF (contractile/desmoplastic)     | High       | COLs, SPARC, TAGLN, ACTA2, POSTN, THBS2, MMPs |
| 6       | iCAF (IEG/inflammatory)              | Med–High   | FOS/JUN/NR4A, IL6, SERPINE1, CTGF + ECM       |
| 1       | Invasive COL11A1⁺ CAF                | High       | COL11A1/12A1, POSTN, THBS2, CTHRC1, MMPs      |
| 4       | Matrix CAFs w/ immune admixture      | Med–High   | ECM core + CD74/IFI30/JCHAIN/IGs              |
| 9       | myCAF (SMC-like) w/ plasma admixture | Medium     | ACTA2/TAGLN/MYL9 + XBP1/MZB1/Igs              |
| 8       | Macrophages (not fibroblast)         | —          | CD74, IFI30, C1Q, TYROBP                      |
| **7**   | Epithelial               | Reactive basal/squamous           | Keratin-high, injury/inflammation-associated |
| **5**   | Epithelial + Endothelial | AT2–endothelial interface         | Alveolar surfactant genes + VWF/PLVAP        |
| **2**   | Epithelial               | Club/AT2 secretory (inflammatory) | SCGB3A1⁺ + IL6 / NR4A / PTGDS                |
| **3**   | Lymphoid                 | Plasma (ASC)                      | XBP1⁺ MZB1⁺ IGH⁺, mature antibody-secreting  |



In [None]:
subadata.obs['cellsubtypes'] = subadata.obs.leiden.replace({'0': 'Stromal - myCAF',
                             '1': 'Stromal - Invasive CAF',
                            '2': 'Epithelial - AT2',
                            '3': 'Lymphoid - Plasma',
                            '4': 'Stromal - Matrix-remodeling',
                            '5': 'Epithelial - AT2',
                            '6': 'Stromal - iCAF',
                            '7': 'Malignant - Basal/progenitor squamous carcinoma',
                            '8': 'Myeloid - C1Q+ TAM',
                            '9': 'Stromal - myCAF',})

In [None]:
subadata.obs[['cellsubtypes','sample_id']].to_csv('nsclc_stromal_ct.csv')

In [None]:
subadata.obs[['sample_id','cellsubtypes',]].value_counts().unstack()

# Perivascular

In [None]:
subadata = adata[(adata.obs.celltypes=='Perivascular')].copy()

In [None]:
sc.external.pp.bbknn(subadata, batch_key='sample_id')

In [None]:
sc.tl.umap(subadata)

In [None]:
sc.tl.leiden(subadata, resolution=1.2, flavor="igraph", n_iterations=2)

In [None]:
sc.pl.umap(subadata, color=['sample_id','celltypes','leiden'])

In [None]:
subadata.obs.leiden.value_counts()

In [None]:
sc.tl.rank_genes_groups(subadata, groupby='leiden', method='wilcoxon')

In [None]:
dgex = {}
for gr in subadata.obs.leiden.unique():
    dgex[gr] = sc.get.rank_genes_groups_df(subadata, group=gr)

In [None]:
for cl in dgex:
    print(cl)
    print(dgex[cl].head(50).names.ravel())

| Cluster | Cell Type     | Subtype                                   | Confidence | Key Features                    |
| ------- | ------------- | ----------------------------------------- | ---------- | ------------------------------- |
| **10**  | Endothelial   | Activated venous/capillary                | High       | PECAM1⁺ CALCRL⁺ SPARCL1⁺ EGFL7⁺ |
| **6**   | Mural         | Endothelial–pericyte transitional         | High       | PECAM1⁺ COL4A1⁺ CSRP1⁺ CTGF⁺    |
| **5**   | Mural         | vSMC / contractile pericyte               | High       | ACTA2⁺ TAGLN⁺ COL4A1/2⁺         |
| **9**   | Endothelial   | Quiescent capillary                       | High       | VWF⁺ CLDN5⁺ ACKR1⁺ KLF2⁺        |
| **3**   | Stromal       | Perivascular fibroblast / adventitial CAF | Med–High   | ECM-rich (COLs, FN1, SPARC)     |
| **1**   | Immune        | Perivascular macrophage (C1Q⁺ TAM)        | High       | C1QA/B/C, TYROBP, CXCL10⁺       |
| **7**   | Immune        | SPP1⁺ pro-angiogenic macrophage           | High       | SPP1⁺ CXCL9/10⁺ CD68⁺           |
| **0**   | Immune/Stroma | Plasma–fibroblast mix                     | Medium     | Ig⁺ + collagen genes            |
| **8**   | Immune        | Plasma cells (ASC)                        | High       | XBP1⁺ MZB1⁺ IG⁺                 |
| **4**   | Immune        | Highly secretory plasma cells             | High       | XBP1⁺ HERPUD1⁺ SEC11C⁺          |
| **2**   | Epithelial    | Alveolar type II                          | High       | SFTPA/B/C⁺ NAPSA⁺ EPCAM⁺        |


In [None]:
subadata.obs['cellsubtypes'] = subadata.obs.leiden.replace({'0': 'Stromal - myCAF',
                             '1': 'Myeloid - C1Q+ TAM',
                            '2': 'Epithelial - AT2',
                            '3': 'Stromal - perivascular fibroblast',
                            '4': 'Lymphoid - Plasma',
                            '5': 'Perivascular - Mural',
                            '6': 'Perivascular - Mural',
                            '7': 'Myeloid - SPP1+ TAM',
                            '8': 'Lymphoid - Plasma',
                            '9': 'Perivascular - endothelial venous/capillary',
                            '10': 'Perivascular - endothelial venous/capillary'})

In [None]:
subadata.obs[['cellsubtypes','sample_id']].to_csv('nsclc_perivascular_ct.csv')

In [None]:
subadata.obs[['sample_id','cellsubtypes',]].value_counts().unstack()

# Group all together

In [None]:
dfs = []
for ct in ['malep','lymph','myeloid','stromal','perivascular']:
    dfs.append(
        pd.read_csv(f'nsclc_{ct}_ct.csv', index_col=0)
    )

In [None]:
dfs = pd.concat(dfs)

In [None]:
dfs['refined_celltypes'] = dfs.cellsubtypes.str.split(' - ').str[0]

In [None]:
dfs = dfs.rename(columns={'cellsubtypes': 'refined_cellsubtypes'})

In [None]:
idx = ['-'.join(f) for f in dfs.index.str.split('-').str[:-1]]

In [None]:
dfs.index = pd.Series(idx) + '::' +  dfs.sample_id.reset_index(drop=True)

In [None]:
dfs.to_csv('full_refined_celltypes.csv')

# Homogeneize previous label

In [None]:
cell_subtype_mapping = {
    # --- Stromal ---
    "Stromal - CAF myofibroblastic": "Stromal - CAF myofibroblastic",
    "Stromal - CAF matrix-remodeling": "Stromal - CAF matrix-remodeling",
    "Stromal - CAF inflammatory": "Stromal - CAF inflammatory",
    "Stromal - CAF perivascular": "Stromal - CAF perivascular",
    "Stromal - Activated CAF/perivascular": "Stromal - CAF perivascular",
    
    # --- Malignant (grouping by epithelial origin and features) ---
    "Malignant - basal/suprabasal": "Malignant - basal/squamous",
    "Malignant - basal/squamous (SOX2/FGFR1)": "Malignant - basal/squamous",
    "Malignant - basal/squamous": "Malignant - basal/squamous",
    "Malignant - squamous": "Malignant - basal/squamous",
    "Malignant - basal": "Malignant - basal/squamous",
    "Malignant - suprabasal/basal": "Malignant - basal/squamous",
    "Malignant - suprabasal/squamous": "Malignant - basal/squamous",
    "Malignant - WNT-active": "Malignant - basal/squamous",
    "Malignant - EGFR+": "Malignant - basal/squamous",

    "Malignant - AT2/secretory": "Malignant - alveolar/secretory",
    "Malignant - AT2": "Malignant - alveolar/secretory",
    "Malignant - AT2/club": "Malignant - alveolar/secretory",
    "Malignant - AT1/AT2": "Malignant - alveolar/secretory",
    "Malignant - mucinous/secretory": "Malignant - secretory/mucinous",
    "Malignant - mucous/secretory": "Malignant - secretory/mucinous",
    "Malignant - inflammatory/secretory": "Malignant - secretory/mucinous",
    
    "Malignant - EMT/stress (hypoxia/invasive)": "Malignant - EMT/stress",
    "Malignant - EMT": "Malignant - EMT/stress",
    "Malignant - stress": "Malignant - EMT/stress",
    "Malignant - stressed/inflammatory": "Malignant - EMT/stress",
    "Malignant - inflamed/invasive": "Malignant - EMT/stress",
    "Malignant - stromal interface": "Malignant - EMT/stress",
    "Malignant - progenitor": "Malignant - EMT/stress",

    # --- Epithelial (non-malignant) ---
    "Epithelial - AT2": "Epithelial - alveolar/secretory",
    "Epithelial - mucous/secretory": "Epithelial - alveolar/secretory",
    "Epithelial - secretory/club": "Epithelial - alveolar/secretory",
    "Epithelial - stromal interface": "Epithelial - interface",

    # --- Myeloid ---
    "Myeloid - TAM": "Myeloid - macrophage/TAM",
    "Myeloid - TAM/monocytes": "Myeloid - macrophage/TAM",
    "Myeloid - neutrophils/monocytes": "Myeloid - monocyte/neutrophil",
    "Myeloid - inflamed/chemokine-high": "Myeloid - inflammatory",
    "Myeloid - mast": "Myeloid - mast",

    # --- Lymphoid ---
    "Lymphoid - Plasma": "Lymphoid - plasma cell",
    "Lymphoid - plasma": "Lymphoid - plasma cell",
    "Lymphoid - GC B": "Lymphoid - B cell",
    "Lymphoid - B": "Lymphoid - B cell",
    "Lymphoid - B/T": "Lymphoid - mixed lymphoid",
    "Lymphoid - T/B": "Lymphoid - mixed lymphoid",

    # --- Perivascular ---
    "Perivascular - Endothelial": "Endothelial",
    "Perivascular - endothelial": "Endothelial",
    "Perivascular - smooth muscle/pericytes": "Perivascular - smooth muscle",
}


In [None]:
adata.obs.cellsubtypes = adata.obs.cellsubtypes.replace(homogenized_labels)

In [None]:
sc.pl.umap(adata, color=['celltypes','cellsubtypes'])

In [None]:
adata.write_h5ad(base_dir / 'full_cohort.h5ad')