### PBMC annotation

**Objective**
Assign broad immune cell types to PBMC clusters : cluster annotation (PBMC)

*Which immune cell types correspond to the transcriptional clusters identified in the dataset?* 

**Input**
- Clustered PBMC object from Notebook 04

**Methods**
- Inspection of canonical marker genes
- Manual annotation of clusters (T, B, NK, Monocytes, DC)

**Output**
- Annotated PBMC object
- Table: cluster corresponds to cell type

Clusters were annotated by comparing the mean expression and proportion of cells expressing canonical PBMC lineage markers across Leiden clusters.
Assignments were based on coherent multi-marker profiles rather than single genes.
This annotation step assigns biological meaning to the unsupervised clustering and enables downstream analyses focused on specific immune cell populations.


In [21]:
#Setup and load clustred object 

import os
import scanpy as sc
import pandas as pd

PROJECT_ROOT = "/mnt/c/Users/yasmi/OneDrive/Desktop/Mini-Projets/scRNA_Influenza_Patients"
os.chdir(PROJECT_ROOT)

adata = sc.read_h5ad("results/adata_cond_clustered.h5ad")
print("condition?", "condition" in adata.obs.columns)
print("sample_id?", "sample_id" in adata.obs.columns)
print("clusters:", adata.obs["leiden"].nunique())


condition? True
sample_id? True
clusters: 12


  utils.warn_names_duplicates("obs")


In [22]:
## Definition of canonical PBMC marker panels

import pandas as pd
import scanpy as sc

# Markers panels (must exist; if already defined above, keep this cell as-is)
PANELS = {
    "T":       ["CD3D", "CD3E", "TRAC", "TRBC1", "TRBC2", "IL7R", "CCR7", "LTB", "LCK", "CD247"],
    "NK":      ["NKG7", "GNLY", "PRF1", "GZMB", "FCGR3A"],
    "B":       ["MS4A1", "CD79A", "CD74", "HLA-DRA"],
    "Myeloid": ["LYZ", "S100A8", "S100A9", "FCN1", "LST1", "CTSS", "MS4A7"],
    "DC":      ["FCER1A", "CST3"],
    "Platelet":["PPBP", "PF4"],
}

# Collect markers present
all_genes = sorted({g for genes in PANELS.values() for g in genes})
present = [g for g in all_genes if g in adata.var_names]

# Build expression table for present markers
df = sc.get.obs_df(adata, keys=present)
df["leiden"] = adata.obs["leiden"].astype(str).values

# Mean expression and % expressing by cluster
mean_expr = df.groupby("leiden")[present].mean()
pct_expr  = df.groupby("leiden")[present].apply(lambda x: (x > 0).mean() * 100)

# Panel mean scores by cluster
panel_mean_by_cluster = pd.DataFrame(index=mean_expr.index)
for panel, genes in PANELS.items():
    genes_present = [g for g in genes if g in present]
    if genes_present:
        panel_mean_by_cluster[panel] = mean_expr[genes_present].mean(axis=1)

print("panel_mean_by_cluster created:", panel_mean_by_cluster.shape)


panel_mean_by_cluster created: (12, 6)


In [23]:
## Identification of marker genes present in the dataset

all_genes = sorted({g for panel in PANELS.values() for g in panel})

# Check which markers are present in the dataset
present = [g for g in all_genes if g in adata.var_names]
missing = [g for g in all_genes if g not in adata.var_names]

print(f"Markers present: {len(present)}/{len(all_genes)}")
print("Example present markers:", present[:15])
print("Example missing markers:", missing[:15])

Markers present: 18/30
Example present markers: ['CD3D', 'CD74', 'CD79A', 'CST3', 'FCER1A', 'FCN1', 'GNLY', 'GZMB', 'HLA-DRA', 'IL7R', 'LYZ', 'MS4A1', 'NKG7', 'PF4', 'PPBP']
Example missing markers: ['CCR7', 'CD247', 'CD3E', 'CTSS', 'FCGR3A', 'LCK', 'LST1', 'LTB', 'MS4A7', 'S100A8', 'S100A9', 'TRBC1']


In [24]:

# Top clusters per lineage (panel scores)
print("\nTop clusters per lineage (panel_mean_by_cluster):")
for lineage in panel_mean_by_cluster.columns:
    print("\n", lineage)
    print(panel_mean_by_cluster[lineage].sort_values(ascending=False).head(5))

# Save tables (for notebook + README)
mean_expr.to_csv("results/marker_mean_expr_by_cluster.csv")
pct_expr.to_csv("results/marker_pct_expr_by_cluster.csv")
panel_mean_by_cluster.to_csv("results/panel_mean_by_cluster.csv")

# Save annotated object
adata.write("results/adata_annotated.h5ad")


Top clusters per lineage (panel_mean_by_cluster):

 T
leiden
3    1.919293
2    1.123936
6    0.431746
7    0.271169
5    0.122997
Name: T, dtype: float32

 NK
leiden
2    2.966677
9    1.109618
8    0.697369
6    0.512841
7    0.500231
Name: NK, dtype: float32

 B
leiden
6    3.548680
9    2.180609
5    1.821485
4    1.816555
7    0.796505
Name: B, dtype: float32

 Myeloid
leiden
5    3.788818
8    3.782072
4    3.608060
1    1.300400
7    1.061355
Name: Myeloid, dtype: float32

 DC
leiden
5     1.947447
4     1.906664
9     1.806481
10    0.890690
0     0.493213
Name: DC, dtype: float32

 Platelet
leiden
10    5.375428
5     0.146165
9     0.057996
4     0.048092
11    0.044624
Name: Platelet, dtype: float32


In [25]:
## Marker expression profiles by cluster (calculate mean + proportion)
df = sc.get.obs_df(adata, keys=present)
df["leiden"] = adata.obs["leiden"].astype(str).values

mean_expr = df.groupby("leiden")[present].mean()
pct_expr  = df.groupby("leiden")[present].apply(lambda x: (x > 0).mean() * 100)

In [26]:
## Lineage scoring based on canonical marker panels (calculate scores by lienage : T, NK, B ...)
panel_mean = pd.DataFrame(index=mean_expr.index)
panel_pct  = pd.DataFrame(index=pct_expr.index)

for panel, genes in PANELS.items():
    genes_present = [g for g in genes if g in present]
    if len(genes_present) == 0:
        continue
    panel_mean[panel] = mean_expr[genes_present].mean(axis=1)
    panel_pct[panel]  = pct_expr[genes_present].mean(axis=1)

panel_mean


Unnamed: 0_level_0,T,NK,B,Myeloid,DC,Platelet
leiden,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.021295,0.089649,0.392296,0.970303,0.493213,0.018721
1,0.091615,0.234733,0.418871,1.3004,0.400865,0.037471
10,0.084396,0.202972,0.362282,0.697905,0.89069,5.375428
11,0.048333,0.132944,0.399118,0.772018,0.36506,0.044624
2,1.123936,2.966677,0.752002,0.528795,0.280129,0.021167
3,1.919293,0.393415,0.528603,0.501393,0.213951,0.014452
4,0.065266,0.245157,1.816555,3.60806,1.906664,0.048092
5,0.122997,0.341355,1.821485,3.788818,1.947447,0.146165
6,0.431746,0.512841,3.54868,0.475363,0.219392,0.019051
7,0.271169,0.500231,0.796505,1.061355,0.49118,0.039093


In [27]:
# Define / paste your final mapping here
cluster_to_celltype = {
    "0": "Myeloid",
    "1": "NK",
    "2": "NK",
    "3": "T",
    "4": "B",
    "5": "Myeloid",
    "6": "B",
    "7": "Myeloid",
    "8": "Myeloid",
    "9": "DC",
    "10": "Platelet",
    "11": "DC",
}

# Apply mapping
adata.obs["cell_type"] = adata.obs["leiden"].astype(str).map(cluster_to_celltype)

# Quick check
print("Has cell_type?", "cell_type" in adata.obs.columns)
print(adata.obs["cell_type"].value_counts())


# Save annotated AnnData object (THIS is what NB06 needs)
adata.write("results/adata_annotated.h5ad")

# Export annotation tables (for README / figures)
mean_expr.to_csv("results/marker_mean_expr_by_cluster.csv")
pct_expr.to_csv("results/marker_pct_expr_by_cluster.csv")
panel_mean.to_csv("results/panel_mean_by_cluster.csv")
panel_pct.to_csv("results/panel_pct_by_cluster.csv")

print("Saved: results/adata_annotated.h5ad")


Has cell_type? True
cell_type
Myeloid     44448
NK          36164
B           12275
T            9539
DC            485
Platelet      291
Name: count, dtype: int64
Saved: results/adata_annotated.h5ad
