# Set-up

In [None]:
import os

script_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(script_dir)

os.chdir(script_dir)  # Change working directory to script directory

## Importing modules

In [None]:
# Import generic libraries
import os
import copy
import random
import multiprocessing
import scanpy as sc
import numpy as nps
import seaborn as sns
import matplotlib.pyplot as plt
import math
import sklearn.metrics
from scipy.stats import ranksums
from scipy.cluster.hierarchy import linkage, leaves_list
from statsmodels.stats.multitest import multipletests
import espressopro as ep

# Import mosaic libraries
import missionbio.mosaic as ms

# Get the number of detectable CPU cores
num_cores = multiprocessing.cpu_count()

# Subtract one from the number of cores
num_cores_to_use = max(1, num_cores - 1)

# Import graph_objects from the plotly package to display figures when saving the notebook as an HTML
import plotly.graph_objects as go

# Import additional packages for specific visuals
import plotly.offline as pyo
pyo.init_notebook_mode()
import numpy as np
from itables import init_notebook_mode, show
from itables.sample_dfs import get_dict_of_test_dfs
import itables.options as opt

# Defining itables options
dict_of_test_dfs = get_dict_of_test_dfs()
init_notebook_mode(all_interactive=True)
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]
opt.maxBytes = 0
opt.maxColumns = 0
opt.classes = ["display", "nowrap"]

# Other useful packages for downstream cluster analyses
import pandas as pd

# Note: when exporting the notebook as an HTML, plots that use the "go.Figure(fig)" command are saved

In [None]:
pip list

PYTHONHASHSEED was set as envinronmental variable to 0 as follows:
    
conda env config vars set PYTHONHASHSEED=0

In [None]:
os.environ['PYTHONHASHSEED'] = '0'
random.seed(42)
np.random.seed(42)

In [None]:
def ensure_pythonhashseed(seed=0):
    current_seed = os.environ.get("PYTHONHASHSEED")

    seed = str(seed)
    if current_seed is None or current_seed != seed:
        print(f'Setting PYTHONHASHSEED="{seed}"')
        os.environ["PYTHONHASHSEED"] = seed
        # restart the current process
        os.execl(sys.executable, sys.executable, *sys.argv)

In [None]:
import random

hash = random.getrandbits(128)

print("hash value: %032x" % hash)

## Defining paths

In [None]:
figures_path = os.path.join(parent_dir, "Figures")

# Load Data

In [None]:
PBMC_samples = ms.load_example_dataset(path="Multisample PBMC", single=False)

In [None]:
PBMC_HD01 = PBMC_samples.samples[0]
PBMC_HD02 = PBMC_samples.samples[1]

# <b> Data Overview </b>

In [None]:
# Summary of Protein assay 
print("\'sample.protein\':", PBMC_HD01.protein, '\n')
print("\'row_attrs\':", "\n\t", list(PBMC_HD01.protein.row_attrs.keys()), '\n')
print("\'col_attrs\':", "\n\t", list(PBMC_HD01.protein.col_attrs.keys()), '\n')
print("\'layers\':", "\n\t", list(PBMC_HD01.protein.layers.keys()), '\n')
print("\'metadata\':", "\n")
for i in list(PBMC_HD01.protein.metadata.keys()):
    print("\t", i, ": ", PBMC_HD01.protein.metadata[i], sep="")

In [None]:
# Summary of Protein assay 
print("\'sample.protein\':", PBMC_HD02.protein, '\n')
print("\'row_attrs\':", "\n\t", list(PBMC_HD02.protein.row_attrs.keys()), '\n')
print("\'col_attrs\':", "\n\t", list(PBMC_HD02.protein.col_attrs.keys()), '\n')
print("\'layers\':", "\n\t", list(PBMC_HD02.protein.layers.keys()), '\n')
print("\'metadata\':", "\n")
for i in list(PBMC_HD02.protein.metadata.keys()):
    print("\t", i, ": ", PBMC_HD02.protein.metadata[i], sep="")

## <b> Protein Analysis </b>

### Filtering non-informative proteins

In [None]:
PBMC_HD01.protein = PBMC_HD01.protein.drop(['IgG1', 'IgG2a', 'IgG2b'])
PBMC_HD02.protein = PBMC_HD02.protein.drop(['IgG1', 'IgG2a', 'IgG2b'])

### Normalisation

In [None]:
ep.Normalise_protein_data(PBMC_HD01)
ep.Scale_protein_data(PBMC_HD01)

ep.Normalise_protein_data(PBMC_HD02)
ep.Scale_protein_data(PBMC_HD02)

### Dimensionality reduction

In [None]:
PBMC_HD01.protein.run_pca(attribute='Scaled_reads', components=45,show_plot=True, random_state=42, svd_solver='randomized')
PBMC_HD02.protein.run_pca(attribute='Scaled_reads', components=45,show_plot=True, random_state=42, svd_solver='randomized')

In [None]:
PBMC_HD01.protein.run_pca(attribute='Scaled_reads', components=8, show_plot=False, random_state=42, svd_solver='randomized')
PBMC_HD02.protein.run_pca(attribute='Scaled_reads', components=8, show_plot=False, random_state=42, svd_solver='randomized')

In [None]:
PBMC_HD01.protein.run_umap(attribute='pca', random_state=42, n_neighbors=50, min_dist=0.1, spread=8, n_components=2)
PBMC_HD02.protein.run_umap(attribute='pca', random_state=42, n_neighbors=50, min_dist=0.1, spread=8, n_components=2)

### Clustering

In [None]:
PBMC_HD01.protein.cluster(attribute='umap', method='graph-community', k=8, random_state=42) 
PBMC_HD02.protein.cluster(attribute='umap', method='graph-community', k=8, random_state=42)     

# EspressoPro

## Generate predictions

In [None]:
PBMC_HD01.protein.shape


In [None]:
PBMC_HD02.protein.shape


In [None]:
from pathlib import Path
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

p = Path("/Users/kgurashi/GitHub/2024__EspressoPro_Manuscript/Data/Pre_trained_models/TotalSeqD_Heme_Oncology_CAT399906/Luecken/Models/Broad_Immature/Broad_Immature_bundle.joblib")
bundle = joblib.load(p)

# 1) Top-level scaler in the bundle?
print("Top-level 'scaler' key present:", 'scaler' in bundle)
print("Top-level scaler object:", bundle.get('scaler'))

# 2) Inside the model: check for any StandardScaler in pipelines
est = bundle.get("model") or bundle.get("Stacked")
stacker = getattr(est, "estimator", est)   # unwrap CalibratedClassifierCV -> StackingClassifier

print("\nBase learners:")
for i, base in enumerate(getattr(stacker, "estimators_", [])):
    if isinstance(base, Pipeline):
        step_names = [name for name, _ in base.steps]
        print(f"  #{i} Pipeline steps:", step_names)
        for name, step in base.steps:
            if isinstance(step, StandardScaler):
                print("     -> StandardScaler found:", step)
    else:
        print(f"  #{i}", type(base).__name__)

# 3) Final estimator (usually LogisticRegression)
print("\nFinal estimator:", type(stacker.final_estimator_).__name__)


In [None]:
# 1) Just print overlaps & a quick preview3
_ = ep.audit_feature_overlap(PBMC_HD01, models_path = "/Users/kgurashi/GitHub/2024__EspressoPro_Manuscript/Data/Pre_trained_models/TotalSeqD_Heme_Oncology_CAT399906", base_layer="Normalized_reads", show=15)

In [None]:
from pathlib import Path
import joblib

p = Path("/Users/kgurashi/GitHub/2024__EspressoPro_Manuscript/Data/Pre_trained_models/TotalSeqD_Heme_Oncology_CAT399906/Luecken/Models/Broad_Immature/Broad_Immature_bundle.joblib")
bundle = joblib.load(p)

# Check columns list in the bundle (from your training script)
cols = bundle.get("columns", [])
print(f"[bundle] columns count: {len(cols)}")
print(f"[bundle] columns: {cols}")

# Check estimator's recorded features
model = bundle.get("model") or bundle.get("Stacked")
if hasattr(model, "feature_names_in_"):
    print(f"[model] feature_names_in_ count: {len(model.feature_names_in_)}")
if hasattr(model, "n_features_in_"):
    print(f"[model] n_features_in_: {model.n_features_in_}")


In [None]:
# Assume you've already loaded the bundle and PBMC_HD01
cols = bundle.get("columns", [])
panel_ids = list(PBMC_HD01.protein.col_attrs["id"])

set_cols = set(cols)
set_ids = set(panel_ids)

missing = sorted(set_cols - set_ids)  # features in training but not in sample

print(f"Trained features: {len(cols)}")
print(f"Sample protein IDs: {len(panel_ids)}")
print(f"Overlap count: {len(set_cols & set_ids)}")
print(f"Missing from sample: {len(missing)} -> {missing}")
print(f"Extra in sample: {len(set_ids - set_cols)} -> {sorted(set_ids - set_cols)}")


In [None]:
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
df_protein

In [None]:
PBMC_HD01 = ep.generate_predictions(obj=PBMC_HD01)

In [None]:
PBMC_HD02 = ep.generate_predictions(obj=PBMC_HD02)

In [None]:
PBMC_HD01.protein.row_attrs

In [None]:
fig = PBMC_HD01.protein.scatterplot(attribute='umap',colorby='Averaged.Simplified.cDC.predscore')
go.Figure(fig)

## Use predictions for annotation

In [None]:
PBMC_HD01 = ep.annotate_data(obj=PBMC_HD01)

In [None]:
PBMC_HD02 = ep.annotate_data(obj=PBMC_HD02)

## Exploring prediction scores in PBMC - HD01

In [None]:
PBMC_HD01.protein.row_attrs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Hao.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Hao.Broad.Immature.predscore']
pbmc.obs['Zhang.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Zhang.Broad.Immature.predscore']
pbmc.obs['Triana.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Triana.Broad.Immature.predscore']
pbmc.obs['Luecken.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Luecken.Broad.Immature.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Hao.Broad.Immature.predscore': 'Hao - Broad: Immature',
    'Zhang.Broad.Immature.predscore': 'Zhang - Broad: Immature',
    'Triana.Broad.Immature.predscore': 'Triana - Broad: Immature',
    'Luecken.Broad.Immature.predscore': 'Luecken - Broad: Immature'
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()

# Save the figure at 300 dpi
plt.savefig(os.path.join(figures_path, "Atlases_Immature_prediction.png"), 
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Immature.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Immature.predscore': 'Averaged - Broad: Immature',
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Immature.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Immature.predscore': 'Averaged - Broad: Immature',
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Mature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Mature.predscore']
pbmc.obs['Averaged.Constrained.Simplified.CD4_T.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Simplified.CD4_T.predscore']
pbmc.obs['Averaged.Constrained.Detailed.CD4_T_Memory.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Detailed.CD4_T_Memory.predscore']
pbmc.obs['Averaged.Constrained.Detailed.CD4_T_Naive.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Detailed.CD4_T_Naive.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Mature.predscore': 'Broad: Mature',
    'Averaged.Constrained.Simplified.CD4_T.predscore': 'Simplified: CD4 T',
    'Averaged.Constrained.Detailed.CD4_T_Memory.predscore': 'Detailed: CD4 T Memory',
    'Averaged.Constrained.Detailed.CD4_T_Naive.predscore': 'Detailed: CD4 T Naive'
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Mature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Mature.predscore']
pbmc.obs['Averaged.Constrained.Simplified.B.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Simplified.B.predscore']
pbmc.obs['Averaged.Constrained.Detailed.B_Naive.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Detailed.B_Naive.predscore']
pbmc.obs['Averaged.Constrained.Detailed.B_Memory.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Detailed.B_Memory.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Mature.predscore': 'Broad: Mature',
    'Averaged.Constrained.Simplified.B.predscore': 'Simplified: B',
    'Averaged.Constrained.Detailed.B_Naive.predscore': 'Detailed: B Naive',
    'Averaged.Constrained.Detailed.B_Memory.predscore': 'Detailed: B Memory'
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Immature.predscore']
pbmc.obs['Averaged.Simplified.HSPC.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Simplified.HSPC.predscore']
pbmc.obs['Averaged.Constrained.Simplified.HSPC.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Simplified.HSPC.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Immature.predscore': 'Broad: Immature',
    'Averaged.Simplified.HSPC.predscore': 'Simplified: HSPC',
    'Averaged.Constrained.Simplified.HSPC.predscore': 'Simplified: HSPC (with hierarchical constraint)'
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Immature.predscore']
pbmc.obs['Averaged.Constrained.Simplified.HSPC.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Simplified.HSPC.predscore']
pbmc.obs['Averaged.Constrained.Detailed.HSC_MPP.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Constrained.Detailed.HSC_MPP.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Immature.predscore': 'Broad: Immature',
    'Averaged.Constrained.Simplified.HSPC.predscore': 'Simplified: HSPC',
    'Averaged.Constrained.Detailed.HSC_MPP.predscore': 'Detailed: HSC MPP'
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=True,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


## Refining annotations

In [None]:
# Refining labels for PBMC_HD01

n_fix, _, PBMC_HD01 = ep.refine_labels_by_centroid_knn(
    PBMC_HD01,
    label_col="Simplified.Celltype",
    out_col="Simplified.Celltype.Refined"
)

n_fix, _, PBMC_HD01 = ep.refine_labels_by_centroid_knn(
    PBMC_HD01,
    label_col="Detailed.Celltype",
    out_col="Detailed.Celltype.Refined"
)

PBMC_HD01 = ep.mark_small_clusters(PBMC_HD01, "Simplified.Celltype.Refined", min_count=3)
PBMC_HD01 = ep.mark_small_clusters(PBMC_HD01, "Detailed.Celltype.Refined", min_count=3)

# Refining labels for PBMC_HD02

n_fix, _, PBMC_HD02 = ep.refine_labels_by_centroid_knn(
    PBMC_HD02,
    label_col="Simplified.Celltype",
    out_col="Simplified.Celltype.Refined"
)

n_fix, _, PBMC_HD02 = ep.refine_labels_by_centroid_knn(
    PBMC_HD02,
    label_col="Detailed.Celltype",
    out_col="Detailed.Celltype.Refined"
)

PBMC_HD02 = ep.mark_small_clusters(PBMC_HD02, "Simplified.Celltype.Refined", min_count=3)
PBMC_HD02 = ep.mark_small_clusters(PBMC_HD02, "Detailed.Celltype.Refined", min_count=3)

In [None]:
# Suggest cell type identities for PBMC_HD01

ep.suggest_cluster_celltype_identity(
    sample=PBMC_HD01,
    annotation="Simplified.Celltype.Refined")

PBMC_HD01 = ep.suggest_cluster_celltype_identity(
    sample=PBMC_HD01,
    annotation="Detailed.Celltype.Refined", rewrite=True)

# Suggest cell type identities for PBMC_HD02

ep.suggest_cluster_celltype_identity(
    sample=PBMC_HD02,
    annotation="Simplified.Celltype.Refined")

PBMC_HD02 = ep.suggest_cluster_celltype_identity(
    sample=PBMC_HD02,
    annotation="Detailed.Celltype.Refined", rewrite=True)

## Exploring annotations in PBMC - HD01

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Broad.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'Immature': "#0079ea",
    'Mature': "#CFCFCF"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.35, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD01",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=len(palette),            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Simplified.Celltype.Refined']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'HSPC': '#0079ea',
    'cDC': "#16D2E3",
    'pDC': "#69FFCB",
    'Monocyte': '#D27CE3',
    'Other_T': "#EDB416",
    'Erythroid': "#F30A1A",
    'CD4_T': '#C99546',
    'CD8_T': "#6B3317",
    'NK': "#FBEF0D",
    'B': '#68D827',
    'Small': "#292929"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.30, 3.55)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD01",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=4,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Detailed.Celltype.Refined']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Scaled_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'gdT': "#EDB416",                # from your original
    'pDC': "#69FFCB"                 # from your original
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.3, 4)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD01",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=3,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['label']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends
# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.4, 4.2)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD01",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=6,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['annotated_clusters']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends


# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'gdT': "#EDB416",                # from your original
    'pDC': "#69FFCB"                 # from your original
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.30, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        palette=palette,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD01",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=3,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
# Make sure categories are in the right order
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]

# Assign palette to the AnnData object for 'clusters'
pbmc.uns['clusters_colors'] = palette

# Now plot the heatmap — no 'palette' argument needed
all_genes = pbmc.var_names.tolist()
genes_to_plot = all_genes  # or subset for readability

sc.pl.heatmap(
    pbmc,
    var_names=genes_to_plot,
    groupby="clusters",
    use_raw=False,
    cmap="magma",
    dendrogram=True,
    standard_scale="var",  # z-score each gene across clusters
    figsize=(7.25, 9)
)


In [None]:
PBMC_HD01.protein.row_attrs

In [None]:
PBMC_HD01.protein.signaturemap('Normalized_reads',
                           splitby='Detailed.Celltype')

## Exploring annotations in PBMC - HD02

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD02.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD02.protein.row_attrs['label']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD02.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD02.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends
# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.4, 4.1)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD02",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=6,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD02.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD02.protein.row_attrs['annotated_clusters']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD02.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD02.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends


# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'gdT': "#EDB416",                # from your original
    'pDC': "#69FFCB"                 # from your original
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        palette=palette,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=True,
        title="PBMC - HD02",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=3,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
# Make sure categories are in the right order
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]

# Assign palette to the AnnData object for 'clusters'
pbmc.uns['clusters_colors'] = palette

# Now plot the heatmap — no 'palette' argument needed
all_genes = pbmc.var_names.tolist()
genes_to_plot = all_genes  # or subset for readability

sc.pl.heatmap(
    pbmc,
    var_names=genes_to_plot,
    groupby="clusters",
    use_raw=False,
    cmap="magma",
    dendrogram=True,
    standard_scale="var",  # z-score each gene across clusters
    figsize=(7.25, 9)
)




In [None]:
PBMC_HD02.protein.signaturemap('Normalized_reads',
                           splitby='Detailed.Celltype.Refined')