# Set-up

In [None]:
import os

script_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(script_dir)

os.chdir(script_dir)  # Change working directory to script directory

## Importing modules

In [None]:
# Import generic libraries
import os
import copy
import random
import multiprocessing
import scanpy as sc
import numpy as nps
import seaborn as sns
import matplotlib.pyplot as plt
import math
import sklearn.metrics
from scipy.stats import ranksums
from scipy.cluster.hierarchy import linkage, leaves_list
from statsmodels.stats.multitest import multipletests
import espressopro as ep

# Import mosaic libraries
import missionbio.mosaic as ms

# Get the number of detectable CPU cores
num_cores = multiprocessing.cpu_count()

# Subtract one from the number of cores
num_cores_to_use = max(1, num_cores - 1)

# Import graph_objects from the plotly package to display figures when saving the notebook as an HTML
import plotly.graph_objects as go

# Import additional packages for specific visuals
import plotly.offline as pyo
pyo.init_notebook_mode()
import numpy as np
from itables import init_notebook_mode, show
from itables.sample_dfs import get_dict_of_test_dfs
import itables.options as opt

# Defining itables options
dict_of_test_dfs = get_dict_of_test_dfs()
init_notebook_mode(all_interactive=True)
opt.lengthMenu = [5, 10, 20, 50, 100, 200, 500]
opt.maxBytes = 0
opt.maxColumns = 0
opt.classes = ["display", "nowrap"]

# Other useful packages for downstream cluster analyses
import pandas as pd

# Note: when exporting the notebook as an HTML, plots that use the "go.Figure(fig)" command are saved

In [None]:
pip list

PYTHONHASHSEED was set as envinronmental variable to 0 as follows:
    
conda env config vars set PYTHONHASHSEED=0

In [None]:
os.environ['PYTHONHASHSEED'] = '0'
random.seed(42)
np.random.seed(42)

In [None]:
def ensure_pythonhashseed(seed=0):
    current_seed = os.environ.get("PYTHONHASHSEED")

    seed = str(seed)
    if current_seed is None or current_seed != seed:
        print(f'Setting PYTHONHASHSEED="{seed}"')
        os.environ["PYTHONHASHSEED"] = seed
        # restart the current process
        os.execl(sys.executable, sys.executable, *sys.argv)

In [None]:
import random

hash = random.getrandbits(128)

print("hash value: %032x" % hash)

## Defining paths

In [None]:
figures_path = parent_dir + "/Figures/Model_Independent_Testing"

if not os.path.exists(figures_path):
    os.makedirs(figures_path)

data_path = parent_dir + "/Data/Annotation_Assignments"

if not os.path.exists(data_path):
    os.makedirs(data_path)

# Custom function

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score


# ---------- marker metrics helpers ----------
def _auc_safe(y, s):
    try:
        if np.unique(y).size < 2 or np.nanstd(s) == 0:
            return np.nan
        return float(roc_auc_score(y, s))
    except Exception:
        return np.nan

def _ap_safe(y, s):
    try:
        if np.unique(y).size < 2 or np.nanstd(s) == 0:
            return np.nan
        return float(average_precision_score(y, s))
    except Exception:
        return np.nan


def marker_fidelity_for_annotation(
    sample,
    label_key: str,
    layer: str,
    pos_marker_dict: dict,
    top_frac: float = 0.05,
    min_cells_per_label: int = 20,
):
    """
    Returns a per-label table with marker fidelity metrics, plus a weighted overall score in [0,1] approx.
    """
    df = sample.protein.get_attribute(layer, constraint="row+col")
    labs = pd.Series(np.asarray(sample.protein.row_attrs[label_key]), index=df.index).astype("object")
    labs = labs.dropna().astype(str)

    # align df to labs
    df = df.loc[labs.index]

    per_label_rows = []
    for lbl, n_lbl in labs.value_counts().items():
        if n_lbl < min_cells_per_label:
            continue

        pos_list = [m for m in pos_marker_dict.get(lbl, []) if m in df.columns]
        if len(pos_list) == 0:
            continue

        y = (labs == lbl).astype(int).to_numpy()
        n = y.size
        k_top = max(1, int(top_frac * n))

        aucs, aps = [], []
        for m in pos_list:
            s = df[m].to_numpy()
            aucs.append(_auc_safe(y, s))
            aps.append(_ap_safe(y, s))

        auc_mean = float(np.nanmean(aucs)) if len(aucs) else np.nan
        ap_mean  = float(np.nanmean(aps))  if len(aps)  else np.nan
        fidelity = float(2 * auc_mean - 1) if np.isfinite(auc_mean) else np.nan  # 0..1

        per_label_rows.append({
            "label": lbl,
            "n_cells": int(n_lbl),
            "n_pos_markers": int(len(pos_list)),
            "AUC_pos_mean": auc_mean,
            "AP_pos_mean": ap_mean,
            "Fidelity_pos": fidelity,
        })

    per_label = pd.DataFrame(per_label_rows)
    if per_label.empty:
        return {"marker_fidelity_weighted": np.nan}, per_label

    # size-weighted fidelity across labels
    w = per_label["n_cells"].to_numpy(dtype=float)
    w = w / w.sum()
    marker_fidelity_weighted = float(np.nansum(w * per_label["Fidelity_pos"].to_numpy(dtype=float)))

    return {"marker_fidelity_weighted": marker_fidelity_weighted}, per_label


# ---------- clustering alignment metrics ----------
def _entropy_from_counts(counts: np.ndarray) -> float:
    counts = counts.astype(float)
    s = counts.sum()
    if s <= 0:
        return np.nan
    p = counts / s
    p = p[p > 0]
    return float(-(p * np.log(p)).sum())

def cluster_alignment_metrics(
    sample,
    label_key: str,
    cluster_key: str = "cluster",
    collapse_subclusters: bool = True,
    cluster_delim: str = ":",
    min_cells_per_label: int = 20,
):
    ra = sample.protein.row_attrs

    labels = pd.Series(np.asarray(ra[label_key]), dtype="object").dropna().astype(str)
    clusters_raw = pd.Series(np.asarray(ra[cluster_key]), dtype="object").iloc[labels.index]

    if collapse_subclusters:
        clusters = clusters_raw.astype(str).str.split(cluster_delim, n=1).str[0]
    else:
        clusters = clusters_raw.astype(str)

    # contingency table: cluster x label
    ct = pd.crosstab(clusters, labels)

    # drop tiny labels (optional, keeps metrics meaningful)
    label_sizes = ct.sum(axis=0)
    keep_labels = label_sizes[label_sizes >= min_cells_per_label].index
    ct = ct[keep_labels]
    if ct.shape[1] == 0:
        return {"coherence": np.nan, "label_concentration": np.nan}, ct

    # H(label)
    H_label = _entropy_from_counts(ct.sum(axis=0).to_numpy())

    # H(label | cluster) = sum_c P(c) * H(label within c)
    cluster_sizes = ct.sum(axis=1).to_numpy(dtype=float)
    P_c = cluster_sizes / cluster_sizes.sum()

    H_label_given_cluster = 0.0
    for i, c in enumerate(ct.index):
        H_label_given_cluster += P_c[i] * _entropy_from_counts(ct.loc[c].to_numpy())

    # normalized coherence score in [0,1]
    coherence = np.nan
    if np.isfinite(H_label) and H_label > 0:
        coherence = float(1.0 - (H_label_given_cluster / H_label))

    # label concentration: E_label [ max_c P(c|label) ] weighted by label size
    # Compute P(c|label) via ct / label_total
    label_total = ct.sum(axis=0).to_numpy(dtype=float)
    max_frac_per_label = (ct.to_numpy() / label_total[None, :]).max(axis=0)  # max over clusters for each label

    w = label_total / label_total.sum()
    label_concentration = float(np.sum(w * max_frac_per_label))

    return {"coherence": coherence, "label_concentration": label_concentration}, ct


# ---------- master comparison ----------
def compare_annotation_schemes(
    sample,
    label_keys: list[str],
    pos_marker_dict: dict,
    layer: str = "Normalized_reads",
    cluster_key: str = "cluster",
    collapse_subclusters: bool = True,
    dominance_threshold: float | None = None,   # optional: if you want to pre-collapse labels using your dominance rule (usually None)
    min_cells_per_label: int = 20,
    weights=(0.45, 0.25, 0.30),
):
    w_coh, w_conc, w_mark = weights

    rows = []
    details = {}

    for lk in label_keys:
        # cluster alignment
        align, ct = cluster_alignment_metrics(
            sample,
            label_key=lk,
            cluster_key=cluster_key,
            collapse_subclusters=collapse_subclusters,
            min_cells_per_label=min_cells_per_label,
        )

        # marker fidelity
        mf_summary, mf_table = marker_fidelity_for_annotation(
            sample,
            label_key=lk,
            layer=layer,
            pos_marker_dict=pos_marker_dict,
            min_cells_per_label=min_cells_per_label,
        )

        coherence = align["coherence"]
        label_conc = align["label_concentration"]
        marker_fid = mf_summary["marker_fidelity_weighted"]

        total = np.nan
        if np.isfinite([coherence, label_conc, marker_fid]).all():
            total = float(w_coh * coherence + w_conc * label_conc + w_mark * marker_fid)

        rows.append({
            "annotation": lk,
            "coherence(1-H|/H)": coherence,
            "label_concentration": label_conc,
            "marker_fidelity_weighted": marker_fid,
            "total_score": total,
        })

        details[lk] = {
            "contingency_cluster_x_label": ct,
            "marker_fidelity_per_label": mf_table,
        }

    scoreboard = pd.DataFrame(rows).sort_values("total_score", ascending=False)
    return scoreboard, details


# Load Data

In [None]:
PBMC_samples = ms.load_example_dataset(path="Multisample PBMC", single=False)

In [None]:
PBMC_HD01 = PBMC_samples.samples[0]
PBMC_HD02 = PBMC_samples.samples[1]

# <b> Data Overview </b>

In [None]:
# Summary of Protein assay 
print("\'sample.protein\':", PBMC_HD01.protein, '\n')
print("\'row_attrs\':", "\n\t", list(PBMC_HD01.protein.row_attrs.keys()), '\n')
print("\'col_attrs\':", "\n\t", list(PBMC_HD01.protein.col_attrs.keys()), '\n')
print("\'layers\':", "\n\t", list(PBMC_HD01.protein.layers.keys()), '\n')
print("\'metadata\':", "\n")
for i in list(PBMC_HD01.protein.metadata.keys()):
    print("\t", i, ": ", PBMC_HD01.protein.metadata[i], sep="")

In [None]:
# Summary of Protein assay 
print("\'sample.protein\':", PBMC_HD02.protein, '\n')
print("\'row_attrs\':", "\n\t", list(PBMC_HD02.protein.row_attrs.keys()), '\n')
print("\'col_attrs\':", "\n\t", list(PBMC_HD02.protein.col_attrs.keys()), '\n')
print("\'layers\':", "\n\t", list(PBMC_HD02.protein.layers.keys()), '\n')
print("\'metadata\':", "\n")
for i in list(PBMC_HD02.protein.metadata.keys()):
    print("\t", i, ": ", PBMC_HD02.protein.metadata[i], sep="")

## <b> Protein Analysis </b>

### Filtering non-informative proteins

In [None]:
PBMC_HD01.protein = PBMC_HD01.protein.drop(['IgG1', 'IgG2a', 'IgG2b'])
PBMC_HD02.protein = PBMC_HD02.protein.drop(['IgG1', 'IgG2a', 'IgG2b'])

### Normalisation

In [None]:
ep.Normalise_protein_data(PBMC_HD01, inplace=True, axis=1, flavor="seurat")
ep.Scale_protein_data(PBMC_HD01, inplace=True)

ep.Normalise_protein_data(PBMC_HD02, inplace=True, axis=1, flavor="seurat")
ep.Scale_protein_data(PBMC_HD02, inplace=True)

### Dimensionality reduction

In [None]:
PBMC_HD01.protein.run_pca(attribute='Scaled_reads', components=45,show_plot=True, random_state=42, svd_solver='randomized')
PBMC_HD02.protein.run_pca(attribute='Scaled_reads', components=45,show_plot=True, random_state=42, svd_solver='randomized')

In [None]:
PBMC_HD01.protein.run_pca(attribute='Scaled_reads', components=8, show_plot=False, random_state=42, svd_solver='randomized')
PBMC_HD02.protein.run_pca(attribute='Scaled_reads', components=8, show_plot=False, random_state=42, svd_solver='randomized')

In [None]:
PBMC_HD01.protein.run_umap(attribute='pca', random_state=42, n_neighbors=50, min_dist=0.1, spread=8, n_components=2)
PBMC_HD02.protein.run_umap(attribute='pca', random_state=42, n_neighbors=50, min_dist=0.1, spread=8, n_components=2)

### Clustering

In [None]:
PBMC_HD01.protein.cluster(attribute='umap', method='graph-community', k=5, random_state=42) 
PBMC_HD02.protein.cluster(attribute='umap', method='graph-community', k=5, random_state=42)     

In [None]:
PBMC_HD01.protein.row_attrs

In [None]:
PBMC_HD01.protein.scatterplot(attribute='umap', colorby='label')

In [None]:
PBMC_HD01.protein.row_attrs['Clusters'] = copy.copy(PBMC_HD01.protein.row_attrs['label'])
PBMC_HD02.protein.row_attrs['Clusters'] = copy.copy(PBMC_HD02.protein.row_attrs['label'])

# EspressoPro

## Predictions on PBMC - HD01

### Generate predictions

In [None]:
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
df_protein

In [None]:
PBMC_HD01 = ep.generate_predictions(obj=PBMC_HD01)
PBMC_HD02 = ep.generate_predictions(obj=PBMC_HD02)

### Use predictions for annotation

In [None]:
PBMC_HD01 = ep.annotate_data(obj=PBMC_HD01)
PBMC_HD02 = ep.annotate_data(obj=PBMC_HD02)

In [None]:
import pandas as pd
import numpy as np

celltypes = pd.Series(np.asarray(PBMC_HD01.protein.row_attrs['Averaged.Simplified.Celltype']))
counts = celltypes.value_counts(dropna=False)
percent = counts / counts.sum() * 100

# Combine into a DataFrame for display
summary = pd.DataFrame({'count': counts, 'percent': percent.round(2)})
print(summary)

### Exploring prediction scores

In [None]:
import numpy as np
import pandas as pd

# 1) collect constrained detailed predscore columns
keys = sorted([
    k for k in PBMC_HD01.protein.row_attrs.keys()
    if k.startswith("Averaged.Simplified.") and k.endswith(".predscore.constrained")
])
print(f"Found {len(keys)} detailed constrained columns")

# 2) stack into matrix (n_cells x n_classes)
M = np.vstack([np.asarray(PBMC_HD01.protein.row_attrs[k]).reshape(-1) for k in keys]).T

# 3) argmax per row + top score
imax = M.argmax(axis=1)
top_key = np.array(keys, dtype=object)[imax]
top_score = M[np.arange(M.shape[0]), imax]

# 4) bring in Averaged.Simplified.Celltype (already in row_attrs per your printout)
celltype = np.asarray(PBMC_HD01.protein.row_attrs["Averaged.Simplified.Celltype"]).astype(object)

toppy = np.asarray(PBMC_HD01.protein.row_attrs["Averaged.Simplified.Celltype.TopScore"])[imax].astype(object)

# 5) optional: include barcode (helpful for joining / tracing)
barcode = np.asarray(PBMC_HD01.protein.row_attrs["barcode"]).astype(object)

# 6) make a tidy table
out = pd.DataFrame({
    "barcode": barcode,
    "top_constrained_key": top_key,
    "top_constrained_score": top_score,
    "top_score": toppy,
    "Averaged.Simplified.Celltype": celltype,
})

out.head(10)


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

# --- Ensure the four score columns exist in pbmc.obs ---
pbmc.obs['Hao.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Hao.Broad.Immature.predscore']
pbmc.obs['Zhang.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Zhang.Broad.Immature.predscore']
pbmc.obs['Triana.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Triana.Broad.Immature.predscore']
pbmc.obs['Luecken.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Luecken.Broad.Immature.predscore']

# Pretty labels and fixed order (row1: Hao, Zhang; row2: Triana, Luecken)
pretty_names = {
    'Hao.Broad.Immature.predscore': '',
    'Zhang.Broad.Immature.predscore': '',
    'Triana.Broad.Immature.predscore': '',
    'Luecken.Broad.Immature.predscore': ''
}
ordered_cols = [
    'Hao.Broad.Immature.predscore',
    'Zhang.Broad.Immature.predscore',
    'Triana.Broad.Immature.predscore',
    'Luecken.Broad.Immature.predscore'
]

# 2x2 grid
with rc_context({"figure.figsize": (8, 7)}):
    fig, axs = plt.subplots(2, 2, figsize=(8, 7))
    for col, ax in zip(ordered_cols, axs.ravel()):
        sc.pl.umap(
            pbmc,
            color=col,
            add_outline=True,
            cmap="magma",
            frameon=False,
            size=50,
            alpha=0.9,
            title=pretty_names[col],
            ax=ax,
            show=False
        )
        # Style axes
        ax.set_title(ax.get_title(), fontweight='bold')
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    fig.subplots_adjust(wspace=0.40)
    fig.subplots_adjust(hspace=0.30)  # add a bit of horizontal space between columns
    # Save before showing to avoid backend closing the figure
    plt.savefig(os.path.join(figures_path, "Atlases_Immature_prediction.png"),
                dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Immature.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Immature.predscore': '',
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Mature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Mature.predscore']
pbmc.obs['Averaged.Simplified.CD4_T.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Simplified.CD4_T.predscore']
pbmc.obs['Averaged.Detailed.CD4_T_Memory.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Detailed.CD4_T_Memory.predscore']
pbmc.obs['Averaged.Detailed.CD4_T_Naive.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Detailed.CD4_T_Naive.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Mature.predscore': '',
    'Averaged.Simplified.CD4_T.predscore': '',
    'Averaged.Detailed.CD4_T_Memory.predscore': '',
    'Averaged.Detailed.CD4_T_Naive.predscore': ''
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes
    fig.subplots_adjust(wspace=0.40)

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()
    plt.savefig(os.path.join(figures_path, "Longitudinal_CD4_T_subsets_prediction.png"),
                dpi=300, bbox_inches='tight')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Mature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Mature.predscore']
pbmc.obs['Averaged.Simplified.CD4_T.predscore.constrained'] = PBMC_HD01.protein.row_attrs['Averaged.Simplified.CD4_T.predscore.constrained']
pbmc.obs['Averaged.Detailed.CD4_T_Memory.predscore.constrained'] = PBMC_HD01.protein.row_attrs['Averaged.Detailed.CD4_T_Memory.predscore.constrained']
pbmc.obs['Averaged.Detailed.CD4_T_Naive.predscore.constrained'] = PBMC_HD01.protein.row_attrs['Averaged.Detailed.CD4_T_Naive.predscore.constrained']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Mature.predscore': '',
    'Averaged.Simplified.CD4_T.predscore.constrained': '',
    'Averaged.Detailed.CD4_T_Memory.predscore.constrained': '',
    'Averaged.Detailed.CD4_T_Naive.predscore.constrained': ''
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes
    fig.subplots_adjust(wspace=0.40)

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()
    plt.savefig(os.path.join(figures_path, "Longitudinal_CD4_T_subsets_prediction_with_constrains.png"),
                dpi=300, bbox_inches='tight')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Mature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Mature.predscore']
pbmc.obs['Averaged.Simplified.B.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Simplified.B.predscore']
pbmc.obs['Averaged.Detailed.B_Naive.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Detailed.B_Naive.predscore']
pbmc.obs['Averaged.Detailed.B_Memory.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Detailed.B_Memory.predscore']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Mature.predscore': 'Broad: Mature',
    'Averaged.Simplified.B.predscore': 'Simplified: B',
    'Averaged.Detailed.B_Naive.predscore': 'Detailed: B Naive',
    'Averaged.Detailed.B_Memory.predscore': 'Detailed: B Memory'
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (4, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()
    plt.savefig(os.path.join(figures_path, "Longitudinal_B_subsets_prediction.png"),
                dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords

from matplotlib import rc_context
import scanpy as sc
import matplotlib.pyplot as plt

# --- Make sure the three scores exist in pbmc.obs ---
pbmc.obs['Averaged.Broad.Immature.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Broad.Immature.predscore']
pbmc.obs['Averaged.Simplified.HSPC.predscore'] = PBMC_HD01.protein.row_attrs['Averaged.Simplified.HSPC.predscore']
pbmc.obs['Averaged.Simplified.HSPC.predscore.constrained'] = PBMC_HD01.protein.row_attrs['Averaged.Simplified.HSPC.predscore.constrained']

# Map internal column names to pretty labels
pretty_names = {
    'Averaged.Broad.Immature.predscore': '',
    'Averaged.Simplified.HSPC.predscore': '',
    'Averaged.Simplified.HSPC.predscore.constrained': ''
}

score_cols = list(pretty_names.keys())

with rc_context({"figure.figsize": (3.5, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color=score_cols,
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title=[pretty_names[c] for c in score_cols],
        return_fig=True,
        show=False
    )

    # Get all axes in the figure
    axs = fig.axes

    for ax in axs:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])

    plt.tight_layout()
    plt.show()
    plt.savefig(os.path.join(figures_path, "Longitudinal_HSPC_subsets_prediction_with_and_without_constraints.png"),
                dpi=300, bbox_inches='tight')

In [None]:
import os
import re
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import rc_context
from anndata import AnnData

# Build AnnData with UMAP for plotting
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])
df_prot = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])
pbmc.obs_names = df_prot.index
pbmc.var_names = df_prot.columns
pbmc.obsm['X_umap'] = coords

# Collect all Simplified predscore keys across atlases
atlases = ["Hao", "Zhang", "Triana", "Luecken"]
key_re = re.compile(rf"^({'|'.join(atlases)})\.Simplified\.([^.]+)\.predscore$")
row_keys = list(PBMC_HD01.protein.row_attrs.keys())

# Map: label -> {atlas: series}
label_to_atlas = {}
for k in row_keys:
    m = key_re.match(k)
    if not m:
        continue
    atlas, label = m.groups()
    label_to_atlas.setdefault(label, {})[atlas] = pd.Series(
        PBMC_HD01.protein.row_attrs[k], index=pbmc.obs_names
    )

def _get_averaged_series(label):
    averaged_key = f"Averaged.Simplified.{label}.predscore.constrained"
    if averaged_key in PBMC_HD01.protein.row_attrs:
        return pd.Series(PBMC_HD01.protein.row_attrs[averaged_key], index=pbmc.obs_names), "library"
    if label in label_to_atlas and label_to_atlas[label]:
        mat = pd.DataFrame(label_to_atlas[label]).reindex(pbmc.obs_names)
        return mat.mean(axis=1, skipna=True), "computed"
    return None, None

def plot_label_panels(label, save_dir=None):
    cols = []
    titles = []
    # add per-atlas columns if present
    for atlas in atlases:
        ser = label_to_atlas.get(label, {}).get(atlas)
        if ser is not None:
            col = f"{atlas}.Simplified.{label}.predscore.constrained"
            pbmc.obs[col] = ser.reindex(pbmc.obs_names).values
            cols.append(col)
            titles.append(f"{atlas} - Simplified: {label}")

    # averaged: prefer library-provided, else compute mean across present atlases
    avg_series, avg_src = _get_averaged_series(label)
    if avg_series is not None:
        avg_col = f"Averaged.Simplified.{label}.predscore.constrained"
        pbmc.obs[avg_col] = avg_series.values
        src_txt = "library" if avg_src == "library" else "computed"
        cols.append(avg_col)
        titles.append(f"Averaged ({src_txt}) - Simplified: {label}")

    if not cols:
        return  # nothing to plot

    with rc_context({"figure.figsize": (4, 3.5)}):
        fig = sc.pl.umap(
            pbmc,
            color=cols,
            add_outline=True,
            cmap="magma",
            frameon=False,
            size=50,
            alpha=0.9,
            title=titles,
            return_fig=True,
            show=False,
        )
        for ax in fig.axes:
            ax.set_title(ax.get_title(), fontweight='bold')
            ax.set_xlabel("")
            ax.set_ylabel("")
            ax.set_xticks([])
            ax.set_yticks([])
        plt.tight_layout()
        if save_dir:
            safe_label = re.sub(r"[^A-Za-z0-9._-]+", "_", label)
            out = os.path.join(save_dir, f"Simplified_{safe_label}_predscores.png")
            plt.savefig(out, dpi=300, bbox_inches='tight')
        plt.show()

# Plot all Simplified labels found; save alongside Averaged
for lbl in sorted(label_to_atlas.keys()):
    plot_label_panels(lbl, save_dir=figures_path)

In [None]:
import os
import re
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from matplotlib import rc_context
from anndata import AnnData

# Build AnnData with UMAP for plotting
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])
df_prot = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])
pbmc.obs_names = df_prot.index
pbmc.var_names = df_prot.columns
pbmc.obsm['X_umap'] = coords

# Collect all Detailed predscore keys across atlases
atlases = ["Hao", "Zhang", "Triana", "Luecken"]
key_re = re.compile(rf"^({'|'.join(atlases)})\.Detailed\.([^.]+)\.predscore$")
row_keys = list(PBMC_HD01.protein.row_attrs.keys())

# Map: label -> {atlas: series}
label_to_atlas = {}
for k in row_keys:
    m = key_re.match(k)
    if not m:
        continue
    atlas, label = m.groups()
    label_to_atlas.setdefault(label, {})[atlas] = pd.Series(
        PBMC_HD01.protein.row_attrs[k], index=pbmc.obs_names
    )

def _get_averaged_series(label):
    averaged_key = f"Averaged.Detailed.{label}.predscore.constrained"
    if averaged_key in PBMC_HD01.protein.row_attrs:
        return pd.Series(PBMC_HD01.protein.row_attrs[averaged_key], index=pbmc.obs_names), "library"
    if label in label_to_atlas and label_to_atlas[label]:
        mat = pd.DataFrame(label_to_atlas[label]).reindex(pbmc.obs_names)
        return mat.mean(axis=1, skipna=True), "computed"
    return None, None

def plot_label_panels(label, save_dir=None):
    cols = []
    titles = []
    # add per-atlas columns if present
    for atlas in atlases:
        ser = label_to_atlas.get(label, {}).get(atlas)
        if ser is not None:
            col = f"{atlas}.Detailed.{label}.predscore.constrained"
            pbmc.obs[col] = ser.reindex(pbmc.obs_names).values
            cols.append(col)
            titles.append(f"{atlas} - Detailed: {label}")

    # averaged: prefer library-provided, else compute mean across present atlases
    avg_series, avg_src = _get_averaged_series(label)
    if avg_series is not None:
        avg_col = f"Averaged.Detailed.{label}.predscore.constrained"
        pbmc.obs[avg_col] = avg_series.values
        src_txt = "library" if avg_src == "library" else "computed"
        cols.append(avg_col)
        titles.append(f"Averaged ({src_txt}) - Detailed: {label}")

    if not cols:
        return  # nothing to plot

    with rc_context({"figure.figsize": (4, 3.5)}):
        fig = sc.pl.umap(
            pbmc,
            color=cols,
            add_outline=True,
            cmap="magma",
            frameon=False,
            size=50,
            alpha=0.9,
            title=titles,
            return_fig=True,
            show=False,
        )
        for ax in fig.axes:
            ax.set_title(ax.get_title(), fontweight='bold')
            ax.set_xlabel("")
            ax.set_ylabel("")
            ax.set_xticks([])
            ax.set_yticks([])
        plt.tight_layout()
        if save_dir:
            safe_label = re.sub(r"[^A-Za-z0-9._-]+", "_", label)
            out = os.path.join(save_dir, f"Detailed_{safe_label}_predscores.png")
            plt.savefig(out, dpi=300, bbox_inches='tight')
        plt.show()

# Plot all Detailed labels found; save alongside Averaged
for lbl in sorted(label_to_atlas.keys()):
    plot_label_panels(lbl, save_dir=figures_path)

### Annotation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.patches import Rectangle
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Averaged.Broad.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'Immature': "#0079ea",
    'Mature': "#CFCFCF"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure
# ============================================
fig2, ax_legend = plt.subplots(figsize=(6, 0.5))
ax_legend.axis('off')

# Create legend handles manually
handles = []
for cat, color in zip(cats, palette):
    from matplotlib.lines import Line2D
    handle = Line2D(
        [0], [0],
        marker='o',              # Circle marker (dot)
        markerfacecolor=color,   # Fill color
        markeredgecolor='black', # Border color
        markeredgewidth=0.5,     # Border width
        markersize=10,           # Size of the dot
        linestyle='None'         # No connecting line
    )
    handles.append(handle)

# Create horizontal legend
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=len(cats),  # All in one row
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Broad_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Averaged.Simplified.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'HSPC': '#0079ea',
    'cDC': "#16D2E3",
    'pDC': "#69FFCB",
    'Monocyte': '#D27CE3',
    "Myeloid": '#473B76',
    'Other_T': "#EDB416",
    'Erythroid': "#F30A1A",
    'CD4_T': '#C99546',
    'CD8_T': "#6B3317",
    'NK': "#FBEF0D",
    'B': '#4CAF50',
    "Plasma": "#9DC012",
    'Small': "#292929"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(8, 1.2))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (4 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=4,           # 4 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=0.8
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Simplified_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Averaged.Detailed.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new (added MEP)
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new (added Pre-B)
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F"
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Detailed_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
PBMC_HD01.protein.signaturemap(
    attribute="Normalized_reads",
    splitby="Averaged.Detailed.Celltype")

### Comparison to MBio Truth annotator

In [None]:
random.seed(42)
np.random.seed(42)
PBMC_HD01.protein.normalize_reads('NSP')

In [None]:
import os
import plotly.io as pio
from missionbio.demultiplex.protein.truth import Truth

truth = Truth.builtin()

# Output directory
figures_path = os.path.join(parent_dir, "Figures", "Model_Independent_Testing")
os.makedirs(figures_path, exist_ok=True)

out_png = os.path.join(figures_path, "Mosaic_Truth_Reference_Annotation.png")
out_html = os.path.join(figures_path, "Mosaic_Truth_Reference_Annotation.html")

# Generate plot
fig = truth.plot()

# --- Case 1: truth.plot() RETURNS a Plotly Figure ---
if fig is not None:
    # Static image (PNG) — requires kaleido
    pio.write_image(fig, out_png, width=1200, height=900, scale=2)

    # Interactive HTML (always works, no extra deps)
    pio.write_html(fig, out_html)

# --- Case 2: truth.plot() renders but RETURNS None ---
else:
    # Grab the last active Plotly figure
    fig = pio._orca.get_last_figure() if hasattr(pio, "_orca") else None
    if fig is None:
        raise RuntimeError("Could not retrieve Plotly figure from truth.plot()")

    pio.write_image(fig, out_png, width=1200, height=900, scale=2)
    pio.write_html(fig, out_html)


In [None]:
random.seed(42)
np.random.seed(42)

pace = PBMC_HD01.protein.cluster_and_label(
    max_adjusted_mixing=0.3,  # This parameter controls which mixed clusters would be labelled "Mixed Like"
    min_distance_for_doublet=5,  # Increase this if too many mixed cells are observed.
    cluster=True,  # If True, graph-community clustering is run, otherwise the existing cluster labels are used
)

In [None]:
import os
import pandas as pd
import numpy as np

# ------------------------------------------------------------------
# Output directory
data_path = os.path.join(parent_dir, "Data", "Annotation_Assignments")
os.makedirs(data_path, exist_ok=True)

# ------------------------------------------------------------------
# Annotation keys to sync
annotation_keys = [
    "label",
    "Averaged.Broad.Celltype",
    "Averaged.Simplified.Celltype",
    "Averaged.Detailed.Celltype",
]

# ------------------------------------------------------------------
def sync_row_attr(obj, attr_key, out_dir):
    """
    If file exists -> load and overwrite obj.protein.row_attrs[attr_key]
                      as a NumPy array
    Else           -> save obj.protein.row_attrs[attr_key] to disk
    """
    out_file = os.path.join(out_dir, f"{attr_key}.tsv")

    if os.path.exists(out_file):
        # -------- LOAD --------
        df = pd.read_csv(out_file, sep="\t", index_col=0)

        if df.shape[1] != 1:
            raise ValueError(f"{out_file} must contain exactly one column")

        # Convert to numpy array (this is the key change)
        arr = df.iloc[:, 0].astype(str).to_numpy()

        obj.protein.row_attrs[attr_key] = arr
        print(f"[LOAD] {attr_key} -> numpy array {arr.dtype}")

    else:
        # -------- SAVE --------
        arr = np.asarray(obj.protein.row_attrs[attr_key])

        # Preserve index if available, otherwise write positional index
        df = pd.DataFrame({attr_key: arr})
        df.to_csv(out_file, sep="\t")

        print(f"[SAVE] {attr_key} <- numpy array {arr.dtype}")

# ------------------------------------------------------------------
# Run sync
for key in annotation_keys:
    sync_row_attr(PBMC_HD01, key, data_path)


In [None]:
custom_palette = {
    'B cell': "#1C511D",           # from your original
    'IgM memory B cell': '#68D827',            # from your original
    'Monocyte': "#D27CE3",          # from your original
    'Terminal Memory CD4+ T cell': "#C1AF93",       # from your original
    'CD4+ T cell': "#C99546",        # from your original
    'CD8+ T cell': "#6B3317",        # from your original
    'Terminal Memory CD8+ T cell': "#6B3317",       # from your original
    'T cell': "#645846",            # new
    'Proerythroblast': "#D1235A",
    'NK T cell': "#F3AC1F",     # new
    'CD16+ NK cell': "#FBEF0D", 
    'Unassigned-3': "#292929",              # new
    'Unassigned-2': "#3D0B0B",              # new
    'Dendritic': "#16D2E3",               # from your original
    'Memory T cell': "#EDB416",                # from your original
    'Plasmacytoid dendritic': "#69FFCB",                # from your original
    'Unassigned-1': "#43401F"                # new
}

PBMC_HD01.protein.set_palette(custom_palette)

In [None]:
bars = PBMC_HD01.protein.clustered_barcodes("normalized_counts", subcluster=False)
fig = PBMC_HD01.protein.heatmap("normalized_counts", bars_order=bars)
fig.layout.width = 1000
fig

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['label']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B cell': "#1C511D",           # from your original
    'IgM memory B cell': '#68D827',            # from your original
    'Monocyte': "#D27CE3",          # from your original
    'Terminal Memory CD4+ T cell': "#C1AF93",       # from your original
    'CD4+ T cell': "#C99546",        # from your original
    'CD8+ T cell': "#6B3317",        # from your original
    'Terminal Memory CD8+ T cell': "#6B3317",       # from your original
    'T cell': "#645846",            # new
    'Proerythroblast': "#D1235A",
    'NK T cell': "#F3AC1F",     # new
    'CD16+ NK cell': "#FBEF0D", 
    'Unassigned-3': "#292929",              # new
    'Unassigned-2': "#3D0B0B",              # new
    'Dendritic': "#16D2E3",               # from your original
    'Memory T cell': "#EDB416",                # from your original
    'Plasmacytoid dendritic': "#69FFCB",                # from your original
    'Unassigned-1': "#43401F"                # new
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        palette=palette,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Mosaic_Truth_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist

# Get the data
label_annotation = np.asarray(PBMC_HD01.protein.row_attrs['label'])

# Get expression data
expr_data = PBMC_HD01.protein.layers['Normalized_reads']
if sparse.issparse(expr_data):
    expr_data = expr_data.toarray()

# Get marker names
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
marker_names = df_protein.columns.tolist()

# Create DataFrame
df = pd.DataFrame(expr_data, columns=marker_names)
df['label'] = label_annotation

# Calculate median expression for each marker in each celltype
median_expr = df.groupby('label')[marker_names].median()

print(f"Celltypes: {len(median_expr)}")
print(f"Markers: {len(marker_names)}")
print(f"\nMedian expression matrix shape: {median_expr.shape}")

# ----------------------------------------
# Cluster celltypes (rows/y-axis)
# ----------------------------------------
try:
    celltype_dist = pdist(median_expr.values, metric='euclidean')
    celltype_linkage = linkage(celltype_dist, method='ward')
    celltype_order = leaves_list(celltype_linkage)
    celltypes_sorted = median_expr.index[celltype_order].tolist()
    print(f"\nCelltypes clustered successfully")
except:
    celltypes_sorted = median_expr.index.tolist()
    print(f"\nCelltype clustering failed, using original order")

# ----------------------------------------
# Keep markers in original order (no clustering)
# ----------------------------------------
markers_sorted = marker_names
print(f"Markers kept in original order (not clustered)")

# Reorder the matrix
median_expr_sorted = median_expr.loc[celltypes_sorted, markers_sorted]

# ----------------------------------------
# Create heatmap with square cells
# ----------------------------------------
# Calculate aspect ratio to make squares
n_celltypes = len(celltypes_sorted)
n_markers = len(markers_sorted)
aspect_ratio = n_markers / n_celltypes

fig, ax = plt.subplots(figsize=(14, 14 / aspect_ratio))

# Plot with black gridlines and square cells
sns.heatmap(
    median_expr_sorted,
    cmap='magma',
    cbar_kws={'label': 'Median Normalized Reads', 'orientation': 'horizontal', 
              'shrink': 0.35, 'aspect': 30, 'pad': 0.08},
    xticklabels=markers_sorted,
    yticklabels=celltypes_sorted,
    linewidths=1,           # Width of grid lines
    linecolor='black',      # Black grid lines
    square=True,            # Make cells square
    ax=ax
)

# Move x-axis labels to top
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

# Move y-axis labels to right
ax.yaxis.tick_right()
ax.yaxis.set_label_position('right')

# Move colorbar to bottom
cbar = ax.collections[0].colorbar
cbar.ax.set_position([0.25, 0, 0.5, 0.02])  # [x, y, width, height]

ax.set_xlabel('', fontsize=12, fontweight='bold')
ax.set_ylabel('', fontsize=12, fontweight='bold')
ax.set_title('Median Expression - Mosaic Truth Method', 
            fontsize=14, fontweight='bold', pad=40)  # Increased padding for title
plt.xticks(rotation=90, ha='left', fontsize=9)  # Changed ha to 'left' for top labels
plt.yticks(rotation=0, fontsize=7)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "="*70)
print("EXPRESSION SUMMARY")
print("="*70)

# Overall statistics
print(f"\nOverall median expression range: {median_expr_sorted.min().min():.2f} - {median_expr_sorted.max().max():.2f}")
print(f"Mean of medians: {median_expr_sorted.values.mean():.2f}")

# Top markers for each celltype
print("\nTop 3 markers per cell type:")
print("-"*70)
for celltype in celltypes_sorted:
    top_markers = median_expr_sorted.loc[celltype].nlargest(3)
    print(f"\n{celltype}:")
    for marker, value in top_markers.items():
        print(f"  {marker}: {value:.2f}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from scipy.cluster.hierarchy import linkage, leaves_list
from scipy.spatial.distance import pdist

# Get the data
label_annotation = np.asarray(PBMC_HD01.protein.row_attrs['Averaged.Detailed.Celltype'])

# Get expression data
expr_data = PBMC_HD01.protein.layers['Normalized_reads']
if sparse.issparse(expr_data):
    expr_data = expr_data.toarray()

# Get marker names
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
marker_names = df_protein.columns.tolist()

# Create DataFrame
df = pd.DataFrame(expr_data, columns=marker_names)
df['label'] = label_annotation

# Calculate median expression for each marker in each celltype
median_expr = df.groupby('label')[marker_names].median()

print(f"Celltypes: {len(median_expr)}")
print(f"Markers: {len(marker_names)}")
print(f"\nMedian expression matrix shape: {median_expr.shape}")

# ----------------------------------------
# Cluster celltypes (rows/y-axis)
# ----------------------------------------
try:
    celltype_dist = pdist(median_expr.values, metric='euclidean')
    celltype_linkage = linkage(celltype_dist, method='ward')
    celltype_order = leaves_list(celltype_linkage)
    celltypes_sorted = median_expr.index[celltype_order].tolist()
    print(f"\nCelltypes clustered successfully")
except:
    celltypes_sorted = median_expr.index.tolist()
    print(f"\nCelltype clustering failed, using original order")

# ----------------------------------------
# Keep markers in original order (no clustering)
# ----------------------------------------
markers_sorted = marker_names
print(f"Markers kept in original order (not clustered)")

# Reorder the matrix
median_expr_sorted = median_expr.loc[celltypes_sorted, markers_sorted]

# ----------------------------------------
# Create heatmap with square cells
# ----------------------------------------
# Calculate aspect ratio to make squares
n_celltypes = len(celltypes_sorted)
n_markers = len(markers_sorted)
aspect_ratio = n_markers / n_celltypes

fig, ax = plt.subplots(figsize=(14, 14 / aspect_ratio))

# Plot with black gridlines and square cells
sns.heatmap(
    median_expr_sorted,
    cmap='magma',
    cbar_kws={'label': 'Median Normalized Reads', 'orientation': 'horizontal', 
              'shrink': 0.35, 'aspect': 30, 'pad': 0.08},
    xticklabels=markers_sorted,
    yticklabels=celltypes_sorted,
    linewidths=1,           # Width of grid lines
    linecolor='black',      # Black grid lines
    square=True,            # Make cells square
    ax=ax
)

# Move x-axis labels to top
ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

# Move y-axis labels to right
ax.yaxis.tick_right()
ax.yaxis.set_label_position('right')

# Move colorbar to bottom
cbar = ax.collections[0].colorbar
cbar.ax.set_position([0.25, 0, 0.5, 0.02])  # [x, y, width, height]

ax.set_xlabel('', fontsize=12, fontweight='bold')
ax.set_ylabel('', fontsize=12, fontweight='bold')
ax.set_title('Median Expression - Mosaic Truth Method', 
            fontsize=14, fontweight='bold', pad=40)  # Increased padding for title
plt.xticks(rotation=90, ha='left', fontsize=9)  # Changed ha to 'left' for top labels
plt.yticks(rotation=0, fontsize=7)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "="*70)
print("EXPRESSION SUMMARY")
print("="*70)

# Overall statistics
print(f"\nOverall median expression range: {median_expr_sorted.min().min():.2f} - {median_expr_sorted.max().max():.2f}")
print(f"Mean of medians: {median_expr_sorted.values.mean():.2f}")

# Top markers for each celltype
print("\nTop 3 markers per cell type:")
print("-"*70)
for celltype in celltypes_sorted:
    top_markers = median_expr_sorted.loc[celltype].nlargest(3)
    print(f"\n{celltype}:")
    for marker, value in top_markers.items():
        print(f"  {marker}: {value:.2f}")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Extract the two annotations to compare
label_annotation = PBMC_HD01.protein.row_attrs['label']
detailed_annotation = PBMC_HD01.protein.row_attrs['Averaged.Detailed.Celltype']

# Create a confusion/agreement matrix
df_compare = pd.DataFrame({
    'label': label_annotation,
    'Averaged.Detailed.Celltype': detailed_annotation
})

# Create crosstab (counts)
confusion_matrix = pd.crosstab(
    df_compare['label'],
    df_compare['Averaged.Detailed.Celltype'],
    margins=False
)

# Normalize by row (each 'label' category) to show proportions
confusion_matrix_norm = confusion_matrix.div(confusion_matrix.sum(axis=1), axis=0) * 100

# Sort rows and columns for better visualization
# Sort by most abundant categories
row_order = confusion_matrix.sum(axis=1).sort_values(ascending=False).index
col_order = confusion_matrix.sum(axis=0).sort_values(ascending=False).index

confusion_matrix_norm_sorted = confusion_matrix_norm.loc[row_order, col_order]

# Create the heatmap
fig, ax = plt.subplots(figsize=(14, 10))

sns.heatmap(
    confusion_matrix_norm_sorted,
    annot=True,           # Show percentages
    fmt='.1f',            # Format as decimal with 1 decimal place
    cmap='YlOrRd',        # Yellow-Orange-Red colormap
    cbar_kws={'label': 'Percentage (%)'},
    linewidths=0.5,
    linecolor='gray',
    ax=ax,
    vmin=0,
    vmax=100
)

ax.set_xlabel('Averaged.Detailed.Celltype', fontsize=12, fontweight='bold')
ax.set_ylabel('Label', fontsize=12, fontweight='bold')
ax.set_title('Agreement Heatmap: Label vs Averaged.Detailed.Celltype\n(% of each Label row)', 
             fontsize=14, fontweight='bold', pad=20)

# Rotate x-axis labels
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.yticks(rotation=0, fontsize=9)

plt.tight_layout()
plt.show()

# Print summary statistics
print("\n" + "="*70)
print("AGREEMENT SUMMARY")
print("="*70)

# Calculate overall agreement (where labels match exactly)
# Note: this requires matching category names, which may not be exact
total_cells = len(df_compare)
print(f"\nTotal cells: {total_cells}")

# Show top mappings for each label
print("\nTop mappings for each 'label' category:")
print("-" * 70)
for label_cat in row_order:
    row = confusion_matrix_norm_sorted.loc[label_cat]
    top_3 = row.nlargest(3)
    n_cells = confusion_matrix.loc[label_cat].sum()
    print(f"\n{label_cat} (n={n_cells}):")
    for detailed_cat, pct in top_3.items():
        count = confusion_matrix.loc[label_cat, detailed_cat]
        if pct > 0:
            print(f"  → {detailed_cat}: {pct:.1f}% ({count} cells)")

print("\n" + "="*70)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from scipy.cluster.hierarchy import linkage, leaves_list, dendrogram
from scipy.spatial.distance import pdist

# Get the data
label_annotation = np.asarray(PBMC_HD01.protein.row_attrs['label'])
detailed_annotation = np.asarray(PBMC_HD01.protein.row_attrs['Averaged.Detailed.Celltype'])

# Get expression data
expr_data = PBMC_HD01.protein.layers['Normalized_reads']
if sparse.issparse(expr_data):
    expr_data = expr_data.toarray()

# Get gene names (antibodies)
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
gene_names = df_protein.columns.tolist()

# Create DataFrame with all data
df_full = pd.DataFrame(expr_data, columns=gene_names)
df_full['label'] = label_annotation
df_full['Averaged.Detailed.Celltype'] = detailed_annotation

# Get unique labels (sorted by frequency)
label_counts = pd.Series(label_annotation).value_counts()
unique_labels = label_counts.index.tolist()

print(f"Found {len(unique_labels)} unique label categories")
print(f"Total cells: {len(label_annotation)}")

# ============================================
# Create paired heatmaps for each label
# ============================================

for label_cat in unique_labels:
    # Filter cells belonging to this label
    mask = df_full['label'] == label_cat
    df_subset = df_full[mask].copy()
    
    n_cells = mask.sum()
    if n_cells < 5:  # Skip if too few cells
        print(f"Skipping {label_cat} (only {n_cells} cells)")
        continue
    
    print(f"\nProcessing {label_cat} ({n_cells} cells)...")
    
    # Get expression data
    expr_subset = df_subset[gene_names].values
    
    # No z-score normalization - use raw normalized reads
    expr_data_subset = expr_subset
    
    # ----------------------------------------
    # Cluster genes (markers) based on expression patterns
    # ----------------------------------------
    try:
        gene_dist = pdist(expr_data_subset.T, metric='euclidean')
        gene_linkage = linkage(gene_dist, method='ward')
        gene_order = leaves_list(gene_linkage)
        genes_sorted = [gene_names[i] for i in gene_order]
        expr_genes_sorted = expr_data_subset[:, gene_order]
    except:
        genes_sorted = gene_names
        expr_genes_sorted = expr_data_subset
        gene_linkage = None
    
    # Create figure with two heatmaps and right dendrogram
    fig = plt.figure(figsize=(18, 10))
    gs = fig.add_gridspec(2, 3, height_ratios=[5, 0.5], width_ratios=[5, 5, 0.5],
                          hspace=0.25, wspace=0.1)
    
    # ----------------------------------------
    # LEFT PANEL: Heatmap only (no dendrogram)
    # ----------------------------------------
    # Cluster cells
    ax_heat_left = fig.add_subplot(gs[0, 0])
    try:
        if n_cells > 2:
            cell_dist = pdist(expr_genes_sorted, metric='euclidean')
            cell_linkage = linkage(cell_dist, method='ward')
            cell_order = leaves_list(cell_linkage)
            expr_sorted = expr_genes_sorted[cell_order, :]
        else:
            expr_sorted = expr_genes_sorted
    except:
        expr_sorted = expr_genes_sorted
    
    # Plot left heatmap (no colorbar)
    sns.heatmap(
        expr_sorted.T,
        cmap='magma',
        vmin=0,
        vmax=np.percentile(expr_data_subset, 99),  # Cap at 99th percentile
        yticklabels=genes_sorted,
        xticklabels=False,
        cbar=False,  # No colorbar on left
        ax=ax_heat_left
    )
    ax_heat_left.set_title(f'{label_cat}', 
                           fontsize=12, fontweight='bold')
    ax_heat_left.set_xlabel('Cells', fontsize=10)
    ax_heat_left.set_ylabel('Antibodies', fontsize=10)
    
    # ----------------------------------------
    # MIDDLE: Heatmap grouped by detailed celltype (no colorbar)
    # ----------------------------------------
    ax_heat_right = fig.add_subplot(gs[0, 1])
    
    detailed_cats = df_subset['Averaged.Detailed.Celltype'].values
    detailed_unique = pd.Series(detailed_cats).value_counts().index.tolist()
    
    # Sort cells by detailed celltype
    detailed_order = []
    detailed_boundaries = []
    detailed_labels_for_x = []
    start_pos = 0
    
    for det_cat in detailed_unique:
        det_mask = detailed_cats == det_cat
        det_indices = np.where(det_mask)[0]
        
        if len(det_indices) > 0:
            # Sort within this detailed category by clustering
            if len(det_indices) > 2:
                try:
                    sub_expr = expr_genes_sorted[det_indices, :]
                    sub_dist = pdist(sub_expr, metric='euclidean')
                    sub_linkage = linkage(sub_dist, method='ward')
                    sub_order = leaves_list(sub_linkage)
                    sorted_indices = det_indices[sub_order]
                except:
                    sorted_indices = det_indices
            else:
                sorted_indices = det_indices
            
            detailed_order.extend(sorted_indices)
            mid_pos = start_pos + len(sorted_indices) / 2
            detailed_boundaries.append(start_pos + len(sorted_indices))
            detailed_labels_for_x.append((mid_pos, det_cat))
            start_pos += len(sorted_indices)
    
    expr_detailed = expr_genes_sorted[detailed_order, :]
    
    # Plot right heatmap (no colorbar)
    sns.heatmap(
        expr_detailed.T,
        cmap='magma',
        vmin=0,
        vmax=np.percentile(expr_data_subset, 99),
        yticklabels=False,  # Don't repeat gene names
        xticklabels=False,
        cbar=False,  # No colorbar on right
        ax=ax_heat_right
    )
    ax_heat_right.set_title(f'{label_cat}', 
                            fontsize=12, fontweight='bold')
    ax_heat_right.set_xlabel('Cells', fontsize=10)
    
    # Add vertical lines to separate detailed celltypes
    for boundary in detailed_boundaries[:-1]:  # Don't add line at the end
        ax_heat_right.axvline(x=boundary, color='white', linewidth=2)
    
    # Add detailed celltype labels at bottom (90 degrees)
    for mid_pos, cat in detailed_labels_for_x:
        ax_heat_right.text(mid_pos, len(genes_sorted) + 1, cat, 
                          ha='center', va='top', fontsize=9, rotation=90)
    
    # ----------------------------------------
    # RIGHT: Gene dendrogram
    # ----------------------------------------
    ax_dend_right = fig.add_subplot(gs[0, 2])
    if gene_linkage is not None:
        dendrogram(gene_linkage, ax=ax_dend_right, orientation='right',
                  no_labels=True, above_threshold_color='black',
                  color_threshold=0)
        ax_dend_right.set_xticks([])
        ax_dend_right.set_yticks([])
        ax_dend_right.spines['top'].set_visible(False)
        ax_dend_right.spines['right'].set_visible(False)
        ax_dend_right.spines['left'].set_visible(False)
        ax_dend_right.spines['bottom'].set_visible(False)
    else:
        ax_dend_right.axis('off')
    
    # ----------------------------------------
    # BOTTOM LEFT: Colorbar under first plot (half height)
    # ----------------------------------------
    # Create a colorbar under the left plot only
    ax_cbar = fig.add_subplot(gs[1, 0])
    
    # Create colorbar
    from matplotlib import cm
    from matplotlib.colors import Normalize
    
    norm = Normalize(vmin=0, vmax=np.percentile(expr_data_subset, 99))
    sm = cm.ScalarMappable(cmap='magma', norm=norm)
    sm.set_array([])
    
    cbar = plt.colorbar(sm, cax=ax_cbar, orientation='horizontal')
    cbar.set_label('Normalized Reads', fontsize=10)
    
    plt.suptitle(f'Expression Comparison: Original Label vs Detailed Celltype', 
                fontsize=14, fontweight='bold', y=0.95)
    plt.tight_layout()
    plt.show()
    
    # Print breakdown
    print(f"  Detailed celltype breakdown:")
    for mid_pos, det_cat in detailed_labels_for_x:
        count = (detailed_cats == det_cat).sum()
        pct = 100 * count / n_cells
        print(f"    {det_cat}: {count} cells ({pct:.1f}%)")

print("\nDone!")

### Propagating annotations

In [None]:
PBMC_HD01.protein.row_attrs['label'] = copy.copy(PBMC_HD01.protein.row_attrs['Averaged.Detailed.Celltype'])

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --------- CONFIG ---------
# Assumes you already have:
#   PBMC_HD01 (MissionBio Sample)
#   figures_path (output directory)
os.makedirs(figures_path, exist_ok=True)

FIGSIZE = (3.35, 3.35)

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs["umap"])
labels = np.asarray(PBMC_HD01.protein.row_attrs["Clusters"], dtype=object)

df_protein = PBMC_HD01.protein.get_attribute("Normalized_reads", constraint="row+col")

pbmc = AnnData(X=np.asarray(df_protein.values))
pbmc.obs_names = df_protein.index.astype(str)
pbmc.var_names = df_protein.columns.astype(str)
pbmc.obsm["X_umap"] = coords
pbmc.obs["Clusters"] = pd.Categorical(labels.astype(str))  # ensure categorical + str

cats = list(pbmc.obs["Clusters"].cat.categories)

# --------- COLORS: let Scanpy generate a consistent categorical palette ---------
# This call can create an extra figure in some environments; close it immediately.
tmp = sc.pl.umap(pbmc, color="Clusters", show=False, return_fig=True)
plt.close(tmp)
palette = list(pbmc.uns["Clusters_colors"])

# ============================================
# PLOT 1: UMAP without legend (colored) — SAVE ONLY, DO NOT RENDER
# ============================================
fig1 = sc.pl.umap(
    pbmc,
    color="Clusters",
    add_outline=True,
    legend_loc="none",
    frameon=False,
    title="",
    size=50,
    alpha=0.9,
    palette=palette,
    return_fig=True,
    show=False,
)

for ax in fig1.axes:
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.set_xticks([])
    ax.set_yticks([])

fig1.set_size_inches(*FIGSIZE)
fig1.tight_layout()
fig1.savefig(
    os.path.join(figures_path, "PBMC_HD01_Clusters_umap.png"),
    dpi=300,
    bbox_inches="tight",
)
plt.show()
plt.close(fig1)  # no plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS — RENDER + SAVE
# ============================================
fig2, ax_legend = plt.subplots(figsize=FIGSIZE)
ax_legend.axis("off")

handles = [
    Line2D(
        [0], [0],
        marker="o",
        linestyle="None",
        markerfacecolor=color,
        markeredgecolor="black",
        markeredgewidth=0.5,
        color="w",
        markersize=9,
    )
    for color in palette
]

legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc="center",
    ncol=6,
    fontsize=8,
    frameon=False,
    columnspacing=1.2,
    labelspacing=0.9,
    handletextpad=0.6,
)

fig2.tight_layout()
fig2.savefig(
    os.path.join(figures_path, "PBMC_HD01_Clusters_legend.png"),
    dpi=300,
    bbox_inches="tight",
)

plt.show()
plt.close(fig2)

print(
    "Saved:\n"
    f"- {os.path.join(figures_path, 'PBMC_HD01_Clusters_umap.png')}\n"
    f"- {os.path.join(figures_path, 'PBMC_HD01_Clusters_legend.png')}"
)


In [None]:
PBMC_HD01, summary, pivot = ep.suggest_cluster_celltype_identity(
    sample=PBMC_HD01,
    dominance_threshold=0.35,
    annotation_col="Averaged.Detailed.Celltype",
    cluster_col="Clusters",
    rewrite=True,
    verbose=True
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['annotated_clusters']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F"                # new
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        palette=palette,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Propagated_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
# -*- coding: utf-8 -*-
"""
FULL script: block-ordered Scanpy heatmap where ONLY the Y categories (gene blocks)
can be inverted, while the X order of clusters is locked.

- X (columns): clusters in a fixed order (dendrogram-derived once, then frozen)
- Y (rows): genes grouped by the cluster in which they peak (z-scored means);
            you can invert JUST these Y blocks without touching X.
"""

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import sparse

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['annotated_clusters']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# ------------------ CONFIG ------------------
groupby = "clusters"     # column in pbmc.obs with cluster labels
use_raw = False          # True -> use pbmc.raw if present
genes_subset = None      # e.g. ['CD3D','CD3E', ...]; None = all genes

WITHIN_DESC = True       # sort genes inside each block by descending expression
INVERT_Y_BLOCKS = True   # flip ONLY the Y category order (top↔bottom)

cmap = "magma"
fig_h = 9
# -------------------------------------------

# 0) Ensure categorical; get a sensible cluster order once
if not pbmc.obs[groupby].dtype.name.startswith("category"):
    pbmc.obs[groupby] = pbmc.obs[groupby].astype("category")

if f"dendrogram_{groupby}" not in pbmc.uns:
    sc.tl.dendrogram(pbmc, groupby=groupby)

d = pbmc.uns.get(f"dendrogram_{groupby}", {})
cluster_order = (
    d.get("categories_order")
    or d.get("ordered_categories")
    or list(pbmc.obs[groupby].cat.categories)
)

cluster_order = cluster_order[::-1]  # dendrogram gives inverted order

# --- LOCK the X order (clusters) and never touch it again ---
pbmc.obs[groupby] = pbmc.obs[groupby].cat.reorder_categories(cluster_order, ordered=True)
cluster_order_fixed = list(pbmc.obs[groupby].cat.categories)

# (optional) keep your palette consistent if you have custom_palette
try:
    pbmc.uns[f"{groupby}_colors"] = [custom_palette.get(c, "#cccccc") for c in cluster_order_fixed]
except NameError:
    pass

# 1) Build genes × clusters mean matrix (Scanpy heatmap summarizes like this)
adata = pbmc.raw if (use_raw and pbmc.raw is not None) else pbmc
all_genes = list(adata.var_names)
genes = all_genes if genes_subset is None else [g for g in genes_subset if g in all_genes]

X = adata[:, genes].X
if sparse.issparse(X):
    X = X.toarray()

df = pd.DataFrame(X, index=adata.obs_names, columns=genes)
df["__cluster__"] = pbmc.obs[groupby].values
M = df.groupby("__cluster__", observed=True).mean().T                 # genes × clusters
M = M.reindex(columns=cluster_order_fixed)                             # ensure X order

# 2) Z-score each gene across clusters (mimic standard_scale="var")
eps = 1e-9
Mz = M.sub(M.mean(axis=1), axis=0)
std = M.std(axis=1).replace(0, np.nan)
Mz = Mz.div(std + eps, axis=0)

# Keep only non-NaN genes for ordering
keep = ~Mz.isna().any(axis=1)
Mz = Mz.loc[keep]
genes_kept = list(Mz.index)

# 3) Build blocks: genes grouped by the cluster where they peak (following X order)
col_index = {c: i for i, c in enumerate(cluster_order_fixed)}
peak_idx = np.nanargmax(Mz.values, axis=1)                             # index into cluster_order_fixed
peak_cluster = [cluster_order_fixed[i] for i in peak_idx]

genes_by_cluster = {c: [] for c in cluster_order_fixed}
for c in cluster_order_fixed:
    ci = col_index[c]
    mask = [i for i, pc in enumerate(peak_cluster) if pc == c]
    # sort genes by their (z-scored) value in cluster c
    sub = sorted(mask, key=lambda i: Mz.values[i, ci], reverse=WITHIN_DESC)
    genes_by_cluster[c] = [genes_kept[i] for i in sub]

# optional: leftovers (e.g., constant genes that were dropped above)
leftovers = [g for g in genes if g not in set(sum(genes_by_cluster.values(), []))]
has_other = len(leftovers) > 0

# 4) INVERT ONLY THE Y categories (gene blocks) if requested
orig_blocks = list(cluster_order_fixed) + (["Other"] if has_other else [])
block_order = list(reversed(orig_blocks)) if INVERT_Y_BLOCKS else orig_blocks

# rebuild the gene order to follow that Y category order
genes_final = []
for c in block_order:
    if c == "Other":
        genes_final.extend(leftovers)
    else:
        genes_final.extend(genes_by_cluster.get(c, []))

# rebuild left-bar annotations to match the (possibly inverted) Y order
var_group_positions, var_group_labels = [], []
start = 0
for c in block_order:
    block = leftovers if c == "Other" else genes_by_cluster.get(c, [])
    n = len(block)
    if n == 0:
        continue
    var_group_positions.append((start, start + n - 1))
    var_group_labels.append(c)
    start += n

# 5) Plot (dendrogram OFF so both X and Y custom orders are respected)
w = 4.5
g = sc.pl.heatmap(
    pbmc,
    var_names=genes_final[::-1],
    groupby=groupby,
    use_raw=False,
    cmap=cmap,
    dendrogram=False,
    var_group_rotation=0,
    figsize=(8, 12),
    show=False
)

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Propagated_Identities_Heatmap.png"),
            dpi=300, bbox_inches='tight')


### Evaluating signatures

In [None]:
PBMC_HD01 = ep.add_mast_annotation(
    PBMC_HD01,
    layer="Normalized_reads",
    verbose=True
)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from scipy.stats import norm

scores_all = np.asarray(PBMC_HD01.protein.row_attrs["Mast_signature_score"], dtype=float)
scores = scores_all[np.isfinite(scores_all)]

tail_q = 0.80
posterior_threshold = 0.95

# --- bulk model: fit a single Gaussian to the lower (1-tail_q) fraction ---
tail_cut = np.quantile(scores, tail_q)
bulk_scores = scores[scores < tail_cut]
bulk_mu = float(np.mean(bulk_scores))
bulk_sd = float(np.std(bulk_scores) + 1e-12)

# --- tail model: 2-GMM on tail only ---
tail_scores = scores[scores >= tail_cut].reshape(-1, 1)
gmm = GaussianMixture(n_components=2, random_state=42).fit(tail_scores)
means = gmm.means_.ravel()
vars_ = gmm.covariances_.ravel()
weights = gmm.weights_.ravel()
hi = int(np.argmax(means))

# posterior on ALL points (but we only "call" within tail)
X_all = scores.reshape(-1, 1)
post_hi = gmm.predict_proba(X_all)[:, hi]
called = (scores >= tail_cut) & (post_hi >= posterior_threshold)

# find the posterior threshold cut location for plotting
xs = np.linspace(scores.min(), scores.max(), 2000)
post_grid = gmm.predict_proba(xs.reshape(-1, 1))[:, hi]
cross = np.where((xs >= tail_cut) & (post_grid >= posterior_threshold))[0]
cut = float(xs[cross[0]]) if cross.size else np.nan

# ---- Plot ----
fig, ax = plt.subplots(figsize=(4.35, 4.35))

# histogram
ax.hist(scores, bins=60, density=True, color="lightgrey", edgecolor="black", alpha=0.8)

# bulk Gaussian (single-component fit on bulk)
ax.plot(xs, norm.pdf(xs, bulk_mu, bulk_sd), linewidth=2,
        label=f"Bulk Gaussian (μ={bulk_mu:.3f})")

# tail components (visual guides; they’re meant for tail, not full bulk)
for i in range(2):
    ax.plot(xs, weights[i] * norm.pdf(xs, means[i], np.sqrt(vars_[i])),
            linewidth=2, label=f"Tail comp {i} (μ={means[i]:.3f})")

# reference lines
ax.axvline(tail_cut, color="black", linestyle=":", linewidth=1,
           label=f"tail_q={tail_q:.2f} @ {tail_cut:.3f}")
if np.isfinite(cut):
    ax.axvline(cut, color="black", linestyle="--", linewidth=1,
               label=f"P(hi)≥{posterior_threshold:.2f} @ {cut:.3f}")

# annotation
ax.text(
    0.02, 0.98,
    f"hi comp={hi}\ncalled={called.sum()}/{scores.size} ({100*called.mean():.2f}%)",
    transform=ax.transAxes, ha="left", va="top", fontsize=9,
    bbox=dict(boxstyle="round,pad=0.25", fc="white", ec="none", alpha=0.85),
)

ax.set_title("Mast signature score — bulk+tail model", fontweight="bold")
ax.set_xlabel("Mast signature score")
ax.set_ylabel("Density")
ax.legend(frameon=False, fontsize=8)

plt.tight_layout()
plt.show()


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# -------- CONFIG --------
figures_path = figures_path  # assumes you already defined it
out_png = os.path.join(figures_path, "PBMC_HD01_Mast_signature_score_umap.png")

# -------- Build minimal AnnData --------
coords = np.asarray(PBMC_HD01.protein.row_attrs["umap"])
if coords.ndim != 2 or coords.shape[1] < 2:
    raise ValueError(f"row_attrs['umap'] must be (n_cells, >=2). Got {coords.shape}")
coords = coords[:, :2]

df_protein = PBMC_HD01.protein.get_attribute("Normalized_reads", constraint="row+col")

pbmc = AnnData(X=np.asarray(df_protein.values))
pbmc.obs_names = df_protein.index.astype(str)
pbmc.var_names = df_protein.columns.astype(str)
pbmc.obsm["X_umap"] = coords

# add score column
pbmc.obs["Mast_signature_score"] = np.asarray(PBMC_HD01.protein.row_attrs["Mast_signature_score"], dtype=float)

# -------- Plot (1x1) --------
with rc_context({"figure.figsize": (4.35, 4.35)}):
    fig, ax = plt.subplots(1, 1, figsize=(4.35, 4.35))

    sc.pl.umap(
        pbmc,
        color="Mast_signature_score",
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title="",
        ax=ax,
        show=False,
    )

    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.set_xticks([])
    ax.set_yticks([])

    fig.tight_layout()

    # SAVE (optional)
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.show()
    plt.close(fig)

print("Saved:", out_png)

In [None]:
PBMC_HD01 = ep.add_signature_annotation(
    PBMC_HD01,
    layer="Normalized_reads",
    positive_markers=['CD14', 'CD33', 'CD11b', 'CD64'],
    negative_markers='',
    cell_type_label="CD14_mono",
    verbose=True
)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from scipy.stats import norm

def call_tail_or_bimodal_gmm(
    scores_all: np.ndarray,
    *,
    random_state: int = 42,
    posterior_threshold_bimodal: float = 0.90,
    posterior_threshold_tail: float = 0.95,
    tail_q: float = 0.80,
    sep_threshold: float = 1.25,   # higher => more conservative about calling "bimodal"
    bins: int = 60,
    title: str = "Signature score — adaptive GMM",
    figsize=(4.35, 4.35),
    show: bool = True,
):
    """
    Adaptive caller:
      - If distribution is well-separated bimodal: full-data 2-GMM, positives = higher-mean comp (posterior-gated).
      - Else (unimodal + tail): fit 2-GMM on upper tail ONLY; positives = right-most tail comp (posterior-gated),
        and ONLY for cells in the tail region.

    Returns
    -------
    mask : bool array (same length as scores_all)
    info : dict with method, sep, thresholds, counts, etc.
    """
    scores_all = np.asarray(scores_all, dtype=float).reshape(-1)
    finite = np.isfinite(scores_all)
    s = scores_all[finite]

    mask = np.zeros_like(scores_all, dtype=bool)
    info = {"status": "ok"}

    if s.size < 10 or np.nanstd(s) == 0:
        info.update({"status": "degenerate", "method": "none"})
        if show:
            fig, ax = plt.subplots(figsize=figsize)
            ax.hist(s, bins=bins, density=True, color="lightgrey", edgecolor="black", alpha=0.8)
            ax.set_title(title + " (degenerate)", fontweight="bold")
            ax.set_xlabel("Signature score"); ax.set_ylabel("Density")
            plt.tight_layout(); plt.show()
        return mask, info

    # -------------------------
    # 1) full-data 2-GMM fit (used only to decide "bimodal vs tail")
    # -------------------------
    X = s.reshape(-1, 1)
    gmm_full = GaussianMixture(n_components=2, random_state=random_state).fit(X)
    means = gmm_full.means_.ravel()
    vars_ = gmm_full.covariances_.ravel()
    weights = gmm_full.weights_.ravel()

    # separation score: |μ1-μ0| / sqrt(σ0^2 + σ1^2)
    sep = float(np.abs(means[1] - means[0]) / (np.sqrt(vars_[0] + vars_[1]) + 1e-12))
    use_bimodal = sep >= sep_threshold

    # shared plotting grid
    xs = np.linspace(s.min(), s.max(), 2000)

    # -------------------------
    # 2) calling rule
    # -------------------------
    if use_bimodal:
        # positives = higher-mean component in full-data model
        hi = int(np.argmax(means))
        post_hi = gmm_full.predict_proba(X)[:, hi]
        called_local = post_hi >= posterior_threshold_bimodal

        # posterior threshold cut
        post_grid = gmm_full.predict_proba(xs.reshape(-1, 1))[:, hi]
        cross = np.where(post_grid >= posterior_threshold_bimodal)[0]
        cut = float(xs[cross[0]]) if cross.size else np.nan

        # align
        mask[np.where(finite)[0]] = called_local

        info.update({
            "method": "bimodal_full_2gmm",
            "sep": sep,
            "sep_threshold": sep_threshold,
            "posterior_threshold": posterior_threshold_bimodal,
            "means": [float(m) for m in means],
            "n_called": int(mask.sum()),
            "n_total_finite": int(s.size),
            "cut": cut,
        })

        plot_mode = "full"
        plot_params = (means, vars_, weights, hi, posterior_threshold_bimodal, cut)

    else:
        # unimodal+tail: fit 2-GMM on upper tail only; positives = right-most tail component
        tail_cut = float(np.quantile(s, tail_q))
        in_tail = s >= tail_cut

        # if tail too small, bail safely
        if in_tail.sum() < 10:
            info.update({
                "status": "tail_too_small",
                "method": "tail",
                "sep": sep,
                "tail_q": tail_q,
                "tail_cut": tail_cut,
                "n_tail": int(in_tail.sum()),
            })
            if show:
                fig, ax = plt.subplots(figsize=figsize)
                ax.hist(s, bins=bins, density=True, color="lightgrey", edgecolor="black", alpha=0.8)
                ax.axvline(tail_cut, color="black", linestyle=":", linewidth=1, label=f"tail_q={tail_q:.2f}")
                ax.set_title(title + " (tail too small)", fontweight="bold")
                ax.set_xlabel("Signature score"); ax.set_ylabel("Density")
                ax.legend(frameon=False, fontsize=8)
                plt.tight_layout(); plt.show()
            return mask, info

        gmm_tail = GaussianMixture(n_components=2, random_state=random_state).fit(s[in_tail].reshape(-1, 1))
        means_t = gmm_tail.means_.ravel()
        vars_t = gmm_tail.covariances_.ravel()
        weights_t = gmm_tail.weights_.ravel()
        hi = int(np.argmax(means_t))  # right-most tail comp

        # posterior for ALL points under tail model
        post_hi = gmm_tail.predict_proba(X)[:, hi]
        called_local = in_tail & (post_hi >= posterior_threshold_tail)

        # posterior threshold cut (within tail region)
        post_grid = gmm_tail.predict_proba(xs.reshape(-1, 1))[:, hi]
        cross = np.where((xs >= tail_cut) & (post_grid >= posterior_threshold_tail))[0]
        cut = float(xs[cross[0]]) if cross.size else np.nan

        # align
        mask[np.where(finite)[0]] = called_local

        info.update({
            "method": "unimodal_tail_2gmm_rightcomp",
            "sep": sep,
            "sep_threshold": sep_threshold,
            "tail_q": tail_q,
            "tail_cut": tail_cut,
            "posterior_threshold": posterior_threshold_tail,
            "means_tail": [float(m) for m in means_t],
            "n_called": int(mask.sum()),
            "n_total_finite": int(s.size),
            "n_tail": int(in_tail.sum()),
            "cut": cut,
        })

        plot_mode = "tail"
        plot_params = (means_t, vars_t, weights_t, hi, posterior_threshold_tail, cut, tail_cut)

    # -------------------------
    # 3) plot
    # -------------------------
    if show:
        fig, ax = plt.subplots(figsize=figsize)
        ax.hist(s, bins=bins, density=True, color="lightgrey", edgecolor="black", alpha=0.8)

        if plot_mode == "full":
            m, v, w, hi, pthr, cut = plot_params
            for i in range(2):
                ax.plot(xs, w[i] * norm.pdf(xs, m[i], np.sqrt(v[i])),
                        linewidth=2, label=f"Component {i} (μ={m[i]:.3f})")
            method_txt = f"full 2-GMM (sep={sep:.2f} ≥ {sep_threshold})"
        else:
            m, v, w, hi, pthr, cut, tail_cut = plot_params
            for i in range(2):
                ax.plot(xs, w[i] * norm.pdf(xs, m[i], np.sqrt(v[i])),
                        linewidth=2, label=f"Tail comp {i} (μ={m[i]:.3f})")
            ax.axvline(tail_cut, color="black", linestyle=":", linewidth=1,
                       label=f"tail_q={tail_q:.2f} @ {tail_cut:.3f}")
            method_txt = f"tail 2-GMM (sep={sep:.2f} < {sep_threshold})"

        if np.isfinite(cut):
            ax.axvline(cut, color="black", linestyle="--", linewidth=1,
                       label=f"P(hi)≥{pthr:.2f} @ {cut:.3f}")

        called_local = mask[np.where(finite)[0]]
        ax.text(
            0.02, 0.98,
            f"{method_txt}\nhi comp={plot_params[3]}\ncalled={called_local.sum()}/{s.size} ({100*called_local.mean():.2f}%)",
            transform=ax.transAxes, ha="left", va="top", fontsize=9,
            bbox=dict(boxstyle="round,pad=0.25", fc="white", ec="none", alpha=0.85),
        )

        ax.set_title(title, fontweight="bold")
        ax.set_xlabel("Signature score")
        ax.set_ylabel("Density")
        ax.legend(frameon=False, fontsize=8)

        plt.tight_layout()
        plt.show()

    return mask, info


In [None]:
scores = np.asarray(PBMC_HD01.protein.row_attrs["CD14_mono_signature_score"], dtype=float)

mask, info = call_tail_or_bimodal_gmm(
    scores,
    sep_threshold=1.25,            # tune 1.0–1.5
    posterior_threshold_bimodal=0.90,
    posterior_threshold_tail=0.90,
    tail_q=0.80,
    title="CD14 Mono signature score — adaptive GMM",
)
print(info)

# Apply to refined labels
base = np.asarray(PBMC_HD01.protein.row_attrs["Averaged.Detailed.Celltype"], dtype=object)
refined = base.copy()
refined[mask] = "CD14 Mono"
PBMC_HD01.protein.row_attrs["Averaged.Detailed.Celltype.Refined"] = refined


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# -------- CONFIG --------
figures_path = figures_path  # assumes you already defined it
out_png = os.path.join(figures_path, "PBMC_HD01_CD14_mono_signature_score_umap.png")

# -------- Build minimal AnnData --------
coords = np.asarray(PBMC_HD01.protein.row_attrs["umap"])
if coords.ndim != 2 or coords.shape[1] < 2:
    raise ValueError(f"row_attrs['umap'] must be (n_cells, >=2). Got {coords.shape}")
coords = coords[:, :2]

df_protein = PBMC_HD01.protein.get_attribute("Normalized_reads", constraint="row+col")

pbmc = AnnData(X=np.asarray(df_protein.values))
pbmc.obs_names = df_protein.index.astype(str)
pbmc.var_names = df_protein.columns.astype(str)
pbmc.obsm["X_umap"] = coords

# add score column
pbmc.obs["CD14_mono_signature_score"] = np.asarray(PBMC_HD01.protein.row_attrs["CD14_mono_signature_score"], dtype=float)

# -------- Plot (1x1) --------
with rc_context({"figure.figsize": (4.35, 4.35)}):
    fig, ax = plt.subplots(1, 1, figsize=(4.35, 4.35))

    sc.pl.umap(
        pbmc,
        color="CD14_mono_signature_score",
        add_outline=True,
        cmap="magma",
        frameon=False,
        size=50,
        alpha=0.9,
        title="",
        ax=ax,
        show=False,
    )

    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.set_xticks([])
    ax.set_yticks([])

    fig.tight_layout()

    # SAVE (optional)
    fig.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.show()
    plt.close(fig)

print("Saved:", out_png)

## Single atlas predictions on PBMC - HD01

### Hao

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.patches import Rectangle
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Hao.Broad.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'Immature': "#0079ea",
    'Mature': "#CFCFCF"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure
# ============================================
fig2, ax_legend = plt.subplots(figsize=(6, 0.5))
ax_legend.axis('off')

# Create legend handles manually
handles = []
for cat, color in zip(cats, palette):
    from matplotlib.lines import Line2D
    handle = Line2D(
        [0], [0],
        marker='o',              # Circle marker (dot)
        markerfacecolor=color,   # Fill color
        markeredgecolor='black', # Border color
        markeredgewidth=0.5,     # Border width
        markersize=10,           # Size of the dot
        linestyle='None'         # No connecting line
    )
    handles.append(handle)

# Create horizontal legend
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=len(cats),  # All in one row
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Hao_Broad_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Hao.Simplified.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'HSPC': '#0079ea',
    'cDC': "#16D2E3",
    'pDC': "#69FFCB",
    'Monocyte': '#D27CE3',
    "Myeloid": '#473B76',
    'Other_T': "#EDB416",
    'Erythroid': "#F30A1A",
    'CD4_T': '#C99546',
    'CD8_T': "#6B3317",
    'NK': "#FBEF0D",
    'B': '#4CAF50',
    "Plasma": "#9DC012",
    'Small': "#292929"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(8, 1.2))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (4 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=4,           # 4 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=0.8
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Hao_Simplified_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Hao.Detailed.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new (added MEP)
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new (added Pre-B)
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F"
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Hao_Detailed_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Hao.Detailed.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends


# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'gdT': "#EDB416",                # from your original
    'pDC': "#69FFCB"                 # from your original
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# --- Plot with Scanpy + outlines (like your example) ---
with rc_context({"figure.figsize": (3.20, 3.5)}):
    fig = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        palette=palette,
        legend_fontsize=6,
        legend_fontoutline=2,
        frameon=False,
        title="Hao - Detailed",
        size=50,
        alpha=0.9,
        return_fig=True,   # so we get the figure handle
        show=False,
    )

    for ax in fig.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

        # Move legend outside below plot
        leg = ax.get_legend()
        if leg is not None:
            ax.legend(
                handles=leg.legendHandles,
                labels=[t.get_text() for t in leg.get_texts()],
                loc='upper center',
                bbox_to_anchor=(0.5, -0.05),  # below plot
                ncol=3,            # all in one row
                fontsize=6,
                frameon=False
            )

    plt.tight_layout()
    plt.show()



In [None]:
PBMC_HD01.protein.signaturemap(
    attribute="Normalized_reads",
    splitby="Hao.Detailed.Celltype")

### Zhang

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.patches import Rectangle
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Zhang.Broad.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'Immature': "#0079ea",
    'Mature': "#CFCFCF"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure
# ============================================
fig2, ax_legend = plt.subplots(figsize=(6, 0.5))
ax_legend.axis('off')

# Create legend handles manually
handles = []
for cat, color in zip(cats, palette):
    from matplotlib.lines import Line2D
    handle = Line2D(
        [0], [0],
        marker='o',              # Circle marker (dot)
        markerfacecolor=color,   # Fill color
        markeredgecolor='black', # Border color
        markeredgewidth=0.5,     # Border width
        markersize=10,           # Size of the dot
        linestyle='None'         # No connecting line
    )
    handles.append(handle)

# Create horizontal legend
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=len(cats),  # All in one row
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Zhang_Broad_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Zhang.Simplified.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'HSPC': '#0079ea',
    'cDC': "#16D2E3",
    'pDC': "#69FFCB",
    'Monocyte': '#D27CE3',
    "Myeloid": '#473B76',
    'Other_T': "#EDB416",
    'Erythroid': "#F30A1A",
    'CD4_T': '#C99546',
    'CD8_T': "#6B3317",
    'NK': "#FBEF0D",
    'B': '#4CAF50',
    "Plasma": "#9DC012",
    'Small': "#292929"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(8, 1.2))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (4 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=4,           # 4 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=0.8
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Zhang_Simplified_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Zhang.Detailed.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new (added MEP)
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new (added Pre-B)
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F",
    'EoBaMaP': "#AA77AA"
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Zhang_Detailed_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
PBMC_HD01.protein.signaturemap(
    attribute="Normalized_reads",
    splitby="Zhang.Detailed.Celltype")

### Triana

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.patches import Rectangle
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Triana.Broad.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'Immature': "#0079ea",
    'Mature': "#CFCFCF"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure
# ============================================
fig2, ax_legend = plt.subplots(figsize=(6, 0.5))
ax_legend.axis('off')

# Create legend handles manually
handles = []
for cat, color in zip(cats, palette):
    from matplotlib.lines import Line2D
    handle = Line2D(
        [0], [0],
        marker='o',              # Circle marker (dot)
        markerfacecolor=color,   # Fill color
        markeredgecolor='black', # Border color
        markeredgewidth=0.5,     # Border width
        markersize=10,           # Size of the dot
        linestyle='None'         # No connecting line
    )
    handles.append(handle)

# Create horizontal legend
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=len(cats),  # All in one row
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Triana_Broad_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Triana.Simplified.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'HSPC': '#0079ea',
    'cDC': "#16D2E3",
    'pDC': "#69FFCB",
    'Monocyte': '#D27CE3',
    "Myeloid": '#473B76',
    'Other_T': "#EDB416",
    'Erythroid': "#F30A1A",
    'CD4_T': '#C99546',
    'CD8_T': "#6B3317",
    'NK': "#FBEF0D",
    'B': '#4CAF50',
    "Plasma": "#9DC012",
    'Small': "#292929"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(8, 1.2))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (4 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=4,           # 4 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=0.8
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Triana_Simplified_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Triana.Detailed.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new (added MEP)
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new (added Pre-B)
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F"
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Triana_Detailed_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
PBMC_HD01.protein.signaturemap(
    attribute="Normalized_reads",
    splitby="Triana.Detailed.Celltype")

### Luecken

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.patches import Rectangle
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Luecken.Broad.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'Immature': "#0079ea",
    'Mature': "#CFCFCF"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure
# ============================================
fig2, ax_legend = plt.subplots(figsize=(6, 0.5))
ax_legend.axis('off')

# Create legend handles manually
handles = []
for cat, color in zip(cats, palette):
    from matplotlib.lines import Line2D
    handle = Line2D(
        [0], [0],
        marker='o',              # Circle marker (dot)
        markerfacecolor=color,   # Fill color
        markeredgecolor='black', # Border color
        markeredgewidth=0.5,     # Border width
        markersize=10,           # Size of the dot
        linestyle='None'         # No connecting line
    )
    handles.append(handle)

# Create horizontal legend
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=len(cats),  # All in one row
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Luecken_Broad_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Luecken.Simplified.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'HSPC': '#0079ea',
    'cDC': "#16D2E3",
    'pDC': "#69FFCB",
    'Monocyte': '#D27CE3',
    "Myeloid": '#473B76',
    'Other_T': "#EDB416",
    'Erythroid': "#F30A1A",
    'CD4_T': '#C99546',
    'CD8_T': "#6B3317",
    'NK': "#FBEF0D",
    'B': '#4CAF50',
    "Plasma": "#9DC012",
    'Small': "#292929"
}
cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(8, 1.2))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (4 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=4,           # 4 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=0.8
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Luecken_Simplified_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD01.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD01.protein.row_attrs['Luecken.Detailed.Celltype']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD01.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD01.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new (added MEP)
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new (added Pre-B)
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F",
    'Pre-Pro-B': "#87BA87"
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        palette=palette,
        add_outline=True,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD01_Luecken_Detailed_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
PBMC_HD01.protein.signaturemap(
    attribute="Normalized_reads",
    splitby="Luecken.Detailed.Celltype")

### Comparison of markers fidelity

In [None]:
POS={'MEP': ['CD34', 'CD117', 'CD71'],
     'HSC_MPP': ['CD34', 'CD117', 'CD90', 'CD49f'],
     'Pre-Pro-B': ['CD19', 'CD34', 'CD10'],
     'Pro-B': ['CD19', 'CD34', 'CD10', 'CD20'],
     'Pre-B': ['CD19', 'CD20', 'CD10', 'CD34'],
     'Immature_B': ['CD19', 'CD22', 'CD10'],
     'B_Naive': ['CD19', 'CD22', 'CD1c', 'CD27'],
     'B_Memory': ['CD19', 'CD22', 'CD1c', 'CD27+'],
     'Plasma': ['CD19', 'CD38'],
     'CD4_T_Naive': ['CD3', 'CD2', 'CD4'],
     'CD4_T_Memory': ['CD3', 'CD2', 'CD4', 'CD45RO'],
     'CD8_T_Naive': ['CD3', 'CD2', 'CD8'],
     'CD8_T_Memory': ['CD3', 'CD2', 'CD8', 'CD45RO'],
     'CD4_CTL': ['CD3', 'CD4'],
     'Treg': ['CD3', 'CD2', 'CD4', 'CD25'],
     'GdT': ['CD3', 'CD2', 'CD5'],
     'MAIT': ['CD3', 'CD8', 'CD2'],
     'NK_CD56_bright': ['CD7', 'CD56'],
     'NK_CD56_dim': ['CD7', 'CD56', 'CD16'],
     'CD14_Mono': ['CD14', 'CD11b', 'CD33'],
     'CD16_Mono': ['CD13', 'CD11c', 'CD16'],
     'cDC1': ['CD11c', 'CD1c'],
     'cDC2': ['CD11c', 'CD1c'],
     'pDC': ['CD123', 'CD303', 'CD304'],
     'Erythroblast': ['CD71', 'CD141'],
     'ErP': ['CD34', 'CD117', 'CD71', 'CD38'],
     'Myeloid_progenitor': ['CD33', 'CD38', 'CD62L'],
     'GMP': ['CD34+', 'CD117', 'CD38', 'CD62L'],
}

In [None]:
label_keys = [
    "Averaged.Detailed.Celltype",
    "Hao.Detailed.Celltype",
    "Zhang.Detailed.Celltype",
    "Triana.Detailed.Celltype",
    "Luecken.Detailed.Celltype",
]

scoreboard, details = compare_annotation_schemes(
    PBMC_HD01,
    label_keys=label_keys,
    pos_marker_dict=POS,
    layer="Normalized_reads",
    cluster_key="Clusters",
    collapse_subclusters=True,
    min_cells_per_label=5,
    weights=(0.45, 0.25, 0.30),
)

print(scoreboard)


In [None]:
import os
import re
import numpy as np
import matplotlib.pyplot as plt

# --------- CONFIG ---------
label_keys = [
    "Averaged.Detailed.Celltype",
    "Hao.Detailed.Celltype",
    "Zhang.Detailed.Celltype",
    "Triana.Detailed.Celltype",
    "Luecken.Detailed.Celltype",
]

colors = {
    "Averaged.Detailed.Celltype": "#454546",
    "Hao.Detailed.Celltype": "#892921",
    "Zhang.Detailed.Celltype": "#C09F62",
    "Triana.Detailed.Celltype": "#335DB0",
    "Luecken.Detailed.Celltype": "#316C32",
}

metrics = [
    ("coherence(1-H|/H)", "Coherence", False),
    ("label_concentration", "Label concentration", False),
    ("marker_fidelity_weighted", "Marker fidelity (weighted)", True),
    ("total_score", "Total score", True),
]

# Set your output directory
figures_path = "/Users/kgurashi/GitHub/2024__EspressoPro_Manuscript/Figures/Model_Independent_Testing"
os.makedirs(figures_path, exist_ok=True)

def _safe_slug(s: str) -> str:
    # replace anything not alnum, dot, dash, underscore with underscore
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s)

# --------- PREP ---------
scoreboard_ordered = (
    scoreboard.set_index("annotation")
    .reindex(label_keys)
    .reset_index()
)

x = np.arange(len(label_keys))
xlabels = [k.replace(".Detailed.Celltype", "") for k in label_keys]
bar_colors = [colors[k] for k in label_keys]

# --------- PLOTS + SAVE ---------
for col, title, show_xaxis in metrics:
    vals_raw = scoreboard_ordered[col].to_numpy(dtype=float)
    is_na = ~np.isfinite(vals_raw)
    vals = np.where(is_na, 0.0, vals_raw)

    averaged_val = scoreboard_ordered.loc[
        scoreboard_ordered["annotation"] == "Averaged.Detailed.Celltype", col
    ].values[0]

    fig, ax = plt.subplots(figsize=(4.5, 3.5))

    # 1) reference line (behind everything)
    if np.isfinite(averaged_val):
        ax.axhline(
            averaged_val,
            linestyle="--",
            linewidth=1.2,
            color="#454546",
            alpha=0.8,
            zorder=1,
        )

    # 2) bars
    bars = ax.bar(x, vals, color=bar_colors, zorder=2)

    ax.set_title(title, fontweight="bold")
    ax.set_ylim(0, 1.05)
    ax.set_ylabel("Score")

    # x-axis shown only for marker fidelity + total score
    if show_xaxis:
        ax.set_xticks(x)
        ax.set_xticklabels(xlabels, rotation=0)
    else:
        ax.set_xticks([])
        ax.set_xticklabels([])

    # 3) values (on top of line)
    for i, b in enumerate(bars):
        h = b.get_height()
        if is_na[i]:
            ax.text(
                b.get_x() + b.get_width() / 2,
                max(0.02, h) + 0.02,
                "NA",
                ha="center",
                va="bottom",
                fontsize=9,
                fontweight="bold",
                zorder=3,
            )
        else:
            ax.text(
                b.get_x() + b.get_width() / 2,
                h + 0.02,
                f"{vals_raw[i]:.3f}",
                ha="center",
                va="bottom",
                fontsize=9,
                zorder=3,
            )

    ax.yaxis.grid(True, linestyle="-", linewidth=0.5, alpha=0.3)
    ax.set_axisbelow(True)
    fig.tight_layout()

    # ---- SAVE (before show) ----
    fname = f"PBMC_HD01_{_safe_slug(col)}_Metrics.png"
    outpath = os.path.join(figures_path, fname)
    fig.savefig(outpath, dpi=300, bbox_inches="tight")

    plt.show()
    plt.close(fig)

print(f"Saved plots to: {figures_path}")

## Predictions on PBMC - HD02

In [None]:
PBMC_HD02.protein.row_attrs['label'] = copy.copy(PBMC_HD02.protein.row_attrs['Averaged.Detailed.Celltype'])

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --------- CONFIG ---------
# Assumes you already have:
#   PBMC_HD01 (MissionBio Sample)
#   figures_path (output directory)
os.makedirs(figures_path, exist_ok=True)

FIGSIZE = (3.35, 3.35)

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD02.protein.row_attrs["umap"])
labels = np.asarray(PBMC_HD02.protein.row_attrs["Clusters"], dtype=object)

df_protein = PBMC_HD02.protein.get_attribute("Normalized_reads", constraint="row+col")

pbmc = AnnData(X=np.asarray(df_protein.values))
pbmc.obs_names = df_protein.index.astype(str)
pbmc.var_names = df_protein.columns.astype(str)
pbmc.obsm["X_umap"] = coords
pbmc.obs["Clusters"] = pd.Categorical(labels.astype(str))  # ensure categorical + str

cats = list(pbmc.obs["Clusters"].cat.categories)

# --------- COLORS: let Scanpy generate a consistent categorical palette ---------
# This call can create an extra figure in some environments; close it immediately.
tmp = sc.pl.umap(pbmc, color="Clusters", show=False, return_fig=True)
plt.close(tmp)
palette = list(pbmc.uns["Clusters_colors"])

# ============================================
# PLOT 1: UMAP without legend (colored) — SAVE ONLY, DO NOT RENDER
# ============================================
fig1 = sc.pl.umap(
    pbmc,
    color="Clusters",
    add_outline=True,
    legend_loc="none",
    frameon=False,
    title="",
    size=50,
    alpha=0.9,
    palette=palette,
    return_fig=True,
    show=False,
)

for ax in fig1.axes:
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.set_xticks([])
    ax.set_yticks([])

fig1.set_size_inches(*FIGSIZE)
fig1.tight_layout()
fig1.savefig(
    os.path.join(figures_path, "PBMC_HD02_Clusters_umap.png"),
    dpi=300,
    bbox_inches="tight",
)
plt.show()
plt.close(fig1)  # no plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS — RENDER + SAVE
# ============================================
fig2, ax_legend = plt.subplots(figsize=FIGSIZE)
ax_legend.axis("off")

handles = [
    Line2D(
        [0], [0],
        marker="o",
        linestyle="None",
        markerfacecolor=color,
        markeredgecolor="black",
        markeredgewidth=0.5,
        color="w",
        markersize=9,
    )
    for color in palette
]

legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc="center",
    ncol=6,
    fontsize=8,
    frameon=False,
    columnspacing=1.2,
    labelspacing=0.9,
    handletextpad=0.6,
)

fig2.tight_layout()
fig2.savefig(
    os.path.join(figures_path, "PBMC_HD02_Clusters_legend.png"),
    dpi=300,
    bbox_inches="tight",
)

plt.show()
plt.close(fig2)

print(
    "Saved:\n"
    f"- {os.path.join(figures_path, 'PBMC_HD02_Clusters_umap.png')}\n"
    f"- {os.path.join(figures_path, 'PBMC_HD02_Clusters_legend.png')}"
)


In [None]:
PBMC_HD02, summary, pivot = ep.suggest_cluster_celltype_identity(
    sample=PBMC_HD02,
    dominance_threshold=0.35,
    annotation_col="Averaged.Detailed.Celltype",
    cluster_col="Clusters",
    rewrite=True,
    verbose=True
)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc_context
from matplotlib.lines import Line2D
import scanpy as sc
from anndata import AnnData

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD02.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD02.protein.row_attrs['annotated_clusters']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD02.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD02.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# --- Custom palette (match your label names) ---
custom_palette = {
    'B_Memory': "#68D827",           # from your original
    'B_Naive': '#1C511D',            # from your original
    'CD14_Mono': "#D27CE3",          # from your original
    'CD16_Mono': "#8D43CD",          # from your original
    'CD4_T_Memory': "#C1AF93",       # from your original
    'CD4_T_Naive': "#C99546",        # from your original
    'CD8_T_Memory': "#6B3317",       # from your original
    'CD8_T_Naive': "#4D382E",        # new
    'CD4_CTL': "#645846",            # new
    'ErP': "#D1235A",                # new
    'Erythroblast': "#F30A1A",       # new
    'GMP': "#C5E4FF",                # new
    'HSC_MPP': '#0079ea',            # from your original
    'Immature_B': "#91FF7B",         # new
    'LMPP': "#17BECF",               # new
    'MAIT': "#BCBD22",               # new
    'MEP': "#FF6B9D",                # new
    'Myeloid_progenitor': "#AEC7E8", # new
    'NK_CD56_bright': "#F3AC1F",     # new
    'NK_CD56_dim': "#FBEF0D",        # from your original
    'Plasma': "#9DC012",             # new
    'Pre-B': "#4CAF50",              # new
    'Pro-B': "#66BB6A",              # new
    'Small': "#292929",              # new
    'cDC1': "#76A7CB",               # new
    'cDC2': "#16D2E3",               # from your original
    'GdT': "#EDB416",                # from your original
    'pDC': "#69FFCB",                # from your original
    'Treg': "#43401F"                # new
}

cats = list(pbmc.obs['clusters'].cat.categories)
palette = [custom_palette.get(c, '#cccccc') for c in cats]  # grey fallback

# ============================================
# PLOT 1: UMAP without legend
# ============================================
with rc_context({"figure.figsize": (3.35, 3.35)}):
    fig1 = sc.pl.umap(
        pbmc,
        color="clusters",
        add_outline=True,
        palette=palette,
        legend_loc='none',      # Remove legend from UMAP plot
        frameon=False,
        title="",
        size=50,
        alpha=0.9,
        return_fig=True,
        show=False,
    )

    for ax in fig1.axes:
        # Bold titles
        ax.set_title(ax.get_title(), fontweight='bold')
        # Remove axis labels and ticks
        ax.set_xlabel("")
        ax.set_ylabel("")
        ax.set_xticks([])
        ax.set_yticks([])

    plt.tight_layout()
    plt.show()

# ============================================
# PLOT 2: Separate legend figure with DOTS
# ============================================
fig2, ax_legend = plt.subplots(figsize=(10, 3))
ax_legend.axis('off')

# Create legend handles with circular markers (dots)
handles = []
for cat, color in zip(cats, palette):
    handle = Line2D(
        [0], [0],
        marker='o',           # Circle marker
        color='w',            # Line color (invisible)
        markerfacecolor=color,
        markeredgecolor='black',
        markeredgewidth=0.5,
        markersize=10,        # Size of the dot
        linestyle='None'      # No line, just the marker
    )
    handles.append(handle)

# Create grid legend (3 columns to match your original)
legend = ax_legend.legend(
    handles=handles,
    labels=cats,
    loc='center',
    ncol=3,           # 3 columns as in your original
    fontsize=8,
    frameon=False,
    handlelength=1.5,
    handleheight=1.5,
    columnspacing=1.5,
    labelspacing=1.0
)

# Optional: add outlines to legend text for better visibility
for text in legend.get_texts():
    text.set_path_effects([
        plt.matplotlib.patheffects.Stroke(linewidth=2, foreground='white'),
        plt.matplotlib.patheffects.Normal()
    ])

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD02_Propagated_Annotation.png"),
            dpi=300, bbox_inches='tight')

In [None]:
# -*- coding: utf-8 -*-
"""
FULL script: block-ordered Scanpy heatmap where ONLY the Y categories (gene blocks)
can be inverted, while the X order of clusters is locked.

- X (columns): clusters in a fixed order (dendrogram-derived once, then frozen)
- Y (rows): genes grouped by the cluster in which they peak (z-scored means);
            you can invert JUST these Y blocks without touching X.
"""

import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
from scipy import sparse

# --- Build a minimal AnnData from Mosaic arrays ---
coords = np.asarray(PBMC_HD02.protein.row_attrs['umap'])               # (n_cells, 2)
labels = np.asarray(PBMC_HD02.protein.row_attrs['annotated_clusters']) # (n_cells,)

pbmc = AnnData(X=PBMC_HD02.protein.layers['Normalized_reads'])  # dummy X; we only need obs/obsm for plotting
df_protein = PBMC_HD02.protein.get_attribute('Normalized_reads', constraint='row+col')
pbmc.obs_names = df_protein.index
pbmc.var_names = df_protein.columns
pbmc.obsm['X_umap'] = coords
pbmc.obs['clusters'] = pd.Categorical(labels)     # must be categorical for nice legends

# ------------------ CONFIG ------------------
groupby = "clusters"     # column in pbmc.obs with cluster labels
use_raw = False          # True -> use pbmc.raw if present
genes_subset = None      # e.g. ['CD3D','CD3E', ...]; None = all genes

WITHIN_DESC = True       # sort genes inside each block by descending expression
INVERT_Y_BLOCKS = True   # flip ONLY the Y category order (top↔bottom)

cmap = "magma"
fig_h = 9
# -------------------------------------------

# 0) Ensure categorical; get a sensible cluster order once
if not pbmc.obs[groupby].dtype.name.startswith("category"):
    pbmc.obs[groupby] = pbmc.obs[groupby].astype("category")

if f"dendrogram_{groupby}" not in pbmc.uns:
    sc.tl.dendrogram(pbmc, groupby=groupby)

d = pbmc.uns.get(f"dendrogram_{groupby}", {})
cluster_order = (
    d.get("categories_order")
    or d.get("ordered_categories")
    or list(pbmc.obs[groupby].cat.categories)
)

cluster_order = cluster_order[::-1]  # dendrogram gives inverted order

# --- LOCK the X order (clusters) and never touch it again ---
pbmc.obs[groupby] = pbmc.obs[groupby].cat.reorder_categories(cluster_order, ordered=True)
cluster_order_fixed = list(pbmc.obs[groupby].cat.categories)

# (optional) keep your palette consistent if you have custom_palette
try:
    pbmc.uns[f"{groupby}_colors"] = [custom_palette.get(c, "#cccccc") for c in cluster_order_fixed]
except NameError:
    pass

# 1) Build genes × clusters mean matrix (Scanpy heatmap summarizes like this)
adata = pbmc.raw if (use_raw and pbmc.raw is not None) else pbmc
all_genes = list(adata.var_names)
genes = all_genes if genes_subset is None else [g for g in genes_subset if g in all_genes]

X = adata[:, genes].X
if sparse.issparse(X):
    X = X.toarray()

df = pd.DataFrame(X, index=adata.obs_names, columns=genes)
df["__cluster__"] = pbmc.obs[groupby].values
M = df.groupby("__cluster__", observed=True).mean().T                 # genes × clusters
M = M.reindex(columns=cluster_order_fixed)                             # ensure X order

# 2) Z-score each gene across clusters (mimic standard_scale="var")
eps = 1e-9
Mz = M.sub(M.mean(axis=1), axis=0)
std = M.std(axis=1).replace(0, np.nan)
Mz = Mz.div(std + eps, axis=0)

# Keep only non-NaN genes for ordering
keep = ~Mz.isna().any(axis=1)
Mz = Mz.loc[keep]
genes_kept = list(Mz.index)

# 3) Build blocks: genes grouped by the cluster where they peak (following X order)
col_index = {c: i for i, c in enumerate(cluster_order_fixed)}
peak_idx = np.nanargmax(Mz.values, axis=1)                             # index into cluster_order_fixed
peak_cluster = [cluster_order_fixed[i] for i in peak_idx]

genes_by_cluster = {c: [] for c in cluster_order_fixed}
for c in cluster_order_fixed:
    ci = col_index[c]
    mask = [i for i, pc in enumerate(peak_cluster) if pc == c]
    # sort genes by their (z-scored) value in cluster c
    sub = sorted(mask, key=lambda i: Mz.values[i, ci], reverse=WITHIN_DESC)
    genes_by_cluster[c] = [genes_kept[i] for i in sub]

# optional: leftovers (e.g., constant genes that were dropped above)
leftovers = [g for g in genes if g not in set(sum(genes_by_cluster.values(), []))]
has_other = len(leftovers) > 0

# 4) INVERT ONLY THE Y categories (gene blocks) if requested
orig_blocks = list(cluster_order_fixed) + (["Other"] if has_other else [])
block_order = list(reversed(orig_blocks)) if INVERT_Y_BLOCKS else orig_blocks

# rebuild the gene order to follow that Y category order
genes_final = []
for c in block_order:
    if c == "Other":
        genes_final.extend(leftovers)
    else:
        genes_final.extend(genes_by_cluster.get(c, []))

# rebuild left-bar annotations to match the (possibly inverted) Y order
var_group_positions, var_group_labels = [], []
start = 0
for c in block_order:
    block = leftovers if c == "Other" else genes_by_cluster.get(c, [])
    n = len(block)
    if n == 0:
        continue
    var_group_positions.append((start, start + n - 1))
    var_group_labels.append(c)
    start += n

# 5) Plot (dendrogram OFF so both X and Y custom orders are respected)
w = 4.5
g = sc.pl.heatmap(
    pbmc,
    var_names=genes_final[::-1],
    groupby=groupby,
    use_raw=False,
    cmap=cmap,
    dendrogram=False,
    var_group_rotation=0,
    figsize=(8, 12),
    show=False
)

plt.tight_layout()
plt.show()
plt.savefig(os.path.join(figures_path, "PBMC_HD02_Propagated_Identities_Heatmap.png"),
            dpi=300, bbox_inches='tight')
