# Set-up

In [None]:
import os

script_dir = os.path.dirname(os.path.realpath('__file__'))
parent_dir = os.path.dirname(script_dir)

## Importing modules

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import cellhint
import harmonypy as hm
import seaborn as sns
import random
import matplotlib.pyplot as plt
import espressopro as ep

In [None]:
import warnings
warnings.filterwarnings('ignore')

Loading custom scripts

In [None]:
def assign_labels(dataset, reduction, n_neighbors, label_input, label_output, frequency_threshold):
    # Compute the neighborhood graph
    sc.pp.neighbors(dataset, use_rep=reduction, n_neighbors=n_neighbors)

    # Perform the clustering
    sc.tl.leiden(dataset, key_added='clusters', resolution=10)

    # Initialize the new column with the existing labels
    dataset.obs[label_output] = dataset.obs[label_input]

    # For each cluster, find the most frequent label and assign it to all cells in the cluster
    for cluster in dataset.obs['clusters'].unique():
        cluster_labels = dataset.obs.loc[dataset.obs['clusters'] == cluster, label_input]
        most_frequent_label = cluster_labels.mode()[0]
        frequency = (cluster_labels == most_frequent_label).mean()

        if frequency > frequency_threshold:
            dataset.obs.loc[dataset.obs['clusters'] == cluster, label_output] = most_frequent_label

    return dataset

In [None]:
def grouped_obs_mean(adata, group_key, layer=None, gene_symbols=None):
    if layer is not None:
        getX = lambda x: x.layers[layer]
    else:
        getX = lambda x: x.X
    if gene_symbols is not None:
        new_idx = adata.var[idx]
    else:
        new_idx = adata.var_names

    grouped = adata.obs.groupby(group_key)
    out = pd.DataFrame(
        np.zeros((adata.shape[1], len(grouped)), dtype=np.float64),
        columns=list(grouped.groups.keys()),
        index=adata.var_names
    )

    for group, idx in grouped.indices.items():
        X = getX(adata[idx])
        out[group] = np.ravel(X.mean(axis=0, dtype=np.float64))
    return out

In [None]:
import sys
sys.path.append(parent_dir + '/Scripts/SingleCellUtils')

import SCUtils

In [None]:
pip list

PYTHONHASHSEED was set as envinronmental variable to 0 as follows:
    
conda env config vars set PYTHONHASHSEED=0

In [None]:
os.environ['PYTHONHASHSEED'] = '0'
random.seed(42)
np.random.seed(42)

In [None]:
def ensure_pythonhashseed(seed=0):
    current_seed = os.environ.get("PYTHONHASHSEED")

    seed = str(seed)
    if current_seed is None or current_seed != seed:
        print(f'Setting PYTHONHASHSEED="{seed}"')
        os.environ["PYTHONHASHSEED"] = seed
        # restart the current process
        os.execl(sys.executable, sys.executable, *sys.argv)

In [None]:
import random

hash = random.getrandbits(128)

print("hash value: %032x" % hash)

## Defining data path

In [None]:
# Specify the folder path
data_path = parent_dir + "/Data"
figures_path = parent_dir + "/Figures/Label_Harmonisation"

if not os.path.exists(figures_path):
    os.makedirs(figures_path)

# Processing Zhang X. et al. (2024) dataset

In [None]:
Zhang_dataset = sc.read_h5ad(data_path + '/References/Zhang' + '/adata_combined_rna_adt_annotated-titrated.h5ad')

### Dataset Description

In [None]:
Zhang_dataset

In [None]:
Zhang_dataset.obsm['X_umap'] = Zhang_dataset.obsm['X_umap'].values
Zhang_dataset.obsm['X_umap-titration'] = Zhang_dataset.obsm['X_umap-titration'].values

In [None]:
# Plot UMAP with color
sc.pl.embedding(Zhang_dataset, 
                color='Level 3 Multimodal', 
                basis='X_umap', 
                legend_loc='right margin', 
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(4.5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Zhang X. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Original annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Place the legend below the plot
legend = ax.legend(loc='upper center', 
                   bbox_to_anchor=(0.5, -0.05),
                   prop={'size': 4.8},
                   ncol=5)

# Reduce the size of the dots in the legend
for handle in legend.legend_handles:
    handle._sizes = [10]

# Adjust the layout to make room for the legend
plt.subplots_adjust(bottom=0.3)

# Save the figure at 300 dpi
plt.savefig(figures_path + "/01_ZhangX_original_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
vars_to_keep = [
    'CD230', 'Hu.C5L2', 'Hu.CD10', 'Hu.CD101', 'Hu.CD102', 'Hu.CD103', 'Hu.CD105_43A3', 'Hu.CD106', 'Hu.CD109', 'Hu.CD110', 
    'Hu.CD112', 'Hu.CD115', 'Hu.CD116', 'Hu.CD117', 'Hu.CD119', 'Hu.CD11a', 'Hu.CD11b', 'Hu.CD11c', 'Hu.CD123', 'Hu.CD127', 
    'Hu.CD13', 'Hu.CD133_S16016B', 'Hu.CD135', 'Hu.CD138_DL.101', 'Hu.CD140b', 'Hu.CD141', 'Hu.CD14_M5E2', 'Hu.CD150', 'Hu.CD151', 
    'Hu.CD154', 'Hu.CD155', 'Hu.CD158e1', 'Hu.CD158f', 'Hu.CD15_W6D3', 'Hu.CD16', 'Hu.CD162', 'Hu.CD163', 'Hu.CD164', 'Hu.CD172a', 
    'Hu.CD177', 'Hu.CD18', 'Hu.CD183', 'Hu.CD185', 'Hu.CD186', 'Hu.CD19', 'Hu.CD192', 'Hu.CD1a', 'Hu.CD1d', 'Hu.CD2', 'Hu.CD200', 
    'Hu.CD201', 'Hu.CD202b', 'Hu.CD205', 'Hu.CD226_TX25', 'Hu.CD235a', 'Hu.CD24', 'Hu.CD25', 'Hu.CD26', 'Hu.CD27', 'Hu.CD271', 
    'Hu.CD274', 'Hu.CD279', 'Hu.CD28', 'Hu.CD29', 'Hu.CD304', 'Hu.CD305_LAIR1', 'Hu.CD309', 'Hu.CD32', 'Hu.CD325', 'Hu.CD326', 
    'Hu.CD33', 'Hu.CD335', 'Hu.CD34', 'Hu.CD35', 'Hu.CD354', 'Hu.CD36', 'Hu.CD366', 'Hu.CD37', 'Hu.CD38_HIT2', 'Hu.CD41', 'Hu.CD42b', 
    'Hu.CD43', 'Hu.CD45RA', 'Hu.CD45RB', 'Hu.CD45RO', 'Hu.CD45_2D1', 'Hu.CD47', 'Hu.CD49b', 'Hu.CD4_RPA.T4', 'Hu.CD5', 'Hu.CD52', 
    'Hu.CD54', 'Hu.CD55', 'Hu.CD56', 'Hu.CD57', 'Hu.CD58', 'Hu.CD59', 'Hu.CD61', 'Hu.CD62L', 'Hu.CD62P', 'Hu.CD63', 'Hu.CD64', 
    'Hu.CD69', 'Hu.CD7', 'Hu.CD71', 'Hu.CD72', 'Hu.CD73', 'Hu.CD8', 'Hu.CD81', 'Hu.CD82', 'Hu.CD83', 'Hu.CD84', 'Hu.CD85g', 'Hu.CD9', 
    'Hu.CD90', 'Hu.CD93', 'Hu.CD98', 'Hu.CLEC1B', 'Hu.Cadherin.11', 'Hu.FR.b', 'Hu.FceRIa', 'Hu.GARP', 'Hu.GPR56', 'Hu.Galectin.9', 
    'Hu.HLA.ABC', 'Hu.HLA.DR.DP.DQ', 'Hu.KLRG1', 'Hu.TIM.4', 'Hu.TSPAN33', 'HuMs.CD44', 'HuMs.CD49f', 'HuMs.integrin.b7', 
    'Isotype_G0114F7', 'Isotype_HTK888', 'Isotype_MOPC.173', 'Isotype_MOPC.21', 'Isotype_MPC.11', 'Isotype_RTK2071', 'Isotype_RTK2758', 
    'Isotype_RTK4174', 'Isotype_RTK4530', 'Hu.IgG.Fc'
]

vars_to_keep = np.in1d(Zhang_dataset.var_names, vars_to_keep)
Zhang_dataset = Zhang_dataset[:, vars_to_keep]

In [None]:
def zhang_dataset_adt_rename(dataset):
    dataset.var.rename(index=lambda x: x.replace('Hu.', '').replace('HuMs.', ''), inplace=True)
    dataset.var.rename(index={'FceRIa': 'FcεRIα'}, inplace=True)
    dataset.var.rename(index={'CD4_RPA.T4': 'CD4'}, inplace=True)
    dataset.var.rename(index={'CD45_2D1': 'CD45'}, inplace=True)
    dataset.var.rename(index={'CD38_HIT2': 'CD38'}, inplace=True)
    dataset.var.rename(index={'CD305_LAIR1': 'CD305'}, inplace=True)
    dataset.var.rename(index={'CD226_TX25': 'CD226'}, inplace=True)
    dataset.var.rename(index={'CD15_W6D3': 'CD15'}, inplace=True)
    dataset.var.rename(index={'CD14_M5E2': 'CD14'}, inplace=True)
    dataset.var.rename(index={'CD138_DL.101': 'CD138'}, inplace=True)
    dataset.var.rename(index={'CD133_S16016B': 'CD133'}, inplace=True)
    dataset.var.rename(index={'CD105_43A3': 'CD105'}, inplace=True)

zhang_dataset_adt_rename(Zhang_dataset)

In [None]:
BF21_CD34 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_BF21-CD34_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
BF21_CD271 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_BF21-CD271_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
BF21_TNC = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_BF21-TNC_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]

WF26_CD34 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_WF26-CD34_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
WF26_CD271 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_WF26-CD271_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
WF26_TNC = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_WF26-TNC_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]

BM27_CD34 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_BM27-CD34_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
BM27_CD271 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_BM27-CD271_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
BM27_TNC = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_BM27-TNC_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]

WM34_CD34 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_WM34-CD34_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
WM34_CD271 = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_WM34-CD271_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]
WM34_TNC = sc.read_10x_h5(data_path + '/References/Zhang' + '/GSE245108_WM34-TNC_filtered_feature_bc_matrix.h5', gex_only=False)[:, vars_to_keep]

In [None]:
zhang_dataset_adt_rename(BF21_CD34)
zhang_dataset_adt_rename(BF21_CD271)
zhang_dataset_adt_rename(BF21_TNC)

zhang_dataset_adt_rename(WF26_CD34)
zhang_dataset_adt_rename(WF26_CD271)
zhang_dataset_adt_rename(WF26_TNC)

zhang_dataset_adt_rename(BM27_CD34)
zhang_dataset_adt_rename(BM27_CD271)
zhang_dataset_adt_rename(BM27_TNC)

zhang_dataset_adt_rename(WM34_CD34)
zhang_dataset_adt_rename(WM34_CD271)
zhang_dataset_adt_rename(WM34_TNC)

In [None]:
BF21_CD34.obs_names = BF21_CD34.obs_names + '.BF21_032123_CD34'
BF21_CD271.obs_names = BF21_CD271.obs_names + '.BF21_032123_CD271'
BF21_TNC.obs_names = BF21_TNC.obs_names + '.BF21_032123_TNC'

WF26_CD34.obs_names = WF26_CD34.obs_names + '.WF26_031423_CD34'
WF26_CD271.obs_names = WF26_CD271.obs_names + '.WF26_031423_CD271'
WF26_TNC.obs_names = WF26_TNC.obs_names + '.WF26_031423_TNC'

BM27_CD34.obs_names = BM27_CD34.obs_names + '.BM27_120522_CD34'
BM27_CD271.obs_names = BM27_CD271.obs_names + '.BM27_120522_CD271'
BM27_TNC.obs_names = BM27_TNC.obs_names + '.BM27_120522_TNC'

WM34_CD34.obs_names = WM34_CD34.obs_names + '.WM34_120522_CD34'
WM34_CD271.obs_names = WM34_CD271.obs_names + '.WM34_120522_CD271'
WM34_TNC.obs_names = WM34_TNC.obs_names + '.WM34_120522_TNC'

In [None]:
WM34_CD34

In [None]:
merged_adata = anndata.concat([BF21_CD34, BF21_CD271, BF21_TNC,
                               WF26_CD34, WF26_CD271, WF26_TNC,
                               BM27_CD34, BM27_CD271, BM27_TNC,
                               WM34_CD34, WM34_CD271, WM34_TNC], axis=0)

In [None]:
obs_to_keep = np.in1d(merged_adata.obs_names, Zhang_dataset.obs_names)
merged_adata = merged_adata[obs_to_keep,:]
merged_adata = merged_adata[Zhang_dataset.obs_names]

In [None]:
Zhang_dataset.X = merged_adata.X

In [None]:
Zhang_dataset.obs['Batch'] = Zhang_dataset.obs['sample'].values

In [None]:
Zhang_dataset.obs['Chemistry'] = 'BioLegend TotalSeqA'

# Loading Hao Y. et al. (2021) dataset

In [None]:
Hao_dataset = sc.read_h5ad(data_path + "/References/Hao" + "/228AB_healthy_donors_PBMNCs.h5ad")

## Dataset Description

In [None]:
Hao_dataset

In [None]:
type(ax)

In [None]:
# Plot UMAP with color
sc.pl.embedding(Hao_dataset, 
                color='celltype.l3', 
                basis='X_wnn.umap', 
                legend_loc='right margin', 
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(4.5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Hao Y. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Original annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Place the legend below the plot
legend = ax.legend(loc='upper center', 
                   bbox_to_anchor=(0.5, -0.05),
                   prop={'size': 4.8},
                   ncol=5)

# Reduce the size of the dots in the legend
for handle in legend.legend_handles:
    handle._sizes = [10]

# Adjust the layout to make room for the legend
plt.subplots_adjust(bottom=0.3)

# Save the figure at 300 dpi
plt.savefig(figures_path + "/02_HaoY_original_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
var_to_drop = np.in1d(Hao_dataset.var_names, SCUtils.Filter_duplicate_vars(Hao_dataset))
Hao_dataset = Hao_dataset[:, ~var_to_drop]

In [None]:
Hao_dataset.obs['Chemistry'] = 'BioLegend TotalSeqA'

In [None]:
Hao_dataset.obs['Batch'] = Hao_dataset.obs['donor'].values

# Loading Triana S. et al. (2021) dataset

In [None]:
Triana_dataset = sc.read_h5ad(data_path + "/References/Triana" + "/97AB_young_and_old_adult_healthy_donor_BMMNCs.h5ad")

## Dataset Description

In [None]:
Triana_dataset

In [None]:
# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='CellTypes', 
                basis='X_mofaumap', 
                legend_loc='right margin', 
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5.5, 5.5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Original annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Place the legend below the plot
legend = ax.legend(loc='upper center', 
                   bbox_to_anchor=(0.5, -0.05),
                   prop={'size': 4.8},
                   ncol=3)

# Reduce the size of the dots in the legend
for handle in legend.legend_handles:
    handle._sizes = [10]

# Adjust the layout to make room for the legend
plt.subplots_adjust(bottom=0.3)

# Save the figure at 300 dpi
plt.savefig(figures_path + "/03_TrianaS_original_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

Defining used chemistry as metadata to use for interdatasets integration

In [None]:
Triana_dataset.obs['Chemistry'] = 'BD AbSeq'

Renaming feature labels to match across datasets

In [None]:
Triana_dataset.var.rename(index={'HLA.DR': 'HLA-DR'}, inplace=True)
Triana_dataset.var.rename(index={'FCER1A': 'FcεRIα'}, inplace=True)

# Loading Luecken M.D. et al. (2021) dataset

In [None]:
Luecken_dataset = sc.read_h5ad(data_path + "/References/Luecken" + "/140AB_adult_healthy_donor_BMMNCs.h5ad")

In [None]:
adt = Luecken_dataset.var['feature_types'] == 'ADT'
Luecken_dataset = Luecken_dataset[:, adt]
Luecken_dataset.X = Luecken_dataset.layers['counts']

## Dataset Description

In [None]:
Luecken_dataset

In [None]:
Luecken_dataset.obs['Batch'] = Luecken_dataset.obs['batch'].values

We are computing new embeddings as the original embeddings are not clear

In [None]:
import scanpy as sc
from scipy.sparse import issparse
import numpy as np
import harmonypy as hm

adata = Luecken_dataset.copy()

# Normalize in-place on AnnData so you don't lose structure
ep.Normalise_protein_data(adata, inplace=True, axis=1, flavor="seurat")

ep.Scale_protein_data(adata, inplace=True)

random.seed(42)
np.random.seed(42)

# PCA (cells x PCs)
sc.tl.pca(adata, n_comps=30, svd_solver="arpack")

random.seed(42)
np.random.seed(42)

# Harmony expects PCs x cells
Z = np.asarray(adata.obsm["X_pca"].T, dtype=np.float64)

ho = hm.run_harmony(Z, adata.obs, ["Batch"], max_iter_harmony=30, random_state=42)

# Corrected PCs back to (cells x PCs)
adata.obsm["X_pcahm"] = ho.Z_corr.T

Luecken_dataset.obsm['X_pcahm'] = adata.obsm['X_pcahm']


In [None]:
random.seed(42)
np.random.seed(42)

sc.pp.neighbors(Luecken_dataset, n_neighbors=30, n_pcs=13, use_rep="X_pcahm", random_state = 42)

random.seed(42)
np.random.seed(42)

sc.tl.umap(Luecken_dataset, random_state = 42)

In [None]:
# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='cell_type', 
                basis='X_umap', 
                legend_loc='right margin', 
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5.5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken M.D. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Original annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Place the legend below the plot
legend = ax.legend(loc='upper center', 
                   bbox_to_anchor=(0.5, -0.05),
                   prop={'size': 4.8},
                   ncol=4)

# Reduce the size of the dots in the legend
for handle in legend.legend_handles:
    handle._sizes = [10]

# Adjust the layout to make room for the legend
plt.subplots_adjust(bottom=0.3)

# Save the figure at 300 dpi
plt.savefig(figures_path + "/04_LueckenMD_original_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
Luecken_dataset.obs['Chemistry'] = 'BioLegend TotalSeqB'

In [None]:
Luecken_dataset.var.rename(index={'FceRIa': 'FcεRIα'}, inplace=True)

# Label harmonisation

### All cellular types

In [None]:
original_labels = pd.Categorical(np.concatenate((Zhang_dataset.obs['Level 3 Multimodal'].values, 
                                                 Hao_dataset.obs['celltype.l3'].values, 
                                                 Triana_dataset.obs['CellTypes'].values, 
                                                 Luecken_dataset.obs['cell_type'].values)))

In [None]:
adatas_merged = {"Zhang": Zhang_dataset, 
          "Hao": Hao_dataset, 
          "Triana": Triana_dataset, 
          "Luecken": Luecken_dataset}

adatas_merged = anndata.concat(adatas_merged, 
                        label="dataset_name", 
                        join="outer")

In [None]:
from venny4py.venny4py import *

# Create the Venn diagram with custom colors
sets = {'Zhang': set(list(Zhang_dataset.var_names)),
        'Hao': set(list(Hao_dataset.var_names)),
        'Triana': set(list(Triana_dataset.var_names)),
        'Luecken': set(list(Luecken_dataset.var_names))}

# Define custom colors for each dataset
colors = ['#1F77B4',  # Zhang - blue
          '#FE8010',  # Hao - orange  
          '#2EA02E',  # Triana - green
          '#D62828']  # Luecken - red

venny4py(sets=sets, out=figures_path, ext='png', colors=colors)

# Display the plot
plt.show()

# Specify the current file name and the new file name
current_file_name = figures_path + "/Venn_4.png"
new_file_name = figures_path + "/05_Shared_ADTs_across_all_datasets.png"

# Rename the file
os.rename(current_file_name, new_file_name)

# Specify the current file name and the new file name
current_file_name = figures_path + "/Intersections_4.txt"
new_file_name = figures_path + "/05_Shared_ADTs_across_all_datasets.txt"

# Rename the file
os.rename(current_file_name, new_file_name)

In [None]:
common = SCUtils.Intersect_lists(Zhang_dataset.var_names, 
                                 Hao_dataset.var_names, 
                                 Triana_dataset.var_names, 
                                 Luecken_dataset.var_names)

In [None]:
adatas_merged = adatas_merged[:, common]

In [None]:
adatas_merged.obs['Original_annotation'] = original_labels

In [None]:
adata = adatas_merged.copy()

# Normalize in-place on AnnData so you don't lose structure
ep.Normalise_protein_data(adata, inplace=True, axis=1, flavor="seurat")

ep.Scale_protein_data(adata, inplace=True)

random.seed(42)
np.random.seed(42)

# PCA (cells x PCs)
sc.tl.pca(adata, n_comps=30, svd_solver="arpack")

random.seed(42)
np.random.seed(42)

# Harmony expects PCs x cells
Z = np.asarray(adata.obsm["X_pca"].T, dtype=np.float64)

ho = hm.run_harmony(Z, adata.obs, ["Batch"], max_iter_harmony=30, random_state=42)

# Corrected PCs back to (cells x PCs)
adata.obsm["X_pcahm"] = ho.Z_corr.T

adatas_merged.obsm['X_pcahm'] = adata.obsm['X_pcahm']


In [None]:
random.seed(42)
np.random.seed(42)

sc.pp.neighbors(adatas_merged, use_rep="X_pcahm", n_neighbors=30, metric='cosine', random_state = 42)

adatas_merged.obsp["connectivities"] = np.round(adatas_merged.obsp["connectivities"], decimals=5)
adatas_merged.obsp["distances"] = np.round(adatas_merged.obsp["distances"], decimals=5)

In [None]:
random.seed(42)
np.random.seed(42)

sc.tl.umap(adatas_merged, random_state = 42,  min_dist=0.3)

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='dataset_name', 
                basis='X_umap', 
                legend_loc='right margin', 
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.05)

# Place the legend below the plot
legend = ax.legend(loc='upper center', 
                   bbox_to_anchor=(0.9, 1),
                   prop={'size': 4.8},
                   ncol=1)

# Reduce the size of the dots in the legend
for handle in legend.legend_handles:
    handle._sizes = [10]

# Adjust the layout to make room for the legend
plt.subplots_adjust(bottom=0.3)

# Save the figure at 300 dpi
plt.savefig(figures_path + "/06_Merged_datasets__coloured_by_datasets.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Create subplot figure split by dataset_name
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()

# Get unique datasets
datasets = adatas_merged.obs['dataset_name'].unique()

for i, dataset in enumerate(datasets):
    # Filter data for current dataset
    dataset_mask = adatas_merged.obs['dataset_name'] == dataset
    dataset_data = adatas_merged[dataset_mask]
    
    # Plot UMAP for current dataset
    sc.pl.embedding(dataset_data, 
                    color='dataset_name', 
                    basis='X_umap', 
                    legend_loc='none',
                    add_outline=False,
                    frameon=False,
                    show=False,
                    ax=axes[i])
    
    # Set axis labels and title for each subplot
    axes[i].set_xlabel('UMAP 1', fontsize=12)
    axes[i].set_ylabel('UMAP 2', fontsize=12)
    axes[i].set_title(f'{dataset} dataset', fontsize=12, fontweight='bold')

# Remove empty subplot if odd number of datasets
if len(datasets) < 4:
    fig.delaxes(axes[3])

# Adjust layout
plt.tight_layout()

# Save the figure at 300 dpi
plt.savefig(figures_path + "/07_Merged_datasets_split_by_dataset.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
random.seed(42)
np.random.seed(42)

sc.tl.leiden(adatas_merged, resolution=3.5, random_state = 42, 
             n_iterations=10)

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='leiden', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Cluster annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/08_Merged_datasets__coloured_by_leiden_clusters.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
random.seed(42)
np.random.seed(42)

alignment = cellhint.harmonize(adatas_merged, 'dataset_name', 'Original_annotation', 
                               use_rep='X_pcahm', metric='cosine')

In [None]:
cellhint.treeplot(alignment, save=figures_path + "/09_Merged_datasets__CellHint_Preannotated_Classes.png")

In [None]:
adatas_merged.obs[['low_hierarchy', 'high_hierarchy']] = alignment.reannotation.loc[adatas_merged.obs_names, ['reannotation', 'group']]

In [None]:
adatas_merged.obs['low_hierarchy'] = pd.Categorical(adatas_merged.obs['low_hierarchy'])
adatas_merged.obs['high_hierarchy'] = pd.Categorical(adatas_merged.obs['high_hierarchy'])

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='high_hierarchy', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Cluster annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# # Save the figure at 300 dpi
# plt.savefig(figures_path + "/Merged_datasets_leiden_annotation.png", 
#             dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
adatas_merged

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Level 3 Multimodal',  #Level 3 Multimodal, celltype.l2, CellTypes, cell_type
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Cluster annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='leiden', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Cluster annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
adatas_merged.obs['leiden']

In [None]:
cluster = '47'

summary = adatas_merged.obs.groupby('leiden')['high_hierarchy'].value_counts()
print("Top 5 high_hierarchy for cluster", cluster)
print(summary.loc[cluster].nlargest(5))
print()

summary = adatas_merged.obs.groupby('leiden')['Original_annotation'].value_counts()
print("Top 5 Original_annotation for cluster", cluster)
print(summary.loc[cluster].nlargest(5))
print()

# Get the top high_hierarchy group for this cluster to find related alignment info
top_hierarchy = adatas_merged.obs.groupby('leiden')['high_hierarchy'].value_counts().loc[cluster].index[0]
# Use the groups column directly instead of index filtering
matching_groups = alignment.relation[alignment.groups == top_hierarchy]
print("Related alignment groups:")
print(matching_groups)

In [None]:
import numpy as np
import pandas as pd

# 0) Make sure leiden is string
adatas_merged.obs["leiden"] = adatas_merged.obs["leiden"].astype(str)

# 1) RESET: wipe any previous assignments
adatas_merged.obs["Consensus_annotation_detailed"] = pd.NA

# 2) Mapping (now complete up to "54")
cluster_to_label = {
    "0":  "Progenitor",
    "1":  "CD4 T Naive",
    "2":  "B Naive",
    "3":  "Progenitor",
    "4":  "CD14 Mono",
    "5":  "CD8 T Naive",
    "6":  "NK CD56 dim",
    "7":  "Immature B",
    "8":  "CD14 Mono",
    "9":  "CD4 T Memory",
    "10": "CD14 Mono",
    "11": "CD14 Mono",
    "12": "NK CD56 dim",
    "13": "Progenitor",
    "14": "pDC",
    "15": "CD8 T Memory",
    "16": "B Memory",
    "17": "CD16 Mono",
    "18": "Progenitor",
    "19": "Progenitor",
    "20": "cDC2",
    "21": "CD8 T Memory",
    "22": "CD4 T Memory",
    "23": "CD4 T Memory",
    "24": "CD8 T Memory",
    "25": "Progenitor",
    "26": "CD8 T Memory",
    "27": "CD4 T Memory",
    "28": "MAIT",
    "29": "CD4 T Naive",
    "30": "NK CD56 bright",
    "31": "Treg",
    "32": "CD14 Mono",
    "33": "Progenitor",
    "34": "CD4 CTL",
    "35": "CD14 Mono",
    "36": "CD8 T Memory",
    "37": "CD14 Mono",
    "38": "CD8 T Memory",
    "39": "Plasma",
    "40": "CD8 T Naive",
    "41": "B Memory",
    "42": "NK CD56 dim",
    "43": "NK CD56 dim",
    "44": "Progenitor",
    "45": "Progenitor",
    "46": "Mesenchymal",
    "47": "cDC1",
    "48": "DnT",
    "49": "Progenitor",
    "50": "CD14 Mono",
    "51": "B Memory",
    "52": "Macrophage",
    "53": "CD14 Mono",
    "54": "CD14 Mono",
    "55": "B Naive",
}

# 3) Assign in one shot
adatas_merged.obs["Consensus_annotation_detailed"] = adatas_merged.obs["leiden"].map(cluster_to_label)

# 4) Sanity checks
all_clusters = sorted(adatas_merged.obs["leiden"].unique(), key=lambda x: int(x))
missing = [c for c in all_clusters if c not in cluster_to_label]
unassigned_n = adatas_merged.obs["Consensus_annotation_detailed"].isna().sum()

print(f"Unique clusters in data: {len(all_clusters)}")
print(f"Clusters missing from mapping: {missing}")
print(f"Unassigned cells after mapping: {unassigned_n}")


In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in adatas_merged.uns:
    del adatas_merged.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Draft consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/10_Merged_datasets__coloured_by_preliminary_consensus_annotation_broad.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Extract CD4 T Naive, CD4 T Memory, CD8 T Memory, MAIT, and DnT cells
t_cell_types = ['CD4 T Naive', 'CD4 T Memory', 'CD8 T Memory', 'CD8 T Naive', 'MAIT', 'DnT']
t_cell_mask = adatas_merged.obs['Consensus_annotation_detailed'].isin(t_cell_types)
t_cell_subset = adatas_merged[t_cell_mask].copy()

print(f"Number of T cells: {t_cell_subset.n_obs}")
print(f"Leiden clusters containing T cells: {t_cell_subset.obs['leiden'].unique()}")

# Check distribution of cell types
print("\nDistribution of T cell types:")
print(t_cell_subset.obs['Consensus_annotation_detailed'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on T cells
sc.pp.neighbors(t_cell_subset, use_rep="X_pcahm", n_neighbors=15, metric='cosine', random_state=42)
sc.tl.leiden(t_cell_subset, resolution=3, random_state=42, key_added='t_cell_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the subset
sc.tl.umap(t_cell_subset, random_state=42, min_dist=0.3)

# Plot the subclusters
sc.pl.embedding(t_cell_subset, 
                color='t_cell_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('T Cell Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Plot original annotations
sc.pl.embedding(t_cell_subset, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('T Cell Original Annotations', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal annotations per subcluster:")
for cluster in sorted(t_cell_subset.obs['t_cell_subclusters'].unique()):
    cluster_cells = t_cell_subset.obs[t_cell_subset.obs['t_cell_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['Original_annotation'].value_counts().head())

# Find marker genes for subclusters
sc.tl.rank_genes_groups(t_cell_subset, 't_cell_subclusters', method='wilcoxon', use_raw=False)
sc.pl.rank_genes_groups(t_cell_subset, n_genes=5, sharey=False, ncols=3, fontsize=12)

plt.show()

In [None]:
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

# -----------------------------------------------------------------------------
# User-editable: subcluster IDs -> new label (only these will be changed)
# Everything else will keep the ORIGINAL label in adatas.obs[LABEL_COL_MAIN]
# -----------------------------------------------------------------------------
SUBCLUSTER_TO_LABEL = {
    "Gamma delta T": ["37"],
}

LABEL_COL_MAIN = "Consensus_annotation_detailed"
SUBCLUSTER_COL = "t_cell_subclusters"

# -----------------------------------------------------------------------------
# 0) Ensure subcluster IDs are strings for matching
# -----------------------------------------------------------------------------
t_cell_subset.obs[SUBCLUSTER_COL] = t_cell_subset.obs[SUBCLUSTER_COL].astype(str)

# -----------------------------------------------------------------------------
# 1) Ensure target labels exist as categories in the main AnnData (if categorical)
# -----------------------------------------------------------------------------
target_labels = list(SUBCLUSTER_TO_LABEL.keys())

if pd.api.types.is_categorical_dtype(adatas_merged.obs[LABEL_COL_MAIN]):
    missing = [c for c in target_labels if c not in adatas_merged.obs[LABEL_COL_MAIN].cat.categories]
    if missing:
        adatas_merged.obs[LABEL_COL_MAIN] = adatas_merged.obs[LABEL_COL_MAIN].cat.add_categories(missing)

# -----------------------------------------------------------------------------
# 2) Build mask per target label and write back ONLY those cells
# -----------------------------------------------------------------------------
t_cell_subset.obs["t_cell_label_updated"] = adatas_merged.obs.loc[t_cell_subset.obs_names, LABEL_COL_MAIN].astype(str).values

for new_label, subcluster_ids in SUBCLUSTER_TO_LABEL.items():
    mask = t_cell_subset.obs[SUBCLUSTER_COL].isin([str(x) for x in subcluster_ids])
    idx = t_cell_subset.obs_names[mask]
    # Update subset tracking column
    t_cell_subset.obs.loc[idx, "t_cell_label_updated"] = new_label
    # Update main object ONLY for those cells
    adatas_merged.obs.loc[idx, LABEL_COL_MAIN] = new_label

# -----------------------------------------------------------------------------
# 3) Summary of CHANGES only
# -----------------------------------------------------------------------------
changed = t_cell_subset.obs["t_cell_label_updated"] != adatas_merged.obs.loc[t_cell_subset.obs_names, LABEL_COL_MAIN].astype(str).values
# The line above compares after assignment; better to compute changes vs original:
original_labels = adatas_merged.obs.loc[t_cell_subset.obs_names, LABEL_COL_MAIN].astype(str).copy()
# Reconstruct "after" labels from adatas (authoritative)
after_labels = adatas_merged.obs.loc[t_cell_subset.obs_names, LABEL_COL_MAIN].astype(str)
changes = pd.DataFrame({"original": original_labels, "updated": after_labels}, index=t_cell_subset.obs_names)
changes = changes.loc[changes["original"] != changes["updated"]]

print("Reassignment summary (changed cells only):")
if changes.empty:
    print("No cells were reassigned.")
else:
    print(changes["updated"].value_counts().to_string())

# -----------------------------------------------------------------------------
# 4) Plot updated labels on the subset UMAP
# -----------------------------------------------------------------------------
sc.pl.embedding(
    t_cell_subset,
    color="t_cell_label_updated",
    basis="X_umap",
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("T Cell Reassignments (others unchanged)", fontsize=12, fontweight="bold", y=1.02)
plt.show()


In [None]:
# Extract Immature B cells
immature_b_mask = adatas_merged.obs['Consensus_annotation_detailed'] == 'Immature B'
immature_b_subset = adatas_merged[immature_b_mask].copy()

print(f"Number of Immature B cells: {immature_b_subset.n_obs}")
print(f"Leiden clusters containing Immature B: {immature_b_subset.obs['leiden'].unique()}")

random.seed(42)
np.random.seed(42)

# Perform subclustering on Immature B cells
sc.pp.neighbors(immature_b_subset, use_rep="X_pcahm", n_neighbors=15, metric='cosine', random_state=42)
sc.tl.leiden(immature_b_subset, resolution=0.5, random_state=42, key_added='immature_b_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the subset
sc.tl.umap(immature_b_subset, random_state=42, min_dist=0.3)

# Plot the subclusters
sc.pl.embedding(immature_b_subset, 
                color='immature_b_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('Immature B Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal annotations per subcluster:")
for cluster in immature_b_subset.obs['immature_b_subclusters'].unique():
    cluster_cells = immature_b_subset.obs[immature_b_subset.obs['immature_b_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['Original_annotation'].value_counts().head())

# Find marker genes for subclusters
sc.tl.rank_genes_groups(immature_b_subset, 'immature_b_subclusters', method='wilcoxon', use_raw=False)
sc.pl.rank_genes_groups(immature_b_subset, n_genes=5, sharey=False, ncols=3, fontsize=12)

plt.show()

In [None]:
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

# =============================================================================
# IMMATURE B reassignment helper
# - Only the specified Immature B subclusters are relabeled
# - Everything else in adatas_merged.obs[LABEL_COL_MAIN] stays unchanged
# =============================================================================

MAIN_ADATA = adatas_merged
SUBSET_ADATA = immature_b_subset  # <-- your Immature B subset AnnData

LABEL_COL_MAIN = "Consensus_annotation_detailed"
SUBCLUSTER_COL = "immature_b_subclusters"
UPDATED_COL_IN_SUBSET = "immature_b_label_updated"

# -------------------------------------------------------------------------
# USER-EDITABLE: map NEW LABEL -> list of Immature B subcluster IDs to change
# Example placeholder below; edit these IDs/labels as needed.
# -------------------------------------------------------------------------
SUBCLUSTER_TO_LABEL = {
    "Progenitor": ["3"],          # example
}

# -----------------------------------------------------------------------------
# 0) Ensure subcluster IDs are strings
# -----------------------------------------------------------------------------
SUBSET_ADATA.obs[SUBCLUSTER_COL] = SUBSET_ADATA.obs[SUBCLUSTER_COL].astype(str)

# -----------------------------------------------------------------------------
# 1) Snapshot ORIGINAL labels for accurate change summary
# -----------------------------------------------------------------------------
original_labels = MAIN_ADATA.obs.loc[SUBSET_ADATA.obs_names, LABEL_COL_MAIN].astype(str).copy()

# -----------------------------------------------------------------------------
# 2) Ensure target labels exist as categories in MAIN (if categorical)
# -----------------------------------------------------------------------------
target_labels = list(SUBCLUSTER_TO_LABEL.keys())
if pd.api.types.is_categorical_dtype(MAIN_ADATA.obs[LABEL_COL_MAIN]):
    missing = [c for c in target_labels if c not in MAIN_ADATA.obs[LABEL_COL_MAIN].cat.categories]
    if missing:
        MAIN_ADATA.obs[LABEL_COL_MAIN] = MAIN_ADATA.obs[LABEL_COL_MAIN].cat.add_categories(missing)

# -----------------------------------------------------------------------------
# 3) Initialize subset tracking column to ORIGINAL labels (others unchanged)
# -----------------------------------------------------------------------------
SUBSET_ADATA.obs[UPDATED_COL_IN_SUBSET] = original_labels.values

# -----------------------------------------------------------------------------
# 4) Apply reassignments ONLY for specified subclusters
# -----------------------------------------------------------------------------
for new_label, subcluster_ids in SUBCLUSTER_TO_LABEL.items():
    subcluster_ids = [str(x) for x in subcluster_ids]
    mask = SUBSET_ADATA.obs[SUBCLUSTER_COL].isin(subcluster_ids)
    idx = SUBSET_ADATA.obs_names[mask]

    # Track in subset
    SUBSET_ADATA.obs.loc[idx, UPDATED_COL_IN_SUBSET] = new_label
    # Update MAIN only for those cells
    MAIN_ADATA.obs.loc[idx, LABEL_COL_MAIN] = new_label

# -----------------------------------------------------------------------------
# 5) Summary of CHANGES (original vs updated) among Immature B subset cells only
# -----------------------------------------------------------------------------
after_labels = MAIN_ADATA.obs.loc[SUBSET_ADATA.obs_names, LABEL_COL_MAIN].astype(str)
changes = pd.DataFrame({"original": original_labels, "updated": after_labels}, index=SUBSET_ADATA.obs_names)
changes = changes.loc[changes["original"] != changes["updated"]]

print("Immature B reassignment summary (changed cells only):")
if changes.empty:
    print("No cells were reassigned.")
else:
    print("\nCounts by updated label:")
    print(changes["updated"].value_counts().to_string())

    print("\nCounts by (original -> updated):")
    print(changes.groupby(["original", "updated"]).size().sort_values(ascending=False).to_string())

# -----------------------------------------------------------------------------
# 6) Plot updated labels on the subset UMAP
# -----------------------------------------------------------------------------
sc.pl.embedding(
    SUBSET_ADATA,
    color=UPDATED_COL_IN_SUBSET,
    basis="X_umap",
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Immature B Reassignments (others unchanged)", fontsize=12, fontweight="bold", y=1.02)
plt.show()


In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Draft consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/11_Merged_datasets_Consensus_annotation_detailed_draft_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
adatas_merged.obs['Consensus_annotation_broad'] = 'Mature'

categories = ['Mature', 'Immature']

adatas_merged.obs['Consensus_annotation_broad'] = pd.Categorical(adatas_merged.obs['Consensus_annotation_broad'], categories=categories)
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'Progenitor', 'Consensus_annotation_broad'] = 'Immature'

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Consensus_annotation_broad', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(5, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Draft consensus broad annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/12_Merged_datasets__coloured_by_preliminary_consensus_annotation_broad_binary.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
Zhang_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Zhang_dataset.obs_names, 'Consensus_annotation_detailed'].values
Zhang_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Zhang_dataset.obs_names, 'Consensus_annotation_broad'].values

Hao_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Hao_dataset.obs_names, 'Consensus_annotation_detailed'].values
Hao_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Hao_dataset.obs_names, 'Consensus_annotation_broad'].values
Triana_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Triana_dataset.obs_names, 'Consensus_annotation_detailed'].values
Triana_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Triana_dataset.obs_names, 'Consensus_annotation_broad'].values

Luecken_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Luecken_dataset.obs_names, 'Consensus_annotation_detailed'].values
Luecken_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Luecken_dataset.obs_names, 'Consensus_annotation_broad'].values

### HSPC

In [None]:
original_labels = pd.Categorical(np.concatenate((Zhang_dataset.obs['Level 3 Multimodal'].values,
                                                 Triana_dataset.obs['CellTypes'].values, 
                                                 Luecken_dataset.obs['cell_type'].values)))

In [None]:
adatas_merged_HSPC = {"Zhang": Zhang_dataset, 
               "Triana": Triana_dataset, 
               "Luecken": Luecken_dataset}

adatas_merged_HSPC = anndata.concat(adatas_merged_HSPC, 
                             label="dataset_name", 
                             join="outer")

In [None]:
common = SCUtils.Intersect_lists(Zhang_dataset.var_names, 
                                 Triana_dataset.var_names, 
                                 Luecken_dataset.var_names)

In [None]:
from venny4py.venny4py import *

# Create the Venn diagram with custom colors
sets = {'Zhang': set(list(Zhang_dataset.var_names)),
        'Triana': set(list(Triana_dataset.var_names)),
        'Luecken': set(list(Luecken_dataset.var_names))}

# Define custom colors for each dataset
colors = ['#1F77B4',  # Zhang - blue
          '#2EA02E',  # Triana - green
          '#D62828']  # Luecken - red

venny4py(sets=sets, out=figures_path, ext='png', colors=colors)

# Display the plot
plt.show()

# Specify the current file name and the new file name
current_file_name = figures_path + "/Venn_3.png"
new_file_name = figures_path + "/13_Shared_ADTs_across_hspcs_containing_datasets.png"

# Rename the file
os.rename(current_file_name, new_file_name)

# Specify the current file name and the new file name
current_file_name = figures_path + "/Intersections_3.txt"
new_file_name = figures_path + "/13_Shared_ADTs_across_hspcs_containing_datasets_list.txt"

# Rename the file
os.rename(current_file_name, new_file_name)

In [None]:
adatas_merged_HSPC = adatas_merged_HSPC[:, common]

In [None]:
adatas_merged_HSPC.obs['Original_annotation'] = original_labels

In [None]:
immature_obs_names = adatas_merged.obs_names[(adatas_merged.obs['Consensus_annotation_broad'] == 'Immature') & (adatas_merged.obs['dataset_name'] != 'Hao')]
obs_to_keep = np.in1d(adatas_merged_HSPC.obs_names, immature_obs_names)

In [None]:
adatas_merged_HSPC = adatas_merged_HSPC[obs_to_keep,:]

In [None]:
adata = adatas_merged_HSPC.copy()

# Normalize in-place on AnnData so you don't lose structure
ep.Normalise_protein_data(adata, inplace=True, axis=1, flavor="seurat")

ep.Scale_protein_data(adata, inplace=True)

random.seed(42)
np.random.seed(42)

# PCA (cells x PCs)
sc.tl.pca(adata, n_comps=30, svd_solver="arpack")

random.seed(42)
np.random.seed(42)

# Harmony expects PCs x cells
Z = np.asarray(adata.obsm["X_pca"].T, dtype=np.float64)

ho = hm.run_harmony(Z, adata.obs, ["Batch"], max_iter_harmony=30, random_state=42)

# Corrected PCs back to (cells x PCs)
adata.obsm["X_pcahm"] = ho.Z_corr.T

adatas_merged_HSPC.obsm['X_pcahm'] = adata.obsm['X_pcahm']

In [None]:
random.seed(42)
np.random.seed(42)

sc.pp.neighbors(adatas_merged_HSPC, use_rep="X_pcahm", n_neighbors=30, metric='cosine', random_state = 42)

adatas_merged_HSPC.obsp["connectivities"] = np.round(adatas_merged_HSPC.obsp["connectivities"], decimals=5)
adatas_merged_HSPC.obsp["distances"] = np.round(adatas_merged_HSPC.obsp["distances"], decimals=5)

In [None]:
random.seed(42)
np.random.seed(42)

sc.tl.umap(adatas_merged_HSPC, random_state = 42,  min_dist=0.2)

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged_HSPC, 
                color='dataset_name', 
                basis='X_umap', 
                legend_loc='right margin', 
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(7, 8)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.05)

# Place the legend below the plot
legend = ax.legend(loc='upper center', 
                   bbox_to_anchor=(0.1, 1),
                   prop={'size': 4.8},
                   ncol=1)

# Reduce the size of the dots in the legend
for handle in legend.legend_handles:
    handle._sizes = [10]

# Adjust the layout to make room for the legend
plt.subplots_adjust(bottom=0.3)

# Save the figure at 300 dpi
plt.savefig(figures_path + "/14_Merged_datasets_hspcs__coloured_by_datasets.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Create subplot figure split by dataset_name
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

# Get unique datasets
datasets = adatas_merged_HSPC.obs['dataset_name'].unique()

# Define colors for each dataset
dataset_colors = {
    'Zhang': '#1F77B4',   # blue
    'Triana': '#2EA02E',  # green  
    'Luecken': '#D62828'  # red
}

for i, dataset in enumerate(datasets):
    # Filter data for current dataset
    dataset_mask = adatas_merged_HSPC.obs['dataset_name'] == dataset
    dataset_data = adatas_merged_HSPC[dataset_mask]
    
    # Plot UMAP for current dataset with specific color
    sc.pl.embedding(dataset_data, 
                    color='dataset_name', 
                    basis='X_umap', 
                    legend_loc='none',
                    add_outline=False,
                    frameon=False,
                    show=False,
                    ax=axes[i],
                    palette=[dataset_colors[dataset]])
    
    # Set axis labels and title for each subplot
    axes[i].set_xlabel('UMAP 1', fontsize=12)
    axes[i].set_ylabel('UMAP 2', fontsize=12)
    axes[i].set_title(f'{dataset} dataset', fontsize=12, fontweight='bold')

# Remove empty subplot if odd number of datasets
if len(datasets) < 4:
    fig.delaxes(axes[3])

# Adjust layout
plt.tight_layout()

# Save the figure at 300 dpi
plt.savefig(figures_path + "/15_Merged_datasets_hspcs__split_by_datasets.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
random.seed(42)
np.random.seed(42)

sc.tl.leiden(adatas_merged_HSPC, resolution=3, random_state = 42, n_iterations=10)

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged_HSPC, 
                color='leiden', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Cluster annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/16_Merged_datasets_hspcs_leiden_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged_HSPC, 
                color='cell_type', #Level 3 Multimodal, CellTypes, cell_type
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Cluster annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
random.seed(42)
np.random.seed(42)

alignment = cellhint.harmonize(adatas_merged_HSPC, 'dataset_name', 'Original_annotation', 
                               use_rep='X_pcahm', metric='cosine')

In [None]:
cellhint.treeplot(alignment, save=figures_path + "/17_Merged_datasets_hspcs__CellHint_Preannotated_Classes.png")

In [None]:
adatas_merged_HSPC.obs[['low_hierarchy', 'high_hierarchy']] = alignment.reannotation.loc[adatas_merged_HSPC.obs_names, ['reannotation', 'group']]

In [None]:
adatas_merged_HSPC.obs['low_hierarchy'] = pd.Categorical(adatas_merged_HSPC.obs['low_hierarchy'])
adatas_merged_HSPC.obs['high_hierarchy'] = pd.Categorical(adatas_merged_HSPC.obs['high_hierarchy'])

In [None]:
cluster = '28'

summary = adatas_merged_HSPC.obs.groupby('leiden')['high_hierarchy'].value_counts()
print("Top 5 high_hierarchy for cluster", cluster)
print(summary.loc[cluster].nlargest(5))
print()

summary = adatas_merged_HSPC.obs.groupby('leiden')['Original_annotation'].value_counts()
print("Top 5 Original_annotation for cluster", cluster)
print(summary.loc[cluster].nlargest(5))
print()

# Get the top high_hierarchy group for this cluster to find related alignment info
top_hierarchy = adatas_merged_HSPC.obs.groupby('leiden')['high_hierarchy'].value_counts().loc[cluster].index[0]
# Use the groups column directly instead of index filtering
matching_groups = alignment.relation[alignment.groups == top_hierarchy]
print("Related alignment groups:")
print(matching_groups)

In [None]:
adatas_merged_HSPC.obs['leiden']

In [None]:
import numpy as np
import pandas as pd

# 0) Make sure leiden is string
adatas_merged_HSPC.obs["leiden"] = adatas_merged_HSPC.obs["leiden"].astype(str)

# 1) RESET: wipe any previous assignments
adatas_merged_HSPC.obs["Consensus_annotation_detailed"] = pd.NA

# 2) Mapping (now complete up to "54")
cluster_to_label = {
    "0":  "CD14 Mono",
    "1":  "LMPP",
    "2":  "Erythroblast",
    "3":  "MPP",
    "4":  "Erythroblast",
    "5":  "MEP",
    "6":  "GMP",
    "7":  "CD14 Mono",
    "8":  "HSC",
    "9":  "MEP",
    "10": "MEP",
    "11": "MEP",
    "12": "Erythroblast",
    "13": "MEP",
    "14": "Pro-B",
    "15": "cDC2",
    "16": "GMP",
    "17": "Pre-Pro-B",
    "18": "LMPP",
    "19": "MkP",
    "20": "CD4 T Naive",
    "21": "MEP",
    "22": "MEP",
    "23": "GMP",
    "24": "Erythroblast",
    "25": "CD16 Mono",
    "26": "Erythroblast",
    "27": "HSC",
    "28": "MEP",
}

# 3) Assign in one shot
adatas_merged_HSPC.obs["Consensus_annotation_detailed"] = adatas_merged_HSPC.obs["leiden"].map(cluster_to_label)

# 4) Sanity checks
all_clusters = sorted(adatas_merged_HSPC.obs["leiden"].unique(), key=lambda x: int(x))
missing = [c for c in all_clusters if c not in cluster_to_label]
unassigned_n = adatas_merged_HSPC.obs["Consensus_annotation_detailed"].isna().sum()

print(f"Unique clusters in data: {len(all_clusters)}")
print(f"Clusters missing from mapping: {missing}")
print(f"Unassigned cells after mapping: {unassigned_n}")


In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in adatas_merged_HSPC.uns:
    del adatas_merged_HSPC.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(adatas_merged_HSPC, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Draft consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/18_Merged_datasets_hspcs__coloured_by_preliminary_consensus_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

## Joint annotation

In [None]:
# Get categories from both datasets
adatas_categories = list(adatas_merged.obs['Consensus_annotation_detailed'].cat.categories)
adatas_hspc_categories = list(adatas_merged_HSPC.obs['Consensus_annotation_detailed'].cat.categories)

# Merge categories and remove duplicates while preserving order
merged_categories = []
for cat in adatas_categories + adatas_hspc_categories:
    if cat not in merged_categories:
        merged_categories.append(cat)

# Set the merged categories for adatas
adatas_merged.obs['Consensus_annotation_detailed'] = pd.Categorical(
    adatas_merged.obs['Consensus_annotation_detailed'], 
    categories=merged_categories
)

In [None]:
adatas_merged.obs.loc[adatas_merged_HSPC.obs.index, 'Consensus_annotation_detailed'] = adatas_merged_HSPC.obs['Consensus_annotation_detailed'].astype(str)

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/19_Merged_datasets_Consensus_annotation_detailed_final_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
adatas_merged.obs['Consensus_annotation_broad'] = 'Mature'

categories = ['Mature', 'Immature']

adatas_merged.obs['Consensus_annotation_broad'] = pd.Categorical(adatas_merged.obs['Consensus_annotation_broad'], categories=categories)
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'HSC', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'MPP', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'MEP', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'GMP', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'Pro-B', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'Pre-Pro-B', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'EoBaMaP', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'LMPP', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'MkP', 'Consensus_annotation_broad'] = 'Immature'
adatas_merged.obs.loc[adatas_merged.obs['Consensus_annotation_detailed'] == 'Progenitor', 'Consensus_annotation_broad'] = 'Immature'

In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Consensus_annotation_broad', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus broad annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/20_Merged_datasets__coloured_by_intermediate_consensus_annotation_binary.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
import pandas as pd

# Initialize as categorical with fixed order
categories = [
    'HSPC','Monocyte','NK','CD4 T','CD8 T','B',
    'Erythroid','Doublet','Stroma','Other T','cDC','pDC','Mesenchymal'
]
adatas_merged.obs['Consensus_annotation_simplified'] = pd.Categorical(
    [pd.NA] * adatas_merged.n_obs,
    categories=categories
)

# -------------------------------
# EDIT THIS ONLY
# -------------------------------
DETAILED_TO_SIMPLIFIED = {
    # HSPC
    'HSPC': [
        'HSC', 'MPP', 'MEP', 'GMP', 'Pro-B', 'MkP', 'LMPP',
        'Progenitor', 'EoBaMaP', 'Pre-Pro-B'
    ],
    # Monocyte
    'Monocyte': ['CD14 Mono','CD16 Mono','Macrophage'],
    # NK
    'NK': ['NK CD56 dim','NK CD56 bright'],
    # CD4 T
    'CD4 T': ['CD4 T Naive','CD4 T Memory','Treg','CD4 CTL'],
    # CD8 T
    'CD8 T': ['CD8 T Naive','CD8 T Memory','MAIT'],
    # B
    'B': ['B Naive','B Memory','Plasma','Immature B','Pre-B'],
    # Erythroid
    'Erythroid': ['Erythroblast'],
    # Singletons
    'Doublet': ['Doublet'],
    'Stroma': ['Stroma'],
    'Other T': ['Gamma delta T', 'DnT'],
    'cDC': ['cDC1','cDC2'],
    'Mesenchymal': ['Mesenchymal'],
    'pDC': ['pDC'],
}

# -------------------------------
# Build reverse lookup: detailed -> simplified
# -------------------------------
reverse_map = {}
for simp, detailed_list in DETAILED_TO_SIMPLIFIED.items():
    for d in detailed_list:
        reverse_map[d] = simp

# -------------------------------
# Apply mapping in one line
# -------------------------------
adatas_merged.obs['Consensus_annotation_simplified'] = (
    adatas_merged.obs['Consensus_annotation_detailed']
    .map(reverse_map)
    .astype('category')
)

# Enforce category order
adatas_merged.obs['Consensus_annotation_simplified'] = (
    adatas_merged.obs['Consensus_annotation_simplified']
    .cat.set_categories(categories)
)

# -------------------------------
# Sanity check
# -------------------------------
unmapped = adatas_merged.obs[
    adatas_merged.obs['Consensus_annotation_simplified'].isna()
]['Consensus_annotation_detailed'].value_counts()

print("Unmapped detailed labels:")
print(unmapped)


In [None]:
# Plot UMAP with color
sc.pl.embedding(adatas_merged, 
                color='Consensus_annotation_simplified', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus simplified annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/21_Merged_datasets__coloured_by_intermediate_consensus_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
adatas_merged.write_h5ad(data_path + "/References/Merged_references.h5ad")

## Re-labelling

In [None]:
Zhang_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Zhang_dataset.obs_names, 'Consensus_annotation_broad'].values
Zhang_dataset.obs['Consensus_annotation_simplified']=adatas_merged.obs.loc[Zhang_dataset.obs_names, 'Consensus_annotation_simplified'].values
Zhang_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Zhang_dataset.obs_names, 'Consensus_annotation_detailed'].values

In [None]:
Hao_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Hao_dataset.obs_names, 'Consensus_annotation_broad'].values
Hao_dataset.obs['Consensus_annotation_simplified']=adatas_merged.obs.loc[Hao_dataset.obs_names, 'Consensus_annotation_simplified'].values
Hao_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Hao_dataset.obs_names, 'Consensus_annotation_detailed'].values

In [None]:
Triana_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Triana_dataset.obs_names, 'Consensus_annotation_broad'].values
Triana_dataset.obs['Consensus_annotation_simplified']=adatas_merged.obs.loc[Triana_dataset.obs_names, 'Consensus_annotation_simplified'].values
Triana_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Triana_dataset.obs_names, 'Consensus_annotation_detailed'].values

In [None]:
Luecken_dataset.obs['Consensus_annotation_broad']=adatas_merged.obs.loc[Luecken_dataset.obs_names, 'Consensus_annotation_broad'].values
Luecken_dataset.obs['Consensus_annotation_simplified']=adatas_merged.obs.loc[Luecken_dataset.obs_names, 'Consensus_annotation_simplified'].values
Luecken_dataset.obs['Consensus_annotation_detailed']=adatas_merged.obs.loc[Luecken_dataset.obs_names, 'Consensus_annotation_detailed'].values

### Zhang dataset

In [None]:
# Remove unused categories
Zhang_dataset.obs['Consensus_annotation_simplified'] = Zhang_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Zhang_dataset.obs['Consensus_annotation_detailed'] = Zhang_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

In [None]:
counts = Zhang_dataset.obs['Consensus_annotation_detailed'].value_counts()

In [None]:
print(counts)

In [None]:
filtered_categories = counts[counts >= 10].index
Zhang_dataset = Zhang_dataset[Zhang_dataset.obs[Zhang_dataset.obs['Consensus_annotation_detailed'].isin(filtered_categories)].index, :]

In [None]:
# Plot UMAP with color
sc.pl.embedding(Zhang_dataset, 
                color='Level 3 Multimodal', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=2,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Zhang X. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Zhang_dataset.uns:
    del Zhang_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Zhang X. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
import pandas as pd

LEVEL_COL = "Level 3 Multimodal"
OUT_BROAD = "Consensus_annotation_broad"
OUT_SIMPL = "Consensus_annotation_simplified"
OUT_DETAIL = "Consensus_annotation_detailed"

# -----------------------------------------------------------------------------
# 1) Mapping rules: Level 3 Multimodal -> (broad, simplified, detailed)
# -----------------------------------------------------------------------------
RULES = [
    # MEP
    (['MEP-1', 'MEP-2', 'MEP-Eryth-1', 'MEP-Eryth-2'],
     'Immature', 'HSPC', 'MEP'),

    # Monocytes (CD14)
    (['Intermediate Mono-1','Intermediate Mono-2','Intermediate Mono-3','Classical-Mono'],
     'Mature', 'Monocyte', 'CD14 Mono'),

    # Monocytes (CD16)
    (['Non-Classical Mono-1','Non-Classical Mono-2'],
     'Mature', 'Monocyte', 'CD16 Mono'),

    # BMCP -> EoBaMaP
    (['BMCP-1','BMCP-2'],
     'Immature', 'HSPC', 'EoBaMaP'),

    # cDC1-ish
    (['pre-DC-2','pre-DC-1','cDC1'],
     'Mature', 'cDC', 'cDC1'),

    # cDC2-ish
    (['pre-DC-3','cDC2-1','cDC2-2','ASDC'],
     'Mature', 'cDC', 'cDC2'),

    # pDC
    (['pDC'],
     'Mature', 'pDC', 'pDC'),

    # Erythroid progenitors (ErP)
    (['ERP-1','ERP-2','ERP-3','ERP-4','ERP-5','ERP-6','ERP-7','ERP-8'],
     'Immature', 'Erythroid', 'ErP'),

    # Erythroblasts
    (['Erythroblast-1','Erythroblast-2','Erythroblast-3'],
     'Mature', 'Erythroid', 'Erythroblast'),

    # LMPP
    (['LMPP-1-cycling','LMPP-1'],
     'Immature', 'HSPC', 'MPP'),

    # CLP
    (['CLP'],
     'Immature', 'HSPC', 'Pre-Pro-B'),

    # CD8 naive
    (['T CD8 Naive'],
     'Mature', 'CD8 T', 'CD8 T Naive'),

    # "Myeloid intermediate" etc -> Myeloid progenitor
    (['Myeloid intermediate 1','Myeloid intermediate 2','Myeloid intermediate 3','Mono-1','Mono-2','cMOP'],
     'Mature', 'Myeloid', 'Myeloid progenitor'),

    # preNeu/immNeu -> GMP
    (['preNeu','immNeu-1','immNeu-2'],
     'Immature', 'HSPC', 'GMP'),

    # MPP-MEP -> MPP
    (['MPP-MEP'],
     'Immature', 'HSPC', 'MPP'),

    # HSC / MPP
    (['HSC-1','HSC-2','MPP-1','MPP-2'],
     'Immature', 'HSPC', 'HSC'),

    # MultiLin -> LMPP
    (['MultiLin-GMP-1','MultiLin-GMP-2','MultiLin-GMP-3','Multilin-1','Multilin-2','Multilin-3',
      'LMPP-2','MDP-1','MDP-2'],
     'Immature', 'HSPC', 'LMPP'),

    # Pro-B
    (['Pro-B-Early-cycling','Pro-B-Early','Pro-B-cycling-1','Pro-B-cycling-2','Pro-B-2','Pro-B-3','Pro-B-1'],
     'Immature', 'B', 'Pro-B'),

    # Transitional-B-2 -> Pre-B
    (['Transitional-B-2'],
     'Immature', 'B', 'Pre-B'),

    # Transitional-B-1 / pre-B -> Immature B
    (['Transitional-B-1','pre-B'],
     'Mature', 'B', 'Immature B'),
]

# -----------------------------------------------------------------------------
# 2) Build lookup dicts automatically from the rules
# -----------------------------------------------------------------------------
broad_map, simpl_map, detail_map = {}, {}, {}
for keys, broad, simpl, detail in RULES:
    for k in keys:
        broad_map[k] = broad
        simpl_map[k] = simpl
        detail_map[k] = detail

# -----------------------------------------------------------------------------
# 3) Ensure output columns exist and are OBJECT dtype (avoid categorical setitem)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    if col not in Zhang_dataset.obs.columns:
        Zhang_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Zhang_dataset.obs[col]):
        Zhang_dataset.obs[col] = Zhang_dataset.obs[col].astype("object")

# -----------------------------------------------------------------------------
# 4) Apply mapping (index-aligned, write only matched rows)
# -----------------------------------------------------------------------------
new_broad  = Zhang_dataset.obs[LEVEL_COL].map(broad_map)
new_simpl  = Zhang_dataset.obs[LEVEL_COL].map(simpl_map)
new_detail = Zhang_dataset.obs[LEVEL_COL].map(detail_map)

mask = Zhang_dataset.obs[LEVEL_COL].isin(broad_map.keys())

Zhang_dataset.obs.loc[mask, OUT_BROAD]  = new_broad.loc[mask]
Zhang_dataset.obs.loc[mask, OUT_SIMPL]  = new_simpl.loc[mask]
Zhang_dataset.obs.loc[mask, OUT_DETAIL] = new_detail.loc[mask]

# -----------------------------------------------------------------------------
# 5) Optional: convert to categoricals AFTER assignment
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    Zhang_dataset.obs[col] = pd.Categorical(Zhang_dataset.obs[col])

# -----------------------------------------------------------------------------
# 6) Sanity checks
# -----------------------------------------------------------------------------
unmapped = Zhang_dataset.obs.loc[~mask, LEVEL_COL].value_counts().head(30)
print(f"Matched rows: {int(mask.sum())} / {Zhang_dataset.n_obs}")
print("Top unmatched Level 3 Multimodal labels (first 30):")
print(unmapped.to_string())


In [None]:
# Remove unused categories
Zhang_dataset.obs['Consensus_annotation_simplified'] = Zhang_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Zhang_dataset.obs['Consensus_annotation_detailed'] = Zhang_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

Smoothing labels

In [None]:
counts = Zhang_dataset.obs['Consensus_annotation_detailed'].value_counts()
print(counts)

In [None]:
filtered_categories = counts[counts >= 10].index
Zhang_dataset = Zhang_dataset[Zhang_dataset.obs[Zhang_dataset.obs['Consensus_annotation_detailed'].isin(filtered_categories)].index, :]

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Calculate silhouette scores for current annotations
print("Calculating silhouette scores for Zhang dataset...")

# Use the UMAP representation for silhouette analysis
X_embed = Zhang_dataset.obsm['X_umap']
labels = Zhang_dataset.obs['Consensus_annotation_detailed'].astype('category').cat.codes

# Calculate silhouette scores
silhouette_avg = silhouette_score(X_embed, labels)
sample_silhouette_values = silhouette_samples(X_embed, labels)

print(f"Average silhouette score: {silhouette_avg:.3f}")

# Add silhouette scores to the dataset
Zhang_dataset.obs['silhouette_score'] = sample_silhouette_values

# Identify cells with negative silhouette scores
negative_silhouette_mask = sample_silhouette_values < 0
print(f"Number of cells with negative silhouette scores: {negative_silhouette_mask.sum()}")
print(f"Percentage of cells with negative silhouette scores: {negative_silhouette_mask.sum()/len(sample_silhouette_values)*100:.2f}%")

# Show distribution of silhouette scores by cell type
silhouette_by_type = Zhang_dataset.obs.groupby('Consensus_annotation_detailed')['silhouette_score'].agg(['mean', 'std', 'min', 'max', 'count'])
print("\nSilhouette scores by cell type:")
print(silhouette_by_type.sort_values('mean'))

# Initialize refined annotations (start with original smoothed annotations)
Zhang_dataset.obs['Consensus_annotation_detailed_refined'] = Zhang_dataset.obs['Consensus_annotation_detailed'].copy()

# Perform silhouette-based reassignment
print("\n=== PERFORMING SILHOUETTE-BASED REASSIGNMENT ===")

# Identify cells with very poor silhouette scores (< -0.1)
very_poor_silhouette = Zhang_dataset.obs['silhouette_score'] < -0.1

if very_poor_silhouette.sum() > 0:
    print(f"Found {very_poor_silhouette.sum()} cells with very poor silhouette scores (< -0.1)")
    
    # Fit nearest neighbors
    nn = NearestNeighbors(n_neighbors=30, metric='euclidean')
    nn.fit(X_embed)
    
    # Get indices of poorly assigned cells
    poor_indices = np.where(very_poor_silhouette)[0]
    
    reassignments_made = 0
    
    for idx in poor_indices:
        # Find neighbors for this cell
        distances, neighbor_indices = nn.kneighbors([X_embed[idx]])
        neighbor_indices = neighbor_indices[0][1:]  # Exclude the cell itself
        
        # Get annotations of neighbors
        neighbor_annotations = Zhang_dataset.obs['Consensus_annotation_detailed'].iloc[neighbor_indices]

        # Find most common annotation among neighbors
        most_common = neighbor_annotations.mode()
        
        if len(most_common) > 0:
            new_annotation = most_common.iloc[0]
            current_annotation = Zhang_dataset.obs['Consensus_annotation_detailed'].iloc[idx]
            
            # Only reassign if the most common neighbor annotation is different
            if new_annotation != current_annotation:
                # Check if at least 40% of neighbors have this annotation
                fraction = (neighbor_annotations == new_annotation).sum() / len(neighbor_annotations)
                
                if fraction >= 0.4:
                    Zhang_dataset.obs.loc[Zhang_dataset.obs.index[idx], 'Consensus_annotation_detailed_refined'] = new_annotation
                    reassignments_made += 1
    
    print(f"Reassigned {reassignments_made} cells based on neighborhood consensus")
    
    # Recalculate silhouette scores after reassignment
    new_labels = Zhang_dataset.obs['Consensus_annotation_detailed_refined'].astype('category').cat.codes
    new_silhouette_scores = silhouette_samples(X_embed, new_labels)
    silhouette_avg_corrected = silhouette_score(X_embed, new_labels)
    
    # Store corrected scores
    Zhang_dataset.obs['silhouette_score_corrected'] = new_silhouette_scores
    
    print(f"\n=== REASSIGNMENT RESULTS ===")
    print(f"Original average silhouette: {silhouette_avg:.3f}")
    print(f"Refined average silhouette: {silhouette_avg_corrected:.3f}")
    print(f"Improvement: {silhouette_avg_corrected - silhouette_avg:.3f}")
    
    print(f"Original negative silhouette cells: {negative_silhouette_mask.sum()}")
    print(f"Refined negative silhouette cells: {(new_silhouette_scores < 0).sum()}")
    
    # Show what changes were made
    if reassignments_made > 0:
        changes_mask = (Zhang_dataset.obs['Consensus_annotation_detailed'] != 
                       Zhang_dataset.obs['Consensus_annotation_detailed_refined'])
        changes = Zhang_dataset.obs[changes_mask]
        
        print(f"\n=== SPECIFIC REASSIGNMENTS ===")
        change_summary = changes.groupby([
            'Consensus_annotation_detailed', 
            'Consensus_annotation_detailed_refined'
        ]).size().reset_index(name='count')
        
        for _, row in change_summary.iterrows():
            print(f"{row['Consensus_annotation_detailed']} -> {row['Consensus_annotation_detailed_refined']}: {row['count']} cells")

else:
    print("No cells with very poor silhouette scores found.")
    # Create corrected scores column that's identical to original
    Zhang_dataset.obs['silhouette_score_corrected'] = Zhang_dataset.obs['silhouette_score'].copy()
    silhouette_avg_corrected = silhouette_avg

# Create a reassignment status column for visualization
reassignment_mask = (Zhang_dataset.obs['Consensus_annotation_detailed'] != 
                    Zhang_dataset.obs['Consensus_annotation_detailed_refined'])
Zhang_dataset.obs['reassignment_status'] = 'Unchanged'
Zhang_dataset.obs.loc[reassignment_mask, 'reassignment_status'] = 'Reassigned'

# Final summary
print(f"\n=== FINAL SUMMARY ===")
print(f"Total cells: {len(Zhang_dataset)}")
print(f"Cells reassigned: {reassignment_mask.sum()}")
print(f"Final cell type distribution:")
final_counts = Zhang_dataset.obs['Consensus_annotation_detailed_refined'].value_counts()
print(final_counts)

# Plot comprehensive analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Original annotations
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_umap',
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,0])
axes[0,0].set_title('Original Smoothed Annotations', fontsize=14, fontweight='bold')

# Plot 2: Refined annotations
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_detailed_refined', 
                basis='X_umap',
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,1])
axes[0,1].set_title('Silhouette-Refined Annotations', fontsize=14, fontweight='bold')

# Plot 3: Reassignment status
sc.pl.embedding(Zhang_dataset, 
                color='reassignment_status', 
                basis='X_umap',
                palette={'Unchanged': 'lightgray', 'Reassigned': 'red'},
                add_outline=False,
                legend_loc='right margin', 
                frameon=False,
                show=False,
                ax=axes[1,0])
axes[1,0].set_title('Reassignment Status', fontsize=14, fontweight='bold')

# Plot 4: Corrected silhouette scores
sc.pl.embedding(Zhang_dataset, 
                color='silhouette_score_corrected', 
                basis='X_umap',
                color_map='RdBu_r',
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[1,1])
axes[1,1].set_title('Silhouette Scores (Refined)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(figures_path + "/22_Zhang_dataset_silhouette_refinement_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# Additional histogram comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))

# Original silhouette distribution
ax1.hist(sample_silhouette_values, bins=50, alpha=0.7, edgecolor='black', color='lightblue')
ax1.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax1.set_xlabel('Silhouette Score')
ax1.set_ylabel('Number of Cells')
ax1.set_title(f'Original Silhouette Distribution\n(Avg: {silhouette_avg:.3f})')
ax1.legend()

# Refined silhouette distribution
ax2.hist(Zhang_dataset.obs['silhouette_score_corrected'], bins=50, alpha=0.7, edgecolor='black', color='lightgreen')
ax2.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax2.set_xlabel('Silhouette Score')
ax2.set_ylabel('Number of Cells')
ax2.set_title(f'Refined Silhouette Distribution\n(Avg: {silhouette_avg_corrected:.3f})')
ax2.legend()

plt.tight_layout()
plt.savefig(figures_path + "/23_Zhang_dataset_silhouette_distribution_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Remove unused categories
Zhang_dataset.obs['Consensus_annotation_simplified'] = Zhang_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Zhang_dataset.obs['Consensus_annotation_detailed_refined'] = Zhang_dataset.obs['Consensus_annotation_detailed_refined'].cat.remove_unused_categories()

In [None]:
Zhang_dataset_normalized = Zhang_dataset.copy()
ep.Normalise_protein_data(Zhang_dataset_normalized, inplace=True, axis=1, flavor="seurat")

sc.tl.rank_genes_groups(Zhang_dataset_normalized, 'Consensus_annotation_detailed_refined', method='wilcoxon')
sc.pl.rank_genes_groups(Zhang_dataset_normalized, n_genes=10, sharey=False, ncols = 3, fontsize = 14)

plt.savefig(figures_path + "/24_Zhang_dataset_top10_markers.png", dpi=300, bbox_inches='tight')

In [None]:
AveragedExpression = grouped_obs_mean(Zhang_dataset_normalized, 'Consensus_annotation_detailed_refined')
df = pd.DataFrame(AveragedExpression)

In [None]:
# Compute the correlation matrix
corr = df.corr(method='pearson')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(235, 15, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
heatmap = sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, 
                        square=True, linewidths=.6, cbar_kws={"shrink": 1},
                        annot_kws={"fontsize":5})

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)

plt.savefig(figures_path + "/25_Zhang_dataset_correlation_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
import pandas as pd

LEVEL_COL  = "Level 3 Multimodal"
OUT_BROAD  = "Consensus_annotation_broad"
OUT_SIMPL  = "Consensus_annotation_simplified"
OUT_DETAIL = "Consensus_annotation_detailed"

# -----------------------------------------------------------------------------
# 1) Mapping rules: Level 3 Multimodal -> (broad, simplified, detailed)
# -----------------------------------------------------------------------------
RULES = [
    # MEP
    (['MEP-1', 'MEP-2', 'MEP-Eryth-1', 'MEP-Eryth-2'],
     'Immature', 'HSPC', 'MEP'),

    # Monocytes (CD14)
    (['Intermediate Mono-1','Intermediate Mono-2','Intermediate Mono-3','Classical-Mono'],
     'Mature', 'Monocyte', 'CD14 Mono'),

    # Monocytes (CD16) - include both spellings/cases if they exist in your data
    (['Non-classical-Mono-1','Non-classical-Mono-2', 'Non-Classical Mono-1','Non-Classical Mono-2'],
     'Mature', 'Monocyte', 'CD16 Mono'),

    # BMCP -> EoBaMaP
    (['BMCP-1','BMCP-2'],
     'Immature', 'HSPC', 'EoBaMaP'),

    # cDC1-ish
    (['pre-DC-2','pre-DC-1','cDC1'],
     'Mature', 'cDC', 'cDC1'),

    # cDC2-ish
    (['pre-DC-3','cDC2-1','cDC2-2','ASDC'],
     'Mature', 'cDC', 'cDC2'),

    # pDC
    (['pDC'],
     'Mature', 'pDC', 'pDC'),

    # Erythroid progenitors (ErP)
    (['ERP-1','ERP-2','ERP-3','ERP-4','ERP-5','ERP-6','ERP-7','ERP-8'],
     'Immature', 'Erythroid', 'ErP'),

    # Erythroblasts
    (['Erythroblast-1','Erythroblast-2','Erythroblast-3'],
     'Mature', 'Erythroid', 'Erythroblast'),

    # LMPP
    (['LMPP-1-cycling','LMPP-1'],
     'Immature', 'HSPC', 'MPP'),

    # CLP
    (['CLP'],
     'Immature', 'HSPC', 'Pre-Pro-B'),

    # CD8 naive
    (['T CD8 Naive'],
     'Mature', 'CD8 T', 'CD8 T Naive'),

    # Myeloid intermediate etc -> Myeloid progenitor
    (['Myeloid intermediate 1','Myeloid intermediate 2','Myeloid intermediate 3','Mono-1','Mono-2','cMOP'],
     'Mature', 'Myeloid', 'Myeloid progenitor'),

    # preNeu/immNeu -> GMP
    (['preNeu','immNeu-1','immNeu-2'],
     'Immature', 'HSPC', 'GMP'),

    # MPP-MEP -> MPP
    (['MPP-MEP'],
     'Immature', 'HSPC', 'MPP'),

    # HSC / MPP
    (['HSC-1','HSC-2','MPP-1','MPP-2'],
     'Immature', 'HSPC', 'HSC'),

    # MultiLin -> LMPP
    (['MultiLin-GMP-1','MultiLin-GMP-2','MultiLin-GMP-3','Multilin-1','Multilin-2','Multilin-3','LMPP-2','MDP-1','MDP-2'],
     'Immature', 'HSPC', 'LMPP'),

    # Pro-B
    (['Pro-B-Early-cycling','Pro-B-Early','Pro-B-cycling-1','Pro-B-cycling-2','Pro-B-2','Pro-B-3','Pro-B-1'],
     'Immature', 'B', 'Pro-B'),

    # Transitional-B-2 -> Pre-B
    (['Transitional-B-2'],
     'Immature', 'B', 'Pre-B'),

    # Transitional-B-1 / pre-B -> Immature B
    (['Transitional-B-1','pre-B'],
     'Mature', 'B', 'Immature B'),
]

# -----------------------------------------------------------------------------
# 2) Build lookup dicts
# -----------------------------------------------------------------------------
broad_map, simpl_map, detail_map = {}, {}, {}
for keys, broad, simpl, detail in RULES:
    for k in keys:
        broad_map[k]  = broad
        simpl_map[k]  = simpl
        detail_map[k] = detail

# -----------------------------------------------------------------------------
# 3) Apply mapping safely (avoid categorical setitem issues on reruns)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    if col not in Zhang_dataset.obs.columns:
        Zhang_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Zhang_dataset.obs[col]):
        Zhang_dataset.obs[col] = Zhang_dataset.obs[col].astype("object")

mask = Zhang_dataset.obs[LEVEL_COL].isin(broad_map)

Zhang_dataset.obs.loc[mask, OUT_BROAD]  = Zhang_dataset.obs.loc[mask, LEVEL_COL].map(broad_map)
Zhang_dataset.obs.loc[mask, OUT_SIMPL]  = Zhang_dataset.obs.loc[mask, LEVEL_COL].map(simpl_map)
Zhang_dataset.obs.loc[mask, OUT_DETAIL] = Zhang_dataset.obs.loc[mask, LEVEL_COL].map(detail_map)

# Optional: make these categoricals after assignment (inferred categories)
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    Zhang_dataset.obs[col] = pd.Categorical(Zhang_dataset.obs[col])

# -----------------------------------------------------------------------------
# 4) Build Consensus_annotation_simplified_final from refined detailed (or fallback)
# -----------------------------------------------------------------------------
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"
REFINED_DETAIL_COL = "Consensus_annotation_detailed_refined"  # preferred input
DETAIL_INPUT = REFINED_DETAIL_COL if REFINED_DETAIL_COL in Zhang_dataset.obs.columns else OUT_DETAIL

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'Macrophage', 'Stroma', 'Myeloid', 'Doublet', 'Other T', 'Plasma', 'Mesenchymal'
]

# Ensure simplified_final exists and is OBJECT dtype (safe on reruns)
if SIMPL_FINAL_COL not in Zhang_dataset.obs.columns:
    Zhang_dataset.obs[SIMPL_FINAL_COL] = pd.NA
else:
    if pd.api.types.is_categorical_dtype(Zhang_dataset.obs[SIMPL_FINAL_COL]):
        Zhang_dataset.obs[SIMPL_FINAL_COL] = Zhang_dataset.obs[SIMPL_FINAL_COL].astype("object")

# Reset each run
Zhang_dataset.obs[SIMPL_FINAL_COL] = pd.NA

s = Zhang_dataset.obs[DETAIL_INPUT]

Zhang_dataset.obs.loc[s.isin(['HSC','MPP','LMPP','EoBaMaP','Pre-Pro-B','Pro-B','GMP','MkP','ErP','MEP']), SIMPL_FINAL_COL] = 'HSPC'
Zhang_dataset.obs.loc[s.isin(['CD14 Mono','CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'
Zhang_dataset.obs.loc[s.isin(['NK CD56 dim','NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'
Zhang_dataset.obs.loc[s.isin(['CD4 T Naive','CD4 T Memory','Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'
Zhang_dataset.obs.loc[s.isin(['CD8 T Naive','CD8 T Memory','MAIT']), SIMPL_FINAL_COL] = 'CD8 T'
Zhang_dataset.obs.loc[s.isin(['B Naive','B Memory','Immature B','Pre-B']), SIMPL_FINAL_COL] = 'B'
Zhang_dataset.obs.loc[s.isin(['cDC1','cDC2']), SIMPL_FINAL_COL] = 'cDC'
Zhang_dataset.obs.loc[s.isin(['Erythroblast','ErP']), SIMPL_FINAL_COL] = 'Erythroid'
Zhang_dataset.obs.loc[s.isin(['Myeloid progenitor']), SIMPL_FINAL_COL] = 'Myeloid'
Zhang_dataset.obs.loc[s.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'
Zhang_dataset.obs.loc[s.isin(['Gamma delta T']), SIMPL_FINAL_COL] = 'Other T'
Zhang_dataset.obs.loc[s.eq('Macrophage'), SIMPL_FINAL_COL] = 'Macrophage'
Zhang_dataset.obs.loc[s.eq('Mesenchymal'), SIMPL_FINAL_COL] = 'Mesenchymal'
Zhang_dataset.obs.loc[s.eq('Stroma'), SIMPL_FINAL_COL] = 'Stroma'
Zhang_dataset.obs.loc[s.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# Enforce fixed categories/order
Zhang_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Zhang_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------------------------------------------------------
# 5) Build Consensus_annotation_broad_final (Zhang-style)
# -----------------------------------------------------------------------------
BROAD_FINAL_COL = "Consensus_annotation_broad_final"
broad_categories = ["Immature", "Mature", "Doublet"]

# Ensure broad_final exists and is OBJECT dtype (safe on reruns)
if BROAD_FINAL_COL not in Zhang_dataset.obs.columns:
    Zhang_dataset.obs[BROAD_FINAL_COL] = pd.NA
else:
    if pd.api.types.is_categorical_dtype(Zhang_dataset.obs[BROAD_FINAL_COL]):
        Zhang_dataset.obs[BROAD_FINAL_COL] = Zhang_dataset.obs[BROAD_FINAL_COL].astype("object")

# Reset each run
Zhang_dataset.obs[BROAD_FINAL_COL] = pd.NA

sf = Zhang_dataset.obs[SIMPL_FINAL_COL]
d = Zhang_dataset.obs[DETAIL_INPUT]

# Doublet if present in simplified_final (optional)
Zhang_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (align with your HSPC list)
immature_details = ['HSC','MPP','LMPP','EoBaMaP','Pre-Pro-B','Pro-B','GMP','MkP','ErP','MEP']
Zhang_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything assigned in simplified_final and not already Immature/Doublet
Zhang_dataset.obs.loc[Zhang_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"

# Enforce fixed categories/order
Zhang_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Zhang_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------------------------------------------------------
# 6) Sanity checks
# -----------------------------------------------------------------------------
print(f"Matched Level 3 rows: {int(mask.sum())} / {Zhang_dataset.n_obs}")

unmapped_level3 = Zhang_dataset.obs.loc[~mask, LEVEL_COL].value_counts().head(30)
print("\nTop unmatched Level 3 Multimodal labels (first 30):")
print(unmapped_level3.to_string())

unassigned_simpl_final = int(Zhang_dataset.obs[SIMPL_FINAL_COL].isna().sum())
print(f"\nUnassigned '{SIMPL_FINAL_COL}' (NA) rows: {unassigned_simpl_final} / {Zhang_dataset.n_obs}")

unassigned_broad_final = int(Zhang_dataset.obs[BROAD_FINAL_COL].isna().sum())
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {unassigned_broad_final} / {Zhang_dataset.n_obs}")

print("\nBroad_final value counts:")
print(Zhang_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified_final value counts (top 30):")
print(Zhang_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())


In [None]:
Zhang_dataset.obs['Consensus_annotation_detailed_final'] = Zhang_dataset.obs['Consensus_annotation_detailed_refined']

In [None]:
# Remove Gamma delta T cells from the final dataset
print(f"Before removing Gamma delta T cells: {len(Zhang_dataset)} cells")
print("Cell type counts before:")
print(Zhang_dataset.obs['Consensus_annotation_detailed_final'].value_counts())

# Create mask to exclude Gamma delta T cells
mask_not_gdt = ~(Zhang_dataset.obs['Consensus_annotation_detailed_final'] == 'Gamma delta T')

# Filter the dataset
Zhang_dataset = Zhang_dataset[mask_not_gdt, :].copy()

# Remove unused categories from all annotation columns
Zhang_dataset.obs['Consensus_annotation_detailed_final'] = Zhang_dataset.obs['Consensus_annotation_detailed_final'].cat.remove_unused_categories()
Zhang_dataset.obs['Consensus_annotation_simplified_final'] = Zhang_dataset.obs['Consensus_annotation_simplified_final'].cat.remove_unused_categories()
Zhang_dataset.obs['Consensus_annotation_broad_final'] = Zhang_dataset.obs['Consensus_annotation_broad_final'].cat.remove_unused_categories()

print(f"After removing Gamma delta T cells: {len(Zhang_dataset)} cells")
print("Cell type counts after:")
print(Zhang_dataset.obs['Consensus_annotation_detailed_final'].value_counts())

# Plot updated annotations
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_umap',
                legend_loc='on data',
                legend_fontsize=4,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

plt.tight_layout()
plt.show()

In [None]:
# Remove unused categories
Zhang_dataset.obs['Consensus_annotation_simplified_final'] = Zhang_dataset.obs['Consensus_annotation_simplified_final'].cat.remove_unused_categories()
Zhang_dataset.obs['Consensus_annotation_detailed_final'] = Zhang_dataset.obs['Consensus_annotation_detailed_final'].cat.remove_unused_categories()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_broad_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Zhang X. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus broad annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/26_Zhang_dataset_final_consensus_annotation_broad_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_simplified_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Zhang X. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus simplified annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/27_Zhang_dataset_final_consensus_annotation_simplified_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Zhang_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Zhang X. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/28_Zhang_dataset_final_Consensus_annotation_detailed_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

### Hao dataset

In [None]:
# Remove unused categories
Hao_dataset.obs['Consensus_annotation_simplified'] = Hao_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Hao_dataset.obs['Consensus_annotation_detailed'] = Hao_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

In [None]:
# Keep only the cells that are not labelled as 'Doublet'
Hao_dataset = Hao_dataset[Hao_dataset.obs['celltype.l2'] != 'Doublet']

In [None]:
counts = Hao_dataset.obs['Consensus_annotation_detailed'].value_counts()
print(counts)

In [None]:
filtered_categories = counts[counts >= 10].index
Hao_dataset = Hao_dataset[Hao_dataset.obs[Hao_dataset.obs['Consensus_annotation_detailed'].isin(filtered_categories)].index, :]

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed' in Hao_dataset.uns:
    del Hao_dataset.uns['Consensus_annotation_detailed']

# Plot UMAP with color
sc.pl.embedding(Hao_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_wnn.umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Hao Y. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
sc.pl.embedding(Hao_dataset, 
                color='celltype.l2', 
                basis='X_wnn.umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Hao et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925,
            color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
Hao_dataset.obs.loc[Hao_dataset.obs['Consensus_annotation_detailed'] == 'Progenitor', 'Consensus_annotation_broad'] = 'Mature'
Hao_dataset.obs.loc[Hao_dataset.obs['Consensus_annotation_detailed'] == 'Progenitor', 'Consensus_annotation_simplified'] = 'Monocyte'
Hao_dataset.obs.loc[Hao_dataset.obs['Consensus_annotation_detailed'] == 'Progenitor', 'Consensus_annotation_detailed'] = 'CD14 Mono'

In [None]:
import pandas as pd

OUT_BROAD  = "Consensus_annotation_broad"
OUT_SIMPL  = "Consensus_annotation_simplified"
OUT_DETAIL = "Consensus_annotation_detailed"

# -----------------------------------------------------------------------------
# 0) Ensure output columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    if col not in Hao_dataset.obs.columns:
        Hao_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Hao_dataset.obs[col]):
        Hao_dataset.obs[col] = Hao_dataset.obs[col].astype("object")

# Convenience handles (avoid KeyError if a column is absent)
l2 = Hao_dataset.obs["celltype.l2"] if "celltype.l2" in Hao_dataset.obs.columns else pd.Series(index=Hao_dataset.obs.index, dtype="object")
l3 = Hao_dataset.obs["celltype.l3"] if "celltype.l3" in Hao_dataset.obs.columns else pd.Series(index=Hao_dataset.obs.index, dtype="object")

# -----------------------------------------------------------------------------
# 1) Apply your overrides
# -----------------------------------------------------------------------------

# MAIT (from l2)
m = l2.eq("MAIT")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "CD8 T"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "MAIT"

# Eryth (from l2)
m = l2.eq("Eryth")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "Erythroid"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "Erythroblast"

# gdT (from l2)
m = l2.eq("gdT")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "Other T"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "GdT"

# dnT (from l2)
m = l2.eq("dnT")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "Other T"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "DnT"

# ASDC_mDC (from l3) -> cDC2
m = l3.eq("ASDC_mDC")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "cDC"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "cDC2"

# HSPC (from l3) -> MPP
m = l3.eq("HSPC")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "HSPC"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "MPP"

# CD8 TEM (from l3)
m = l3.eq("CD8 TEM")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "CD8 T"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "CD8 T Memory"

# CD4 TCM (from l2)
m = l2.eq("CD4 TCM")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "CD4 T"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "CD4 T Memory"

# CD4 TCM (from l2)
m = l3.eq("ILC")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "ILC"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "ILC"

# Platelet (from l3)
m = l3.eq("Platelet")
Hao_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL]  = "Erythroid"     # keeping your original choice
Hao_dataset.obs.loc[m, OUT_DETAIL] = "Platelet"

# Immature B -> simplified B (and keep detailed label)
m = Hao_dataset.obs[OUT_DETAIL].eq("Immature B")
Hao_dataset.obs.loc[m, OUT_BROAD] = "Mature"
Hao_dataset.obs.loc[m, OUT_SIMPL] = "B"
Hao_dataset.obs.loc[m, OUT_DETAIL] = "Immature B"

# -----------------------------------------------------------------------------
# 2) Optional: convert back to categoricals AFTER all assignments
#    (categories inferred; safe and rerunnable)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    Hao_dataset.obs[col] = pd.Categorical(Hao_dataset.obs[col])

# -----------------------------------------------------------------------------
# 3) Sanity checks
# -----------------------------------------------------------------------------
print("Value counts (simplified) top 20:")
print(Hao_dataset.obs[OUT_SIMPL].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (detailed) top 20:")
print(Hao_dataset.obs[OUT_DETAIL].value_counts(dropna=False).head(20).to_string())


In [None]:
# Remove unused categories
Hao_dataset.obs['Consensus_annotation_simplified'] = Hao_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Hao_dataset.obs['Consensus_annotation_detailed'] = Hao_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

In [None]:
counts = Hao_dataset.obs['Consensus_annotation_detailed'].value_counts()
print(counts)

In [None]:
counts = Hao_dataset.obs['Consensus_annotation_detailed'].value_counts()
print(counts)

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Calculate silhouette scores for current annotations
print("Calculating silhouette scores for Hao dataset...")

# Use the WNN UMAP representation for silhouette analysis
X_embed = Hao_dataset.obsm['X_wnn.umap']
labels = Hao_dataset.obs['Consensus_annotation_detailed'].astype('category').cat.codes

# Calculate silhouette scores
silhouette_avg = silhouette_score(X_embed, labels)
sample_silhouette_values = silhouette_samples(X_embed, labels)

print(f"Average silhouette score: {silhouette_avg:.3f}")

# Add silhouette scores to the dataset
Hao_dataset.obs['silhouette_score'] = sample_silhouette_values

# Identify cells with negative silhouette scores
negative_silhouette_mask = sample_silhouette_values < 0
print(f"Number of cells with negative silhouette scores: {negative_silhouette_mask.sum()}")
print(f"Percentage of cells with negative silhouette scores: {negative_silhouette_mask.sum()/len(sample_silhouette_values)*100:.2f}%")

# Show distribution of silhouette scores by cell type
silhouette_by_type = Hao_dataset.obs.groupby('Consensus_annotation_detailed')['silhouette_score'].agg(['mean', 'std', 'min', 'max', 'count'])
print("\nSilhouette scores by cell type:")
print(silhouette_by_type.sort_values('mean'))

# Initialize refined annotations (start with original smoothed annotations)
Hao_dataset.obs['Consensus_annotation_detailed_refined'] = Hao_dataset.obs['Consensus_annotation_detailed'].copy()

# Perform silhouette-based reassignment
print("\n=== PERFORMING SILHOUETTE-BASED REASSIGNMENT ===")

# Identify cells with very poor silhouette scores (< -0.1)
very_poor_silhouette = Hao_dataset.obs['silhouette_score'] < -0.1

if very_poor_silhouette.sum() > 0:
    print(f"Found {very_poor_silhouette.sum()} cells with very poor silhouette scores (< -0.1)")
    
    # Fit nearest neighbors
    nn = NearestNeighbors(n_neighbors=30, metric='euclidean')
    nn.fit(X_embed)
    
    # Get indices of poorly assigned cells
    poor_indices = np.where(very_poor_silhouette)[0]
    
    reassignments_made = 0
    
    for idx in poor_indices:
        # Find neighbors for this cell
        distances, neighbor_indices = nn.kneighbors([X_embed[idx]])
        neighbor_indices = neighbor_indices[0][1:]  # Exclude the cell itself
        
        # Get annotations of neighbors
        neighbor_annotations = Hao_dataset.obs['Consensus_annotation_detailed'].iloc[neighbor_indices]

        # Find most common annotation among neighbors
        most_common = neighbor_annotations.mode()
        
        if len(most_common) > 0:
            new_annotation = most_common.iloc[0]
            current_annotation = Hao_dataset.obs['Consensus_annotation_detailed'].iloc[idx]
            
            # Only reassign if the most common neighbor annotation is different
            if new_annotation != current_annotation:
                # Check if at least 40% of neighbors have this annotation
                fraction = (neighbor_annotations == new_annotation).sum() / len(neighbor_annotations)
                
                if fraction >= 0.4:
                    Hao_dataset.obs.loc[Hao_dataset.obs.index[idx], 'Consensus_annotation_detailed_refined'] = new_annotation
                    reassignments_made += 1
    
    print(f"Reassigned {reassignments_made} cells based on neighborhood consensus")
    
    # Recalculate silhouette scores after reassignment
    new_labels = Hao_dataset.obs['Consensus_annotation_detailed_refined'].astype('category').cat.codes
    new_silhouette_scores = silhouette_samples(X_embed, new_labels)
    silhouette_avg_corrected = silhouette_score(X_embed, new_labels)
    
    # Store corrected scores
    Hao_dataset.obs['silhouette_score_corrected'] = new_silhouette_scores
    
    print(f"\n=== REASSIGNMENT RESULTS ===")
    print(f"Original average silhouette: {silhouette_avg:.3f}")
    print(f"Refined average silhouette: {silhouette_avg_corrected:.3f}")
    print(f"Improvement: {silhouette_avg_corrected - silhouette_avg:.3f}")
    
    print(f"Original negative silhouette cells: {negative_silhouette_mask.sum()}")
    print(f"Refined negative silhouette cells: {(new_silhouette_scores < 0).sum()}")
    
    # Show what changes were made
    if reassignments_made > 0:
        changes_mask = (Hao_dataset.obs['Consensus_annotation_detailed'] != 
                       Hao_dataset.obs['Consensus_annotation_detailed_refined'])
        changes = Hao_dataset.obs[changes_mask]
        
        print(f"\n=== SPECIFIC REASSIGNMENTS ===")
        change_summary = changes.groupby([
            'Consensus_annotation_detailed', 
            'Consensus_annotation_detailed_refined'
        ]).size().reset_index(name='count')
        
        for _, row in change_summary.iterrows():
            print(f"{row['Consensus_annotation_detailed']} -> {row['Consensus_annotation_detailed_refined']}: {row['count']} cells")

else:
    print("No cells with very poor silhouette scores found.")
    # Create corrected scores column that's identical to original
    Hao_dataset.obs['silhouette_score_corrected'] = Hao_dataset.obs['silhouette_score'].copy()
    silhouette_avg_corrected = silhouette_avg

# Create a reassignment status column for visualization
reassignment_mask = (Hao_dataset.obs['Consensus_annotation_detailed'] != 
                    Hao_dataset.obs['Consensus_annotation_detailed_refined'])
Hao_dataset.obs['reassignment_status'] = 'Unchanged'
Hao_dataset.obs.loc[reassignment_mask, 'reassignment_status'] = 'Reassigned'

# Final summary
print(f"\n=== FINAL SUMMARY ===")
print(f"Total cells: {len(Hao_dataset)}")
print(f"Cells reassigned: {reassignment_mask.sum()}")
print(f"Final cell type distribution:")
final_counts = Hao_dataset.obs['Consensus_annotation_detailed_refined'].value_counts()
print(final_counts)

# Plot comprehensive analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Original annotations
sc.pl.embedding(Hao_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_wnn.umap',
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,0])
axes[0,0].set_title('Original Smoothed Annotations', fontsize=14, fontweight='bold')

# Plot 2: Refined annotations
sc.pl.embedding(Hao_dataset, 
                color='Consensus_annotation_detailed_refined', 
                basis='X_wnn.umap',
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,1])
axes[0,1].set_title('Silhouette-Refined Annotations', fontsize=14, fontweight='bold')

# Plot 3: Reassignment status
sc.pl.embedding(Hao_dataset, 
                color='reassignment_status', 
                basis='X_wnn.umap',
                palette={'Unchanged': 'lightgray', 'Reassigned': 'red'},
                add_outline=False,
                legend_loc='right margin', 
                frameon=False,
                show=False,
                ax=axes[1,0])
axes[1,0].set_title('Reassignment Status', fontsize=14, fontweight='bold')

# Plot 4: Corrected silhouette scores
sc.pl.embedding(Hao_dataset, 
                color='silhouette_score_corrected', 
                basis='X_wnn.umap',
                color_map='RdBu_r',
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[1,1])
axes[1,1].set_title('Silhouette Scores (Refined)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(figures_path + "/29_Hao_dataset_silhouette_refinement_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# Additional histogram comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))

# Original silhouette distribution
ax1.hist(sample_silhouette_values, bins=50, alpha=0.7, edgecolor='black', color='lightblue')
ax1.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax1.set_xlabel('Silhouette Score')
ax1.set_ylabel('Number of Cells')
ax1.set_title(f'Original Silhouette Distribution\n(Avg: {silhouette_avg:.3f})')
ax1.legend()

# Refined silhouette distribution
ax2.hist(Hao_dataset.obs['silhouette_score_corrected'], bins=50, alpha=0.7, edgecolor='black', color='lightgreen')
ax2.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax2.set_xlabel('Silhouette Score')
ax2.set_ylabel('Number of Cells')
ax2.set_title(f'Refined Silhouette Distribution\n(Avg: {silhouette_avg_corrected:.3f})')
ax2.legend()

plt.tight_layout()
plt.savefig(figures_path + "/30_Hao_dataset_silhouette_distribution_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Remove unused categories
Hao_dataset.obs['Consensus_annotation_simplified'] = Hao_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Hao_dataset.obs['Consensus_annotation_detailed_refined'] = Hao_dataset.obs['Consensus_annotation_detailed_refined'].cat.remove_unused_categories()

In [None]:
Hao_dataset_normalized = Hao_dataset.copy()
ep.Normalise_protein_data(Hao_dataset_normalized, inplace=True, axis=1, flavor="seurat")
sc.tl.rank_genes_groups(Hao_dataset_normalized, 'Consensus_annotation_detailed_refined', method='wilcoxon')
sc.pl.rank_genes_groups(Hao_dataset_normalized, n_genes=10, sharey=False, ncols = 3, fontsize = 14)

plt.savefig(figures_path + "/31_Hao_dataset_top10_markers.png", dpi=300, bbox_inches='tight')

In [None]:
AveragedExpression = grouped_obs_mean(Hao_dataset_normalized, 'Consensus_annotation_detailed_refined')
df = pd.DataFrame(AveragedExpression)

In [None]:
# Compute the correlation matrix
corr = df.corr(method='pearson')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(235, 15, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
heatmap = sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, 
                        square=True, linewidths=.6, cbar_kws={"shrink": 1},
                        annot_kws={"fontsize":5})

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)

plt.savefig(figures_path + "/32_Hao_dataset_correlation_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
import pandas as pd

# =============================================================================
# Hao_dataset: build *_final columns (broad_final + simplified_final) Zhang-style
# - Safe on reruns (casts to object before assignment)
# - Deterministic (resets outputs each run)
# - Enforces fixed category orders at the end
# =============================================================================

# -----------------------------
# Column names
# -----------------------------
DETAIL_REFINED_COL = "Consensus_annotation_detailed_refined"
DETAIL_FALLBACK_COL = "Consensus_annotation_detailed"

BROAD_FINAL_COL = "Consensus_annotation_broad_final"
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"

# If refined detailed does not exist, fall back
DETAIL_INPUT = DETAIL_REFINED_COL if DETAIL_REFINED_COL in Hao_dataset.obs.columns else DETAIL_FALLBACK_COL

# -----------------------------
# Fixed category orders
# -----------------------------
broad_categories = ["Immature", "Mature", "Doublet"]

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'ILC', 'Other T', 'Macrophage', 'Stroma', 'Myeloid', 'Doublet', 'Plasma', 'Platelet'
]

# -----------------------------
# 0) Ensure *_final columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------
for col in [BROAD_FINAL_COL, SIMPL_FINAL_COL]:
    if col not in Hao_dataset.obs.columns:
        Hao_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Hao_dataset.obs[col]):
        Hao_dataset.obs[col] = Hao_dataset.obs[col].astype("object")

# Reset each run so reruns are deterministic
Hao_dataset.obs[BROAD_FINAL_COL] = pd.NA
Hao_dataset.obs[SIMPL_FINAL_COL] = pd.NA

# -----------------------------
# 1) Build simplified_final from detailed_refined (or fallback detailed)
# -----------------------------
d = Hao_dataset.obs[DETAIL_INPUT]

# HSPC
Hao_dataset.obs.loc[d.isin(['MPP']), SIMPL_FINAL_COL] = 'HSPC'

# Monocyte
Hao_dataset.obs.loc[d.isin(['CD14 Mono', 'CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'

# NK
Hao_dataset.obs.loc[d.isin(['NK CD56 dim', 'NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'

# CD4 T
Hao_dataset.obs.loc[d.isin(['CD4 T Naive', 'CD4 T Memory', 'Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'

# CD8 T
Hao_dataset.obs.loc[d.isin(['CD8 T Naive', 'CD8 T Memory', 'MAIT']), SIMPL_FINAL_COL] = 'CD8 T'

# B
Hao_dataset.obs.loc[d.isin(['Pre-B', 'B Naive', 'B Memory', 'Immature B']), SIMPL_FINAL_COL] = 'B'

# Erythroid
Hao_dataset.obs.loc[d.eq('Erythroblast'), SIMPL_FINAL_COL] = 'Erythroid'

# Platelet
Hao_dataset.obs.loc[d.eq('Platelet'), SIMPL_FINAL_COL] = 'Platelet'

# ILC
Hao_dataset.obs.loc[d.eq('ILC'), SIMPL_FINAL_COL] = 'ILC'

# Other T
Hao_dataset.obs.loc[d.isin(['GdT', 'DnT']), SIMPL_FINAL_COL] = 'Other T'

# cDC
Hao_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'

# pDC
Hao_dataset.obs.loc[d.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'

# Plasma
Hao_dataset.obs.loc[d.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# If you have additional refined labels and want them routed:
# - Macrophage, Stroma, Myeloid, Doublet can be added here similarly.

# Enforce fixed categories/order for simplified_final
Hao_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Hao_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------
# 2) Build broad_final
#    Priority:
#      - If simplified_final is Doublet -> Doublet
#      - If detailed label indicates progenitor -> Immature
#      - Else if simplified_final assigned -> Mature
#      - Else -> Other
# -----------------------------
# Start from NA (already reset)
sf = Hao_dataset.obs[SIMPL_FINAL_COL]

# Doublet (only if you ever assign it in simplified_final)
Hao_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (expand this list if your refined labels include more progenitors)
immature_details = [
    "HSC", "MPP", "LMPP", "GMP", "MEP", "ErP", "MkP",
    "Pre-Pro-B", "Pro-B", "Pre-B", "CLP"
]
Hao_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything with an assigned simplified_final (except Doublet already handled)
Hao_dataset.obs.loc[Hao_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"


# Enforce fixed categories/order for broad_final
Hao_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Hao_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------
# 3) Sanity checks
# -----------------------------
na_simpl = int(Hao_dataset.obs[SIMPL_FINAL_COL].isna().sum())
na_broad = int(Hao_dataset.obs[BROAD_FINAL_COL].isna().sum())

print(f"Using detail input column: {DETAIL_INPUT}")
print(f"Unassigned '{SIMPL_FINAL_COL}' (NA) rows: {na_simpl} / {Hao_dataset.n_obs}")
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {na_broad} / {Hao_dataset.n_obs}")

print("\nBroad final value counts:")
print(Hao_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified final value counts (top 30):")
print(Hao_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())

# Diagnostics for what is unmapped in simplified_final
unmapped = (
    Hao_dataset.obs.loc[Hao_dataset.obs[SIMPL_FINAL_COL].isna(), DETAIL_INPUT]
    .astype("object")
    .fillna("<<NA in detailed>>")
    .value_counts()
    .head(30)
)
print(f"\nTop unmapped '{DETAIL_INPUT}' labels among NA simplified_final rows (top 30):")
print(unmapped.to_string())


In [None]:
Hao_dataset.obs['Consensus_annotation_detailed_final'] = Hao_dataset.obs['Consensus_annotation_detailed_refined']

In [None]:
# Remove unused categories
Hao_dataset.obs['Consensus_annotation_simplified_final'] = Hao_dataset.obs['Consensus_annotation_simplified_final'].cat.remove_unused_categories()
Hao_dataset.obs['Consensus_annotation_detailed_final'] = Hao_dataset.obs['Consensus_annotation_detailed_final'].cat.remove_unused_categories()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Hao_dataset, 
                color='Consensus_annotation_broad_final', 
                basis='X_wnn.umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Hao Y. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus broad annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/33_Hao_dataset_final_consensus_annotation_broad_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Hao_dataset, 
                color='Consensus_annotation_simplified_final', 
                basis='X_wnn.umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Hao Y. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus simplified annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/34_Hao_dataset_final_consensus_annotation_simplified_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Hao_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_wnn.umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Hao Y. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/35_Hao_dataset_final_Consensus_annotation_detailed_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

### Triana dataset

In [None]:
# Remove unused categories
Triana_dataset.obs['Consensus_annotation_simplified'] = Triana_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Triana_dataset.obs['Consensus_annotation_detailed'] = Triana_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

In [None]:
counts = Triana_dataset.obs['Consensus_annotation_detailed'].value_counts()
print(counts)

In [None]:
filtered_categories = counts[counts >= 10].index
Triana_dataset = Triana_dataset[Triana_dataset.obs[Triana_dataset.obs['Consensus_annotation_detailed'].isin(filtered_categories)].index, :]

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='CellTypes', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
import pandas as pd

OUT_BROAD  = "Consensus_annotation_broad"
OUT_SIMPL  = "Consensus_annotation_simplified"
OUT_DETAIL = "Consensus_annotation_detailed"

# -----------------------------------------------------------------------------
# 0) Backup current annotations (optional but matches your intent)
# -----------------------------------------------------------------------------
Triana_dataset.obs[f"{OUT_BROAD}_tmp"]  = Triana_dataset.obs[OUT_BROAD]
Triana_dataset.obs[f"{OUT_SIMPL}_tmp"]  = Triana_dataset.obs[OUT_SIMPL]
Triana_dataset.obs[f"{OUT_DETAIL}_tmp"] = Triana_dataset.obs[OUT_DETAIL]

# -----------------------------------------------------------------------------
# 1) Ensure output columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    if col not in Triana_dataset.obs.columns:
        Triana_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Triana_dataset.obs[col]):
        Triana_dataset.obs[col] = Triana_dataset.obs[col].astype("object")

# Convenience handles
ct = Triana_dataset.obs["CellTypes"] if "CellTypes" in Triana_dataset.obs.columns else pd.Series(index=Triana_dataset.obs.index, dtype="object")
dtmp = Triana_dataset.obs[f"{OUT_DETAIL}_tmp"] if f"{OUT_DETAIL}_tmp" in Triana_dataset.obs.columns else pd.Series(index=Triana_dataset.obs.index, dtype="object")

# -----------------------------------------------------------------------------
# 2) Apply your overrides (no categorical category wrangling needed)
# -----------------------------------------------------------------------------

# Early promyelocytes -> GMP (HSPC)
m = ct.eq("Early promyelocytes")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Immature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "HSPC"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "GMP"

# Plasmacytoid dendritic cell progenitors -> pDC progenitors (HSPC)
m = ct.eq("Plasmacytoid dendritic cell progenitors")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Immature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "HSPC"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "GMP"

# Late promyelocytes -> GMP (HSPC)
m = ct.eq("Late promyelocytes")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "Myeloid"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "Myeloid progenitor"

# Late promyelocytes -> GMP (HSPC)
m = ct.eq("Myelocytes")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "Myeloid"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "Myeloid progenitor"

# Late erythroid progenitor -> ErP (Erythroid)
m = ct.eq("Late erythroid progenitor")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "Erythroid"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "ErP"

# Eosinophil-basophil-mast cell progenitors -> EoBaMaP (HSPC)
m = ct.eq("Eosinophil-basophil-mast cell progenitors")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Immature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "HSPC"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "EoBaMaP"

# GammaDelta T cells -> Gamma delta T (CD8 T)
m = ct.eq("GammaDelta T cells")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "CD8 T"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "GdT"

# Conventional dendritic cell 1 -> cDC1 (cDC)
m = ct.eq("Conventional dendritic cell 1")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "cDC"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "cDC1"

# Conventional dendritic cell 2 -> cDC2 (cDC)
m = ct.eq("Conventional dendritic cell 2")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "cDC"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "cDC2"

# -----------------------------------------------------------------------------
# 3) Optional: convert back to categoricals AFTER all assignments
#    (categories inferred; safe and rerunnable)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    Triana_dataset.obs[col] = pd.Categorical(Triana_dataset.obs[col])

# -----------------------------------------------------------------------------
# 4) Sanity checks
# -----------------------------------------------------------------------------
print("Value counts (broad) top 20:")
print(Triana_dataset.obs[OUT_BROAD].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (simplified) top 20:")
print(Triana_dataset.obs[OUT_SIMPL].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (detailed) top 20:")
print(Triana_dataset.obs[OUT_DETAIL].value_counts(dropna=False).head(20).to_string())


In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Extract HSC cells from Triana dataset
HSC_mask = Triana_dataset.obs['Consensus_annotation_detailed'] == 'HSC'
hsc_subset = Triana_dataset[HSC_mask].copy()

print(f"Number of HSC cells: {hsc_subset.n_obs}")
print(f"Original clusters containing HSC: {hsc_subset.obs['CellTypes'].unique()}")

# Check distribution of original cell types within HSC
print("\nDistribution of original CellTypes within HSC:")
print(hsc_subset.obs['CellTypes'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on HSC cells
sc.pp.neighbors(hsc_subset, use_rep="X_mofaumap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(hsc_subset, resolution=0.5, random_state=42, key_added='hsc_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the HSC subset
sc.tl.umap(hsc_subset, random_state=42, min_dist=0.3)

# Plot the subclusters
sc.pl.embedding(hsc_subset, 
                color='hsc_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('HSC Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(hsc_subset.obs['hsc_subclusters'].unique()):
    cluster_cells = hsc_subset.obs[hsc_subset.obs['hsc_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['CellTypes'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_mofaumap"
SUBCOL = "hsc_subclusters"
OUTCOL = "hsc_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Triana_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Triana_dataset.obs.loc[hsc_subset.obs_names, OUTCOL] = hsc_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by gmp subclusters (non-GMP will be NA)
sc.pl.embedding(
    Triana_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Triana dataset: HSC subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "hsc_subclusters"
TARGET_COL = "Consensus_annotation_detailed"
DEFAULT_LABEL = "MPP"

subcluster_to_pop = {
    "5": "HSC",
    "7": "HSC",
    "1": "LMPP",
    "3": "LMPP",
    "10": "LMPP",
    "4": "LMPP",
}

# Map subclusters -> labels
mapped = hsc_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)

mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = hsc_subset.obs.index[mask_mapped]
unmapped_cell_ids = hsc_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [hsc_subset, Triana_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
hsc_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Triana_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
hsc_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Triana_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(hsc_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(hsc_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())


In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Extract HSC cells from Triana dataset
GMP_mask = Triana_dataset.obs['Consensus_annotation_detailed'] == 'GMP'
gmp_subset = Triana_dataset[GMP_mask].copy()

print(f"Number of GMP cells: {gmp_subset.n_obs}")
print(f"Original clusters containing GMP: {gmp_subset.obs['CellTypes'].unique()}")
# Check distribution of original cell types within GMP
print("\nDistribution of original CellTypes within GMP:")
print(gmp_subset.obs['CellTypes'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on GMP cells
sc.pp.neighbors(gmp_subset, use_rep="X_mofaumap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(gmp_subset, resolution=0.5, random_state=42, key_added='gmp_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the GMP subset
sc.tl.umap(gmp_subset, random_state=42, min_dist=0.3)

# Plot the subclusters
sc.pl.embedding(gmp_subset, 
                color='gmp_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('GMP Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(gmp_subset.obs['gmp_subclusters'].unique()):
    cluster_cells = gmp_subset.obs[gmp_subset.obs['gmp_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['CellTypes'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_mofaumap"
SUBCOL = "gmp_subclusters"
OUTCOL = "gmp_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Triana_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Triana_dataset.obs.loc[gmp_subset.obs_names, OUTCOL] = gmp_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by gmp subclusters (non-GMP will be NA)
sc.pl.embedding(
    Triana_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Triana dataset: GMP subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "gmp_subclusters"
TARGET_COL = "Consensus_annotation_detailed"
DEFAULT_LABEL = "GMP"

subcluster_to_pop = {
    "4": "LMPP",
    "5": "LMPP",
    "2": "LMPP",
    "10": "LMPP",
    "17": "MPP",
    "14": "MEP"
}

# Map subclusters -> labels
mapped = gmp_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)

mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = gmp_subset.obs.index[mask_mapped]
unmapped_cell_ids = gmp_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [gmp_subset, Triana_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
gmp_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Triana_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
gmp_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Triana_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(gmp_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(gmp_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Extract HSC cells from Triana dataset
LMPP_mask = Triana_dataset.obs['Consensus_annotation_detailed'] == 'LMPP'
lmpp_subset = Triana_dataset[LMPP_mask].copy()

print(f"Number of LMPP cells: {lmpp_subset.n_obs}")
print(f"Original clusters containing LMPP: {lmpp_subset.obs['CellTypes'].unique()}")
# Check distribution of original cell types within LMPP
print("\nDistribution of original CellTypes within LMPP:")
print(lmpp_subset.obs['CellTypes'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on LMPP cells
sc.pp.neighbors(lmpp_subset, use_rep="X_mofaumap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(lmpp_subset, resolution=0.5, random_state=42, key_added='lmpp_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the LMPP subset
sc.tl.umap(lmpp_subset, random_state=42, min_dist=0.3)

# Plot the subclusters
sc.pl.embedding(lmpp_subset, 
                color='lmpp_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('LMPP Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(lmpp_subset.obs['lmpp_subclusters'].unique()):
    cluster_cells = lmpp_subset.obs[lmpp_subset.obs['lmpp_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['CellTypes'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_mofaumap"
SUBCOL = "lmpp_subclusters"
OUTCOL = "lmpp_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Triana_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Triana_dataset.obs.loc[lmpp_subset.obs_names, OUTCOL] = lmpp_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by lmpp subclusters (non-LMPP will be NA)
sc.pl.embedding(
    Triana_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Triana dataset: LMPP subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "lmpp_subclusters"
TARGET_COL = "Consensus_annotation_detailed"
DEFAULT_LABEL = "LMPP"

subcluster_to_pop = {
    "6": "Pre-Pro-B",
}

# Map subclusters -> labels
mapped = lmpp_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)

mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = lmpp_subset.obs.index[mask_mapped]
unmapped_cell_ids = lmpp_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [lmpp_subset, Triana_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
lmpp_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Triana_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
lmpp_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Triana_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(lmpp_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(lmpp_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())

In [None]:
import pandas as pd

OUT_BROAD  = "Consensus_annotation_broad"
OUT_SIMPL  = "Consensus_annotation_simplified"
OUT_DETAIL = "Consensus_annotation_detailed"

# -----------------------------------------------------------------------------
# 0) Backup current annotations (optional but matches your intent)
# -----------------------------------------------------------------------------
Triana_dataset.obs[f"{OUT_BROAD}_tmp"]  = Triana_dataset.obs[OUT_BROAD]
Triana_dataset.obs[f"{OUT_SIMPL}_tmp"]  = Triana_dataset.obs[OUT_SIMPL]
Triana_dataset.obs[f"{OUT_DETAIL}_tmp"] = Triana_dataset.obs[OUT_DETAIL]

# -----------------------------------------------------------------------------
# 1) Ensure output columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    if col not in Triana_dataset.obs.columns:
        Triana_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Triana_dataset.obs[col]):
        Triana_dataset.obs[col] = Triana_dataset.obs[col].astype("object")

# Convenience handles
ct = Triana_dataset.obs["CellTypes"] if "CellTypes" in Triana_dataset.obs.columns else pd.Series(index=Triana_dataset.obs.index, dtype="object")
dtmp = Triana_dataset.obs[f"{OUT_DETAIL}_tmp"] if f"{OUT_DETAIL}_tmp" in Triana_dataset.obs.columns else pd.Series(index=Triana_dataset.obs.index, dtype="object")

# -----------------------------------------------------------------------------
# 2) Apply your overrides (no categorical category wrangling needed)
# -----------------------------------------------------------------------------

# Pro-B cells
m = ct.eq("Pro-B cells")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Immature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "HSPC"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "Pro-B"

# Pre-B cells
m = ct.eq("Pre-B cells")
Triana_dataset.obs.loc[m, OUT_BROAD]  = "Mature"
Triana_dataset.obs.loc[m, OUT_SIMPL]  = "B"
Triana_dataset.obs.loc[m, OUT_DETAIL] = "Pre-B"

# -----------------------------------------------------------------------------
# 3) Optional: convert back to categoricals AFTER all assignments
#    (categories inferred; safe and rerunnable)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    Triana_dataset.obs[col] = pd.Categorical(Triana_dataset.obs[col])

# -----------------------------------------------------------------------------
# 4) Sanity checks
# -----------------------------------------------------------------------------
print("Value counts (broad) top 20:")
print(Triana_dataset.obs[OUT_BROAD].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (simplified) top 20:")
print(Triana_dataset.obs[OUT_SIMPL].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (detailed) top 20:")
print(Triana_dataset.obs[OUT_DETAIL].value_counts(dropna=False).head(20).to_string())


In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
counts = Triana_dataset.obs['Consensus_annotation_detailed'].value_counts()
print(counts)

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Calculate silhouette scores for current annotations
print("Calculating silhouette scores for Triana dataset...")

# Use the harmony-corrected PCA representation for silhouette analysis
X_embed = Triana_dataset.obsm['X_mofaumap']
labels = Triana_dataset.obs['Consensus_annotation_detailed'].astype('category').cat.codes

# Calculate silhouette scores
silhouette_avg = silhouette_score(X_embed, labels)
sample_silhouette_values = silhouette_samples(X_embed, labels)

print(f"Average silhouette score: {silhouette_avg:.3f}")

# Add silhouette scores to the dataset
Triana_dataset.obs['silhouette_score'] = sample_silhouette_values

# Identify cells with negative silhouette scores
negative_silhouette_mask = sample_silhouette_values < 0
print(f"Number of cells with negative silhouette scores: {negative_silhouette_mask.sum()}")
print(f"Percentage of cells with negative silhouette scores: {negative_silhouette_mask.sum()/len(sample_silhouette_values)*100:.2f}%")

# Show distribution of silhouette scores by cell type
silhouette_by_type = Triana_dataset.obs.groupby('Consensus_annotation_detailed')['silhouette_score'].agg(['mean', 'std', 'min', 'max', 'count'])
print("\nSilhouette scores by cell type:")
print(silhouette_by_type.sort_values('mean'))

# Initialize refined annotations (start with original smoothed annotations)
Triana_dataset.obs['Consensus_annotation_detailed_refined'] = Triana_dataset.obs['Consensus_annotation_detailed'].copy()

# Perform silhouette-based reassignment
print("\n=== PERFORMING SILHOUETTE-BASED REASSIGNMENT ===")

# Identify cells with very poor silhouette scores (< -0.1)
very_poor_silhouette = Triana_dataset.obs['silhouette_score'] < -0.1

if very_poor_silhouette.sum() > 0:
    print(f"Found {very_poor_silhouette.sum()} cells with very poor silhouette scores (< -0.1)")
    
    # Fit nearest neighbors
    nn = NearestNeighbors(n_neighbors=30, metric='euclidean')
    nn.fit(X_embed)
    
    # Get indices of poorly assigned cells
    poor_indices = np.where(very_poor_silhouette)[0]
    
    reassignments_made = 0
    
    for idx in poor_indices:
        # Find neighbors for this cell
        distances, neighbor_indices = nn.kneighbors([X_embed[idx]])
        neighbor_indices = neighbor_indices[0][1:]  # Exclude the cell itself
        
        # Get annotations of neighbors
        neighbor_annotations = Triana_dataset.obs['Consensus_annotation_detailed'].iloc[neighbor_indices]

        # Find most common annotation among neighbors
        most_common = neighbor_annotations.mode()
        
        if len(most_common) > 0:
            new_annotation = most_common.iloc[0]
            current_annotation = Triana_dataset.obs['Consensus_annotation_detailed'].iloc[idx]
            
            # Only reassign if the most common neighbor annotation is different
            if new_annotation != current_annotation:
                # Check if at least 40% of neighbors have this annotation
                fraction = (neighbor_annotations == new_annotation).sum() / len(neighbor_annotations)
                
                if fraction >= 0.4:
                    Triana_dataset.obs.loc[Triana_dataset.obs.index[idx], 'Consensus_annotation_detailed_refined'] = new_annotation
                    reassignments_made += 1
    
    print(f"Reassigned {reassignments_made} cells based on neighborhood consensus")
    
    # Recalculate silhouette scores after reassignment
    new_labels = Triana_dataset.obs['Consensus_annotation_detailed_refined'].astype('category').cat.codes
    new_silhouette_scores = silhouette_samples(X_embed, new_labels)
    silhouette_avg_corrected = silhouette_score(X_embed, new_labels)
    
    # Store corrected scores
    Triana_dataset.obs['silhouette_score_corrected'] = new_silhouette_scores
    
    print(f"\n=== REASSIGNMENT RESULTS ===")
    print(f"Original average silhouette: {silhouette_avg:.3f}")
    print(f"Refined average silhouette: {silhouette_avg_corrected:.3f}")
    print(f"Improvement: {silhouette_avg_corrected - silhouette_avg:.3f}")
    
    print(f"Original negative silhouette cells: {negative_silhouette_mask.sum()}")
    print(f"Refined negative silhouette cells: {(new_silhouette_scores < 0).sum()}")
    
    # Show what changes were made
    if reassignments_made > 0:
        changes_mask = (Triana_dataset.obs['Consensus_annotation_detailed'] != 
                       Triana_dataset.obs['Consensus_annotation_detailed_refined'])
        changes = Triana_dataset.obs[changes_mask]
        
        print(f"\n=== SPECIFIC REASSIGNMENTS ===")
        change_summary = changes.groupby([
            'Consensus_annotation_detailed', 
            'Consensus_annotation_detailed_refined'
        ]).size().reset_index(name='count')
        
        for _, row in change_summary.iterrows():
            print(f"{row['Consensus_annotation_detailed']} -> {row['Consensus_annotation_detailed_refined']}: {row['count']} cells")

else:
    print("No cells with very poor silhouette scores found.")
    # Create corrected scores column that's identical to original
    Triana_dataset.obs['silhouette_score_corrected'] = Triana_dataset.obs['silhouette_score'].copy()
    silhouette_avg_corrected = silhouette_avg

# Create a reassignment status column for visualization
reassignment_mask = (Triana_dataset.obs['Consensus_annotation_detailed'] != 
                    Triana_dataset.obs['Consensus_annotation_detailed_refined'])
Triana_dataset.obs['reassignment_status'] = 'Unchanged'
Triana_dataset.obs.loc[reassignment_mask, 'reassignment_status'] = 'Reassigned'

# Final summary
print(f"\n=== FINAL SUMMARY ===")
print(f"Total cells: {len(Triana_dataset)}")
print(f"Cells reassigned: {reassignment_mask.sum()}")
print(f"Final cell type distribution:")
final_counts = Triana_dataset.obs['Consensus_annotation_detailed_refined'].value_counts()
print(final_counts)

# Plot comprehensive analysis
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Original annotations
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap',
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,0])
axes[0,0].set_title('Original Smoothed Annotations', fontsize=14, fontweight='bold')

# Plot 2: Refined annotations
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed_refined', 
                basis='X_mofaumap',
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,1])
axes[0,1].set_title('Silhouette-Refined Annotations', fontsize=14, fontweight='bold')

# Plot 3: Reassignment status
sc.pl.embedding(Triana_dataset, 
                color='reassignment_status', 
                basis='X_mofaumap',
                palette={'Unchanged': 'lightgray', 'Reassigned': 'red'},
                add_outline=False,
                legend_loc='right margin', 
                frameon=False,
                show=False,
                ax=axes[1,0])
axes[1,0].set_title('Reassignment Status', fontsize=14, fontweight='bold')

# Plot 4: Corrected silhouette scores
sc.pl.embedding(Triana_dataset, 
                color='silhouette_score_corrected', 
                basis='X_mofaumap',
                color_map='RdBu_r',
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[1,1])
axes[1,1].set_title('Silhouette Scores (Refined)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(figures_path + "/36_Triana_dataset_silhouette_refinement_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# Additional histogram comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))

# Original silhouette distribution
ax1.hist(sample_silhouette_values, bins=50, alpha=0.7, edgecolor='black', color='lightblue')
ax1.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax1.set_xlabel('Silhouette Score')
ax1.set_ylabel('Number of Cells')
ax1.set_title(f'Original Silhouette Distribution\n(Avg: {silhouette_avg:.3f})')
ax1.legend()

# Refined silhouette distribution
ax2.hist(Triana_dataset.obs['silhouette_score_corrected'], bins=50, alpha=0.7, edgecolor='black', color='lightgreen')
ax2.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax2.set_xlabel('Silhouette Score')
ax2.set_ylabel('Number of Cells')
ax2.set_title(f'Refined Silhouette Distribution\n(Avg: {silhouette_avg_corrected:.3f})')
ax2.legend()

plt.tight_layout()
plt.savefig(figures_path + "/37_Triana_dataset_silhouette_distribution_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Remove unused categories
Triana_dataset.obs['Consensus_annotation_simplified'] = Triana_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Triana_dataset.obs['Consensus_annotation_detailed_refined'] = Triana_dataset.obs['Consensus_annotation_detailed_refined'].cat.remove_unused_categories()

In [None]:
counts = Triana_dataset.obs['Consensus_annotation_detailed_refined'].value_counts()
filtered_categories = counts[counts >= 10].index
Triana_dataset = Triana_dataset[Triana_dataset.obs[Triana_dataset.obs['Consensus_annotation_detailed_refined'].isin(filtered_categories)].index, :]

In [None]:
Triana_dataset_normalized = Triana_dataset.copy()
ep.Normalise_protein_data(Triana_dataset_normalized, inplace=True, axis=1, flavor="seurat")
sc.tl.rank_genes_groups(Triana_dataset_normalized, 'Consensus_annotation_detailed_refined', method='wilcoxon')
sc.pl.rank_genes_groups(Triana_dataset_normalized, n_genes=10, sharey=False, ncols = 3, fontsize = 14)

plt.savefig(figures_path + "/38_Triana_dataset_top10_markers.png", dpi=300, bbox_inches='tight')

In [None]:
AveragedExpression = grouped_obs_mean(Triana_dataset_normalized, 'Consensus_annotation_detailed_refined')
df = pd.DataFrame(AveragedExpression)

In [None]:
# Compute the correlation matrix
corr = df.corr(method='pearson')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(235, 15, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
heatmap = sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, 
                        square=True, linewidths=.6, cbar_kws={"shrink": 1},
                        annot_kws={"fontsize":5})

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)

plt.savefig(figures_path + "/39_Triana_dataset_correlation_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
import pandas as pd

# =============================================================================
# Triana_dataset: build *_final columns (broad_final + simplified_final) Zhang-style
# - Safe on reruns (casts to object before assignment)
# - Deterministic (resets outputs each run)
# - Enforces fixed category orders at the end
# =============================================================================

# -----------------------------
# Column names
# -----------------------------
DETAIL_REFINED_COL  = "Consensus_annotation_detailed_refined"
DETAIL_FALLBACK_COL = "Consensus_annotation_detailed"

BROAD_FINAL_COL = "Consensus_annotation_broad_final"
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"

# If refined detailed does not exist, fall back
DETAIL_INPUT = DETAIL_REFINED_COL if DETAIL_REFINED_COL in Triana_dataset.obs.columns else DETAIL_FALLBACK_COL

# -----------------------------
# Fixed category orders (from your Triana simplified_final spec)
# -----------------------------
broad_categories = ["Immature", "Mature", "Doublet"]

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'ILC', 'Stroma', 'Myeloid', 'Other T', 'Plasma', 'Mesenchymal'
]

# -----------------------------
# 0) Ensure *_final columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------
for col in [BROAD_FINAL_COL, SIMPL_FINAL_COL]:
    if col not in Triana_dataset.obs.columns:
        Triana_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Triana_dataset.obs[col]):
        Triana_dataset.obs[col] = Triana_dataset.obs[col].astype("object")

# Reset each run so reruns are deterministic
Triana_dataset.obs[BROAD_FINAL_COL] = pd.NA
Triana_dataset.obs[SIMPL_FINAL_COL] = pd.NA

# -----------------------------
# 1) Build simplified_final from detailed_refined (or fallback detailed)
# -----------------------------
d = Triana_dataset.obs[DETAIL_INPUT]

# HSPC
Triana_dataset.obs.loc[d.isin(['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP']),
                       SIMPL_FINAL_COL] = 'HSPC'

# Monocyte
Triana_dataset.obs.loc[d.isin(['CD14 Mono', 'CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'

# Myeloid
Triana_dataset.obs.loc[d.isin(['Myeloid progenitor']), SIMPL_FINAL_COL] = 'Myeloid'

# NK
Triana_dataset.obs.loc[d.isin(['NK CD56 dim', 'NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'

# CD4 T
Triana_dataset.obs.loc[d.isin(['CD4 T Naive', 'CD4 T Memory', 'Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'

# CD8 T
Triana_dataset.obs.loc[d.isin(['CD8 T Naive', 'CD8 T Memory', 'MAIT']), SIMPL_FINAL_COL] = 'CD8 T'

# B
Triana_dataset.obs.loc[d.isin(['B Naive', 'B Memory', 'Immature B', 'Pre-B']), SIMPL_FINAL_COL] = 'B'

# Erythroid
Triana_dataset.obs.loc[d.isin(['ErP', 'Erythroblast']), SIMPL_FINAL_COL] = 'Erythroid'

# Mesenchymal
Triana_dataset.obs.loc[d.eq('Mesenchymal'), SIMPL_FINAL_COL] = 'Mesenchymal'

# cDC
Triana_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'

# Other T
Triana_dataset.obs.loc[d.isin(['GdT']), SIMPL_FINAL_COL] = 'Other T'

# pDC
Triana_dataset.obs.loc[d.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'

# Stroma
Triana_dataset.obs.loc[d.eq('Stroma'), SIMPL_FINAL_COL] = 'Stroma'

# Plasma
Triana_dataset.obs.loc[d.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# NOTE: You included 'Pro-B' as a simplified category but did not map anything to it.
# If you intended Pro-B to be a simplified bucket (distinct from HSPC), uncomment:
# Triana_dataset.obs.loc[d.eq('Pro-B'), SIMPL_FINAL_COL] = 'Pro-B'
# (Doing so would override the HSPC assignment for 'Pro-B'.)

# Enforce fixed categories/order for simplified_final
Triana_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Triana_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------
# 2) Build broad_final
#    Priority:
#      - If simplified_final is Doublet -> Doublet
#      - If detailed label indicates progenitor -> Immature
#      - Else if simplified_final assigned -> Mature
#      - Else -> Other
# -----------------------------
sf = Triana_dataset.obs[SIMPL_FINAL_COL]

# Doublet (only if you ever assign it in simplified_final)
Triana_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (expand if needed)
immature_details = ['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP', 'ErP']
Triana_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything with an assigned simplified_final (and not already Immature/Doublet)
Triana_dataset.obs.loc[Triana_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"

# Enforce fixed categories/order for broad_final
Triana_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Triana_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------
# 3) Sanity checks
# -----------------------------
na_simpl = int(Triana_dataset.obs[SIMPL_FINAL_COL].isna().sum())
na_broad = int(Triana_dataset.obs[BROAD_FINAL_COL].isna().sum())

print(f"Using detail input column: {DETAIL_INPUT}")
print(f"Unassigned '{SIMPL_FINAL_COL}' (NA) rows: {na_simpl} / {Triana_dataset.n_obs}")
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {na_broad} / {Triana_dataset.n_obs}")

print("\nBroad final value counts:")
print(Triana_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified final value counts (top 30):")
print(Triana_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())

# Diagnostics for what is unmapped in simplified_final
unmapped = (
    Triana_dataset.obs.loc[Triana_dataset.obs[SIMPL_FINAL_COL].isna(), DETAIL_INPUT]
    .astype("object")
    .fillna("<<NA in detailed>>")
    .value_counts()
    .head(30)
)
print(f"\nTop unmapped '{DETAIL_INPUT}' labels among NA simplified_final rows (top 30):")
print(unmapped.to_string())


In [None]:
Triana_dataset.obs['Consensus_annotation_detailed_final'] = Triana_dataset.obs['Consensus_annotation_detailed_refined']

In [None]:
# Remove unused categories
Triana_dataset.obs['Consensus_annotation_simplified_final'] = Triana_dataset.obs['Consensus_annotation_simplified_final'].cat.remove_unused_categories()
Triana_dataset.obs['Consensus_annotation_detailed_final'] = Triana_dataset.obs['Consensus_annotation_detailed_final'].cat.remove_unused_categories()

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_final_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_final_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Extract HSC cells from Triana dataset
NK_mask = Triana_dataset.obs['Consensus_annotation_detailed_final'] == 'NK CD56 dim'
nk_subset = Triana_dataset[NK_mask].copy()

print(f"Number of NK CD56 dim cells: {nk_subset.n_obs}")
print(f"Original clusters containing NK CD56 dim: {nk_subset.obs['CellTypes'].unique()}")
# Check distribution of original cell types within NK CD56 dim
print("\nDistribution of original CellTypes within NK CD56 dim:")
print(nk_subset.obs['CellTypes'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on NK CD56 dim cells
sc.pp.neighbors(nk_subset, use_rep="X_mofaumap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(nk_subset, resolution=0.5, random_state=42, key_added='nk_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the NK CD56 dim subset
sc.tl.umap(nk_subset, random_state=42, min_dist=0.3)

# Plot the subclusters
sc.pl.embedding(nk_subset, 
                color='nk_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('NK CD56 dim Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(nk_subset.obs['nk_subclusters'].unique()):
    cluster_cells = nk_subset.obs[nk_subset.obs['nk_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['CellTypes'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_mofaumap"
SUBCOL = "nk_subclusters"
OUTCOL = "nk_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Triana_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Triana_dataset.obs.loc[nk_subset.obs_names, OUTCOL] = nk_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by gmp subclusters (non-GMP will be NA)
sc.pl.embedding(
    Triana_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Triana dataset: NK CD56 dim subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "nk_subclusters"
TARGET_COL = "Consensus_annotation_detailed_final"
DEFAULT_LABEL = "NK CD56 dim"

subcluster_to_pop = {
    "23": "MPP",
    "22": "NK CD56 bright",
}

# Map subclusters -> labels
mapped = nk_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)
mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = nk_subset.obs.index[mask_mapped]
unmapped_cell_ids = nk_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [nk_subset, Triana_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
nk_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Triana_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
nk_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Triana_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(nk_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(nk_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())

In [None]:
# Extract HSC cells from Triana dataset
CD8_mask = Triana_dataset.obs['Consensus_annotation_detailed_final'] == 'CD8 T Memory'
cd8_subset = Triana_dataset[CD8_mask].copy()

print(f"Number of CD8 T Memory cells: {cd8_subset.n_obs}")
print(f"Original clusters containing CD8 T Memory: {cd8_subset.obs['CellTypes'].unique()}")
# Check distribution of original cell types within CD8 T Memory
print("\nDistribution of original CellTypes within CD8 T Memory:")
print(cd8_subset.obs['CellTypes'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on CD8 T Memory cells
sc.pp.neighbors(cd8_subset, use_rep="X_mofaumap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(cd8_subset, resolution=0.5, random_state=42, key_added='cd8_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the CD8 T Memory subset
sc.tl.umap(cd8_subset, random_state=42, min_dist=0.3)
# Plot the subclusters
sc.pl.embedding(cd8_subset, 
                color='cd8_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('CD8 T Memory Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(cd8_subset.obs['cd8_subclusters'].unique()):
    cluster_cells = cd8_subset.obs[cd8_subset.obs['cd8_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['CellTypes'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_mofaumap"
SUBCOL = "cd8_subclusters"
OUTCOL = "cd8_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Triana_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Triana_dataset.obs.loc[cd8_subset.obs_names, OUTCOL] = cd8_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by gmp subclusters (non-GMP will be NA)
sc.pl.embedding(
    Triana_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Triana dataset: CD8 T Memory subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "cd8_subclusters"
TARGET_COL = "Consensus_annotation_detailed_final"
DEFAULT_LABEL = "CD8 T Memory"

subcluster_to_pop = {
    "13": "GdT",
    "25": "CD4 T Memory",
}

# Map subclusters -> labels
mapped = cd8_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)
mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = cd8_subset.obs.index[mask_mapped]
unmapped_cell_ids = cd8_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [cd8_subset, Triana_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
cd8_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Triana_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
cd8_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Triana_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(cd8_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(cd8_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
import pandas as pd

# =============================================================================
# Triana_dataset: build *_final columns (broad_final + simplified_final) Zhang-style
# - Safe on reruns (casts to object before assignment)
# - Deterministic (resets outputs each run)
# - Enforces fixed category orders at the end
# =============================================================================

# -----------------------------
# Column names
# -----------------------------
DETAIL_REFINED_COL  = "Consensus_annotation_detailed_final"
DETAIL_FALLBACK_COL = "Consensus_annotation_detailed_refined"

BROAD_FINAL_COL = "Consensus_annotation_broad_final"
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"

# If refined detailed does not exist, fall back
DETAIL_INPUT = DETAIL_REFINED_COL if DETAIL_REFINED_COL in Triana_dataset.obs.columns else DETAIL_FALLBACK_COL

# -----------------------------
# Fixed category orders (from your Triana simplified_final spec)
# -----------------------------
broad_categories = ["Immature", "Mature", "Doublet"]

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'ILC', 'Stroma', 'Myeloid', 'Other T', 'Plasma', 'Mesenchymal'
]

# -----------------------------
# 0) Ensure *_final columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------
for col in [BROAD_FINAL_COL, SIMPL_FINAL_COL]:
    if col not in Triana_dataset.obs.columns:
        Triana_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Triana_dataset.obs[col]):
        Triana_dataset.obs[col] = Triana_dataset.obs[col].astype("object")

# Reset each run so reruns are deterministic
Triana_dataset.obs[BROAD_FINAL_COL] = pd.NA
Triana_dataset.obs[SIMPL_FINAL_COL] = pd.NA

# -----------------------------
# 1) Build simplified_final from detailed_refined (or fallback detailed)
# -----------------------------
d = Triana_dataset.obs[DETAIL_INPUT]

# HSPC
Triana_dataset.obs.loc[d.isin(['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP']),
                       SIMPL_FINAL_COL] = 'HSPC'

# Monocyte
Triana_dataset.obs.loc[d.isin(['CD14 Mono', 'CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'

# Myeloid
Triana_dataset.obs.loc[d.isin(['Myeloid progenitor']), SIMPL_FINAL_COL] = 'Myeloid'

# NK
Triana_dataset.obs.loc[d.isin(['NK CD56 dim', 'NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'

# CD4 T
Triana_dataset.obs.loc[d.isin(['CD4 T Naive', 'CD4 T Memory', 'Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'

# CD8 T
Triana_dataset.obs.loc[d.isin(['CD8 T Naive', 'CD8 T Memory', 'MAIT']), SIMPL_FINAL_COL] = 'CD8 T'

# B
Triana_dataset.obs.loc[d.isin(['B Naive', 'B Memory', 'Immature B', 'Pre-B']), SIMPL_FINAL_COL] = 'B'

# Erythroid
Triana_dataset.obs.loc[d.isin(['ErP', 'Erythroblast']), SIMPL_FINAL_COL] = 'Erythroid'

# Mesenchymal
Triana_dataset.obs.loc[d.eq('Mesenchymal'), SIMPL_FINAL_COL] = 'Mesenchymal'

# cDC
Triana_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'

# Other T
Triana_dataset.obs.loc[d.isin(['GdT']), SIMPL_FINAL_COL] = 'Other T'

# pDC
Triana_dataset.obs.loc[d.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'

# Stroma
Triana_dataset.obs.loc[d.eq('Stroma'), SIMPL_FINAL_COL] = 'Stroma'

# Plasma
Triana_dataset.obs.loc[d.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# NOTE: You included 'Pro-B' as a simplified category but did not map anything to it.
# If you intended Pro-B to be a simplified bucket (distinct from HSPC), uncomment:
# Triana_dataset.obs.loc[d.eq('Pro-B'), SIMPL_FINAL_COL] = 'Pro-B'
# (Doing so would override the HSPC assignment for 'Pro-B'.)

# Enforce fixed categories/order for simplified_final
Triana_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Triana_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------
# 2) Build broad_final
#    Priority:
#      - If simplified_final is Doublet -> Doublet
#      - If detailed label indicates progenitor -> Immature
#      - Else if simplified_final assigned -> Mature
#      - Else -> Other
# -----------------------------
sf = Triana_dataset.obs[SIMPL_FINAL_COL]

# Doublet (only if you ever assign it in simplified_final)
Triana_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (expand if needed)
immature_details = ['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP', 'ErP']
Triana_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything with an assigned simplified_final (and not already Immature/Doublet)
Triana_dataset.obs.loc[Triana_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"

# Enforce fixed categories/order for broad_final
Triana_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Triana_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------
# 3) Sanity checks
# -----------------------------
na_simpl = int(Triana_dataset.obs[SIMPL_FINAL_COL].isna().sum())
na_broad = int(Triana_dataset.obs[BROAD_FINAL_COL].isna().sum())

print(f"Using detail input column: {DETAIL_INPUT}")
print(f"Unassigned '{SIMPL_FINAL_COL}' (NA) rows: {na_simpl} / {Triana_dataset.n_obs}")
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {na_broad} / {Triana_dataset.n_obs}")

print("\nBroad final value counts:")
print(Triana_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified final value counts (top 30):")
print(Triana_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())

# Diagnostics for what is unmapped in simplified_final
unmapped = (
    Triana_dataset.obs.loc[Triana_dataset.obs[SIMPL_FINAL_COL].isna(), DETAIL_INPUT]
    .astype("object")
    .fillna("<<NA in detailed>>")
    .value_counts()
    .head(30)
)
print(f"\nTop unmapped '{DETAIL_INPUT}' labels among NA simplified_final rows (top 30):")
print(unmapped.to_string())


In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
import pandas as pd

# =============================================================================
# Triana_dataset: build *_final columns (broad_final + simplified_final) Zhang-style
# - Safe on reruns (casts to object before assignment)
# - Deterministic (resets outputs each run)
# - Enforces fixed category orders at the end
# =============================================================================

# -----------------------------
# Column names
# -----------------------------
DETAIL_REFINED_COL  = "Consensus_annotation_detailed_final"
DETAIL_FALLBACK_COL = "Consensus_annotation_detailed_refined"

BROAD_FINAL_COL = "Consensus_annotation_broad_final"
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"

# If refined detailed does not exist, fall back
DETAIL_INPUT = DETAIL_REFINED_COL if DETAIL_REFINED_COL in Triana_dataset.obs.columns else DETAIL_FALLBACK_COL

# -----------------------------
# Fixed category orders (from your Triana simplified_final spec)
# -----------------------------
broad_categories = ["Immature", "Mature", "Doublet"]

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'ILC', 'Stroma', 'Myeloid', 'Other T', 'Plasma', 'Mesenchymal'
]

# -----------------------------
# 0) Ensure *_final columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------
for col in [BROAD_FINAL_COL, SIMPL_FINAL_COL]:
    if col not in Triana_dataset.obs.columns:
        Triana_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Triana_dataset.obs[col]):
        Triana_dataset.obs[col] = Triana_dataset.obs[col].astype("object")

# Reset each run so reruns are deterministic
Triana_dataset.obs[BROAD_FINAL_COL] = pd.NA
Triana_dataset.obs[SIMPL_FINAL_COL] = pd.NA

# -----------------------------
# 1) Build simplified_final from detailed_refined (or fallback detailed)
# -----------------------------
d = Triana_dataset.obs[DETAIL_INPUT]

# HSPC
Triana_dataset.obs.loc[d.isin(['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP']),
                       SIMPL_FINAL_COL] = 'HSPC'

# Monocyte
Triana_dataset.obs.loc[d.isin(['CD14 Mono', 'CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'

# Myeloid
Triana_dataset.obs.loc[d.isin(['Myeloid progenitor']), SIMPL_FINAL_COL] = 'Myeloid'

# NK
Triana_dataset.obs.loc[d.isin(['NK CD56 dim', 'NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'

# CD4 T
Triana_dataset.obs.loc[d.isin(['CD4 T Naive', 'CD4 T Memory', 'Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'

# CD8 T
Triana_dataset.obs.loc[d.isin(['CD8 T Naive', 'CD8 T Memory', 'MAIT']), SIMPL_FINAL_COL] = 'CD8 T'

# B
Triana_dataset.obs.loc[d.isin(['B Naive', 'B Memory', 'Immature B', 'Pre-B']), SIMPL_FINAL_COL] = 'B'

# Erythroid
Triana_dataset.obs.loc[d.isin(['ErP', 'Erythroblast']), SIMPL_FINAL_COL] = 'Erythroid'

# Mesenchymal
Triana_dataset.obs.loc[d.eq('Mesenchymal'), SIMPL_FINAL_COL] = 'Mesenchymal'

# cDC
Triana_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'

# Other T
Triana_dataset.obs.loc[d.isin(['GdT']), SIMPL_FINAL_COL] = 'Other T'

# pDC
Triana_dataset.obs.loc[d.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'

# Stroma
Triana_dataset.obs.loc[d.eq('Stroma'), SIMPL_FINAL_COL] = 'Stroma'

# Plasma
Triana_dataset.obs.loc[d.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# NOTE: You included 'Pro-B' as a simplified category but did not map anything to it.
# If you intended Pro-B to be a simplified bucket (distinct from HSPC), uncomment:
# Triana_dataset.obs.loc[d.eq('Pro-B'), SIMPL_FINAL_COL] = 'Pro-B'
# (Doing so would override the HSPC assignment for 'Pro-B'.)

# Enforce fixed categories/order for simplified_final
Triana_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Triana_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------
# 2) Build broad_final
#    Priority:
#      - If simplified_final is Doublet -> Doublet
#      - If detailed label indicates progenitor -> Immature
#      - Else if simplified_final assigned -> Mature
#      - Else -> Other
# -----------------------------
sf = Triana_dataset.obs[SIMPL_FINAL_COL]

# Doublet (only if you ever assign it in simplified_final)
Triana_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (expand if needed)
immature_details = ['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP', 'ErP']
Triana_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything with an assigned simplified_final (and not already Immature/Doublet)
Triana_dataset.obs.loc[Triana_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"

# Enforce fixed categories/order for broad_final
Triana_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Triana_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------
# 3) Sanity checks
# -----------------------------
na_simpl = int(Triana_dataset.obs[SIMPL_FINAL_COL].isna().sum())
na_broad = int(Triana_dataset.obs[BROAD_FINAL_COL].isna().sum())

print(f"Using detail input column: {DETAIL_INPUT}")
print(f"Unassigned '{SIMPL_FINAL_COL}' (NA) rows: {na_simpl} / {Triana_dataset.n_obs}")
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {na_broad} / {Triana_dataset.n_obs}")

print("\nBroad final value counts:")
print(Triana_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified final value counts (top 30):")
print(Triana_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())

# Diagnostics for what is unmapped in simplified_final
unmapped = (
    Triana_dataset.obs.loc[Triana_dataset.obs[SIMPL_FINAL_COL].isna(), DETAIL_INPUT]
    .astype("object")
    .fillna("<<NA in detailed>>")
    .value_counts()
    .head(30)
)
print(f"\nTop unmapped '{DETAIL_INPUT}' labels among NA simplified_final rows (top 30):")
print(unmapped.to_string())


In [None]:
# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_broad_final', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus broad annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/40_Triana_dataset_final_consensus_annotation_broad_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_simplified_final', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus simplified annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/41_Triana_dataset_final_consensus_annotation_simplified_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_final_colors' in Triana_dataset.uns:
    del Triana_dataset.uns['Consensus_annotation_detailed_final_colors']

# Plot UMAP with color
sc.pl.embedding(Triana_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_mofaumap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Triana S. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/42_Triana_dataset_final_Consensus_annotation_detailed_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

### Luecken dataset

In [None]:
# Remove unused categories
Luecken_dataset.obs['Consensus_annotation_simplified'] = Luecken_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Luecken_dataset.obs['Consensus_annotation_detailed'] = Luecken_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

In [None]:
counts = Luecken_dataset.obs['Consensus_annotation_detailed'].value_counts()

In [None]:
print(counts)

In [None]:
filtered_categories = counts[counts >= 10].index
Luecken_dataset = Luecken_dataset[Luecken_dataset.obs[Luecken_dataset.obs['Consensus_annotation_detailed'].isin(filtered_categories)].index, :]

In [None]:
# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='cell_type', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken M.D. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# List the categories in the cell_type column
print("Cell type categories in Luecken dataset:")
print(list(Luecken_dataset.obs['cell_type'].cat.categories))
print(f"\nNumber of categories: {len(Luecken_dataset.obs['cell_type'].cat.categories)}")

# Also show value counts to see distribution
print("\nValue counts:")
print(Luecken_dataset.obs['cell_type'].value_counts())

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Luecken_dataset.uns:
    del Luecken_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken M.D. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
import pandas as pd

OUT_BROAD  = "Consensus_annotation_broad"
OUT_SIMPL  = "Consensus_annotation_simplified"
OUT_DETAIL = "Consensus_annotation_detailed"

CELLTYPE_COL = "cell_type"

# -----------------------------------------------------------------------------
# 0) Ensure output columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    if col not in Luecken_dataset.obs.columns:
        Luecken_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Luecken_dataset.obs[col]):
        Luecken_dataset.obs[col] = Luecken_dataset.obs[col].astype("object")

# Convenience handles (avoid KeyError if columns are absent)
ct = Luecken_dataset.obs[CELLTYPE_COL] if CELLTYPE_COL in Luecken_dataset.obs.columns else pd.Series(index=Luecken_dataset.obs.index, dtype="object")
d  = Luecken_dataset.obs[OUT_DETAIL]   if OUT_DETAIL in Luecken_dataset.obs.columns   else pd.Series(index=Luecken_dataset.obs.index, dtype="object")

# -----------------------------------------------------------------------------
# 1) Apply your overrides (vectorized, rerunnable)
# -----------------------------------------------------------------------------

# Proerythroblast / Erythroblast / Reticulocyte -> Erythroblast (Erythroid, Mature)
m = ct.isin(['Proerythroblast', 'Erythroblast', 'Reticulocyte'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'Erythroid'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'Erythroblast'

# MK/E prog -> ErP (Erythroid, Mature)
m = ct.isin(['MK/E prog'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'Erythroid'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'ErP'

m = Luecken_dataset.obs[OUT_DETAIL].isin(['MkP', 'MEP'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'HSPC'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'MEP'

# Existing detailed == LMPP -> Myeloid progenitor (Myeloid, Mature)
m = Luecken_dataset.obs[OUT_DETAIL].isin(['LMPP', 'GMP'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Immature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'HSPC'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'GMP'

# MAIT -> MAIT (CD8 T, Mature)
m = ct.isin(['MAIT'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'CD8 T'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'MAIT'

# cDC1 -> cDC1 (cDC, Mature)
m = ct.isin(['cDC1'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'cDC'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'cDC1'

# gdT TCRVD2+ -> Gamma delta T (CD8 T, Mature)
m = ct.isin(['gdT TCRVD2+'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'Other T'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'GdT'

# dnT -> Double negative T (Other T, Mature)
m = ct.isin(['dnT'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'Other T'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'DnT'

# CD4 activated / cycling -> CD4 T Memory (CD4 T, Mature)
m = ct.isin(['CD4+ T activated IntegrinB7+', 'CD4+ T activated', 'T prog cycling'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'CD4 T'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'CD4 T Memory'

# CD4 naive -> CD4 T Naive (CD4 T, Mature)
m = ct.isin(['CD4+ T Naive'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Mature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'CD4 T'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'CD4 T Naive'

m = Luecken_dataset.obs[OUT_DETAIL].isin(['CD45RO+', 'MPP', 'EoBaMaP'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Immature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'HSPC'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'MPP'

m = Luecken_dataset.obs[OUT_DETAIL].isin(['Pre-Pro-B'])
Luecken_dataset.obs.loc[m, OUT_BROAD]  = 'Immature'
Luecken_dataset.obs.loc[m, OUT_SIMPL]  = 'HSPC'
Luecken_dataset.obs.loc[m, OUT_DETAIL] = 'Pre-Pro-B'

# -----------------------------------------------------------------------------
# 2) Optional: convert back to categoricals AFTER all assignments
# -----------------------------------------------------------------------------
for col in [OUT_BROAD, OUT_SIMPL, OUT_DETAIL]:
    Luecken_dataset.obs[col] = pd.Categorical(Luecken_dataset.obs[col])

# -----------------------------------------------------------------------------
# 3) Sanity checks
# -----------------------------------------------------------------------------
print("Value counts (broad) top 20:")
print(Luecken_dataset.obs[OUT_BROAD].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (simplified) top 20:")
print(Luecken_dataset.obs[OUT_SIMPL].value_counts(dropna=False).head(20).to_string())

print("\nValue counts (detailed) top 20:")
print(Luecken_dataset.obs[OUT_DETAIL].value_counts(dropna=False).head(20).to_string())

In [None]:
# Plot UMAP with Consensus_annotation_detailed
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(8, 6)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Plot UMAP with Consensus_annotation_detailed
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_detailed', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(8, 6)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Merged datasets', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
# Remove unused categories
Luecken_dataset.obs['Consensus_annotation_simplified'] = Luecken_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Luecken_dataset.obs['Consensus_annotation_detailed'] = Luecken_dataset.obs['Consensus_annotation_detailed'].cat.remove_unused_categories()

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.neighbors import NearestNeighbors

# -----------------------------------------------------------------------------
# Clear any existing color palettes to force scanpy to regenerate them
# -----------------------------------------------------------------------------
if 'Consensus_annotation_detailed_colors' in Luecken_dataset.uns:
    del Luecken_dataset.uns['Consensus_annotation_detailed_colors']

if 'Consensus_annotation_detailed_refined_colors' in Luecken_dataset.uns:
    del Luecken_dataset.uns['Consensus_annotation_detailed_refined_colors']

# -----------------------------------------------------------------------------
# Calculate silhouette scores for current annotations
# -----------------------------------------------------------------------------
print("Calculating silhouette scores for Luecken dataset...")

# Use the UMAP representation for silhouette analysis
X_embed = Luecken_dataset.obsm['X_umap']
labels = Luecken_dataset.obs['Consensus_annotation_detailed'].astype('category').cat.codes

# Calculate silhouette scores
silhouette_avg = silhouette_score(X_embed, labels)
sample_silhouette_values = silhouette_samples(X_embed, labels)

print(f"Average silhouette score: {silhouette_avg:.3f}")

# Add silhouette scores to the dataset
Luecken_dataset.obs['silhouette_score'] = sample_silhouette_values

# Identify cells with negative silhouette scores
negative_silhouette_mask = sample_silhouette_values < 0
print(f"Number of cells with negative silhouette scores: {negative_silhouette_mask.sum()}")
print(f"Percentage of cells with negative silhouette scores: {negative_silhouette_mask.sum()/len(sample_silhouette_values)*100:.2f}%")

# Show distribution of silhouette scores by cell type
silhouette_by_type = (
    Luecken_dataset.obs
    .groupby('Consensus_annotation_detailed')['silhouette_score']
    .agg(['mean', 'std', 'min', 'max', 'count'])
)
print("\nSilhouette scores by cell type:")
print(silhouette_by_type.sort_values('mean'))

# -----------------------------------------------------------------------------
# Initialize refined annotations (start with original smoothed annotations)
# -----------------------------------------------------------------------------
Luecken_dataset.obs['Consensus_annotation_detailed_refined'] = Luecken_dataset.obs['Consensus_annotation_detailed'].copy()

# -----------------------------------------------------------------------------
# Perform silhouette-based reassignment with RESTRICTION
# -----------------------------------------------------------------------------
print("\n=== PERFORMING SILHOUETTE-BASED REASSIGNMENT (WITH RESTRICTION) ===")

# Cells with very poor silhouette scores (< -0.1)
very_poor = Luecken_dataset.obs['silhouette_score'] < -0.1

# Restriction: exclude these labels from reassignment
protected_labels = ['cDC1', 'ErP', 'Pre-Pro-B']
protected_mask = Luecken_dataset.obs['Consensus_annotation_detailed'].isin(protected_labels)

# Eligible = very poor AND not protected
very_poor_silhouette = very_poor & (~protected_mask)

print(f"Found {int(very_poor.sum())} cells with very poor silhouette (< -0.1) before restriction")
print(f"Protected cells (never reassigned): {int(protected_mask.sum())}")
print(f"Cells eligible for reassignment after restriction: {int(very_poor_silhouette.sum())}")

if very_poor_silhouette.sum() > 0:
    # Fit nearest neighbors
    nn = NearestNeighbors(n_neighbors=30, metric='euclidean')
    nn.fit(X_embed)

    # Indices of eligible poorly assigned cells
    poor_indices = np.where(very_poor_silhouette.values)[0]

    reassignments_made = 0

    for idx in poor_indices:
        # Find neighbors for this cell
        distances, neighbor_indices = nn.kneighbors([X_embed[idx]])
        neighbor_indices = neighbor_indices[0][1:]  # exclude the cell itself

        # Get annotations of neighbors (use ORIGINAL labels as in your script)
        neighbor_annotations = Luecken_dataset.obs['Consensus_annotation_detailed'].iloc[neighbor_indices]

        # Most common annotation among neighbors
        most_common = neighbor_annotations.mode()

        if len(most_common) > 0:
            new_annotation = most_common.iloc[0]
            current_annotation = Luecken_dataset.obs['Consensus_annotation_detailed'].iloc[idx]

            # Only reassign if different
            if new_annotation != current_annotation:
                # Require at least 40% of neighbors to agree
                fraction = (neighbor_annotations == new_annotation).sum() / len(neighbor_annotations)

                if fraction >= 0.4:
                    Luecken_dataset.obs.loc[Luecken_dataset.obs.index[idx], 'Consensus_annotation_detailed_refined'] = new_annotation
                    reassignments_made += 1

    print(f"Reassigned {reassignments_made} cells based on neighborhood consensus")

    # Recalculate silhouette scores after reassignment
    new_labels = Luecken_dataset.obs['Consensus_annotation_detailed_refined'].astype('category').cat.codes
    new_silhouette_scores = silhouette_samples(X_embed, new_labels)
    silhouette_avg_corrected = silhouette_score(X_embed, new_labels)

    # Store corrected scores
    Luecken_dataset.obs['silhouette_score_corrected'] = new_silhouette_scores

    print(f"\n=== REASSIGNMENT RESULTS ===")
    print(f"Original average silhouette: {silhouette_avg:.3f}")
    print(f"Refined average silhouette: {silhouette_avg_corrected:.3f}")
    print(f"Improvement: {silhouette_avg_corrected - silhouette_avg:.3f}")

    print(f"Original negative silhouette cells: {int(negative_silhouette_mask.sum())}")
    print(f"Refined negative silhouette cells: {int((new_silhouette_scores < 0).sum())}")

    # Show what changes were made
    if reassignments_made > 0:
        changes_mask = (
            Luecken_dataset.obs['Consensus_annotation_detailed'] !=
            Luecken_dataset.obs['Consensus_annotation_detailed_refined']
        )
        changes = Luecken_dataset.obs[changes_mask]

        print(f"\n=== SPECIFIC REASSIGNMENTS ===")
        change_summary = (
            changes.groupby(['Consensus_annotation_detailed', 'Consensus_annotation_detailed_refined'])
            .size()
            .reset_index(name='count')
        )

        for _, row in change_summary.iterrows():
            print(f"{row['Consensus_annotation_detailed']} -> {row['Consensus_annotation_detailed_refined']}: {row['count']} cells")

else:
    print("No eligible cells with very poor silhouette scores found (after restriction).")
    # Create corrected scores column that's identical to original
    Luecken_dataset.obs['silhouette_score_corrected'] = Luecken_dataset.obs['silhouette_score'].copy()
    silhouette_avg_corrected = silhouette_avg
    reassignments_made = 0

# -----------------------------------------------------------------------------
# Create a reassignment status column for visualization
# -----------------------------------------------------------------------------
reassignment_mask = (
    Luecken_dataset.obs['Consensus_annotation_detailed'] !=
    Luecken_dataset.obs['Consensus_annotation_detailed_refined']
)
Luecken_dataset.obs['reassignment_status'] = 'Unchanged'
Luecken_dataset.obs.loc[reassignment_mask, 'reassignment_status'] = 'Reassigned'

# Final summary
print(f"\n=== FINAL SUMMARY ===")
print(f"Total cells: {len(Luecken_dataset)}")
print(f"Cells reassigned: {int(reassignment_mask.sum())}")
print(f"Final cell type distribution:")
final_counts = Luecken_dataset.obs['Consensus_annotation_detailed_refined'].value_counts()
print(final_counts)

# -----------------------------------------------------------------------------
# Clear ALL color palettes before plotting to ensure fresh colors
# -----------------------------------------------------------------------------
color_keys_to_clear = [
    'Consensus_annotation_detailed_colors',
    'Consensus_annotation_detailed_refined_colors',
    'reassignment_status_colors',
    'silhouette_score_corrected_colors'
]
for key in color_keys_to_clear:
    if key in Luecken_dataset.uns:
        del Luecken_dataset.uns[key]

# -----------------------------------------------------------------------------
# Plot comprehensive analysis
# -----------------------------------------------------------------------------
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Original annotations
sc.pl.embedding(
    Luecken_dataset,
    color='Consensus_annotation_detailed',
    basis='X_umap',
    legend_loc='on data',
    legend_fontsize=5,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
    ax=axes[0, 0]
)
axes[0, 0].set_title('Original Smoothed Annotations', fontsize=14, fontweight='bold')

# Plot 2: Refined annotations
sc.pl.embedding(
    Luecken_dataset,
    color='Consensus_annotation_detailed_refined',
    basis='X_umap',
    legend_loc='on data',
    legend_fontsize=5,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
    ax=axes[0, 1]
)
axes[0, 1].set_title('Silhouette-Refined Annotations', fontsize=14, fontweight='bold')

# Plot 3: Reassignment status
sc.pl.embedding(
    Luecken_dataset,
    color='reassignment_status',
    basis='X_umap',
    palette={'Unchanged': 'lightgray', 'Reassigned': 'red'},
    add_outline=False,
    legend_loc='right margin',
    frameon=False,
    show=False,
    ax=axes[1, 0]
)
axes[1, 0].set_title('Reassignment Status', fontsize=14, fontweight='bold')

# Plot 4: Corrected silhouette scores
sc.pl.embedding(
    Luecken_dataset,
    color='silhouette_score_corrected',
    basis='X_umap',
    color_map='RdBu_r',
    add_outline=False,
    frameon=False,
    show=False,
    ax=axes[1, 1]
)
axes[1, 1].set_title('Silhouette Scores (Refined)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(figures_path + "/43_Luecken_dataset_silhouette_refinement_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# -----------------------------------------------------------------------------
# Additional histogram comparison
# -----------------------------------------------------------------------------
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 6))

# Original silhouette distribution
ax1.hist(sample_silhouette_values, bins=50, alpha=0.7, edgecolor='black', color='lightblue')
ax1.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax1.set_xlabel('Silhouette Score')
ax1.set_ylabel('Number of Cells')
ax1.set_title(f'Original Silhouette Distribution\n(Avg: {silhouette_avg:.3f})')
ax1.legend()

# Refined silhouette distribution
ax2.hist(Luecken_dataset.obs['silhouette_score_corrected'], bins=50, alpha=0.7, edgecolor='black', color='lightgreen')
ax2.axvline(x=0, color='red', linestyle='--', label='Silhouette = 0')
ax2.set_xlabel('Silhouette Score')
ax2.set_ylabel('Number of Cells')
ax2.set_title(f'Refined Silhouette Distribution\n(Avg: {silhouette_avg_corrected:.3f})')
ax2.legend()

plt.tight_layout()
plt.savefig(figures_path + "/44_Luecken_dataset_silhouette_distribution_comparison.png", dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# Remove unused categories
Luecken_dataset.obs['Consensus_annotation_simplified'] = Luecken_dataset.obs['Consensus_annotation_simplified'].cat.remove_unused_categories()
Luecken_dataset.obs['Consensus_annotation_detailed_refined'] = Luecken_dataset.obs['Consensus_annotation_detailed_refined'].cat.remove_unused_categories()

In [None]:
Luecken_dataset_normalized = Luecken_dataset.copy()
ep.Normalise_protein_data(Luecken_dataset_normalized, inplace=True, axis=1, flavor="seurat")
sc.tl.rank_genes_groups(Luecken_dataset_normalized, 'Consensus_annotation_detailed_refined', method='wilcoxon')
sc.pl.rank_genes_groups(Luecken_dataset_normalized, n_genes=10, sharey=False, ncols = 3, fontsize = 14)

plt.savefig(figures_path + "/45_Luecken_dataset_top10_markers.png", dpi=300, bbox_inches='tight')

In [None]:
AveragedExpression = grouped_obs_mean(Luecken_dataset_normalized, 'Consensus_annotation_detailed_refined')
df = pd.DataFrame(AveragedExpression)

In [None]:
# Compute the correlation matrix
corr = df.corr(method='pearson')

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(235, 15, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
heatmap = sns.heatmap(corr, mask=mask, cmap=cmap, annot=True, 
                        square=True, linewidths=.6, cbar_kws={"shrink": 1},
                        annot_kws={"fontsize":5})

heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12)

plt.savefig(figures_path + "/46_Luecken_dataset_correlation_heatmap.png", dpi=300, bbox_inches='tight')

In [None]:
import pandas as pd

# =============================================================================
# Luecken_dataset: build *_final columns (broad_final + simplified_final) Zhang-style
# - Safe on reruns (casts to object before assignment)
# - Deterministic (resets outputs each run)
# - Enforces fixed category orders at the end
# =============================================================================

# -----------------------------
# Column names
# -----------------------------
DETAIL_REFINED_COL  = "Consensus_annotation_detailed_refined"
DETAIL_FALLBACK_COL = "Consensus_annotation_detailed"

BROAD_FINAL_COL = "Consensus_annotation_broad_final"
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"

# If refined detailed does not exist, fall back
DETAIL_INPUT = DETAIL_REFINED_COL if DETAIL_REFINED_COL in Luecken_dataset.obs.columns else DETAIL_FALLBACK_COL

# -----------------------------
# Fixed category orders (from your Luecken simplified_final spec)
# -----------------------------
broad_categories = ["Immature", "Mature", "Doublet"]

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'Macrophage', 'ILC', 'Stroma', 'Myeloid', 'Other T', 'Plasma'
]

# -----------------------------
# 0) Ensure *_final columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------
for col in [BROAD_FINAL_COL, SIMPL_FINAL_COL]:
    if col not in Luecken_dataset.obs.columns:
        Luecken_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Luecken_dataset.obs[col]):
        Luecken_dataset.obs[col] = Luecken_dataset.obs[col].astype("object")

# Reset each run so reruns are deterministic
Luecken_dataset.obs[BROAD_FINAL_COL] = pd.NA
Luecken_dataset.obs[SIMPL_FINAL_COL] = pd.NA

# -----------------------------
# 1) Build simplified_final from detailed_refined (or fallback detailed)
# -----------------------------
d = Luecken_dataset.obs[DETAIL_INPUT]

# HSPC
Luecken_dataset.obs.loc[d.isin(['MPP', 'Pro-B', 'HSC', 'MEP', 'GMP', 'ErP', 'Pre-Pro-B', 'Pro-B']), SIMPL_FINAL_COL] = 'HSPC'

# Monocyte
Luecken_dataset.obs.loc[d.isin(['CD14 Mono', 'CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'

# NK
Luecken_dataset.obs.loc[d.isin(['NK CD56 dim', 'NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'

# CD4 T
Luecken_dataset.obs.loc[d.isin(['CD4 T Naive', 'CD4 T Memory', 'Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'

# CD8 T
Luecken_dataset.obs.loc[d.isin(['CD8 T Naive', 'CD8 T Memory', 'MAIT']), SIMPL_FINAL_COL] = 'CD8 T'

# B (NOTE: your original code included 'Plasma' here, but you also map Plasma -> Plasma below.
# The later Plasma assignment will override this, which is usually what you want.)
Luecken_dataset.obs.loc[d.isin(['B Naive', 'B Memory', 'Plasma', 'Immature B', 'Pre-B']), SIMPL_FINAL_COL] = 'B'

# Erythroid
Luecken_dataset.obs.loc[d.isin(['Erythroblast', 'ErP']), SIMPL_FINAL_COL] = 'Erythroid'  # noqa: E999
# If your environment errors on the line above due to linting, use:
# Luecken_dataset.obs.loc[d.isin(['Erythroblast', 'ErP']), SIMPL_FINAL_COL] = 'Erythroid'

# Myeloid
Luecken_dataset.obs.loc[d.eq('Myeloid progenitor'), SIMPL_FINAL_COL] = 'Myeloid'

# cDC
Luecken_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'  # noqa: E999
# Or:
# Luecken_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'

# pDC
Luecken_dataset.obs.loc[d.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'

# Other T
Luecken_dataset.obs.loc[d.isin(['Double negative T', 'Gamma delta T']), SIMPL_FINAL_COL] = 'Other T'  # noqa: E999
# Or:
# Luecken_dataset.obs.loc[d.isin(['Double negative T', 'Gamma delta T']), SIMPL_FINAL_COL] = 'Other T'

# Macrophage
Luecken_dataset.obs.loc[d.eq('Macrophage'), SIMPL_FINAL_COL] = 'Macrophage'

# ILC
Luecken_dataset.obs.loc[d.eq('ILC'), SIMPL_FINAL_COL] = 'ILC'

# Plasma (override the earlier 'B' assignment for Plasma)
Luecken_dataset.obs.loc[d.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# Enforce fixed categories/order for simplified_final
Luecken_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Luecken_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------
# 2) Build broad_final
#    Priority:
#      - If simplified_final is Doublet -> Doublet
#      - If detailed label indicates progenitor -> Immature
#      - Else if simplified_final assigned -> Mature
#      - Else -> Other
# -----------------------------
sf = Luecken_dataset.obs[SIMPL_FINAL_COL]

# Doublet (only if you ever assign it in simplified_final)
Luecken_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (adjust if your refined labels include additional progenitor states)
immature_details = [
    "HSC", "MPP", "LMPP", "GMP", "MEP", "ErP", "MkP",
    "Pre-Pro-B", "Pro-B", "Pre-B", "CLP", "Progenitor",
    "Neutrophil progenitor", "pDC progenitor", "EoBaMaP"
]
Luecken_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything with an assigned simplified_final (and not already Immature/Doublet)
Luecken_dataset.obs.loc[Luecken_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"

# Enforce fixed categories/order for broad_final
Luecken_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Luecken_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------
# 3) Sanity checks
# -----------------------------
na_simpl = int(Luecken_dataset.obs[SIMPL_FINAL_COL].isna().sum())
na_broad = int(Luecken_dataset.obs[BROAD_FINAL_COL].isna().sum())

print(f"Using detail input column: {DETAIL_INPUT}")
print(f"Unassigned '{SIMPL_FINAL_COL}' (NA) rows: {na_simpl} / {Luecken_dataset.n_obs}")
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {na_broad} / {Luecken_dataset.n_obs}")

print("\nBroad final value counts:")
print(Luecken_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified final value counts (top 30):")
print(Luecken_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())

# Diagnostics for what is unmapped in simplified_final
unmapped = (
    Luecken_dataset.obs.loc[Luecken_dataset.obs[SIMPL_FINAL_COL].isna(), DETAIL_INPUT]
    .astype("object")
    .fillna("<<NA in detailed>>")
    .value_counts()
    .head(30)
)
print(f"\nTop unmapped '{DETAIL_INPUT}' labels among NA simplified_final rows (top 30):")
print(unmapped.to_string())


In [None]:
Luecken_dataset.obs['Consensus_annotation_detailed_final'] = Luecken_dataset.obs['Consensus_annotation_detailed_refined']

In [None]:
# Remove unused categories
Luecken_dataset.obs['Consensus_annotation_simplified_final'] = Luecken_dataset.obs['Consensus_annotation_simplified_final'].cat.remove_unused_categories()
Luecken_dataset.obs['Consensus_annotation_detailed_final'] = Luecken_dataset.obs['Consensus_annotation_detailed_final'].cat.remove_unused_categories()

In [None]:
# Extract HSC cells from Triana dataset
GMP_mask = Luecken_dataset.obs['Consensus_annotation_detailed_final'] == 'GMP'
gmp_subset = Luecken_dataset[GMP_mask].copy()

print(f"Number of GMP cells: {gmp_subset.n_obs}")
print(f"Original clusters containing GMP: {gmp_subset.obs['cell_type'].unique()}")
# Check distribution of original cell types within GMP
print("\nDistribution of original CellTypes within GMP:")
print(gmp_subset.obs['cell_type'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on GMP cells
sc.pp.neighbors(gmp_subset, use_rep="X_umap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(gmp_subset, resolution=0.5, random_state=42, key_added='gmp_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the GMP subset
sc.tl.umap(gmp_subset, random_state=42, min_dist=0.3)
# Plot the subclusters
sc.pl.embedding(gmp_subset, 
                color='gmp_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('GMP Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(gmp_subset.obs['gmp_subclusters'].unique()):
    cluster_cells = gmp_subset.obs[gmp_subset.obs['gmp_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['cell_type'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_umap"
SUBCOL = "gmp_subclusters"
OUTCOL = "gmp_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Luecken_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Luecken_dataset.obs.loc[gmp_subset.obs_names, OUTCOL] = gmp_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by gmp subclusters (non-GMP will be NA)
sc.pl.embedding(
    Luecken_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Luecken dataset: GMP subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "gmp_subclusters"
TARGET_COL = "Consensus_annotation_detailed_final"
DEFAULT_LABEL = "GMP"

subcluster_to_pop = {
    "0": "Myeloid progenitor",
}

# Map subclusters -> labels
mapped = gmp_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)
mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = gmp_subset.obs.index[mask_mapped]
unmapped_cell_ids = gmp_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [gmp_subset, Luecken_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
gmp_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Luecken_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
gmp_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Luecken_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(gmp_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(gmp_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())

In [None]:
# Extract HSC cells from Triana dataset
Monocyte_mask = Luecken_dataset.obs['Consensus_annotation_simplified_final'] == 'Monocyte'
monocyte_subset = Luecken_dataset[Monocyte_mask].copy()

print(f"Number of Monocyte cells: {monocyte_subset.n_obs}")
print(f"Original clusters containing Monocyte: {monocyte_subset.obs['cell_type'].unique()}")
# Check distribution of original cell types within Monocyte
print("\nDistribution of original CellTypes within Monocyte:")
print(monocyte_subset.obs['cell_type'].value_counts())

random.seed(42)
np.random.seed(42)

# Perform subclustering on Monocyte cells
sc.pp.neighbors(monocyte_subset, use_rep="X_umap", n_neighbors=15, metric='euclidean', random_state=42)

random.seed(42)
np.random.seed(42)

sc.tl.leiden(monocyte_subset, resolution=0.5, random_state=42, key_added='monocyte_subclusters')

random.seed(42)
np.random.seed(42)

# Create UMAP for the Monocyte subset
sc.tl.umap(monocyte_subset, random_state=42, min_dist=0.3)
# Plot the subclusters
sc.pl.embedding(monocyte_subset, 
                color='monocyte_subclusters', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)
ax.set_title('Monocyte Subclustering', fontsize=12, fontweight='bold', y=1.1)

plt.show()

# Check original annotations within each subcluster
print("\nOriginal CellTypes per subcluster:")
for cluster in sorted(monocyte_subset.obs['monocyte_subclusters'].unique()):
    cluster_cells = monocyte_subset.obs[monocyte_subset.obs['monocyte_subclusters'] == cluster]
    print(f"\nSubcluster {cluster}:")
    print(cluster_cells['cell_type'].value_counts())

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

FULL_BASIS = "X_umap"
SUBCOL = "monocyte_subclusters"
OUTCOL = "monocyte_subclusters_on_full"

# 1) Create a column in the full dataset and fill with NA
Luecken_dataset.obs[OUTCOL] = pd.NA

# 2) Transfer subcluster labels from subset -> full by index
Luecken_dataset.obs.loc[monocyte_subset.obs_names, OUTCOL] = monocyte_subset.obs[SUBCOL].astype(str)

# 3) Plot: full embedding, but color by monocyte subclusters (non-Monocyte will be NA)
sc.pl.embedding(
    Luecken_dataset,
    color=OUTCOL,
    basis=FULL_BASIS,
    legend_loc="on data",
    legend_fontsize=6,
    legend_fontoutline=2,
    add_outline=False,
    frameon=False,
    show=False,
)

ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel("UMAP 1", fontsize=12)
ax.set_ylabel("UMAP 2", fontsize=12)
ax.set_title("Luecken dataset: Monocyte subclusters on full embedding", fontsize=12, fontweight="bold", y=1.1)
plt.show()


In [None]:
import pandas as pd

SUBCLUSTER_COL = "monocyte_subclusters"
TARGET_COL = "Consensus_annotation_detailed_final"
DEFAULT_LABEL = "CD14 Mono"

subcluster_to_pop = {
    "39": "ErP",
    "34": "Macrophage",
    '13': "CD16 Mono",
    '16': "CD16 Mono",
    '36': "CD16 Mono",
    '35': "CD16 Mono",
}

# Map subclusters -> labels
mapped = monocyte_subset.obs[SUBCLUSTER_COL].astype(str).map(subcluster_to_pop)
mask_mapped = mapped.notna()
mask_unmapped = mapped.isna()

mapped_cell_ids = monocyte_subset.obs.index[mask_mapped]
unmapped_cell_ids = monocyte_subset.obs.index[mask_unmapped]

print(f"Mapped via dict: {mask_mapped.sum()} cells")
print(f"Default to '{DEFAULT_LABEL}': {mask_unmapped.sum()} cells")

# Ensure categories exist if TARGET_COL is categorical
new_labels = pd.Index(mapped.loc[mask_mapped].unique()).append(pd.Index([DEFAULT_LABEL])).unique()

for ad in [monocyte_subset, Luecken_dataset]:
    if TARGET_COL not in ad.obs.columns:
        ad.obs[TARGET_COL] = pd.NA

    if pd.api.types.is_categorical_dtype(ad.obs[TARGET_COL]):
        missing = new_labels.difference(ad.obs[TARGET_COL].cat.categories)
        if len(missing) > 0:
            ad.obs[TARGET_COL] = ad.obs[TARGET_COL].cat.add_categories(list(missing))

# Assign mapped labels (HSC/MPP/MEP) first
monocyte_subset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values
Luecken_dataset.obs.loc[mapped_cell_ids, TARGET_COL] = mapped.loc[mask_mapped].values

# Assign default label (LMPP) to the rest
monocyte_subset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL
Luecken_dataset.obs.loc[unmapped_cell_ids, TARGET_COL] = DEFAULT_LABEL

print("\nUpdated label counts in subset:")
print(monocyte_subset.obs[TARGET_COL].value_counts(dropna=False).to_string())

print("\nSubcluster -> label breakdown (subset):")
print(monocyte_subset.obs.groupby(SUBCLUSTER_COL)[TARGET_COL].value_counts().unstack(fill_value=0).to_string())

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_colors' in Luecken_dataset.uns:
    del Luecken_dataset.uns['Consensus_annotation_detailed_colors']

# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation - smoothed', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Show the figure
plt.show()

In [None]:
import pandas as pd

# =============================================================================
# Triana_dataset: build *_final columns (broad_final + simplified_final) Zhang-style
# - Safe on reruns (casts to object before assignment)
# - Deterministic (resets outputs each run)
# - Enforces fixed category orders at the end
# =============================================================================

# -----------------------------
# Column names
# -----------------------------
DETAIL_REFINED_COL  = "Consensus_annotation_detailed_final"
DETAIL_FALLBACK_COL = "Consensus_annotation_detailed_refined"

BROAD_FINAL_COL = "Consensus_annotation_broad_final"
SIMPL_FINAL_COL = "Consensus_annotation_simplified_final"

# If refined detailed does not exist, fall back
DETAIL_INPUT = DETAIL_REFINED_COL if DETAIL_REFINED_COL in Luecken_dataset.obs.columns else DETAIL_FALLBACK_COL

# -----------------------------
# Fixed category orders (from your Triana simplified_final spec)
# -----------------------------
broad_categories = ["Immature", "Mature", "Doublet"]

simpl_categories = [
    'HSPC', 'Monocyte', 'CD4 T', 'CD8 T', 'Erythroid', 'B', 'cDC', 'pDC', 'NK',
    'ILC', 'Stroma', 'Myeloid', 'Other T', 'Plasma', 'Mesenchymal', 'Macrophage'
]

# -----------------------------
# 0) Ensure *_final columns exist and are OBJECT dtype (safe on reruns)
# -----------------------------
for col in [BROAD_FINAL_COL, SIMPL_FINAL_COL]:
    if col not in Luecken_dataset.obs.columns:
        Luecken_dataset.obs[col] = pd.NA
    if pd.api.types.is_categorical_dtype(Luecken_dataset.obs[col]):
        Luecken_dataset.obs[col] = Luecken_dataset.obs[col].astype("object")

# Reset each run so reruns are deterministic
Luecken_dataset.obs[BROAD_FINAL_COL] = pd.NA
Luecken_dataset.obs[SIMPL_FINAL_COL] = pd.NA

# -----------------------------
# 1) Build simplified_final from detailed_refined (or fallback detailed)
# -----------------------------
d = Luecken_dataset.obs[DETAIL_INPUT]
# HSPC
Luecken_dataset.obs.loc[d.isin(['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP']),
                       SIMPL_FINAL_COL] = 'HSPC'

# Monocyte
Luecken_dataset.obs.loc[d.isin(['CD14 Mono', 'CD16 Mono']), SIMPL_FINAL_COL] = 'Monocyte'

# Myeloid
Luecken_dataset.obs.loc[d.isin(['Myeloid progenitor']), SIMPL_FINAL_COL] = 'Myeloid'
# NK
Luecken_dataset.obs.loc[d.isin(['NK CD56 dim', 'NK CD56 bright']), SIMPL_FINAL_COL] = 'NK'

# CD4 T
Luecken_dataset.obs.loc[d.isin(['CD4 T Naive', 'CD4 T Memory', 'Treg', 'CD4 CTL']), SIMPL_FINAL_COL] = 'CD4 T'

# CD8 T
Luecken_dataset.obs.loc[d.isin(['CD8 T Naive', 'CD8 T Memory', 'MAIT']), SIMPL_FINAL_COL] = 'CD8 T'

# B
Luecken_dataset.obs.loc[d.isin(['B Naive', 'B Memory', 'Immature B', 'Pre-B']), SIMPL_FINAL_COL] = 'B'

# Erythroid
Luecken_dataset.obs.loc[d.isin(['ErP', 'Erythroblast']), SIMPL_FINAL_COL] = 'Erythroid'

# Macrophage
Luecken_dataset.obs.loc[d.isin(['Macrophage']), SIMPL_FINAL_COL] = 'Macrophage'

# DnT and GdT -> Other T
Luecken_dataset.obs.loc[d.isin(['DnT', 'GdT']), SIMPL_FINAL_COL] = 'Other T'

# Mesenchymal
Luecken_dataset.obs.loc[d.eq('Mesenchymal'), SIMPL_FINAL_COL] = 'Mesenchymal'
# cDC
Luecken_dataset.obs.loc[d.isin(['cDC1', 'cDC2']), SIMPL_FINAL_COL] = 'cDC'

# Other T
Luecken_dataset.obs.loc[d.isin(['GdT']), SIMPL_FINAL_COL] = 'Other T'

# pDC
Luecken_dataset.obs.loc[d.eq('pDC'), SIMPL_FINAL_COL] = 'pDC'

# Stroma
Luecken_dataset.obs.loc[d.eq('Stroma'), SIMPL_FINAL_COL] = 'Stroma'

# Plasma
Luecken_dataset.obs.loc[d.eq('Plasma'), SIMPL_FINAL_COL] = 'Plasma'

# NOTE: You included 'Pro-B' as a simplified category but did not map anything to it.
# If you intended Pro-B to be a simplified bucket (distinct from HSPC), uncomment:
# Luecken_dataset.obs.loc[d.eq('Pro-B'), SIMPL_FINAL_COL] = 'Pro-B'
# (Doing so would override the HSPC assignment for 'Pro-B'.)

# Enforce fixed categories/order for simplified_final
Luecken_dataset.obs[SIMPL_FINAL_COL] = pd.Categorical(Luecken_dataset.obs[SIMPL_FINAL_COL], categories=simpl_categories)

# -----------------------------
# 2) Build broad_final
#    Priority:
#      - If simplified_final is Doublet -> Doublet
#      - If detailed label indicates progenitor -> Immature
#      - Else if simplified_final assigned -> Mature
#      - Else -> Other
# -----------------------------
sf = Luecken_dataset.obs[SIMPL_FINAL_COL]

# Doublet (only if you ever assign it in simplified_final)
Luecken_dataset.obs.loc[sf.eq("Doublet"), BROAD_FINAL_COL] = "Doublet"

# Immature progenitors (expand if needed)
immature_details = ['HSC', 'MPP', 'LMPP', 'EoBaMaP', 'MkP', 'MEP', 'Pre-Pro-B', 'Pro-B', 'GMP', 'ErP']
Luecken_dataset.obs.loc[d.isin(immature_details), BROAD_FINAL_COL] = "Immature"

# Mature: anything with an assigned simplified_final (and not already Immature/Doublet)
Luecken_dataset.obs.loc[Luecken_dataset.obs[BROAD_FINAL_COL].isna(), BROAD_FINAL_COL] = "Mature"

# Enforce fixed categories/order for broad_final
Luecken_dataset.obs[BROAD_FINAL_COL] = pd.Categorical(Luecken_dataset.obs[BROAD_FINAL_COL], categories=broad_categories)

# -----------------------------
# 3) Sanity checks
# -----------------------------
na_simpl = int(Luecken_dataset.obs[SIMPL_FINAL_COL].isna().sum())
na_broad = int(Luecken_dataset.obs[BROAD_FINAL_COL].isna().sum())
print(f"Using detail input column: {DETAIL_INPUT}")
print(f"Unassigned '{SIMPL_FINAL_COL}' (NA) rows: {na_simpl} / {Luecken_dataset.n_obs}")
print(f"Unassigned '{BROAD_FINAL_COL}' (NA) rows: {na_broad} / {Luecken_dataset.n_obs}")

print("\nBroad final value counts:")
print(Luecken_dataset.obs[BROAD_FINAL_COL].value_counts(dropna=False).to_string())

print("\nSimplified final value counts (top 30):")
print(Luecken_dataset.obs[SIMPL_FINAL_COL].value_counts(dropna=False).head(30).to_string())
# Diagnostics for what is unmapped in simplified_final
unmapped = (
    Luecken_dataset.obs.loc[Luecken_dataset.obs[SIMPL_FINAL_COL].isna(), DETAIL_INPUT]
    .astype("object")
    .fillna("<<NA in detailed>>")
    .value_counts()
    .head(30)
)
print(f"\nTop unmapped '{DETAIL_INPUT}' labels among NA simplified_final rows (top 30):")
print(unmapped.to_string())


In [None]:
# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_broad_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken M.D. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus broad annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/Luecken_dataset_final_consensus_annotation_broad_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_simplified_final_colors' in Luecken_dataset.uns:
    del Luecken_dataset.uns['Consensus_annotation_simplified_final_colors']

# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_simplified_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken M.D. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus simplified annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/Luecken_dataset_final_consensus_annotation_simplified_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

In [None]:
# Clear any existing color palettes to force scanpy to regenerate them
if 'Consensus_annotation_detailed_final_colors' in Luecken_dataset.uns:
    del Luecken_dataset.uns['Consensus_annotation_detailed_final_colors']

# Plot UMAP with color
sc.pl.embedding(Luecken_dataset, 
                color='Consensus_annotation_detailed_final', 
                basis='X_umap', 
                legend_loc='on data', 
                legend_fontsize=5,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False)

# Get the current axis and set axis labels and tick labels
ax = plt.gca()
ax.figure.set_size_inches(6, 5)
ax.set_xlabel('UMAP 1', fontsize=12)
ax.set_ylabel('UMAP 2', fontsize=12)

# Set the title with font size 14, bold, and increased distance from the plot
ax.set_title('Luecken M.D. et al. dataset', fontsize=12, fontweight='bold', y=1.1)

# Add a subtitle
plt.suptitle('Consensus detailed annotation', fontsize=8, y=0.925, color=(0.5, 0.5, 0.5))

# Save the figure at 300 dpi
plt.savefig(figures_path + "/Luecken_dataset_final_Consensus_annotation_detailed_annotation.png", 
            dpi=300, bbox_inches='tight')

# Show the figure
plt.show()

# Final results

In [None]:
# Get the remaining cell barcodes from all four processed datasets
print("Getting remaining cell barcodes from all processed datasets...")

remaining_barcodes = set()
remaining_barcodes.update(Zhang_dataset.obs_names)
remaining_barcodes.update(Hao_dataset.obs_names)
remaining_barcodes.update(Triana_dataset.obs_names)
remaining_barcodes.update(Luecken_dataset.obs_names)

print(f"Total remaining cells across all datasets: {len(remaining_barcodes)}")

# Filter the original adatas object to keep only remaining cells
print("Filtering original merged adatas object...")
remaining_mask = adatas_merged.obs_names.isin(remaining_barcodes)
adatas_final = adatas_merged[remaining_mask].copy()

print(f"Original adatas shape: {adatas_merged.shape}")
print(f"Filtered adatas_final shape: {adatas_final.shape}")
print(f"Cells removed: {adatas_merged.n_obs - adatas_final.n_obs}")

# Check which cells remain from each dataset
print(f"\nCells remaining per dataset:")
print(adatas_final.obs['dataset_name'].value_counts())

# Initialize final annotation columns
adatas_final.obs['Consensus_annotation_detailed_final'] = ''
adatas_final.obs['Consensus_annotation_simplified_final'] = ''
adatas_final.obs['Consensus_annotation_broad_final'] = ''

# Assign final annotations from each processed dataset
print("\nAssigning final consensus annotations...")

# Zhang dataset assignments
zhang_mask = adatas_final.obs['dataset_name'] == 'Zhang'
zhang_indices = adatas_final.obs_names[zhang_mask]
zhang_overlap = zhang_indices.intersection(Zhang_dataset.obs_names)

if len(zhang_overlap) > 0:
    adatas_final.obs.loc[zhang_overlap, 'Consensus_annotation_detailed_final'] = Zhang_dataset.obs.loc[zhang_overlap, 'Consensus_annotation_detailed_final'].values
    adatas_final.obs.loc[zhang_overlap, 'Consensus_annotation_simplified_final'] = Zhang_dataset.obs.loc[zhang_overlap, 'Consensus_annotation_simplified_final'].values
    adatas_final.obs.loc[zhang_overlap, 'Consensus_annotation_broad_final'] = Zhang_dataset.obs.loc[zhang_overlap, 'Consensus_annotation_broad_final'].values
    print(f"Zhang: Assigned annotations to {len(zhang_overlap)} cells")

# Hao dataset assignments
hao_mask = adatas_final.obs['dataset_name'] == 'Hao'
hao_indices = adatas_final.obs_names[hao_mask]
hao_overlap = hao_indices.intersection(Hao_dataset.obs_names)

if len(hao_overlap) > 0:
    adatas_final.obs.loc[hao_overlap, 'Consensus_annotation_detailed_final'] = Hao_dataset.obs.loc[hao_overlap, 'Consensus_annotation_detailed_final'].values
    adatas_final.obs.loc[hao_overlap, 'Consensus_annotation_simplified_final'] = Hao_dataset.obs.loc[hao_overlap, 'Consensus_annotation_simplified_final'].values
    adatas_final.obs.loc[hao_overlap, 'Consensus_annotation_broad_final'] = Hao_dataset.obs.loc[hao_overlap, 'Consensus_annotation_broad_final'].values
    print(f"Hao: Assigned annotations to {len(hao_overlap)} cells")

# Triana dataset assignments
triana_mask = adatas_final.obs['dataset_name'] == 'Triana'
triana_indices = adatas_final.obs_names[triana_mask]
triana_overlap = triana_indices.intersection(Triana_dataset.obs_names)

if len(triana_overlap) > 0:
    adatas_final.obs.loc[triana_overlap, 'Consensus_annotation_detailed_final'] = Triana_dataset.obs.loc[triana_overlap, 'Consensus_annotation_detailed_final'].values
    adatas_final.obs.loc[triana_overlap, 'Consensus_annotation_simplified_final'] = Triana_dataset.obs.loc[triana_overlap, 'Consensus_annotation_simplified_final'].values
    adatas_final.obs.loc[triana_overlap, 'Consensus_annotation_broad_final'] = Triana_dataset.obs.loc[triana_overlap, 'Consensus_annotation_broad_final'].values
    print(f"Triana: Assigned annotations to {len(triana_overlap)} cells")

# Luecken dataset assignments
luecken_mask = adatas_final.obs['dataset_name'] == 'Luecken'
luecken_indices = adatas_final.obs_names[luecken_mask]
luecken_overlap = luecken_indices.intersection(Luecken_dataset.obs_names)

if len(luecken_overlap) > 0:
    adatas_final.obs.loc[luecken_overlap, 'Consensus_annotation_detailed_final'] = Luecken_dataset.obs.loc[luecken_overlap, 'Consensus_annotation_detailed_final'].values
    adatas_final.obs.loc[luecken_overlap, 'Consensus_annotation_simplified_final'] = Luecken_dataset.obs.loc[luecken_overlap, 'Consensus_annotation_simplified_final'].values
    adatas_final.obs.loc[luecken_overlap, 'Consensus_annotation_broad_final'] = Luecken_dataset.obs.loc[luecken_overlap, 'Consensus_annotation_broad_final'].values
    print(f"Luecken: Assigned annotations to {len(luecken_overlap)} cells")

# Convert to categorical
adatas_final.obs['Consensus_annotation_detailed_final'] = pd.Categorical(adatas_final.obs['Consensus_annotation_detailed_final'])
adatas_final.obs['Consensus_annotation_simplified_final'] = pd.Categorical(adatas_final.obs['Consensus_annotation_simplified_final'])
adatas_final.obs['Consensus_annotation_broad_final'] = pd.Categorical(adatas_final.obs['Consensus_annotation_broad_final'])

# Remove any cells that didn't get annotations assigned (shouldn't happen but safety check)
unassigned_mask = adatas_final.obs['Consensus_annotation_detailed_final'] == ''
if unassigned_mask.sum() > 0:
    print(f"Warning: {unassigned_mask.sum()} cells did not receive final annotations. Removing them.")
    adatas_final = adatas_final[~unassigned_mask]

print(f"\n=== FINAL DATASET SUMMARY ===")
print(f"Final dataset shape: {adatas_final.shape}")
print(f"Final cells per dataset:")
print(adatas_final.obs['dataset_name'].value_counts())

print(f"\nFinal broad annotation distribution:")
print(adatas_final.obs['Consensus_annotation_broad_final'].value_counts())

print(f"\nFinal simplified annotation distribution:")
print(adatas_final.obs['Consensus_annotation_simplified_final'].value_counts())

print(f"\nFinal detailed annotation distribution:")
print(adatas_final.obs['Consensus_annotation_detailed_final'].value_counts())

# Create comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Plot 1: Dataset distribution
sc.pl.embedding(adatas_final, 
                color='dataset_name', 
                basis='X_umap', 
                legend_loc='right margin',
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,0])
axes[0,0].set_title('Dataset Distribution', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('UMAP 1', fontsize=12)
axes[0,0].set_ylabel('UMAP 2', fontsize=12)

# Plot 2: Final broad annotations
sc.pl.embedding(adatas_final, 
                color='Consensus_annotation_broad_final', 
                basis='X_umap',
                legend_loc='right margin',
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,1])
axes[0,1].set_title('Final Broad Annotations', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('UMAP 1', fontsize=12)
axes[0,1].set_ylabel('UMAP 2', fontsize=12)

# Plot 3: Final simplified annotations
sc.pl.embedding(adatas_final, 
                color='Consensus_annotation_simplified_final', 
                basis='X_umap',
                legend_loc='on data',
                legend_fontsize=6,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[0,2])
axes[0,2].set_title('Final Simplified Annotations', fontsize=14, fontweight='bold')
axes[0,2].set_xlabel('UMAP 1', fontsize=12)
axes[0,2].set_ylabel('UMAP 2', fontsize=12)

# Plot 4: Final detailed annotations
sc.pl.embedding(adatas_final, 
                color='Consensus_annotation_detailed_final', 
                basis='X_umap',
                legend_loc='on data',
                legend_fontsize=4,
                legend_fontoutline=2,
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[1,0])
axes[1,0].set_title('Final Detailed Annotations', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('UMAP 1', fontsize=12)
axes[1,0].set_ylabel('UMAP 2', fontsize=12)

# Plot 5: Chemistry distribution
sc.pl.embedding(adatas_final, 
                color='Chemistry', 
                basis='X_umap',
                legend_loc='right margin',
                add_outline=False,
                frameon=False,
                show=False,
                ax=axes[1,1])
axes[1,1].set_title('Chemistry Distribution', fontsize=14, fontweight='bold')
axes[1,1].set_xlabel('UMAP 1', fontsize=12)
axes[1,1].set_ylabel('UMAP 2', fontsize=12)

# Plot 6: Summary statistics as text
axes[1,2].axis('off')
summary_text = f"""
Final Dataset Summary:

Total cells: {len(adatas_final):,}
Total features: {adatas_final.n_vars}

Cells per dataset:
Zhang: {(adatas_final.obs['dataset_name'] == 'Zhang').sum():,}
Hao: {(adatas_final.obs['dataset_name'] == 'Hao').sum():,}
Triana: {(adatas_final.obs['dataset_name'] == 'Triana').sum():,}
Luecken: {(adatas_final.obs['dataset_name'] == 'Luecken').sum():,}

Broad categories:
{chr(10).join([f"{cat}: {count:,}" for cat, count in adatas_final.obs['Consensus_annotation_broad_final'].value_counts().items()])}

Cell types identified:
Detailed: {adatas_final.obs['Consensus_annotation_detailed_final'].nunique()}
Simplified: {adatas_final.obs['Consensus_annotation_simplified_final'].nunique()}
"""

axes[1,2].text(0.05, 0.95, summary_text, transform=axes[1,2].transAxes, 
               fontsize=10, verticalalignment='top', fontfamily='monospace')

plt.tight_layout()
plt.savefig(figures_path + "/Final_merged_datasets_comprehensive_overview.png", 
            dpi=300, bbox_inches='tight')
plt.show()

print("\nFinal merged dataset with consensus annotations is ready!")

In [None]:
import os
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from matplotlib.lines import Line2D

# ---------------------------
# Output dir (fallback if not preset)
# ---------------------------
if "figures_path" not in globals():
    figures_path = "./figures"
os.makedirs(figures_path, exist_ok=True)

# ---------- Custom palette ----------
custom_palette = {
    'B Memory': "#68D827", 'B Naive': '#1C511D', 'CD14 Mono': "#D27CE3",
    'CD16 Mono': "#8D43CD", 'CD4 T Memory': "#C1AF93", 'CD4 T Naive': "#C99546",
    'CD8 T Memory': "#6B3317", 'CD8 T Naive': "#4D382E", 'ErP': "#D1235A",
    'Erythroblast': "#F30A1A", 'GMP': "#C5E4FF", 'HSC': '#0079ea', 'MPP': "#79b6ac",
    'Immature B': "#91FF7B", 'LMPP': "#17BECF", 'MAIT': "#BCBD22",
    'Myeloid progenitor': "#AEC7E8", 'NK CD56 bright': "#F3AC1F",
    'NK CD56 dim': "#FBEF0D", 'Plasma': "#9DC012", 'Pro-B': "#66BB6A",
    'Small': "#292929", 'cDC1': "#76A7CB", 'cDC2': "#16D2E3", 'GdT': "#EDB416",
    'pDC': "#69FFCB", 'CD4 CTL': "#D7D2CB", 'MEP': "#E364B0", 'Pre-B': "#2DBD67",
    'Pre-Pro-B': '#92AC8E', 'EoBaMaP': "#728245", 'MkP': "#69424D",
    'Stroma': "#727272", 'Macrophage': "#5F4761", 'ILC': "#F7CF94", 'DnT': "#504423",
    'GdT_DnT': "#B07A2A",
}
fallback = "#BBBBBB"

# ---------- Column to plot ----------
col = "Consensus_annotation_detailed_final"
if col not in adatas_final.obs:
    raise KeyError(f"Column '{col}' not found in adatas_final.obs")

# =============================================================================
# 1) PERMANENTLY DROP unwanted cell types
# =============================================================================
remove_labels = {"Mesenchymal", "Platelet", "Macrophage", "ILC", "DnT"}

mask_keep = ~adatas_final.obs[col].astype(str).isin(remove_labels)
adatas_final = adatas_final[mask_keep].copy()

# Drop unused categories if categorical
if pd.api.types.is_categorical_dtype(adatas_final.obs[col]):
    adatas_final.obs[col] = adatas_final.obs[col].cat.remove_unused_categories()

# =============================================================================
# 2) DEFINE GLOBAL CATEGORY ORDER + GLOBAL NUMBERING
# =============================================================================
plot_col = f"{col}_plot"
adatas_final.obs[plot_col] = pd.Categorical(adatas_final.obs[col].astype(str))
cats_global = list(adatas_final.obs[plot_col].cat.categories)
cat_to_num_global = {cat: i + 1 for i, cat in enumerate(cats_global)}

# ---------- Color alignment ----------
palette_list = [custom_palette.get(cat, fallback) for cat in cats_global]
adatas_final.uns[f"{plot_col}_colors"] = palette_list

# ---------- Compute centroids ----------
umap_key = "X_umap"
if umap_key not in adatas_final.obsm_keys():
    raise KeyError(f"Embedding '{umap_key}' not found in adatas_final.obsm")

umap_coords = adatas_final.obsm[umap_key]
df = pd.DataFrame(umap_coords[:, :2], columns=["UMAP1", "UMAP2"], index=adatas_final.obs_names)
df[plot_col] = adatas_final.obs[plot_col].astype(str)

centroids = (
    df.groupby(plot_col)[["UMAP1", "UMAP2"]]
      .mean()
      .reindex(cats_global)
      .reset_index()
)

# =============================================================================
# 3) UMAP PLOT
# =============================================================================
fig, ax = plt.subplots(figsize=(5.5, 4.5), dpi=300)

sc.pl.embedding(
    adatas_final,
    color=plot_col,
    basis=umap_key,
    title="",
    legend_loc=None,
    outline_width=(1, 0.1),
    add_outline=True,
    size=1,
    frameon=False,
    show=False,
    ax=ax,
)

ax.set_aspect("auto")

# ---------- Draw centroid circles ----------
for _, row in centroids.iterrows():
    label = row[plot_col]
    color = custom_palette.get(label, fallback)
    num = cat_to_num_global[label]

    ax.scatter(
        row["UMAP1"], row["UMAP2"],
        s=120, facecolor=color, edgecolor="black",
        alpha=0.6, linewidth=0.7, zorder=10
    )

    txt = ax.text(
        row["UMAP1"], row["UMAP2"], str(num),
        ha="center", va="center",
        fontsize=5.5, color="white", weight="bold", zorder=11
    )
    txt.set_path_effects([PathEffects.withStroke(linewidth=1.1, foreground="black")])

plt.tight_layout(pad=0.2)

# ---------- Save UMAP plot ----------
plot_png = os.path.join(figures_path, "Final_merged_datasets_annotation_plot.png")
plot_pdf = os.path.join(figures_path, "Final_merged_datasets_annotation_plot.pdf")
fig.savefig(plot_png, dpi=300, bbox_inches="tight", transparent=True)
fig.savefig(plot_pdf, dpi=300, bbox_inches="tight", transparent=True)
plt.show()
plt.close(fig)

# =============================================================================
# 4) LEGEND FIGURE
# =============================================================================
handles, labels = [], []
for cat in cats_global:
    color = custom_palette.get(cat, fallback)
    handles.append(
        Line2D(
            [0], [0], marker="o", color="none",
            markerfacecolor=color, markeredgecolor="black",
            lw=0, markersize=6
        )
    )
    labels.append(f"{cat_to_num_global[cat]}. {cat}")

fig_leg = plt.figure(figsize=(5.5, 1.2), dpi=300)
fig_leg.legend(
    handles, labels,
    loc="center",
    ncol=5,
    frameon=False,
    fontsize=5.0,
    handletextpad=0.5,
    columnspacing=0.9,
    labelspacing=0.3
)
fig_leg.gca().axis("off")

legend_png = os.path.join(figures_path, "Final_merged_datasets_annotation_legend.png")
legend_pdf = os.path.join(figures_path, "Final_merged_datasets_annotation_legend.pdf")
fig_leg.savefig(legend_png, dpi=300, bbox_inches="tight", transparent=True)
fig_leg.savefig(legend_pdf, dpi=300, bbox_inches="tight", transparent=True)
plt.close(fig_leg)

# =============================================================================
# 5) EXPORT number -> cell type mapping
# =============================================================================
mapping_df = pd.DataFrame({
    "Number": [cat_to_num_global[c] for c in cats_global],
    "CellType": cats_global,
    "Color": [custom_palette.get(c, fallback) for c in cats_global],
})
mapping_csv = os.path.join(figures_path, "Final_merged_datasets_annotation_mapping.csv")
mapping_df.to_csv(mapping_csv, index=False)

print("[INFO] Saved plot, legend, and mapping:")
print(" -", plot_png)
print(" -", plot_pdf)
print(" -", legend_png)
print(" -", legend_pdf)
print(" -", mapping_csv)


In [None]:
adatas_final

In [None]:
adatas_final.obs['Chemistry']

In [None]:
import os
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from matplotlib.lines import Line2D

# ---------------------------
# Output dir (fallback if not preset)
# ---------------------------
if "figures_path" not in globals():
    figures_path = "./figures"
os.makedirs(figures_path, exist_ok=True)

# ---------------------------
# Plot Chemistry (3 categories) with fixed colours
# ---------------------------
col = "Chemistry"
if col not in adatas_final.obs:
    raise KeyError(f"Column '{col}' not found in adatas_final.obs")

chem_palette = {
    "BD AbSeq": "#D1235A",
    "BioLegend TotalSeqA": "#F3AC1F",
    "BioLegend TotalSeqB": "#8D43CD",
}
fallback = "#BBBBBB"

# Optional: filter nothing (keep all)
remove_labels = set()
mask_keep = ~adatas_final.obs[col].astype("object").isin(remove_labels)
adatas_plot = adatas_final[mask_keep].copy()

plot_col = f"{col}_plot"
adatas_plot.obs[plot_col] = adatas_plot.obs[col].astype("object")
adatas_plot.obs[plot_col] = pd.Categorical(adatas_plot.obs[plot_col])

# Align colours to categories present (in category order)
cats = list(adatas_plot.obs[plot_col].cat.categories)
adatas_plot.uns[f"{plot_col}_colors"] = [chem_palette.get(c, fallback) for c in cats]

# Compute centroids
umap_coords = adatas_plot.obsm["X_umap"]
df = pd.DataFrame(umap_coords, columns=["UMAP1", "UMAP2"], index=adatas_plot.obs_names)
df[plot_col] = adatas_plot.obs[plot_col].astype(str)

centroids = (
    df.groupby(plot_col)[["UMAP1", "UMAP2"]]
      .mean()
      .reindex(cats)
      .reset_index()
)

cat_to_num = {cat: i + 1 for i, cat in enumerate(cats)}

# ---------------------------
# Main plot
# ---------------------------
fig, ax = plt.subplots(figsize=(5.5, 4.5), dpi=300)

sc.pl.embedding(
    adatas_plot,
    color=plot_col,
    basis="X_umap",
    title="",
    legend_loc=None,
    outline_width=(1, 0.1),
    add_outline=True,
    size=1,
    frameon=False,
    show=False,
    ax=ax
)

ax.set_aspect("auto")

# (Your current settings: invisible centroid markers/text)
for _, row in centroids.iterrows():
    label = row[plot_col]
    color = chem_palette.get(label, fallback)
    num = cat_to_num[label]

    ax.scatter(
        row["UMAP1"], row["UMAP2"],
        s=120, facecolor=color, edgecolor="black",
        alpha=0, linewidth=0.7, zorder=10
    )

    txt = ax.text(
        row["UMAP1"], row["UMAP2"], str(num),
        ha="center", va="center",
        fontsize=0, color="white", weight="bold", zorder=11
    )
    txt.set_path_effects([PathEffects.withStroke(linewidth=1.1, foreground="black")])

plt.tight_layout(pad=0.2)

# Save (PNG + PDF, transparent)
plot_png = os.path.join(figures_path, "Final_merged_datasets_Chemistry_plot.png")
plot_pdf = os.path.join(figures_path, "Final_merged_datasets_Chemistry_plot.pdf")
fig.savefig(plot_png, dpi=300, bbox_inches="tight", transparent=True)
fig.savefig(plot_pdf, dpi=300, bbox_inches="tight", transparent=True)

plt.show()
plt.close(fig)

# ---------------------------
# Legend figure (only) - max 2 columns
# ---------------------------
handles, labels = [], []
for cat in cats:
    color = chem_palette.get(cat, fallback)
    handles.append(
        Line2D([0], [0],
               marker='o', color='none',
               markerfacecolor=color, markeredgecolor='black',
               lw=0, markersize=6)
    )
    labels.append(f"{cat_to_num[cat]}. {cat}")

fig_leg = plt.figure(figsize=(5.5, 1.2), dpi=300)
fig_leg.legend(
    handles, labels,
    loc='center',
    ncol=min(2, max(1, len(labels))),
    frameon=False,
    fontsize=6.0,
    handletextpad=0.6,
    columnspacing=1.0,
    labelspacing=0.4
)
fig_leg.gca().axis('off')

legend_png = os.path.join(figures_path, "Final_merged_datasets_Chemistry_legend.png")
legend_pdf = os.path.join(figures_path, "Final_merged_datasets_Chemistry_legend.pdf")
fig_leg.savefig(legend_png, dpi=300, bbox_inches="tight", transparent=True)
fig_leg.savefig(legend_pdf, dpi=300, bbox_inches="tight", transparent=True)
plt.close(fig_leg)

# ---------------------------
# Export number->chemistry mapping
# ---------------------------
mapping_df = pd.DataFrame({
    "Number": [cat_to_num[c] for c in cats],
    "Chemistry": cats,
    "Color": [chem_palette.get(c, fallback) for c in cats],
})
mapping_csv = os.path.join(figures_path, "Final_merged_datasets_Chemistry_mapping.csv")
mapping_df.to_csv(mapping_csv, index=False)

print("[INFO] Saved Chemistry plot, legend, and mapping:")
print(" -", plot_png)
print(" -", plot_pdf)
print(" -", legend_png)
print(" -", legend_pdf)
print(" -", mapping_csv)


In [None]:
adatas_final.obs['dataset_name']

In [None]:
import os
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from matplotlib.lines import Line2D

# ---------------------------
# Output dir (fallback if not preset)
# ---------------------------
if "figures_path" not in globals():
    figures_path = "./figures"
os.makedirs(figures_path, exist_ok=True)

# ---------------------------
# Plot dataset_name with fixed colours
# ---------------------------
col = "dataset_name"
if col not in adatas_final.obs:
    raise KeyError(f"Column '{col}' not found in adatas_final.obs")

dataset_palette = {
    "Zhang": "#CDA72B",
    "Hao": "#D23A3A",
    "Triana": "#3381D0",
    "Luecken": "#31AF33",
}
fallback = "#BBBBBB"

# Optional: filter nothing (keep all)
remove_labels = set()
mask_keep = ~adatas_final.obs[col].astype("object").isin(remove_labels)
adatas_plot = adatas_final[mask_keep].copy()

plot_col = f"{col}_plot"
adatas_plot.obs[plot_col] = adatas_plot.obs[col].astype("object")
adatas_plot.obs[plot_col] = pd.Categorical(adatas_plot.obs[plot_col])

# ---------------------------
# Force draw order: Luecken bottom, Hao top
# ---------------------------
cats0 = list(adatas_plot.obs[plot_col].cat.categories)
middle = [c for c in cats0 if c not in ["Hao", "Luecken"]]

new_order = []
if "Luecken" in cats0:
    new_order.append("Luecken")
new_order += middle
if "Hao" in cats0:
    new_order.append("Hao")

adatas_plot.obs[plot_col] = adatas_plot.obs[plot_col].cat.reorder_categories(
    new_order, ordered=True
)

# Align colours to categories present (in the new category order)
cats = list(adatas_plot.obs[plot_col].cat.categories)
adatas_plot.uns[f"{plot_col}_colors"] = [dataset_palette.get(c, fallback) for c in cats]

# Compute centroids (centroids will also follow category order)
umap_coords = adatas_plot.obsm["X_umap"]
df = pd.DataFrame(umap_coords, columns=["UMAP1", "UMAP2"], index=adatas_plot.obs_names)
df[plot_col] = adatas_plot.obs[plot_col].astype(str)

centroids = (
    df.groupby(plot_col)[["UMAP1", "UMAP2"]]
      .mean()
      .reindex(cats)
      .reset_index()
)

cat_to_num = {cat: i + 1 for i, cat in enumerate(cats)}

# ---------------------------
# Main plot
# ---------------------------
fig, ax = plt.subplots(figsize=(5.5, 4.5), dpi=300)

sc.pl.embedding(
    adatas_plot,
    color=plot_col,
    basis="X_umap",
    title="",
    legend_loc=None,
    outline_width=(1, 0.1),
    add_outline=True,
    size=1,
    frameon=False,
    show=False,
    ax=ax
)

ax.set_aspect("auto")

# (Match your prior behaviour: invisible centroid markers/text)
for _, row in centroids.iterrows():
    label = row[plot_col]
    color = dataset_palette.get(label, fallback)
    num = cat_to_num[label]

    ax.scatter(
        row["UMAP1"], row["UMAP2"],
        s=120, facecolor=color, edgecolor="black",
        alpha=0, linewidth=0.7, zorder=10
    )

    txt = ax.text(
        row["UMAP1"], row["UMAP2"], str(num),
        ha="center", va="center",
        fontsize=0, color="white", weight="bold", zorder=11
    )
    txt.set_path_effects([PathEffects.withStroke(linewidth=1.1, foreground="black")])

plt.tight_layout(pad=0.2)

plot_png = os.path.join(figures_path, "Final_merged_datasets_dataset_name_plot.png")
plot_pdf = os.path.join(figures_path, "Final_merged_datasets_dataset_name_plot.pdf")
fig.savefig(plot_png, dpi=300, bbox_inches="tight", transparent=True)
fig.savefig(plot_pdf, dpi=300, bbox_inches="tight", transparent=True)

plt.show()
plt.close(fig)

# ---------------------------
# Legend figure (only)
# ---------------------------
handles, labels = [], []
for cat in cats:
    color = dataset_palette.get(cat, fallback)
    handles.append(
        Line2D([0], [0],
               marker='o', color='none',
               markerfacecolor=color, markeredgecolor='black',
               lw=0, markersize=6)
    )
    labels.append(f"{cat_to_num[cat]}. {cat}")

fig_leg = plt.figure(figsize=(5.5, 1.2), dpi=300)
fig_leg.legend(
    handles, labels,
    loc='center',
    ncol=2,
    frameon=False,
    fontsize=6.0,
    handletextpad=0.6,
    columnspacing=1.0,
    labelspacing=0.4
)
fig_leg.gca().axis('off')

legend_png = os.path.join(figures_path, "Final_merged_datasets_dataset_name_legend.png")
legend_pdf = os.path.join(figures_path, "Final_merged_datasets_dataset_name_legend.pdf")
fig_leg.savefig(legend_png, dpi=300, bbox_inches="tight", transparent=True)
fig_leg.savefig(legend_pdf, dpi=300, bbox_inches="tight", transparent=True)
plt.close(fig_leg)

# ---------------------------
# Mapping CSV
# ---------------------------
mapping_df = pd.DataFrame({
    "Number": [cat_to_num[c] for c in cats],
    "dataset_name": cats,
    "Color": [dataset_palette.get(c, fallback) for c in cats],
})
mapping_csv = os.path.join(figures_path, "Final_merged_datasets_dataset_name_mapping.csv")
mapping_df.to_csv(mapping_csv, index=False)

print("[INFO] Saved dataset_name plot, legend, and mapping:")
print(" -", plot_png)
print(" -", plot_pdf)
print(" -", legend_png)
print(" -", legend_pdf)
print(" -", mapping_csv)


In [None]:
import os
import pandas as pd

# ============================================================
# Permanently remove selected cell types (if present) BEFORE exporting h5ad
# ============================================================

# Cell types to remove
REMOVE_LABELS = {"Mesenchymal", "Platelet", "Macrophage", "ILC", "DnT"}

# Which annotation columns to try (first found will be used)
ANNOT_COL_CANDIDATES = [
    "Consensus_annotation_detailed_final",
    "Consensus_annotation_detailed_refined",
    "Consensus_annotation_detailed",
    "Consensus_annotation_simplified_final",
    "Consensus_annotation_simplified",
]

def remove_celltypes_if_exist(
    adata,
    remove_labels=REMOVE_LABELS,
    col_candidates=ANNOT_COL_CANDIDATES,
    dataset_name="adata",
):
    """
    Permanently drops cells whose annotation label is in remove_labels, using the
    first available annotation column in col_candidates.

    Returns:
        AnnData (possibly filtered). If filtered, this is a fresh .copy() (not a view).
    """
    # pick first available annotation column
    ann_col = next((c for c in col_candidates if c in adata.obs.columns), None)
    if ann_col is None:
        print(f"[WARN] {dataset_name}: none of {col_candidates} found in .obs; skipping removal.")
        return adata

    remove_set = set(remove_labels)

    s = adata.obs[ann_col].astype("object")
    present = sorted(set(s.dropna().unique()) & remove_set)

    if not present:
        print(f"[INFO] {dataset_name}: none of {sorted(remove_set)} present in '{ann_col}'. No filtering applied.")
        return adata

    mask_keep = ~s.isin(remove_set)
    removed_n = int((~mask_keep).sum())
    kept_n = int(mask_keep.sum())
    print(
        f"[INFO] {dataset_name}: permanently removing {removed_n} cells "
        f"(keeping {kept_n}) with labels {present} using column '{ann_col}'."
    )

    # slice + copy to ensure permanent removal downstream (no views)
    adata2 = adata[mask_keep].copy()

    # clean unused categories for any categorical annotation columns we care about
    for col in col_candidates:
        if col in adata2.obs.columns and pd.api.types.is_categorical_dtype(adata2.obs[col]):
            adata2.obs[col] = adata2.obs[col].cat.remove_unused_categories()

    return adata2

# -------------------
# Apply to each dataset (PERMANENT filtering)
# -------------------
Zhang_dataset   = remove_celltypes_if_exist(Zhang_dataset,   dataset_name="Zhang_dataset")
Hao_dataset     = remove_celltypes_if_exist(Hao_dataset,     dataset_name="Hao_dataset")
Triana_dataset  = remove_celltypes_if_exist(Triana_dataset,  dataset_name="Triana_dataset")
Luecken_dataset = remove_celltypes_if_exist(Luecken_dataset, dataset_name="Luecken_dataset")

# -------------------
# Export (ensure dirs exist)
# -------------------
os.makedirs(data_path + "/References/Zhang",   exist_ok=True)
os.makedirs(data_path + "/References/Hao",     exist_ok=True)
os.makedirs(data_path + "/References/Triana",  exist_ok=True)
os.makedirs(data_path + "/References/Luecken", exist_ok=True)

def make_obs_h5ad_safe_no_nullable_strings(adata, dataset_name="adata"):
    """
    Sanitizes adata.obs to avoid pandas nullable string dtype issues during h5ad export:
      - converts non-object string dtype to object
      - normalizes object columns to pure python str (keeping missing values)
      - casts object columns to categorical for compactness / stable export
    """
    for c in list(adata.obs.columns):
        s = adata.obs[c]

        # If it's pandas nullable string or other non-object string dtype, convert to object
        if pd.api.types.is_string_dtype(s) and not pd.api.types.is_object_dtype(s):
            adata.obs[c] = s.astype("object")

        # If it's object, force all non-missing values to plain Python str
        if pd.api.types.is_object_dtype(adata.obs[c]):
            adata.obs[c] = (
                adata.obs[c]
                .where(pd.notna(adata.obs[c]), pd.NA)
                .map(lambda x: str(x) if x is not pd.NA else pd.NA)
            )
            # Optional: categorical for compactness and stable export
            adata.obs[c] = pd.Categorical(adata.obs[c])

    print(f"[INFO] {dataset_name}: obs columns sanitized for h5ad export (no nullable strings).")

make_obs_h5ad_safe_no_nullable_strings(Zhang_dataset, "Zhang_dataset")
make_obs_h5ad_safe_no_nullable_strings(Hao_dataset, "Hao_dataset")
make_obs_h5ad_safe_no_nullable_strings(Triana_dataset, "Triana_dataset")
make_obs_h5ad_safe_no_nullable_strings(Luecken_dataset, "Luecken_dataset")

Zhang_dataset.write_h5ad(data_path + "/References/Zhang" + "/Zhang_adata_annotated.h5ad")
Hao_dataset.write_h5ad(data_path + "/References/Hao" + "/228AB_healthy_donors_PBMNCs_annotated.h5ad")
Triana_dataset.write_h5ad(data_path + "/References/Triana" + "/97AB_young_and_old_adult_healthy_donor_BMMNCs_annotated.h5ad")
Luecken_dataset.write_h5ad(data_path + "/References/Luecken" + "/140AB_adult_healthy_donor_BMMNCs_annotated.h5ad")

print("[INFO] Export complete.")


In [None]:
import os
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects

# ---------------------------
# Output dir (fallback if not preset)
# ---------------------------
if "figures_path" not in globals():
    figures_path = "./figures"
os.makedirs(figures_path, exist_ok=True)

# ---------- Custom palette ----------
custom_palette = {
    'B Memory': "#68D827", 'B Naive': '#1C511D', 'CD14 Mono': "#D27CE3",
    'CD16 Mono': "#8D43CD", 'CD4 T Memory': "#C1AF93", 'CD4 T Naive': "#C99546",
    'CD8 T Memory': "#6B3317", 'CD8 T Naive': "#4D382E", 'ErP': "#D1235A",
    'Erythroblast': "#F30A1A", 'GMP': "#C5E4FF", 'HSC': '#0079ea', 'MPP': "#79b6ac",
    'Immature B': "#91FF7B", 'LMPP': "#17BECF", 'MAIT': "#BCBD22",
    'Myeloid progenitor': "#AEC7E8", 'NK CD56 bright': "#F3AC1F",
    'NK CD56 dim': "#FBEF0D", 'Plasma': "#9DC012", 'Pro-B': "#66BB6A",
    'Small': "#292929", 'cDC1': "#76A7CB", 'cDC2': "#16D2E3", 'GdT': "#EDB416",
    'pDC': "#69FFCB", 'CD4 CTL': "#D7D2CB", 'MEP': "#E364B0", 'Pre-B': "#2DBD67",
    'Pre-Pro-B': '#92AC8E', 'EoBaMaP': "#728245", 'MkP': "#69424D",
    'Stroma': "#727272", 'Macrophage': "#5F4761", 'ILC': "#F7CF94", 'DnT': "#504423",
    'GdT_DnT': "#B07A2A",
}
fallback = "#BBBBBB"

# ---------- Column to plot ----------
col = "Consensus_annotation_detailed_final"

# ---------- Labels to remove permanently ----------
remove_labels = {"Mesenchymal", "Platelet", "Macrophage", "ILC", "DnT"}

# =============================================================================
# 0) PERMANENTLY DROP unwanted labels from ALL datasets used below
#    (and from adatas_final if we derive global numbering from it)
# =============================================================================
def drop_labels_permanently(adata, dataset_name: str, col: str, remove_labels: set):
    if col not in adata.obs:
        raise KeyError(f"[{dataset_name}] Column '{col}' not found in .obs")

    s = adata.obs[col].astype("object")
    mask_keep = ~s.isin(remove_labels)
    removed_n = int((~mask_keep).sum())

    if removed_n > 0:
        present = sorted(set(s.dropna().unique()) & set(remove_labels))
        print(
            f"[INFO] {dataset_name}: permanently removing {removed_n} cells with labels {present} "
            f"using column '{col}'."
        )
        adata2 = adata[mask_keep].copy()
        # clean unused categories
        if pd.api.types.is_categorical_dtype(adata2.obs[col]):
            adata2.obs[col] = adata2.obs[col].cat.remove_unused_categories()
        return adata2

    print(f"[INFO] {dataset_name}: none of {sorted(remove_labels)} present in '{col}'. No filtering applied.")
    return adata

# Permanently filter the per-dataset objects
Zhang_dataset   = drop_labels_permanently(Zhang_dataset,   "Zhang_dataset",   col, remove_labels)
Luecken_dataset = drop_labels_permanently(Luecken_dataset, "Luecken_dataset", col, remove_labels)
Triana_dataset  = drop_labels_permanently(Triana_dataset,  "Triana_dataset",  col, remove_labels)
Hao_dataset     = drop_labels_permanently(Hao_dataset,     "Hao_dataset",     col, remove_labels)

# If we will derive global numbering from the integrated object, filter it permanently as well
if ("cats_global" not in globals() or "cat_to_num_global" not in globals()) and ("adatas_final" in globals()):
    adatas_final = drop_labels_permanently(adatas_final, "adatas_final", col, remove_labels)

# =============================================================================
# GLOBAL NUMBERING: derived from the integrated object if available.
# This ensures every per-dataset plot uses the SAME numbering as the integrated plot.
# Expect that you already ran the integrated plot code and have:
#   - cats_global
#   - cat_to_num_global
# If not, we build them here from `adatas_final` (already filtered above).
# =============================================================================
if "cats_global" not in globals() or "cat_to_num_global" not in globals():
    if "adatas_final" not in globals():
        raise NameError(
            "Expected `cats_global`/`cat_to_num_global` from the integrated plot, or `adatas_final` to derive them."
        )
    if col not in adatas_final.obs:
        raise KeyError(f"[adatas_final] Column '{col}' not found in .obs")

    _plot_col_global = f"{col}_plot"
    adatas_final.obs[_plot_col_global] = pd.Categorical(adatas_final.obs[col].astype("object").astype(str))
    cats_global = list(adatas_final.obs[_plot_col_global].cat.categories)
    cat_to_num_global = {cat: i + 1 for i, cat in enumerate(cats_global)}

    print(f"[INFO] Built global numbering from adatas_final (n_categories={len(cats_global)}).")

def plot_numbered_umap(
    adata,
    dataset_name: str,
    basis_key: str,
    *,
    col: str = col,
    figures_path: str = figures_path,
    custom_palette: dict = custom_palette,
    fallback: str = fallback,
    cats_global: list = cats_global,
    cat_to_num_global: dict = cat_to_num_global,
    figsize=(4, 3.5),
    dpi=300,
):
    if col not in adata.obs:
        raise KeyError(f"[{dataset_name}] Column '{col}' not found in .obs")

    # ---- Ensure embedding exists
    if basis_key not in adata.obsm_keys():
        raise KeyError(f"[{dataset_name}] Embedding '{basis_key}' not found in .obsm")

    # ---- Plot column with GLOBAL category order
    plot_col = f"{col}_plot"

    adata.obs[plot_col] = adata.obs[col].astype("object").astype(str)
    adata.obs[plot_col] = pd.Categorical(adata.obs[plot_col], categories=cats_global, ordered=True)

    # ---- Drop cells not in global set (keeps numbering stable)
    unknown_mask = adata.obs[plot_col].isna()
    n_unknown = int(unknown_mask.sum())
    if n_unknown > 0:
        unknown_vals = sorted(set(adata.obs[col].astype("object").astype(str)[unknown_mask].unique().tolist()))
        print(
            f"[WARN] {dataset_name}: dropping {n_unknown} cells with labels not in global set "
            f"(sample: {unknown_vals[:10]})"
        )
        adata = adata[~unknown_mask].copy()

    # ---- Color alignment (GLOBAL order)
    adata.uns[f"{plot_col}_colors"] = [custom_palette.get(cat, fallback) for cat in cats_global]

    # ---- Centroids (GLOBAL order; some cats may be absent in this dataset)
    coords = adata.obsm[basis_key][:, :2]
    df = pd.DataFrame(coords, columns=["UMAP1", "UMAP2"], index=adata.obs_names)
    df[plot_col] = adata.obs[plot_col].astype(str)

    centroids = (
        df.groupby(plot_col)[["UMAP1", "UMAP2"]]
          .mean()
          .reindex(cats_global)
          .reset_index()
    )

    # ---- Plot
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)

    sc.pl.embedding(
        adata,
        color=plot_col,
        basis=basis_key,
        title="",
        legend_loc=None,
        outline_width=(0.8, 0.05),
        add_outline=True,
        size=3,
        frameon=False,
        show=False,
        ax=ax,
    )

    ax.set_aspect("auto")

    # ---- Numbered centroid circles (GLOBAL numbering)
    for _, row in centroids.iterrows():
        label = row[plot_col]
        if pd.isna(row["UMAP1"]) or pd.isna(row["UMAP2"]):
            continue

        color = custom_palette.get(label, fallback)
        num = cat_to_num_global[label]

        ax.scatter(
            row["UMAP1"], row["UMAP2"],
            s=160, facecolor=color, edgecolor="black",
            alpha=0.6, linewidth=1, zorder=10
        )

        txt = ax.text(
            row["UMAP1"], row["UMAP2"], str(num),
            ha="center", va="center",
            fontsize=6.5, color="white", weight="bold", zorder=11
        )
        txt.set_path_effects([PathEffects.withStroke(linewidth=1.1, foreground="black")])

    plt.tight_layout(pad=0.2)

    # ---- Save
    out_png = os.path.join(figures_path, f"{dataset_name}_annotation_plot.png")
    out_pdf = os.path.join(figures_path, f"{dataset_name}_annotation_plot.pdf")
    fig.savefig(out_png, dpi=dpi, bbox_inches="tight", transparent=True)
    fig.savefig(out_pdf, dpi=dpi, bbox_inches="tight", transparent=True)

    plt.show()
    plt.close(fig)

    print(f"[INFO] Saved {dataset_name}:")
    print(" -", out_png)
    print(" -", out_pdf)

# =============================================================================
# Run for each dataset with your requested embeddings
# =============================================================================
plot_numbered_umap(Zhang_dataset,   dataset_name="Zhang_dataset",   basis_key="X_umap")
plot_numbered_umap(Luecken_dataset, dataset_name="Luecken_dataset", basis_key="X_umap")
plot_numbered_umap(Triana_dataset,  dataset_name="Triana_dataset",  basis_key="X_mofaumap")
plot_numbered_umap(Hao_dataset,     dataset_name="Hao_dataset",     basis_key="X_wnn.umap")
