In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata as ad
from sfaira.consts.ontologies import OntologyContainerSfaira



In [None]:
adata = sc.read_h5ad("/storage/data/organoid_atlas_adatas/230620_10_updated_manual_annotation.h5ad")

# Add final metadata annotation updates

In [None]:
updated_labels = pd.read_csv("/storage/data/2405_revision/new_metadata/metadata_240523.tsv.gz", sep="\t", index_col=0, low_memory=False)
newfields = ['CorrDiff_max_Braun_Bhaduri', 'CorrDiff_top5mean_Braun_Bhaduri', 'GO_canonical_glycolysis', 'GO_canonical_glycolysis_unscaled', 'GO_mitochondrial_ATP_synthesis_coupled_electron_transport', 'Hallmark_Glycolysis', 'Hallmark_Hypoxia', 'Hallmark_Oxidative_Phosphorylation', 'annot_level_3_rev2', 'annot_level_4_rev2', 'annot_ntt_rev2', 'annot_region_rev2', 'organoid_age_months', 'organoid_age_weeks', 'publication_protocol',]

In [None]:
# Update publication details for Atamian et al. 2024 which has now been published
adata.obs["publication"] = adata.obs["publication"].replace("Quadrato, 2023", "Atamian, 2024")
adata.obs["doi"] = adata.obs["doi"].replace("no_doi_quadrato", "10.1016/j.stem.2023.11.013")

# Clean up obs columns
adata.obs["state_exact"] = adata.obs["state_exact"].replace("", "unknown")
adata.obs["treatment"] = adata.obs["treatment"].replace("", "unknown")
adata.obs["disease"] = adata.obs["disease"].replace("Rett syndrome", "unknown")  # these samples have wildtype MECP2 expression and should not be labelled as diseased

In [None]:
adata.obs[newfields] = updated_labels[newfields]

In [None]:
new_obs_order = [
    'assay_differentiation',
    'assay_sc',
    'assay_sc_original',
    'assay_type_differentiation',
    'bio_sample',
    'cell_line',
    'cell_line_original',
    'cell_type',
    'cell_type_original',
    'development_stage',
    'development_stage_original',
    'disease',
    'disease_original',
    'ethnicity',
    'ethnicity_original',
    'gm',
    'id',
    'individual',
    'organ',
    'organ_original',
    'organism',
    'organism_original',
    'sample_source',
    'sex',
    'sex_original',
    'source_doi',
    'state_exact',
    'suspension_type',
    'suspension_type_original',
    'tech_sample',
    'treatment',
    'organoid_age_days',
    'organoid_age_weeks',
    'organoid_age_months',
    'publication',
    'publication_protocol',
    'doi',
    'batch',
    'n_genes_by_counts',
    'log1p_n_genes_by_counts',
    'total_counts',
    'log1p_total_counts',
    'total_counts_mt',
    'log1p_total_counts_mt',
    'pct_counts_mt',
    'leiden_pca_unintegrated_1',
    'leiden_pca_unintegrated_80',
    'leiden_pca_rss_1',
    'leiden_pca_rss_80',
    'snapseed_pca_unintegrated_level_1',
    'snapseed_pca_unintegrated_level_2',
    'snapseed_pca_unintegrated_level_3',
    'snapseed_pca_unintegrated_level_4',
    'snapseed_pca_unintegrated_level_5',
    'snapseed_pca_unintegrated_level_12',
    'snapseed_pca_unintegrated_level_123',
    'snapseed_pca_unintegrated_level_1234',
    'snapseed_pca_unintegrated_level_12345',
    'snapseed_pca_rss_level_1',
    'snapseed_pca_rss_level_2',
    'snapseed_pca_rss_level_3',
    'snapseed_pca_rss_level_4',
    'snapseed_pca_rss_level_5',
    'snapseed_pca_rss_level_12',
    'snapseed_pca_rss_level_123',
    'snapseed_pca_rss_level_1234',
    'snapseed_pca_rss_level_12345',
    'leiden_scpoli_1',
    'leiden_scpoli_80',
    'snapseed_scpoli_level_1',
    'snapseed_scpoli_level_2',
    'snapseed_scpoli_level_3',
    'snapseed_scpoli_level_4',
    'snapseed_scpoli_level_5',
    'snapseed_scpoli_level_12',
    'snapseed_scpoli_level_123',
    'snapseed_scpoli_level_1234',
    'snapseed_scpoli_level_12345',
    'ECM_raw',
    'ROCK_inhibitor_raw',
    'BMP_activator_raw',
    'TGF_B_activator_raw',
    'TGF_B_inhibitor_raw',
    'BMP_inhibitor_raw',
    'WNT_activator_raw',
    'WNT_inhibitor_raw',
    'EGF_raw',
    'FGF2_raw',
    'FGF8_raw',
    'SHH_agonist_raw',
    'RA_raw',
    'MEK_ERK_inhibitor_raw',
    'Notch_inhibitor_raw',
    'manual_annot_ct',
    'manual_annot_ct_fine',
    'manual_annot_region',
    'annot_level_1',
    'annot_level_2',
    'annot_level_3',
    'annot_level_3_rev2',
    'annot_level_4',
    'annot_level_4_rev2',
    'annot_region',
    'annot_region_rev2',
    'annot_ntt_rev2',
    'CorrDiff_max_Braun_Bhaduri',
    'CorrDiff_top5mean_Braun_Bhaduri',
    'GO_canonical_glycolysis',
    'GO_canonical_glycolysis_unscaled',
    'GO_mitochondrial_ATP_synthesis_coupled_electron_transport',
    'Hallmark_Glycolysis',
    'Hallmark_Hypoxia',
    'Hallmark_Oxidative_Phosphorylation',
]

In [None]:
adata.obs = adata.obs[new_obs_order].copy()

In [None]:
adata

In [None]:
adata.X = adata.layers["lognorm"].copy()

In [None]:
# write full object for Zenodo
adata.write("/storage/data/final_adata_object_export/hnoca_allmeta.h5ad", compression="gzip")

# Clean adata for ArchMap and CellxGene

In [None]:
filtered_obs = [
    'assay_differentiation',
    'assay_sc',
    'assay_sc_original',
    'assay_type_differentiation',
    'bio_sample',
    'cell_line',
    'cell_line_original',
    'cell_type',
    'cell_type_original',
    'development_stage',
    'development_stage_original',
    'disease',
    'disease_original',
    'ethnicity',
    'ethnicity_original',
    'gm',
    'id',
    'individual',
    'organ',
    'organ_original',
    'organism',
    'organism_original',
    'sample_source',
    'sex',
    'sex_original',
    #'source_doi',
    'state_exact',
    'suspension_type',
    'suspension_type_original',
    'tech_sample',
    'treatment',
    'organoid_age_days',
    #'organoid_age_weeks',
    #'organoid_age_months',
    'publication',
    #'publication_protocol',
    'doi',
    'batch',
    'n_genes_by_counts',
    'log1p_n_genes_by_counts',
    'total_counts',
    'log1p_total_counts',
    'total_counts_mt',
    'log1p_total_counts_mt',
    'pct_counts_mt',
    #'leiden_pca_unintegrated_1',
    #'leiden_pca_unintegrated_80',
    #'leiden_pca_rss_1',
    #'leiden_pca_rss_80',
    #'snapseed_pca_unintegrated_level_1',
    #'snapseed_pca_unintegrated_level_2',
    #'snapseed_pca_unintegrated_level_3',
    #'snapseed_pca_unintegrated_level_4',
    #'snapseed_pca_unintegrated_level_5',
    #'snapseed_pca_unintegrated_level_12',
    #'snapseed_pca_unintegrated_level_123',
    #'snapseed_pca_unintegrated_level_1234',
    #'snapseed_pca_unintegrated_level_12345',
    #'snapseed_pca_rss_level_1',
    #'snapseed_pca_rss_level_2',
    #'snapseed_pca_rss_level_3',
    #'snapseed_pca_rss_level_4',
    #'snapseed_pca_rss_level_5',
    #'snapseed_pca_rss_level_12',
    #'snapseed_pca_rss_level_123',
    #'snapseed_pca_rss_level_1234',
    #'snapseed_pca_rss_level_12345',
    #'leiden_scpoli_1',
    #'leiden_scpoli_80',
    #'snapseed_scpoli_level_1',
    #'snapseed_scpoli_level_2',
    #'snapseed_scpoli_level_3',
    #'snapseed_scpoli_level_4',
    #'snapseed_scpoli_level_5',
    #'snapseed_scpoli_level_12',
    #'snapseed_scpoli_level_123',
    #'snapseed_scpoli_level_1234',
    #'snapseed_scpoli_level_12345',
    #'ECM_raw',
    #'ROCK_inhibitor_raw',
    #'BMP_activator_raw',
    #'TGF_B_activator_raw',
    #'TGF_B_inhibitor_raw',
    #'BMP_inhibitor_raw',
    #'WNT_activator_raw',
    #'WNT_inhibitor_raw',
    #'EGF_raw',
    #'FGF2_raw',
    #'FGF8_raw',
    #'SHH_agonist_raw',
    #'RA_raw',
    #'MEK_ERK_inhibitor_raw',
    #'Notch_inhibitor_raw',
    #'manual_annot_ct',
    #'manual_annot_ct_fine',
    #'manual_annot_region',
    'annot_level_1',
    'annot_level_2',
    #'annot_level_3',
    'annot_level_3_rev2',
    #'annot_level_4',
    'annot_level_4_rev2',
    #'annot_region',
    'annot_region_rev2',
    'annot_ntt_rev2',
    #'CorrDiff_max_Braun_Bhaduri',
    #'CorrDiff_top5mean_Braun_Bhaduri',
    #'GO_canonical_glycolysis',
    #'GO_canonical_glycolysis_unscaled',
    #'GO_mitochondrial_ATP_synthesis_coupled_electron_transport',
    'Hallmark_Glycolysis',
    #'Hallmark_Hypoxia',
    #'Hallmark_Oxidative_Phosphorylation',
]

filtered_var = [
    'ensembl',
    'gene_symbol',
    'mt',
    'n_cells_by_counts',
    'mean_counts',
    'log1p_mean_counts',
    'pct_dropout_by_counts',
    'total_counts',
    'log1p_total_counts',
    'gene_length',
    'highly_variable',
    'highly_variable_rank',
    'means',
    'variances',
    'variances_norm',
    'highly_variable_nbatches',
]

filtered_obsm = [
    #'X_benchmark_aggr_scpoli_level1',
    #'X_benchmark_aggr_scpoli_level123',
    #'X_benchmark_scanvi_level1',
    #'X_benchmark_scanvi_level12',
    #'X_benchmark_scanvi_level123',
    #'X_benchmark_scpoli_level1',
    #'X_benchmark_scpoli_level12',
    #'X_benchmark_scpoli_level123',
    #'X_benchmark_scvi',
    #'X_pca_rss',
    #'X_pca_unintegrated',
    #'X_rss',
    'X_scpoli',
    #'X_umap_benchmark_aggr_scpoli_level1',
    #'X_umap_benchmark_aggr_scpoli_level123',
    #'X_umap_pca_rss',
    #'X_umap_pca_unintegrated',
    'X_umap_scpoli',
]

filtered_obsp = [
    #'knn_pca_rss_connectivities',
    #'knn_pca_rss_distances',
    #'knn_pca_unintegrated_connectivities',
    #'knn_pca_unintegrated_distances',
    'knn_scpoli_connectivities',
    'knn_scpoli_distances',
]

filtered_uns = [
    'hvg',
    #'knn_pca_rss',
    #'knn_pca_unintegrated',
    'knn_scpoli',
    'log1p',
]

filtered_layers = [
    #'counts',
    'counts_lengthnorm',
    #'lognorm',
]

In [None]:
adata.obs = adata.obs[filtered_obs].copy()
adata.var = adata.var[filtered_var].copy()
for k in list(adata.uns_keys()):
    if k not in filtered_uns:
        del adata.uns[k]
for k in list(adata.obsm_keys()):
    if k not in filtered_obsm:
        del adata.obsm[k]
for k in list(adata.obsp.keys()):
    if k not in filtered_obsp:
        del adata.obsp[k]
for k in list(adata.layers.keys()):
    if k not in filtered_layers:
        del adata.layers[k]

In [None]:
adata

In [None]:
adata.write("/storage/data/final_adata_object_export/hnoca_cleanedmeta.h5ad", compression="gzip")

In [None]:
adata[adata.obs["publication"] != "Treutlein, 2023"].write("/storage/data/final_adata_object_export/hnoca_publiconly_cleanedmeta.h5ad", compression="gzip")

In [None]:
adata[:, adata.var["highly_variable"]].write("/storage/data/final_adata_object_export/hnoca_cleanedmeta_hvg.h5ad", compression="gzip")

In [None]:
adata[adata.obs["publication"] != "Treutlein, 2023", adata.var["highly_variable"]].write("/storage/data/final_adata_object_export/hnoca_publiconly_cleanedmeta_hvg.h5ad", compression="gzip")

## CellxGene

In [None]:
adata = sc.read_h5ad("/storage/data/final_adata_object_export/hnoca_cleanedmeta.h5ad")

In [None]:
oc = OntologyContainerSfaira()

In [None]:
# remove the duplicated indices from the fiorenzano standard organoid sample (subset anndata for faster computation of duplicates)

a = adata[adata.obs["tech_sample"].isin([
    "homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_509silk",
    "homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_55standardorgday30"
])].copy()

dups = a.obs_names[
    pd.util.hash_pandas_object(pd.DataFrame(a.X.A, index=a.obs_names, columns=a.var_names), index=False).duplicated(keep=False) & \
    (a.obs["tech_sample"] == "homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_55standardorgday30")
].tolist()

dups += ["homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_5_16549"] # additional duplicate arising after cellxgene processing

adata = adata[~adata.obs_names.isin(dups)].copy()

In [None]:
# remove 'original' metadata columns apart from celltype
for k in list(adata.obs_keys()):
    if k.endswith("_original") and k != "cell_type_original":
        del adata.obs[k]

In [None]:
# add cellxgene-specific .uns metadata
adata.uns["title"] = "The Human Neural Organoid Atlas"
adata.uns["batch_condition"] = "batch"
adata.uns["default_embedding"] = "X_umap_scpoli"

In [None]:
# var index should be Ensembl ID
adata.var.index = adata.var["ensembl"].tolist()

In [None]:
# store raw counts in raw.X
adata.raw = ad.AnnData(X=np.round(adata.layers["counts_lengthnorm"]), var=adata.var, obs=adata.obs)
del adata.layers["counts_lengthnorm"]

In [None]:
adata.obs["organism_ontology_term_id"] = pd.Categorical(adata.shape[0]*["NCBITaxon:9606"])
del adata.obs["organism"]

adata.obs["tissue_type"] = pd.Categorical(adata.shape[0]*["organoid"])
del adata.obs["sample_source"]

adata.obs["sex_ontology_term_id"] = adata.obs["sex"].replace({"male": "PATO:0000384", "female": "PATO:0000383"})
del adata.obs["sex"]

adata.obs["donor_id"] = adata.obs["cell_line"].copy()

adata.obs["assay_ontology_term_id"] = adata.obs["assay_sc"].replace({i:oc.assay_sc.convert_to_id(i) for i in adata.obs["assay_sc"].cat.categories})
del adata.obs["assay_sc"]

adata.obs["self_reported_ethnicity_ontology_term_id"] = adata.obs["ethnicity"].replace({i:oc.ethnicity['Homo sapiens'].convert_to_id(i) for i in adata.obs["ethnicity"].cat.categories if i != "unknown"})
del adata.obs["ethnicity"]

adata.obs["tissue_ontology_term_id"] = adata.obs["organ"].replace({i:oc.organ.convert_to_id(i) for i in adata.obs["organ"].cat.categories})
del adata.obs["organ"]

adata.obs["disease_ontology_term_id"] = adata.obs["disease"].replace({"unknown": "healthy"}).replace({i:oc.disease.convert_to_id(i) for i in adata.obs["disease"].cat.categories if i != "unknown"})
del adata.obs["disease"]

adata.obs["development_stage_ontology_term_id"] = adata.obs["development_stage"].replace({i:oc.development_stage['Homo sapiens'].convert_to_id(i) for i in adata.obs["development_stage"].cat.categories if i != "unknown"})
del adata.obs["development_stage"]

adata.obs["cell_type_ontology_term_id"] = adata.obs["cell_type"].replace({i:oc.cell_type.convert_to_id(i) for i in adata.obs["cell_type"].cat.categories if i != "unknown"})
del adata.obs["cell_type"]

In [None]:
adata.write("/storage/data/final_adata_object_export/hnoca_cellxgene.h5ad", compression="gzip")

In [None]:
adata[adata.obs["publication"] != "Treutlein, 2023"].write("/storage/data/final_adata_object_export/hnoca_publiconly_cellxgene.h5ad", compression="gzip")

# HNOCA-Extended

In [None]:
adata = sc.read("/storage/data/final_adata_object_export/hnoca_cleanedmeta.h5ad")

In [None]:
adata.var[["ensembl", "gene_symbol"]].to_csv("/storage/data/2404_revision/hnoca_features.csv")

In [None]:
filtered_obs = [
    'assay_differentiation',
    'assay_sc',
    'assay_sc_original',
    'assay_type_differentiation',
    'bio_sample',
    'cell_line',
    'cell_line_original',
    'cell_type',
    'cell_type_original',
    'development_stage',
    'development_stage_original',
    'disease',
    'disease_original',
    'ethnicity',
    'ethnicity_original',
    'gm',
    'id',
    'individual',
    'organ',
    'organ_original',
    'organism',
    'organism_original',
    'sample_source',
    'sex',
    'sex_original',
    'state_exact',
    'suspension_type',
    'suspension_type_original',
    'tech_sample',
    'treatment',
    'organoid_age_days',
    'publication',
    'doi',
    'batch',
    #'n_genes_by_counts',
    #'log1p_n_genes_by_counts',
    #'total_counts',
    #'log1p_total_counts',
    #'total_counts_mt',
    #'log1p_total_counts_mt',
    #'pct_counts_mt',
    'annot_level_1',
    'annot_level_2',
    'annot_level_3_rev2',
    'annot_level_4_rev2',
    'annot_region_rev2',
    'annot_ntt_rev2',
    'Hallmark_Glycolysis',
    'hnoca_core',
]

filtered_var = [
    'ensembl',
    'gene_symbol',
    #'mt',
    #'n_cells_by_counts',
    #'mean_counts',
    #'log1p_mean_counts',
    #'pct_dropout_by_counts',
    #'total_counts',
    #'log1p_total_counts',
    'gene_length',
    'highly_variable',
    'highly_variable_rank',
    #'means',
    #'variances',
    #'variances_norm',
    'highly_variable_nbatches',
]

In [None]:
adata.obs["hnoca_core"] = True

In [None]:
adata_extended = ad.concat((
    adata,
    ad.read_h5ad("/storage/data/2404_revision/ce_output/hnoca_ce_query_allgenes.h5ad"),
), join="outer")

In [None]:
adata_extended.obs = adata_extended.obs[filtered_obs].copy()
adata_extended.var = adata.var[filtered_var].copy()
del adata_extended.obsm

In [None]:
# save HVG object without embeddings for mapping
adata_extended[:, adata_extended.var["highly_variable"]].copy().write("/storage/data/final_adata_object_export/hnoca_ce_hvg_noemb.h5ad", compression="gzip")

In [None]:
# add embeddings and celltype predictions from HVG mapped object
embeddings = ad.read_h5ad("/storage/data/2404_revision/ce_output/hnoca_community_joint.h5ad")

adata_extended.obsp["knn_scpoli_connectivities"] = embeddings.obsp["knn_scpoli_connectivities"].copy()
adata_extended.obsp["knn_scpoli_distances"] = embeddings.obsp["knn_scpoli_distances"].copy()

adata_extended.obsm["X_scpoli"] = embeddings.obsm["X_scpoli"].copy()
adata_extended.obsm["X_umap_scpoli"] = embeddings.obsm["X_umap_scpoli"].copy()

adata_extended.obs["annot_level_2_extended"] = embeddings.obs["annot_level_2"]
adata_extended.obs.loc[adata_extended.obs["annot_level_2_extended"].isna(), "annot_level_2_extended"] = embeddings.obs.loc[adata_extended.obs["annot_level_2_extended"].isna(), "annot_level_2_query"].astype(str)

In [None]:
adata_extended

In [None]:
adata_extended.write("/storage/data/final_adata_object_export/hnoca_extended.h5ad", compression="gzip")

In [None]:
adata_extended[adata_extended.obs["publication"] != "Treutlein, 2023"].write("/storage/data/final_adata_object_export/hnoca_publiconly_extended.h5ad", compression="gzip")

In [None]:
adata_extended[:, adata_extended.var["highly_variable"]].write("/storage/data/final_adata_object_export/hnoca_extended_hvg.h5ad", compression="gzip")

In [None]:
adata_extended[adata_extended.obs["publication"] != "Treutlein, 2023", adata_extended.var["highly_variable"]].write("/storage/data/final_adata_object_export/hnoca_publiconly_extended_hvg.h5ad", compression="gzip")

## CellxGene

In [None]:
oc = OntologyContainerSfaira()

In [None]:
# remove the duplicated indices found in the fiorenzano standard organoid sample (subset anndata for faster computation of duplicates)

a = adata_extended[adata_extended.obs["tech_sample"].isin([
    "homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_509silk",
    "homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_55standardorgday30"
])].copy()

dups = a.obs_names[
    pd.util.hash_pandas_object(pd.DataFrame(a.X.A, index=a.obs_names, columns=a.var_names), index=False).duplicated(keep=False) & \
    (a.obs["tech_sample"] == "homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_55standardorgday30")
]

dups += ["homosapiens_midbraintegmentum_2021_10x3v3_fiorenzanoalessandro_001_d10_1038_s41467_021_27464_5_16549"] # additional duplicate arising after cellxgene processing

adata_extended = adata_extended[~adata_extended.obs_names.isin(dups)].copy()

In [None]:
# remove 'original' metadata columns apart from celltype
for k in list(adata_extended.obs_keys()):
    if k.endswith("_original") and k != "cell_type_original":
        del adata_extended.obs[k]

In [None]:
# add cellxgene-specific .uns metadata
adata_extended.uns["title"] = "HNOCA Extended: The Human Neural Organoid Atlas"
adata_extended.uns["batch_condition"] = "batch"
adata_extended.uns["default_embedding"] = "X_umap_scpoli"

In [None]:
# var index should be Ensembl ID
adata_extended.var.index = adata_extended.var["ensembl"].tolist()

In [None]:
# store raw counts in raw.X
adata_extended.raw = ad.AnnData(X=np.round(adata_extended.layers["counts_lengthnorm"]), var=adata_extended.var, obs=adata_extended.obs)
del adata_extended.layers["counts_lengthnorm"]

In [None]:
adata_extended.obs["organism_ontology_term_id"] = pd.Categorical(adata_extended.shape[0]*["NCBITaxon:9606"])
del adata_extended.obs["organism"]

adata_extended.obs["tissue_type"] = pd.Categorical(adata_extended.shape[0]*["organoid"])
del adata_extended.obs["sample_source"]

adata_extended.obs["sex_ontology_term_id"] = adata_extended.obs["sex"].replace({"male": "PATO:0000384", "female": "PATO:0000383"})
del adata_extended.obs["sex"]

adata_extended.obs["donor_id"] = adata_extended.obs["cell_line"].copy()

adata_extended.obs["assay_ontology_term_id"] = adata_extended.obs["assay_sc"].replace({i:oc.assay_sc.convert_to_id(i) for i in adata_extended.obs["assay_sc"].cat.categories})
del adata_extended.obs["assay_sc"]

adata_extended.obs["self_reported_ethnicity_ontology_term_id"] = adata_extended.obs["ethnicity"].replace({i:oc.ethnicity['Homo sapiens'].convert_to_id(i) for i in adata_extended.obs["ethnicity"].cat.categories if i != "unknown"})
del adata_extended.obs["ethnicity"]

adata_extended.obs["tissue_ontology_term_id"] = adata_extended.obs["organ"].replace({i:oc.organ.convert_to_id(i) for i in adata_extended.obs["organ"].cat.categories})
del adata_extended.obs["organ"]

adata_extended.obs["disease_ontology_term_id"] = adata_extended.obs["disease"].replace({"unknown": "healthy"}).replace({i:oc.disease.convert_to_id(i) for i in adata_extended.obs["disease"].cat.categories if i != "unknown"})
del adata_extended.obs["disease"]

adata_extended.obs["development_stage_ontology_term_id"] = adata_extended.obs["development_stage"].replace({i:oc.development_stage['Homo sapiens'].convert_to_id(i) for i in adata_extended.obs["development_stage"].cat.categories if i != "unknown"})
del adata_extended.obs["development_stage"]

adata_extended.obs["cell_type_ontology_term_id"] = adata_extended.obs["cell_type"].replace({i:oc.cell_type.convert_to_id(i) for i in adata_extended.obs["cell_type"].cat.categories if i != "unknown"})
del adata_extended.obs["cell_type"]

In [None]:
adata_extended.write("/storage/data/final_adata_object_export/hnoca_extended_cellxgene.h5ad", compression="gzip")

In [None]:
adata_extended[adata_extended.obs["publication"] != "Treutlein, 2023"].write("/storage/data/final_adata_object_export/hnoca_publiconly_extended_cellxgene.h5ad", compression="gzip")