In [1]:
import pandas as pd
import torch
import pickle
from datetime import datetime
from src.utils.ontology_utils import load_ontology
from src.data_pipeline.data_loader import load_filtered_cell_metadata
from src.data_pipeline.preprocess_ontology import preprocess_data_ontology
from src.utils.paths import PROJECT_ROOT
from src.utils.ontology_utils import get_sub_DAG

In [2]:
# 1. Load the cached ontology object
cl = load_ontology()

# Define the root of the ontology subgraph to be processed
root_cl_id = 'CL:0000988'  # hematopoietic cell

cell_types = get_sub_DAG(cl, root_cl_id)
print(f"Total cell types including root: {len(cell_types)}")
for t in list(cell_types)[:5]:
    print(t)


Loading cached ontology from /Users/jzhao/dev/Welch-lab/McCell/data/processed/ontology.pkl...
Ontology loaded successfully.
Total cell types including root: 708
Term('CL:0000816', name='immature B cell')
Term('CL:0000928', name='activated CD4-negative, CD8-negative type I NK T cell')
Term('CL:2000001', name='peripheral blood mononuclear cell')
Term('CL:0000042', name='neutrophilic myeloblast')
Term('CL:0009048', name='anorectum macrophage')


In [4]:

# 2. Load filtered cell metadata from CellXGene Census
cell_obs_metadata = load_filtered_cell_metadata(cl, root_cl_id=root_cl_id)

# 3. Preprocess the ontology and cell data
target_column = 'cell_type_ontology_term_id'

print("Starting ontology preprocessing...")
mapping_dict, leaf_values, internal_values, ontology_df, cell_parent_mask = preprocess_data_ontology(
        cl, cell_obs_metadata, target_column,
        upper_limit=root_cl_id,
        cl_only=True, include_leafs=False
    )

print(f"Preprocessing complete. Found {len(leaf_values)} leaf values and {len(internal_values)} internal values.")

Fetching descendants of CL:0000988...
Connecting to CellXGene Census...


The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.


Reading cell metadata to filter cell types...
Found 160 cell types with > 5000 cells.
Querying for final cell metadata...
Finished loading and filtering cell metadata.
Starting ontology preprocessing...
Preprocessing complete. Found 41 leaf values and 100 internal values.


In [5]:
print("Cell Parent Mask:")
print(cell_parent_mask)

Cell Parent Mask:
tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 0.,  ..., 1., 1., 1.],
        [0., 1., 1.,  ..., 0., 0., 1.],
        ...,
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.]])


In [8]:

reverse_mapping_dict = {v: k for k, v in mapping_dict.items()}

blood_cell_term = None
for term in cl.terms():
    if term.name == 'blood cell':
        blood_cell_term = term
        break

if blood_cell_term:
    print(f"Found blood cell term: {blood_cell_term.id}")
    # Get all subclasses of blood cell
    blood_cell_subclasses = list(blood_cell_term.subclasses())
    print(f"Found {len(blood_cell_subclasses)} blood cell subclasses.")

    # Print the parent mask for each subclass
    for i in range(cell_parent_mask.shape[1]):
        cell_id = reverse_mapping_dict.get(i)  # leaf nodes
        if not cell_id:
            cell_id = reverse_mapping_dict.get(i - len(leaf_values) + INTERNAL_NODE_ENCODING_START)  # internal nodes

        if cell_id in [subclass.id for subclass in blood_cell_subclasses]:
            term = cl[cell_id]
            parent_mask = cell_parent_mask[:, i]
            print(f"\nSubclass: {term.name} ({term.id})")
            print(f"Parent mask:\n{parent_mask}")
else:
    print("Blood cell not found in the ontology.")

Found blood cell term: CL:0000081
Found 42 blood cell subclasses.

Subclass: neutrophil (CL:0000775)
Parent mask:
tensor([1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1.])

Subclass: erythrocyte (CL:0000232)
Parent mask:
tensor([1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
        0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        