In [None]:
!pip install obonet

In [1]:
import os
from os.path import join

import pandas as pd
import numpy as np

In [2]:
# DATA_PATH = '/lustre/scratch/users/felix.fischer/merlin_cxg_simple_norm_parquet'
DATA_PATH = '/mnt/dssmcmlfs01/merlin_cxg_norm_parquet'

# Compute lookup matrices 

In [3]:
cell_type_mapping = pd.read_parquet(join(DATA_PATH, 'categorical_lookup/cell_type.parquet'))

inverse_mapping = (
    cell_type_mapping
    .assign(idx=range(len(cell_type_mapping)))
    .set_index('label', drop=True)
)
inverse_mapping.head()

Unnamed: 0_level_0,idx
label,Unnamed: 1_level_1
B cell,0
Bergmann glial cell,1
"CD14-low, CD16-positive monocyte",2
CD14-positive monocyte,3
"CD14-positive, CD16-negative classical monocyte",4


In [None]:
!wget https://github.com/obophenotype/cell-ontology/archive/refs/tags/v2022-09-15.zip
!unzip v2022-09-15.zip

In [5]:
from typing import List

import obonet
import networkx


url = 'cell-ontology-2022-09-15/cl-simple.obo'  # use same version as cellxgene data
graph = obonet.read_obo(url, ignore_obsolete=True)

# only use "is_a" edges
edges_to_delete = []
for i, x in enumerate(graph.edges):
    if x[2] != 'is_a':
        edges_to_delete.append((x[0], x[1]))
for x in edges_to_delete:
    graph.remove_edge(u=x[0], v=x[1])

# define mapping from id to name
id_to_name = {id_: data.get('name') for id_, data in graph.nodes(data=True)}
# define inverse mapping from name to id
name_to_id = {v: k for k, v in id_to_name.items()}

# lookup child and parent nodes
celltypes = cell_type_mapping.label.tolist()
child_nodes_dict = {}
parent_nodes_dict = {}
for celltype in celltypes:
    # find child nodes
    child_nodes_dict[celltype] = [id_to_name[node] for node in networkx.ancestors(graph, name_to_id[celltype]) if id_to_name[node] in celltypes]
    # find parent nodes
    parent_nodes_dict[celltype] = [id_to_name[node] for node in networkx.descendants(graph, name_to_id[celltype]) if id_to_name[node] in celltypes]


In [6]:
parents_idx = []
children_idx = []

for cell_type in cell_type_mapping.label:
    parent_nodes = parent_nodes_dict[cell_type]
    parents_idx.append(inverse_mapping.loc[parent_nodes].idx.sort_values().tolist())
    
    child_nodes = child_nodes_dict[cell_type]
    children_idx.append(inverse_mapping.loc[child_nodes].idx.sort_values().tolist())
    
cell_type_mapping['children'] = children_idx
cell_type_mapping['parents'] = parents_idx

In [7]:
cell_type_mapping.head()

Unnamed: 0,label,children,parents
0,B cell,"[46, 80, 102, 108, 116, 131, 134, 151]","[95, 98]"
1,Bergmann glial cell,[],[32]
2,"CD14-low, CD16-positive monocyte",[],"[3, 95, 112, 125]"
3,CD14-positive monocyte,"[2, 4, 5]","[95, 112]"
4,"CD14-positive, CD16-negative classical monocyte",[],"[3, 47, 95, 112]"


In [8]:
os.makedirs(join(DATA_PATH, 'cell_type_hierarchy'), exist_ok=True)

In [9]:
parent_matrix = np.eye(len(cell_type_mapping))

for i, parent_nodes in enumerate(cell_type_mapping.parents):
    parent_matrix[i, parent_nodes] = 1.

with open(join(DATA_PATH, 'cell_type_hierarchy/parent_matrix.npy'), 'wb') as f:
    np.save(f, parent_matrix)


In [10]:
child_matrix = np.eye(len(cell_type_mapping))

for i, child_nodes in enumerate(cell_type_mapping.children):
    child_matrix[i, child_nodes] = 1.
    
with open(join(DATA_PATH, 'cell_type_hierarchy/child_matrix.npy'), 'wb') as f:
    np.save(f, child_matrix)


# Sanity check lookup matrices

In [11]:
cell_type_mapping.loc[np.where(child_matrix[0, :] == 1.)[0]].label.tolist()

['B cell',
 'class switched memory B cell',
 'immature B cell',
 'mature B cell',
 'memory B cell',
 'naive B cell',
 'plasmablast',
 'precursor B cell',
 'transitional stage B cell']

In [12]:
cell_type_mapping.loc[np.where(parent_matrix[0, :] == 1.)[0]].label.tolist()

['B cell', 'leukocyte', 'lymphocyte']