# HuBMAP Azimuth Datasets

Find all HuBMAP datasets with Cell Ontology annotated cells via Azimuth with the Cells API, then compute some statistics.

# Install libraries

In [1]:
!pip install --upgrade hubmap_api_py_client



# Part 1: Get Cell Populations from HuBMAP Datasets using the Cells API

Imports / functions

In [2]:
from hubmap_api_py_client import Client
from collections import Counter

Setup the HuBMAP Cells API Client

In [3]:

endpoint_url = "https://cells.api.hubmapconsortium.org/api/"
client = Client(endpoint_url) 

Find cell types that have annotated datasets

In [4]:
all_celltypes = client.select_celltypes()
assert len(all_celltypes) > 0

celltypes = [c["grouping_name"] for c in all_celltypes.get_list()]
print('cell types:', len(celltypes))

cell types: 63


Find all datasets that have been annotated with cell types

In [5]:
datasets = client.select_datasets(where='celltype', has=celltypes).get_list()
assert len(datasets) > 0

uuids = [ d['uuid'] for d in datasets ]
print('annotated datasets with cell types:', len(datasets))

annotated datasets with cell types: 45


Get cells for each annotated dataset

In [6]:
dataset_cells = {}
dataset_organ = {}
dataset_modality = {}

for uuid in uuids:
    cells_in_dataset = client.select_cells(where='dataset', has=[uuid])
    all_cells = cells_in_dataset.get_list().results_set.get_list()

    population = Counter()
    for cell in all_cells:
        population[cell['cell_type']] += 1
        dataset_organ[uuid] = cell['organ'].lower()
        dataset_modality[uuid] = cell['modality']

    dataset_cells[uuid] = population

Show raw data results for one dataset

In [7]:
print(uuids[0], 'top cell types:', dataset_cells[uuids[0]].most_common(5))
print(uuids[0], 'organ:', dataset_organ[uuids[0]])
print(uuids[0], 'modality:', dataset_modality[uuids[0]])

007f3dfaaa287d5c7c227651f61a9c5b top cell types: [('CL:0000057', 1868), ('CL:0000235', 621), ('CL:0002131', 532), ('CL:0002144', 310), ('CL:0000763', 273)]
007f3dfaaa287d5c7c227651f61a9c5b organ: heart
007f3dfaaa287d5c7c227651f61a9c5b modality: rna


In [11]:
sum(( sum(pop.values()) for pop in dataset_cells.values() ))


367400

In [12]:
set(dataset_modality.values())

{'rna'}