# Diagnostic: Cell Count Investigation
Testing different query filters progressively to see where cells are being lost

In [1]:
import cellxgene_census
import pandas as pd
from src.utils.paths import get_data_folder
import pickle

print("Opening CellXGene Census...")
census = cellxgene_census.open_soma()

Opening CellXGene Census...


The "stable" release is currently 2025-01-30. Specify 'census_version="2025-01-30"' in future calls to open_soma() to ensure data consistency.


In [2]:
# Load our cell types from the preprocessed data
DATE = '2025-10-24'
PROCESSED_DATA_DIR = get_data_folder(DATE)

mapping_dict_df = pd.read_csv(PROCESSED_DATA_DIR / f"{DATE}_mapping_dict_df.csv", index_col=0)
mapping_dict = pd.Series(mapping_dict_df.iloc[:, 0].values, index=mapping_dict_df.index).to_dict()
all_cell_values = list(mapping_dict.keys())

print(f"Loaded {len(all_cell_values)} cell types from preprocessed data")
print(f"First 10: {all_cell_values[:10]}")

Loaded 80 cell types from preprocessed data
First 10: ['CL:0000233', 'CL:0000895', 'CL:0000900', 'CL:0000904', 'CL:0000905', 'CL:0000910', 'CL:0000912', 'CL:0000913', 'CL:0000938', 'CL:0000939']


## Test 1: No filters at all

In [3]:
experiment = census["census_data"]["homo_sapiens"]

print("Query 1: No filters (all human cells)")
obs_all = experiment.obs.read(
    column_names=["cell_type_ontology_term_id", "assay", "is_primary_data"]
).concat().to_pandas()

print(f"Total cells in census: {len(obs_all):,}")
print(f"Unique cell types: {obs_all['cell_type_ontology_term_id'].nunique()}")
print(f"\nAssay distribution (top 10):")
print(obs_all['assay'].value_counts().head(10))

Query 1: No filters (all human cells)
Total cells in census: 106,118,167
Unique cell types: 819

Assay distribution (top 10):
assay
10x 3' v3                              59668147
10x 3' v2                              22750589
10x 5' v1                               7448617
sci-RNA-seq3                            5064268
10x 5' v2                               4363694
10x 5' transcription profiling          1968545
Drop-seq                                1048377
ScaleBio single cell RNA sequencing      700524
10x 3' transcription profiling           665642
microwell-seq                            642559
Name: count, dtype: int64


## Test 2: Filter by assay only

In [4]:
print("Query 2: Only 10x 3' v3 assay")
obs_10x = experiment.obs.read(
    value_filter='assay == "10x 3\' v3"',
    column_names=["cell_type_ontology_term_id", "assay", "is_primary_data"]
).concat().to_pandas()

print(f"Cells with 10x 3' v3: {len(obs_10x):,}")
print(f"Unique cell types: {obs_10x['cell_type_ontology_term_id'].nunique()}")
print(f"\nis_primary_data distribution:")
print(obs_10x['is_primary_data'].value_counts())

Query 2: Only 10x 3' v3 assay
Cells with 10x 3' v3: 59,668,147
Unique cell types: 647

is_primary_data distribution:
is_primary_data
True     31806345
False    27861802
Name: count, dtype: int64


## Test 3: Filter by assay + primary data

In [5]:
print("Query 3: 10x 3' v3 + primary data")
obs_10x_primary = experiment.obs.read(
    value_filter='assay == "10x 3\' v3" and is_primary_data == True',
    column_names=["cell_type_ontology_term_id"]
).concat().to_pandas()

print(f"Cells with 10x 3' v3 + primary: {len(obs_10x_primary):,}")
print(f"Unique cell types: {obs_10x_primary['cell_type_ontology_term_id'].nunique()}")

# Count cells per type
cell_type_counts = obs_10x_primary['cell_type_ontology_term_id'].value_counts()
print(f"\nTop 20 cell types by count:")
print(cell_type_counts.head(20))

Query 3: 10x 3' v3 + primary data
Cells with 10x 3' v3 + primary: 31,806,345
Unique cell types: 625

Top 20 cell types by count:
cell_type_ontology_term_id
CL:0000540    3118306
CL:0000128    2632680
CL:4023040    1734499
CL:0000604    1141897
CL:0000057     920759
CL:0000127     622607
unknown        619217
CL:0000084     611902
CL:0000235     569955
CL:0000679     562975
CL:0001064     494267
CL:0002453     436454
CL:0000837     401866
CL:0008001     397378
CL:0000236     396007
CL:0000681     386723
CL:0000746     367438
CL:0000576     366274
CL:0000115     362173
CL:0000129     333041
Name: count, dtype: int64


## Test 4: Filter by our specific cell types

In [6]:
print(f"Query 4: 10x 3' v3 + primary + our {len(all_cell_values)} cell types")
obs_value_filter = f'assay == "10x 3\' v3" and is_primary_data == True and cell_type_ontology_term_id in {all_cell_values}'

obs_filtered = experiment.obs.read(
    value_filter=obs_value_filter,
    column_names=["cell_type_ontology_term_id"]
).concat().to_pandas()

print(f"Cells matching our cell types: {len(obs_filtered):,}")
print(f"Unique cell types found: {obs_filtered['cell_type_ontology_term_id'].nunique()}")

filtered_counts = obs_filtered['cell_type_ontology_term_id'].value_counts()
print(f"\nTop 20 of our cell types by count:")
print(filtered_counts.head(20))

Query 4: 10x 3' v3 + primary + our 80 cell types
Cells matching our cell types: 6,292,764
Unique cell types found: 80

Top 20 of our cell types by count:
cell_type_ontology_term_id
CL:0000084    611902
CL:0000235    569955
CL:0000837    401866
CL:0008001    397378
CL:0000236    396007
CL:0000576    366274
CL:0000129    333041
CL:0000624    222571
CL:0000878    209648
CL:0000625    200909
CL:0000623    183013
CL:0000786    155558
CL:0000763    153171
CL:0000583    129513
CL:0000860    117822
CL:1001603    115074
CL:0000895    109263
CL:0000775    106218
CL:0000542    100359
CL:0000814     82147
Name: count, dtype: int64


## Test 5: Check which of our cell types have >5000 cells

In [7]:
print("Checking which of our cell types meet the 5000 cell threshold...")

# From Test 3 results (before filtering to our cell types)
cell_type_counts_all = obs_10x_primary['cell_type_ontology_term_id'].value_counts()

# Filter for our cell types
our_cell_counts = cell_type_counts_all[cell_type_counts_all.index.isin(all_cell_values)]
over_5k = our_cell_counts[our_cell_counts > 5000]

print(f"\nOur cell types with >5000 cells: {len(over_5k)}")
print(f"Expected from preprocessing: 80")
print(f"\nTop 20:")
print(over_5k.head(20))

if len(over_5k) != 80:
    print(f"\n⚠️ MISMATCH: Expected 80 but found {len(over_5k)}!")

Checking which of our cell types meet the 5000 cell threshold...

Our cell types with >5000 cells: 80
Expected from preprocessing: 80

Top 20:
cell_type_ontology_term_id
CL:0000084    611902
CL:0000235    569955
CL:0000837    401866
CL:0008001    397378
CL:0000236    396007
CL:0000576    366274
CL:0000129    333041
CL:0000624    222571
CL:0000878    209648
CL:0000625    200909
CL:0000623    183013
CL:0000786    155558
CL:0000763    153171
CL:0000583    129513
CL:0000860    117822
CL:1001603    115074
CL:0000895    109263
CL:0000775    106218
CL:0000542    100359
CL:0000814     82147
Name: count, dtype: int64


## Summary: Where are the cells going?

In [8]:
print("="*60)
print("CELL COUNT SUMMARY")
print("="*60)
print(f"1. All human cells: {len(obs_all):,}")
print(f"2. After 10x 3' v3 filter: {len(obs_10x):,}")
print(f"3. After + primary data: {len(obs_10x_primary):,}")
print(f"4. After + our cell types: {len(obs_filtered):,}")
print(f"\nOur {len(all_cell_values)} cell types should give us ~{over_5k.sum():,} cells")
print(f"But the notebook shows only ~24,000 cells")
print(f"\n⚠️ The issue is likely in the notebook's gene filtering or dataloader!")

CELL COUNT SUMMARY
1. All human cells: 106,118,167
2. After 10x 3' v3 filter: 59,668,147
3. After + primary data: 31,806,345
4. After + our cell types: 6,292,764

Our 80 cell types should give us ~6,292,764 cells
But the notebook shows only ~24,000 cells

⚠️ The issue is likely in the notebook's gene filtering or dataloader!


In [9]:
census.close()