# Diagnose Why Training Gets Only 24k Cells

This notebook tests whether the gene filter is causing the low cell count.

In [1]:
import tiledbsoma as soma
import pandas as pd
from pathlib import Path

SOMA_PATH = "/scratch/sigbio_project_root/sigbio_project25/jingqiao/mccell-single/soma_db_homo_sapiens"

print("Opening SOMA database...")
experiment = soma.open(SOMA_PATH, mode="r")
print("✓ Database opened successfully")

Opening SOMA database...
✓ Database opened successfully


## Test 1: Total Cells in Database

In [2]:
print("Counting total cells...")
obs_count_df = experiment.obs.read(column_names=["soma_joinid"]).concat().to_pandas()
total_cells = len(obs_count_df)
print(f"Total cells in database: {total_cells:,}")
del obs_count_df

Counting total cells...
Total cells in database: 106,118,167


## Test 2: Sample Data Structure

In [3]:
print("Reading first 10 cells...")
obs_sample = experiment.obs.read(coords=(slice(10),)).concat().to_pandas()
print(f"\nColumns available:")
print(obs_sample.columns.tolist())
print(f"\nFirst 3 cells:")
obs_sample.head(3)

Reading first 10 cells...

Columns available:
['soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars']

First 3 cells:


Unnamed: 0,soma_joinid,dataset_id,assay,assay_ontology_term_id,cell_type,cell_type_ontology_term_id,development_stage,development_stage_ontology_term_id,disease,disease_ontology_term_id,...,tissue,tissue_ontology_term_id,tissue_type,tissue_general,tissue_general_ontology_term_id,raw_sum,nnz,raw_mean_nnz,raw_variance_nnz,n_measured_vars
0,0,d7476ae2-e320-4703-8304-da5c42627e71,10x 3' v3,EFO:0009922,endothelial cell,CL:0000115,29-year-old stage,HsapDv:0000123,breast cancer,MONDO:0007254,...,liver,UBERON:0002107,tissue,liver,UBERON:0002107,19641.0,7157,2.744306,696.131649,12641
1,1,d7476ae2-e320-4703-8304-da5c42627e71,10x 3' v3,EFO:0009922,malignant cell,CL:0001064,29-year-old stage,HsapDv:0000123,breast cancer,MONDO:0007254,...,liver,UBERON:0002107,tissue,liver,UBERON:0002107,17251.0,5388,3.201745,394.135085,12641
2,2,d7476ae2-e320-4703-8304-da5c42627e71,10x 3' v3,EFO:0009922,fibroblast,CL:0000057,29-year-old stage,HsapDv:0000123,breast cancer,MONDO:0007254,...,liver,UBERON:0002107,tissue,liver,UBERON:0002107,14631.0,3942,3.711568,1602.371239,12641


## Test 3: Assay Distribution

In [4]:
print("Reading assay column...")
obs_assay = experiment.obs.read(column_names=["assay"]).concat().to_pandas()
assay_counts = obs_assay['assay'].value_counts()

print(f"\nAssay distribution (top 10):")
print(assay_counts.head(10))

if "10x 3' v3" in assay_counts.index:
    v3_count = assay_counts["10x 3' v3"]
    print(f"\n✓ Found '10x 3' v3' assay: {v3_count:,} cells")
else:
    print(f"\n❌ WARNING: '10x 3' v3' assay NOT found!")

del obs_assay

Reading assay column...

Assay distribution (top 10):
assay
10x 3' v3                              59668147
10x 3' v2                              22750589
10x 5' v1                               7448617
sci-RNA-seq3                            5064268
10x 5' v2                               4363694
10x 5' transcription profiling          1968545
Drop-seq                                1048377
ScaleBio single cell RNA sequencing      700524
10x 3' transcription profiling           665642
microwell-seq                            642559
Name: count, dtype: int64

✓ Found '10x 3' v3' assay: 59,668,147 cells


## Test 4: Primary Data Distribution

In [5]:
print("Reading is_primary_data column...")
obs_primary = experiment.obs.read(column_names=["is_primary_data"]).concat().to_pandas()
primary_counts = obs_primary['is_primary_data'].value_counts()

print(f"\nis_primary_data distribution:")
print(primary_counts)

if True in primary_counts.index:
    primary_true_count = primary_counts[True]
    print(f"\n✓ Primary data cells: {primary_true_count:,}")

del obs_primary

Reading is_primary_data column...

is_primary_data distribution:
is_primary_data
True     62634126
False    43484041
Name: count, dtype: int64

✓ Primary data cells: 62,634,126


## Test 5: Query Cells WITHOUT Gene Filter

In [6]:
print("="*80)
print("QUERYING CELLS WITHOUT GENE FILTER")
print("="*80)

obs_filter = 'assay == "10x 3\' v3" and is_primary_data == True'
print(f"Cell filter: {obs_filter}")

with experiment.axis_query(
    measurement_name="RNA",
    obs_query=soma.AxisQuery(value_filter=obs_filter),
) as query:
    obs_df = query.obs(column_names=["cell_type_ontology_term_id"]).concat().to_pandas()
    count_without_gene_filter = len(obs_df)
    
    print(f"\n✓ Cells found: {count_without_gene_filter:,}")
    
    # Check cell type distribution
    unique_cell_types = obs_df['cell_type_ontology_term_id'].nunique()
    print(f"  Unique cell types: {unique_cell_types}")
    
    cell_type_counts = obs_df['cell_type_ontology_term_id'].value_counts()
    print(f"\n  Top 10 cell types:")
    print(cell_type_counts.head(10))
    
    # Cell types with >5000 cells
    over_5k = cell_type_counts[cell_type_counts > 5000]
    print(f"\n  Cell types with >5000 cells: {len(over_5k)}")

QUERYING CELLS WITHOUT GENE FILTER
Cell filter: assay == "10x 3' v3" and is_primary_data == True

✓ Cells found: 31,806,345
  Unique cell types: 625

  Top 10 cell types:
cell_type_ontology_term_id
CL:0000540    3118306
CL:0000128    2632680
CL:4023040    1734499
CL:0000604    1141897
CL:0000057     920759
CL:0000127     622607
unknown        619217
CL:0000084     611902
CL:0000235     569955
CL:0000679     562975
Name: count, dtype: int64

  Cell types with >5000 cells: 320


## Test 6: Query Cells WITH Gene Filter (Like Training Notebook)

In [7]:
print("="*80)
print("QUERYING CELLS WITH GENE FILTER (protein-coding genes)")
print("="*80)

# Load the gene list
biomart_path = Path.home() / "real_McCell/hpc_workaround/data/mart_export.txt"
biomart = pd.read_csv(biomart_path)
coding_only = biomart[biomart['Gene type'] == 'protein_coding']
gene_list = coding_only['Gene stable ID'].tolist()

print(f"Filtering to {len(gene_list)} protein-coding genes")

var_filter = f"feature_id in {gene_list}"

with experiment.axis_query(
    measurement_name="RNA",
    obs_query=soma.AxisQuery(value_filter=obs_filter),
    var_query=soma.AxisQuery(value_filter=var_filter),
) as query:
    obs_df = query.obs(column_names=["cell_type_ontology_term_id"]).concat().to_pandas()
    count_with_gene_filter = len(obs_df)
    
    print(f"\n✓ Cells found: {count_with_gene_filter:,}")
    
    unique_cell_types = obs_df['cell_type_ontology_term_id'].nunique()
    print(f"  Unique cell types: {unique_cell_types}")

QUERYING CELLS WITH GENE FILTER (protein-coding genes)
Filtering to 23262 protein-coding genes

✓ Cells found: 31,806,345
  Unique cell types: 625


## Summary: Is the Gene Filter the Problem?

In [8]:
print("="*80)
print("SUMMARY")
print("="*80)
print(f"Cells WITHOUT gene filter: {count_without_gene_filter:,}")
print(f"Cells WITH gene filter:    {count_with_gene_filter:,}")
print(f"Difference:                {count_without_gene_filter - count_with_gene_filter:,}")
print(f"Reduction:                 {(1 - count_with_gene_filter/count_without_gene_filter)*100:.1f}%")

if count_with_gene_filter < count_without_gene_filter * 0.1:
    print("\n⚠️  CONFIRMED: The gene filter is drastically reducing cell count!")
    print("   The SOMA database contains sparse data.")
    print("   Not all cells have measurements for all protein-coding genes.")
    print("\n💡 SOLUTION:")
    print("   - Remove var_query filter when querying cells")
    print("   - Let the dataloader handle gene filtering/selection")
    print("   - Or use only highly variable genes (HVGs) that are commonly measured")
else:
    print("\n✓ Gene filter does not significantly reduce cell count")
    print("   The issue must be elsewhere.")

SUMMARY
Cells WITHOUT gene filter: 31,806,345
Cells WITH gene filter:    31,806,345
Difference:                0
Reduction:                 0.0%

✓ Gene filter does not significantly reduce cell count
   The issue must be elsewhere.


In [16]:
from pathlib import Path
import pickle
import sys

# Add project root to path so we can import from src
PROJECT_ROOT = Path.home() / "real_McCell"
sys.path.insert(0, str(PROJECT_ROOT))

from src.utils.paths import get_data_folder

  # Load the preprocessing artifacts
DATE = '2025-10-24'
PROCESSED_DATA_DIR = get_data_folder(DATE)

mapping_dict_df = pd.read_csv(PROCESSED_DATA_DIR / f"{DATE}_mapping_dict_df.csv", index_col=0)
all_cell_values = list(mapping_dict_df.index)

print(f"Preprocessing has {len(all_cell_values)} cell types")

# Now query with the cell type filter
obs_filter_with_cells = f'assay == "10x 3\' v3" and is_primary_data == True and cell_type_ontology_term_id in {all_cell_values}'

with experiment.axis_query(
  measurement_name="RNA",
  obs_query=soma.AxisQuery(value_filter=obs_filter_with_cells),
) as query:
  obs_df = query.obs(column_names=["cell_type_ontology_term_id"]).concat().to_pandas()

  print(f"\nCells matching those {len(all_cell_values)} cell types: {len(obs_df):,}")
  print(f"Unique cell types found: {obs_df['cell_type_ontology_term_id'].nunique()}")

  # Which cell types are missing?
  found_types = set(obs_df['cell_type_ontology_term_id'].unique())
  expected_types = set(all_cell_values)
  missing_types = expected_types - found_types

  if missing_types:
      print(f"\n⚠️  {len(missing_types)} cell types from preprocessing NOT found in local database!")
      print(f"   Missing: {list(missing_types)[:10]}")

Preprocessing has 80 cell types

Cells matching those 80 cell types: 6,292,764
Unique cell types found: 80


In [18]:
# Test: Query with BOTH cell type AND gene filters
print("="*80)
print("TEST: Query with 80 cell types AND gene filter")
print("="*80)

# Load cell types
from pathlib import Path
import sys
PROJECT_ROOT = Path.home() / "real_McCell"
sys.path.insert(0, str(PROJECT_ROOT))
from src.utils.paths import get_data_folder

DATE = '2025-10-24'
PROCESSED_DATA_DIR = get_data_folder(DATE)
mapping_dict_df = pd.read_csv(PROCESSED_DATA_DIR / f"{DATE}_mapping_dict_df.csv", index_col=0)
all_cell_values = list(mapping_dict_df.index)

# Build filter with cell types
obs_filter_with_cells = f'assay == "10x 3\' v3" and is_primary_data == True and cell_type_ontology_term_id in {all_cell_values}'

print(f"Cell types: {len(all_cell_values)}")
print(f"Genes: {len(gene_list)}")

with experiment.axis_query(
  measurement_name="RNA",
  obs_query=soma.AxisQuery(value_filter=obs_filter_with_cells),
  var_query=soma.AxisQuery(value_filter=var_filter),
) as query:
  obs_df = query.obs(column_names=["cell_type_ontology_term_id"]).concat().to_pandas()
  count_both_filters = len(obs_df)

  print(f"\nCells with BOTH filters: {count_both_filters:,}")
  print(f"Unique cell types found: {obs_df['cell_type_ontology_term_id'].nunique()}")

  # Compare to previous results
  print("\n" + "="*80)
  print("COMPARISON:")
  print("="*80)
  print(f"Cells with NO filters:              {count_without_gene_filter:,}")
  print(f"Cells with gene filter only:        {count_with_gene_filter:,}")
  print(f"Cells with BOTH filters:            {count_both_filters:,}")
  print(f"\nReduction from gene filter only:    {count_with_gene_filter - count_both_filters:,}")

TEST: Query with 80 cell types AND gene filter
Cell types: 80
Genes: 23262

Cells with BOTH filters: 6,292,764
Unique cell types found: 80

COMPARISON:
Cells with NO filters:              31,806,345
Cells with gene filter only:        31,806,345
Cells with BOTH filters:            6,292,764

Reduction from gene filter only:    25,513,581


In [None]:
# Cleanup
experiment.close()
print("\n✓ Database closed")