# Diagnose Why Training Gets Only 24k Cells

This notebook tests whether the gene filter is causing the low cell count.

In [None]:
import tiledbsoma as soma
import pandas as pd
from pathlib import Path

SOMA_PATH = "/scratch/sigbio_project_root/sigbio_project25/jingqiao/mccell-single/soma_db_homo_sapiens"

print("Opening SOMA database...")
experiment = soma.open(SOMA_PATH, mode="r")
print("✓ Database opened successfully")

## Test 1: Total Cells in Database

In [None]:
print("Counting total cells...")
obs_count_df = experiment.obs.read(column_names=["soma_joinid"]).concat().to_pandas()
total_cells = len(obs_count_df)
print(f"Total cells in database: {total_cells:,}")
del obs_count_df

## Test 2: Sample Data Structure

In [None]:
print("Reading first 10 cells...")
obs_sample = experiment.obs.read(coords=(slice(10),)).concat().to_pandas()
print(f"\nColumns available:")
print(obs_sample.columns.tolist())
print(f"\nFirst 3 cells:")
obs_sample.head(3)

## Test 3: Assay Distribution

In [None]:
print("Reading assay column...")
obs_assay = experiment.obs.read(column_names=["assay"]).concat().to_pandas()
assay_counts = obs_assay['assay'].value_counts()

print(f"\nAssay distribution (top 10):")
print(assay_counts.head(10))

if "10x 3' v3" in assay_counts.index:
    v3_count = assay_counts["10x 3' v3"]
    print(f"\n✓ Found '10x 3' v3' assay: {v3_count:,} cells")
else:
    print(f"\n❌ WARNING: '10x 3' v3' assay NOT found!")

del obs_assay

## Test 4: Primary Data Distribution

In [None]:
print("Reading is_primary_data column...")
obs_primary = experiment.obs.read(column_names=["is_primary_data"]).concat().to_pandas()
primary_counts = obs_primary['is_primary_data'].value_counts()

print(f"\nis_primary_data distribution:")
print(primary_counts)

if True in primary_counts.index:
    primary_true_count = primary_counts[True]
    print(f"\n✓ Primary data cells: {primary_true_count:,}")

del obs_primary

## Test 5: Query Cells WITHOUT Gene Filter

In [None]:
print("="*80)
print("QUERYING CELLS WITHOUT GENE FILTER")
print("="*80)

obs_filter = 'assay == "10x 3\' v3" and is_primary_data == True'
print(f"Cell filter: {obs_filter}")

with experiment.axis_query(
    measurement_name="RNA",
    obs_query=soma.AxisQuery(value_filter=obs_filter),
) as query:
    obs_df = query.obs(column_names=["cell_type_ontology_term_id"]).concat().to_pandas()
    count_without_gene_filter = len(obs_df)
    
    print(f"\n✓ Cells found: {count_without_gene_filter:,}")
    
    # Check cell type distribution
    unique_cell_types = obs_df['cell_type_ontology_term_id'].nunique()
    print(f"  Unique cell types: {unique_cell_types}")
    
    cell_type_counts = obs_df['cell_type_ontology_term_id'].value_counts()
    print(f"\n  Top 10 cell types:")
    print(cell_type_counts.head(10))
    
    # Cell types with >5000 cells
    over_5k = cell_type_counts[cell_type_counts > 5000]
    print(f"\n  Cell types with >5000 cells: {len(over_5k)}")

## Test 6: Query Cells WITH Gene Filter (Like Training Notebook)

In [None]:
print("="*80)
print("QUERYING CELLS WITH GENE FILTER (protein-coding genes)")
print("="*80)

# Load the gene list
biomart_path = Path.home() / "real_McCell/hpc_workaround/data/mart_export.txt"
biomart = pd.read_csv(biomart_path)
coding_only = biomart[biomart['Gene type'] == 'protein_coding']
gene_list = coding_only['Gene stable ID'].tolist()

print(f"Filtering to {len(gene_list)} protein-coding genes")

var_filter = f"feature_id in {gene_list}"

with experiment.axis_query(
    measurement_name="RNA",
    obs_query=soma.AxisQuery(value_filter=obs_filter),
    var_query=soma.AxisQuery(value_filter=var_filter),
) as query:
    obs_df = query.obs(column_names=["cell_type_ontology_term_id"]).concat().to_pandas()
    count_with_gene_filter = len(obs_df)
    
    print(f"\n✓ Cells found: {count_with_gene_filter:,}")
    
    unique_cell_types = obs_df['cell_type_ontology_term_id'].nunique()
    print(f"  Unique cell types: {unique_cell_types}")

## Summary: Is the Gene Filter the Problem?

In [None]:
print("="*80)
print("SUMMARY")
print("="*80)
print(f"Cells WITHOUT gene filter: {count_without_gene_filter:,}")
print(f"Cells WITH gene filter:    {count_with_gene_filter:,}")
print(f"Difference:                {count_without_gene_filter - count_with_gene_filter:,}")
print(f"Reduction:                 {(1 - count_with_gene_filter/count_without_gene_filter)*100:.1f}%")

if count_with_gene_filter < count_without_gene_filter * 0.1:
    print("\n⚠️  CONFIRMED: The gene filter is drastically reducing cell count!")
    print("   The SOMA database contains sparse data.")
    print("   Not all cells have measurements for all protein-coding genes.")
    print("\n💡 SOLUTION:")
    print("   - Remove var_query filter when querying cells")
    print("   - Let the dataloader handle gene filtering/selection")
    print("   - Or use only highly variable genes (HVGs) that are commonly measured")
else:
    print("\n✓ Gene filter does not significantly reduce cell count")
    print("   The issue must be elsewhere.")

In [None]:
# Cleanup
experiment.close()
print("\n✓ Database closed")