# Quality Control (GSE131907)

In [1]:
# Run this if running on colab
#%pip uninstall -y jax jaxlib ml-dtypes

Found existing installation: jax 0.7.2
Uninstalling jax-0.7.2:
  Successfully uninstalled jax-0.7.2
Found existing installation: jaxlib 0.7.2
Uninstalling jaxlib-0.7.2:
  Successfully uninstalled jaxlib-0.7.2
Found existing installation: ml_dtypes 0.5.4
Uninstalling ml_dtypes-0.5.4:
  Successfully uninstalled ml_dtypes-0.5.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/singlecell-tumor-classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/geneformer-tumor-classification


In [2]:
# Dependencies
# Run inside the notebook so packages land in the active kernel
!pip install --quiet -r requirements.txt



In [13]:
# Workspace paths
from pathlib import Path

DATA_DIR = Path('gse131907')
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f'Raw dir: {RAW_DIR}')
print(f'Processed dir: {PROCESSED_DIR}')


Raw dir: gse131907/raw
Processed dir: gse131907/processed


In [14]:
import pooch
from pathlib import Path

BASE_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE131nnn/GSE131907/suppl/"
FILES = {
    "counts": "GSE131907_Lung_Cancer_raw_UMI_matrix.txt.gz",
    "annotation": "GSE131907_Lung_Cancer_cell_annotation.txt.gz",
    # "log_tpm": "GSE131907_Lung_Cancer_normalized_log2TPM_matrix.txt.gz",
}

downloaded_paths = {}
for label, filename in FILES.items():
    url = f"{BASE_URL}{filename}"
    path = pooch.retrieve(url=url, known_hash=None, fname=filename, path=RAW_DIR, progressbar=True)
    downloaded_paths[label] = Path(path)
    print(f"{label}: {path}")


counts: /content/drive/MyDrive/geneformer-tumor-classification/gse131907/raw/GSE131907_Lung_Cancer_raw_UMI_matrix.txt.gz
annotation: /content/drive/MyDrive/geneformer-tumor-classification/gse131907/raw/GSE131907_Lung_Cancer_cell_annotation.txt.gz


In [15]:
import pandas as pd
import numpy as np
from scipy import sparse
import scanpy as sc

# Load annotation (small)
meta_df = pd.read_csv(downloaded_paths["annotation"], sep="\t", index_col=0)

# Chunked load of counts to stay memory-safe (no forced dtype casting)
blocks = []
gene_names = []
cell_names = None
for chunk in pd.read_csv(
    downloaded_paths["counts"],
    sep="\t",
    index_col=0,
    chunksize=2000,  # lower if memory is still tight
    dtype=None,
    low_memory=False,
):
    if cell_names is None:
        cell_names = chunk.columns.tolist()
    blocks.append(sparse.csr_matrix(chunk.to_numpy()))
    gene_names.extend(chunk.index.tolist())

counts_csr = sparse.vstack(blocks)  # genes x cells
counts_csr = counts_csr.T  # cells x genes

# Build AnnData aligned to annotation
adata = sc.AnnData(X=counts_csr)
adata.obs_names = cell_names
adata.var_names = gene_names
adata.obs = meta_df.reindex(cell_names)
# Standardise label columns for downstream steps
adata.obs["Class"] = adata.obs["Sample_Origin"].astype(str)
adata.obs["Patient"] = adata.obs["Sample"].astype(str)
adata.layers["counts"] = adata.X.copy()

print(adata)


AnnData object with n_obs × n_vars = 208506 × 29634
    obs: 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'Class', 'Patient'
    layers: 'counts'


In [16]:
# Basic QC and filtering choices
sc.pp.calculate_qc_metrics(adata, inplace=True)

min_genes = 200
min_cells = 3
adata = adata[adata.obs["n_genes_by_counts"] >= min_genes].copy()
sc.pp.filter_genes(adata, min_cells=min_cells)

print(f"Cells after filtering: {adata.n_obs}")
print(f"Genes after filtering: {adata.n_vars}")

Cells after filtering: 208506
Genes after filtering: 27578


In [17]:
# Normalisation and highly variable gene selection for downstream modelling
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p_norm"] = adata.X.copy()

sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=5000, subset=False)
print(adata.var.highly_variable.value_counts())

highly_variable
False    22578
True      5000
Name: count, dtype: int64


  disp_grouped = df.groupby("mean_bin")["dispersions"]


In [18]:
import pandas as pd
import numpy as np
from scipy import sparse
import scanpy as sc

# Load annotation (small)
meta_df = pd.read_csv(downloaded_paths["annotation"], sep="\t", index_col=0)

# Chunked load of counts to stay memory-safe (no forced dtype casting)
blocks = []
gene_names = []
cell_names = None
for chunk in pd.read_csv(
    downloaded_paths["counts"],
    sep="\t",
    index_col=0,
    chunksize=2000,  # lower if memory is still tight
    dtype=None,
    low_memory=False,
):
    if cell_names is None:
        cell_names = chunk.columns.tolist()
    blocks.append(sparse.csr_matrix(chunk.to_numpy()))
    gene_names.extend(chunk.index.tolist())

counts_csr = sparse.vstack(blocks)  # genes x cells
counts_csr = counts_csr.T  # cells x genes

# Build AnnData aligned to annotation
adata = sc.AnnData(X=counts_csr)
adata.obs_names = cell_names
adata.var_names = gene_names
adata.obs = meta_df.reindex(cell_names)
adata.layers["counts"] = adata.X.copy()

print(adata)


AnnData object with n_obs × n_vars = 208506 × 29634
    obs: 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype'
    layers: 'counts'


In [20]:
# Sanity check: class balance and per-patient sample sizes
if "Class" not in adata.obs.columns:
    adata.obs["Class"] = adata.obs["Sample_Origin"].astype(str)
if "Patient" not in adata.obs.columns:
    adata.obs["Patient"] = adata.obs["Sample"].astype(str)
print(adata.obs["Class"].value_counts())
print(adata.obs.groupby(["Patient", "Class"]).size())

Class
tLung     45149
nLung     42995
nLN       37446
mBrain    29060
mLN       21479
PE        20304
tL/B      12073
Name: count, dtype: int64
Patient      Class 
BRONCHO_11   mLN       3178
BRONCHO_58   tL/B      2813
EBUS_06      tL/B      2303
EBUS_10      mLN       5144
EBUS_12      mLN       3115
EBUS_13      mLN       3550
EBUS_15      mLN       1105
EBUS_19      mLN       2112
EBUS_28      tL/B      5182
EBUS_49      tL/B      1775
EBUS_51      mLN       3275
EFFUSION_06  PE        4884
EFFUSION_11  PE        2943
EFFUSION_12  PE        4224
EFFUSION_13  PE        4441
EFFUSION_64  PE        3812
LN_01        nLN       3626
LN_02        nLN       3178
LN_03        nLN       3013
LN_04        nLN       3092
LN_05        nLN       3087
LN_06        nLN       2936
LN_07        nLN       5713
LN_08        nLN       4395
LN_11        nLN       3788
LN_12        nLN       4618
LUNG_N01     nLung     3194
LUNG_N06     nLung     2839
LUNG_N08     nLung     3380
LUNG_N09     nLung     2

In [22]:
if "n_genes_by_counts" not in adata.obs.columns:
    sc.pp.calculate_qc_metrics(adata, inplace=True)
adata.obs["n_genes_by_counts"].describe()


Unnamed: 0,n_genes_by_counts
count,208506.0
mean,1591.61015
std,1193.04511
min,200.0
25%,843.0
50%,1103.0
75%,1977.0
max,9750.0


In [23]:
adata.obs["total_counts"].describe()

Unnamed: 0,total_counts
count,208506.0
mean,6690.700685
std,8742.754038
min,1000.0
25%,2319.0
50%,3489.0
75%,7228.0
max,148044.0


In [24]:
adata.obs["Class"].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
tLung,45149
nLung,42995
nLN,37446
mBrain,29060
mLN,21479
PE,20304
tL/B,12073
