# Quality Control (GSE144735)

In [16]:
# Dependencies
# Run inside the notebook so packages land in the active kernel
%pip install --quiet -r ../requirements.txt



Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Building wheel for pyarrow (pyproject.toml) did not run successfully.
  │ exit code: 1
  ╰─> [359 lines of output]
      toml section missing WindowsPath('pyproject.toml') does not contain a tool.setuptools_scm section
      toml section missing WindowsPath('pyproject.toml') does not contain a tool.setuptools_scm section
      !!
      
              ********************************************************************************
              Please consider removing the following classifiers in favor of a SPDX license expression:
      
              License :: OSI Approved :: Apache Software License
      
              See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
              ********************************************************************************
      
      !!
        self._finalize_license_expression()
      !!
      
              ********************************************

In [17]:
# Workspace paths
from pathlib import Path

DATA_DIR = Path('gse144735')
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f'Raw dir: {RAW_DIR}')
print(f'Processed dir: {PROCESSED_DIR}')


Raw dir: gse144735\raw
Processed dir: gse144735\processed


In [18]:
import pooch
from pathlib import Path

BASE_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE144nnn/GSE144735/suppl/"
FILES = {
    "counts": "GSE144735_processed_KUL3_CRC_10X_raw_UMI_count_matrix.txt.gz",
    "log_tpm": "GSE144735_processed_KUL3_CRC_10X_natural_log_TPM_matrix.txt.gz",
    "annotation": "GSE144735_processed_KUL3_CRC_10X_annotation.txt.gz",
}

downloaded_paths = {}
for label, filename in FILES.items():
    url = f"{BASE_URL}{filename}"
    path = pooch.retrieve(url=url, known_hash=None, fname=filename, path=RAW_DIR, progressbar=True)
    downloaded_paths[label] = Path(path)
    print(f"{label}: {path}")

counts: F:\geneformer-tumor-classification\notebooks\gse144735\raw\GSE144735_processed_KUL3_CRC_10X_raw_UMI_count_matrix.txt.gz
log_tpm: F:\geneformer-tumor-classification\notebooks\gse144735\raw\GSE144735_processed_KUL3_CRC_10X_natural_log_TPM_matrix.txt.gz
annotation: F:\geneformer-tumor-classification\notebooks\gse144735\raw\GSE144735_processed_KUL3_CRC_10X_annotation.txt.gz


In [19]:
import gzip
import pandas as pd
from scipy import sparse
import scanpy as sc

# Load expression counts and metadata into AnnData
with gzip.open(downloaded_paths["counts"], "rt") as fh:
    counts_df = pd.read_csv(fh, sep="\t", index_col=0)
counts_df.index.name = "gene_symbol"

with gzip.open(downloaded_paths["annotation"], "rt") as fh:
    meta_df = pd.read_csv(fh, sep="\t", index_col=0)

# Align columns/rows in case of ordering drift
counts_df = counts_df.loc[:, meta_df.index]

adata = sc.AnnData(X=sparse.csr_matrix(counts_df.values.T))
adata.obs_names = counts_df.columns.tolist()
adata.var_names = counts_df.index.tolist()
adata.obs = meta_df
adata.layers["counts"] = adata.X.copy()

print(adata)

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Basic QC and filtering choices
sc.pp.calculate_qc_metrics(adata, inplace=True)

min_genes = 200
min_cells = 3
adata = adata[adata.obs["n_genes_by_counts"] >= min_genes].copy()
sc.pp.filter_genes(adata, min_cells=min_cells)

print(f"Cells after filtering: {adata.n_obs}")
print(f"Genes after filtering: {adata.n_vars}")

In [None]:
# Normalisation and highly variable gene selection for downstream modelling
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p_norm"] = adata.X.copy()

sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=5000, subset=False)
print(adata.var.highly_variable.value_counts())

In [None]:
# Persist AnnData snapshots for later tokenisation and model training
raw_path = PROCESSED_DIR / "gse144735_filtered_raw.h5ad"
hvg_path = PROCESSED_DIR / "gse144735_hvg5k.h5ad"

adata.layers["counts"] = adata.layers.get("counts", adata.X).copy()
adata.write(raw_path, compression="gzip")

adata_hvg = adata[:, adata.var["highly_variable"]].copy()
adata_hvg.write(hvg_path, compression="gzip")

print(f"Saved filtered AnnData: {raw_path}")
print(f"Saved HVG AnnData: {hvg_path}")

In [None]:
# Sanity check: class balance and per-patient sample sizes
print(adata.obs["Class"].value_counts())
print(adata.obs.groupby(["Patient", "Class"]).size())