# Quality Control (GSE144735)

In [None]:
# Run this if running on colab
#%pip uninstall -y jax jaxlib ml-dtypes

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')
#%cd /content/drive/MyDrive/geneformer-tumor-classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/geneformer-tumor-classification


In [3]:
# Dependencies
# Run inside the notebook so packages land in the active kernel
!pip install --quiet -r requirements.txt



In [4]:
# Workspace paths
from pathlib import Path

DATA_DIR = Path('gse144735')
RAW_DIR = DATA_DIR / 'raw'
PROCESSED_DIR = DATA_DIR / 'processed'

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

print(f'Raw dir: {RAW_DIR}')
print(f'Processed dir: {PROCESSED_DIR}')


Raw dir: gse144735/raw
Processed dir: gse144735/processed


In [5]:
import pooch
from pathlib import Path

BASE_URL = "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE144nnn/GSE144735/suppl/"
FILES = {
    "counts": "GSE144735_processed_KUL3_CRC_10X_raw_UMI_count_matrix.txt.gz",
    "log_tpm": "GSE144735_processed_KUL3_CRC_10X_natural_log_TPM_matrix.txt.gz",
    "annotation": "GSE144735_processed_KUL3_CRC_10X_annotation.txt.gz",
}

downloaded_paths = {}
for label, filename in FILES.items():
    url = f"{BASE_URL}{filename}"
    path = pooch.retrieve(url=url, known_hash=None, fname=filename, path=RAW_DIR, progressbar=True)
    downloaded_paths[label] = Path(path)
    print(f"{label}: {path}")

counts: /content/drive/MyDrive/geneformer-tumor-classification/gse144735/raw/GSE144735_processed_KUL3_CRC_10X_raw_UMI_count_matrix.txt.gz
log_tpm: /content/drive/MyDrive/geneformer-tumor-classification/gse144735/raw/GSE144735_processed_KUL3_CRC_10X_natural_log_TPM_matrix.txt.gz
annotation: /content/drive/MyDrive/geneformer-tumor-classification/gse144735/raw/GSE144735_processed_KUL3_CRC_10X_annotation.txt.gz


In [6]:
import gzip
import pandas as pd
from scipy import sparse
import scanpy as sc

# Load expression counts and metadata into AnnData
with gzip.open(downloaded_paths["counts"], "rt") as fh:
    counts_df = pd.read_csv(fh, sep="\t", index_col=0)
counts_df.index.name = "gene_symbol"

with gzip.open(downloaded_paths["annotation"], "rt") as fh:
    meta_df = pd.read_csv(fh, sep="\t", index_col=0)

# Align columns/rows in case of ordering drift
counts_df = counts_df.loc[:, meta_df.index]

adata = sc.AnnData(X=sparse.csr_matrix(counts_df.values.T))
adata.obs_names = counts_df.columns.tolist()
adata.var_names = counts_df.index.tolist()
adata.obs = meta_df
adata.layers["counts"] = adata.X.copy()

print(adata)

AnnData object with n_obs × n_vars = 27414 × 33694
    obs: 'Patient', 'Class', 'Sample', 'Cell_type', 'Cell_subtype'
    layers: 'counts'


In [7]:
# Basic QC and filtering choices
sc.pp.calculate_qc_metrics(adata, inplace=True)

min_genes = 200
min_cells = 3
adata = adata[adata.obs["n_genes_by_counts"] >= min_genes].copy()
sc.pp.filter_genes(adata, min_cells=min_cells)

print(f"Cells after filtering: {adata.n_obs}")
print(f"Genes after filtering: {adata.n_vars}")

Cells after filtering: 27414
Genes after filtering: 24471


In [8]:
# Normalisation and highly variable gene selection for downstream modelling
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p_norm"] = adata.X.copy()

sc.pp.highly_variable_genes(adata, flavor="cell_ranger", n_top_genes=5000, subset=False)
print(adata.var.highly_variable.value_counts())

highly_variable
False    19471
True      5000
Name: count, dtype: int64


  disp_grouped = df.groupby("mean_bin")["dispersions"]


In [9]:
# Persist AnnData snapshots for later tokenisation and model training
raw_path = PROCESSED_DIR / "gse144735_filtered_raw.h5ad"
hvg_path = PROCESSED_DIR / "gse144735_hvg5k.h5ad"

adata.layers["counts"] = adata.layers.get("counts", adata.X).copy()
adata.write(raw_path, compression="gzip")

adata_hvg = adata[:, adata.var["highly_variable"]].copy()
adata_hvg.write(hvg_path, compression="gzip")

print(f"Saved filtered AnnData: {raw_path}")
print(f"Saved HVG AnnData: {hvg_path}")

Saved filtered AnnData: gse144735/processed/gse144735_filtered_raw.h5ad
Saved HVG AnnData: gse144735/processed/gse144735_hvg5k.h5ad


In [10]:
# Sanity check: class balance and per-patient sample sizes
print(adata.obs["Class"].value_counts())
print(adata.obs.groupby(["Patient", "Class"]).size())

Class
Normal    9736
Border    9424
Tumor     8254
Name: count, dtype: int64
Patient  Class 
KUL01    Border    2129
         Normal    2012
         Tumor     1922
KUL19    Border    3263
         Normal    1875
         Tumor     3128
KUL21    Border    1741
         Normal    1340
         Tumor     2149
KUL28    Border     406
         Normal     908
         Tumor      428
KUL30    Border     766
         Normal    2092
         Tumor      552
KUL31    Border    1119
         Normal    1509
         Tumor       75
dtype: int64


  print(adata.obs.groupby(["Patient", "Class"]).size())


In [None]:
adata.obs["n_genes_by_counts"].describe()


Unnamed: 0,n_genes_by_counts
count,27414.0
mean,1744.139345
std,1201.347218
min,201.0
25%,899.25
50%,1306.0
75%,2260.75
max,5998.0


In [14]:
adata.obs["total_counts"].describe()

Unnamed: 0,total_counts
count,27414.0
mean,7698.133399
std,8000.894467
min,1001.0
25%,2492.0
50%,4276.0
75%,10015.75
max,58449.0


In [15]:
adata.obs["Class"].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
Normal,9736
Border,9424
Tumor,8254
