# Tokenisation (GSE131907)


Convert the QC-filtered AnnData into ranked gene tokens ready for Geneformer fine-tuning.


In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/singlecell-tumor-classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/geneformer-tumor-classification


In [2]:
#%pip uninstall -y jax jaxlib ml-dtypes

In [3]:
# Dependencies
!pip install --quiet -r requirements.txt

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse
from tqdm import trange


In [5]:
PROCESSED_DIR = Path("gse131907/processed")
TOKEN_DIR = PROCESSED_DIR / "tokens"
TOKEN_DIR.mkdir(parents=True, exist_ok=True)

RAW_FILENAME = PROCESSED_DIR / "gse131907_filtered_raw.h5ad"
HVG_FILENAME = PROCESSED_DIR / "gse131907_hvg5k.h5ad"

print(f"Using filtered AnnData: {RAW_FILENAME}")
print(f"Exists? {RAW_FILENAME.exists()}")


Using filtered AnnData: gse131907/processed/gse131907_filtered_raw.h5ad
Exists? True


In [7]:
 # Ensure standardized columns exist even if they weren't saved in the H5AD
adata = sc.read_h5ad(RAW_FILENAME)
if "Class" not in adata.obs.columns:
    adata.obs["Class"] = adata.obs["Sample_Origin"].astype(str)
if "Patient" not in adata.obs.columns:
    adata.obs["Patient"] = adata.obs["Sample"].astype(str)

In [8]:
print(adata)
print(adata.obs["Class"].value_counts())


AnnData object with n_obs × n_vars = 208506 × 27578
    obs: 'Barcode', 'Sample', 'Sample_Origin', 'Cell_type', 'Cell_type.refined', 'Cell_subtype', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'Class', 'Patient'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'log1p'
    layers: 'counts', 'log1p_norm'
Class
tLung     45149
nLung     42995
nLN       37446
mBrain    29060
mLN       21479
PE        20304
tL/B      12073
Name: count, dtype: int64


In [9]:
counts = adata.layers.get("counts", adata.X)
if not sparse.issparse(counts):
    counts = sparse.csr_matrix(counts)
else:
    counts = counts.tocsr()

print(f"Counts CSR shape: {counts.shape}")
print(f"Non-zero entries: {counts.nnz}")


Counts CSR shape: (208506, 27578)
Non-zero entries: 331857603


In [10]:
gene_vocab = pd.Series(
    data=np.arange(adata.n_vars, dtype=np.int32),
    index=adata.var_names,
    name="token_id",
)
vocab_path = TOKEN_DIR / "gene_vocab.tsv"
gene_vocab.to_csv(vocab_path, sep="\t", header=True)
print(f"Wrote gene vocabulary: {vocab_path}")
gene_vocab.head()


Wrote gene vocabulary: gse131907/processed/tokens/gene_vocab.tsv


Unnamed: 0,token_id
A1BG,0
A1BG-AS1,1
A1CF,2
A2M,3
A2M-AS1,4


In [11]:
MAX_GENES = 2048  # truncate ranked list to the top-N expressed genes per cell

token_matrix = np.full((adata.n_obs, MAX_GENES), fill_value=-1, dtype=np.int32)
token_lengths = np.zeros(adata.n_obs, dtype=np.int32)

indptr = counts.indptr
indices = counts.indices
data = counts.data

for cell_idx in trange(adata.n_obs, desc="Ranking genes"):
    start = indptr[cell_idx]
    end = indptr[cell_idx + 1]
    cell_gene_idx = indices[start:end]
    cell_expr = data[start:end]
    if cell_expr.size == 0:
        continue
    order = np.argsort(cell_expr)[::-1]
    ranked = cell_gene_idx[order]
    if ranked.size > MAX_GENES:
        ranked = ranked[:MAX_GENES]
    token_matrix[cell_idx, : ranked.size] = ranked
    token_lengths[cell_idx] = ranked.size

print(f"Token matrix shape: {token_matrix.shape}")
print(f"Max token length: {token_lengths.max()}")
print(f"Median token length: {np.median(token_lengths)}")


Ranking genes: 100%|██████████| 208506/208506 [00:07<00:00, 27719.84it/s]

Token matrix shape: (208506, 2048)
Max token length: 2048
Median token length: 1103.0





In [12]:
token_path = TOKEN_DIR / "gse131907_gene_rank_tokens.npz"
np.savez_compressed(
    token_path,
    tokens=token_matrix,
    lengths=token_lengths,
    max_genes=MAX_GENES,
)

# Map dataset-specific columns to standard names
# GSE131907 uses 'Sample_Origin' for tumor/normal/metastatic labels; reuse Sample as a patient proxy if no donor column exists.
metadata = adata.obs[["Sample", "Sample_Origin"]].copy()
metadata = metadata.rename(columns={"Sample_Origin": "Class"})
metadata.insert(0, "Patient", metadata["Sample"])  # replace with true patient column if available

# Collapse multiple class labels into BinaryClass
class_to_binary = {
    "tLung": "Tumor",
    "tL/B": "Tumor",
    "mBrain": "Tumor",
    "mLN": "Tumor",
    "PE": "Tumor",
    "nLung": "Normal",
    "nLN": "Normal",
}
metadata["BinaryClass"] = metadata["Class"].map(class_to_binary).fillna("Normal")

metadata["token_length"] = token_lengths
metadata_path = TOKEN_DIR / "gse131907_tokens_metadata.tsv"
metadata.to_csv(metadata_path, sep="\t", index=False)

print(f"Saved tokens to {token_path}")
print(f"Saved metadata to {metadata_path}")
metadata.groupby(["Patient", "Class"]).size().head()


Saved tokens to gse131907/processed/tokens/gse131907_gene_rank_tokens.npz
Saved metadata to gse131907/processed/tokens/gse131907_tokens_metadata.tsv


  metadata.groupby(["Patient", "Class"]).size().head()


Unnamed: 0_level_0,Unnamed: 1_level_0,0
Patient,Class,Unnamed: 2_level_1
BRONCHO_11,PE,0
BRONCHO_11,mBrain,0
BRONCHO_11,mLN,3178
BRONCHO_11,nLN,0
BRONCHO_11,nLung,0
