# Tokenisation (GSE144735)


Convert the QC-filtered AnnData into ranked gene tokens ready for Geneformer/scGPT fine-tuning.


In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/geneformer-tumor-classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/geneformer-tumor-classification


In [2]:
#%pip uninstall -y jax jaxlib ml-dtypes

In [3]:
# Dependencies
!pip install --quiet -r requirements.txt

In [4]:
from pathlib import Path

import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse
from tqdm import trange


In [5]:
PROCESSED_DIR = Path("gse144735/processed")
TOKEN_DIR = PROCESSED_DIR / "tokens"
TOKEN_DIR.mkdir(parents=True, exist_ok=True)

RAW_FILENAME = PROCESSED_DIR / "gse144735_filtered_raw.h5ad"
HVG_FILENAME = PROCESSED_DIR / "gse144735_hvg5k.h5ad"

print(f"Using filtered AnnData: {RAW_FILENAME}")
print(f"Exists? {RAW_FILENAME.exists()}")


Using filtered AnnData: gse144735/processed/gse144735_filtered_raw.h5ad
Exists? True


In [6]:
adata = sc.read_h5ad(RAW_FILENAME)
print(adata)
print(adata.obs["Class"].value_counts())


AnnData object with n_obs × n_vars = 27414 × 24471
    obs: 'Patient', 'Class', 'Sample', 'Cell_type', 'Cell_subtype', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'log1p'
    layers: 'counts', 'log1p_norm'
Class
Normal    9736
Border    9424
Tumor     8254
Name: count, dtype: int64


In [7]:
counts = adata.layers.get("counts", adata.X)
if not sparse.issparse(counts):
    counts = sparse.csr_matrix(counts)
else:
    counts = counts.tocsr()

print(f"Counts CSR shape: {counts.shape}")
print(f"Non-zero entries: {counts.nnz}")


Counts CSR shape: (27414, 24471)
Non-zero entries: 47810455


In [8]:
gene_vocab = pd.Series(
    data=np.arange(adata.n_vars, dtype=np.int32),
    index=adata.var_names,
    name="token_id",
)
vocab_path = TOKEN_DIR / "gene_vocab.tsv"
gene_vocab.to_csv(vocab_path, sep="\t", header=True)
print(f"Wrote gene vocabulary: {vocab_path}")
gene_vocab.head()


Wrote gene vocabulary: gse144735/processed/tokens/gene_vocab.tsv


Unnamed: 0,token_id
RP11-34P13.7,0
FO538757.2,1
AP006222.2,2
RP4-669L17.10,3
RP5-857K21.4,4


In [9]:
MAX_GENES = 2048  # truncate ranked list to the top-N expressed genes per cell

token_matrix = np.full((adata.n_obs, MAX_GENES), fill_value=-1, dtype=np.int32)
token_lengths = np.zeros(adata.n_obs, dtype=np.int32)

indptr = counts.indptr
indices = counts.indices
data = counts.data

for cell_idx in trange(adata.n_obs, desc="Ranking genes"):
    start = indptr[cell_idx]
    end = indptr[cell_idx + 1]
    cell_gene_idx = indices[start:end]
    cell_expr = data[start:end]
    if cell_expr.size == 0:
        continue
    order = np.argsort(cell_expr)[::-1]
    ranked = cell_gene_idx[order]
    if ranked.size > MAX_GENES:
        ranked = ranked[:MAX_GENES]
    token_matrix[cell_idx, : ranked.size] = ranked
    token_lengths[cell_idx] = ranked.size

print(f"Token matrix shape: {token_matrix.shape}")
print(f"Max token length: {token_lengths.max()}")
print(f"Median token length: {np.median(token_lengths)}")


Ranking genes: 100%|██████████| 27414/27414 [00:01<00:00, 17865.76it/s]

Token matrix shape: (27414, 2048)
Max token length: 2048
Median token length: 1305.5





In [10]:
token_path = TOKEN_DIR / "gse144735_gene_rank_tokens.npz"
np.savez_compressed(
    token_path,
    tokens=token_matrix,
    lengths=token_lengths,
    max_genes=MAX_GENES,
)

metadata_cols = ["Patient", "Class", "Sample"]
metadata = adata.obs[metadata_cols].copy()
metadata["token_length"] = token_lengths
metadata_path = TOKEN_DIR / "gse144735_tokens_metadata.tsv"
metadata.to_csv(metadata_path, sep="\t", index=False)

print(f"Saved tokens to {token_path}")
print(f"Saved metadata to {metadata_path}")
metadata.groupby(["Patient", "Class"]).size().head()


Saved tokens to gse144735/processed/tokens/gse144735_gene_rank_tokens.npz
Saved metadata to gse144735/processed/tokens/gse144735_tokens_metadata.tsv


  metadata.groupby(["Patient", "Class"]).size().head()


Unnamed: 0_level_0,Unnamed: 1_level_0,0
Patient,Class,Unnamed: 2_level_1
KUL01,Border,2129
KUL01,Normal,2012
KUL01,Tumor,1922
KUL19,Border,3263
KUL19,Normal,1875
