# Preprocessing: Domínguez Conde immune tissue (two donors)

Dieses Notebook erstellt konsistente Inputs für:
- **C2S Baseline** (CellSentences nach Expression-Ranking)
- **scGPT** (die gleichen Top‑K Gene + Werte, damit später `mlm_output` gerankt werden kann)

**Input:** `data/dominguez_conde_immune_tissue_two_donors.h5ad`

**Outputs:** im Ordner `processed/`.


In [1]:
# Preprocessing Notebook für:
# - C2S Baseline: CellSentences = Gene nach Expression (Top-K) sortiert
# - scGPT Input: dieselben Top-K Gene + Werte (damit scGPT pro Gen Scores ausgeben kann)
#
# Datensatz: data/dominguez_conde_immune_tissue_two_donors.h5ad
#
# Outputs (im Ordner "processed/"):
# 1) processed_adata_hvg.h5ad          (mit Layern für counts/log1p_norm)
# 2) splits.csv                        (train/val/test pro cell_id)
# 3) c2s_sentences_expression.txt      (eine Sentence pro Zelle)
# 4) scgpt_inputs_topk.npz             (Arrays: topk_gene_symbols, topk_values, cell_id)
#
# Hinweis:
# - scGPT erwartet typischerweise Gene als Tokens (IDs) + Werte. Hier speichern wir Gene als Symbole.
#   Das Mapping auf scGPT-Vokabular machst du im Trainingsteil.

import os
import numpy as np
import pandas as pd

import scanpy as sc
from scipy import sparse
from sklearn.model_selection import GroupShuffleSplit

# ======================
# Konfiguration
# ======================
OUTDIR = "processed"
os.makedirs(OUTDIR, exist_ok=True)

DATA_PATH = "../data/dominguez_conde_immune_tissue_two_donors.h5ad"

SEED = 42
np.random.seed(SEED)

K_TOP = 512                 # Länge der CellSentence / scGPT-Sequenz (256 oder 512 ist meist gut bei wenig Compute)
N_HVG = 2000                # HVGs für Stabilität/Speed
MIN_COUNTS_PER_CELL = 500
MIN_GENES_PER_CELL = 200



# Laden

In [2]:
adata = sc.read_h5ad(DATA_PATH)
print(adata)

if "cell_id" not in adata.obs.columns:
    adata.obs["cell_id"] = adata.obs_names.astype(str)

AnnData object with n_obs × n_vars = 29773 × 36503
    obs: 'cell_type', 'tissue', 'batch_condition', 'organism', 'assay', 'sex'
    var: 'gene_name', 'ensembl_id'


# Counts sichern

In [3]:

if "counts" in adata.layers:
    adata.layers["counts"] = adata.layers["counts"]
else:
    adata.layers["counts"] = adata.X.copy()

# QC Filter

In [4]:
# QC: Filtere Zellen mit zu wenigen Counts/Genen
sc.pp.calculate_qc_metrics(adata, inplace=True)

mask_cells = (
    (adata.obs["total_counts"] >= MIN_COUNTS_PER_CELL) &
    (adata.obs["n_genes_by_counts"] >= MIN_GENES_PER_CELL)
)
adata = adata[mask_cells].copy()

sc.pp.filter_genes(adata, min_cells=10)

print("After QC:", adata)

After QC: AnnData object with n_obs × n_vars = 29773 × 20426
    obs: 'cell_type', 'tissue', 'batch_condition', 'organism', 'assay', 'sex', 'cell_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'gene_name', 'ensembl_id', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
    layers: 'counts'


# normalisierung + log1p für das Ranking

In [5]:
adata.X = adata.layers["counts"].copy()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
adata.layers["log1p_norm"] = adata.X.copy()

# HVGs auswählen

In [6]:
sc.pp.highly_variable_genes(
    adata,
    n_top_genes=N_HVG,
    flavor="seurat_v3",
    layer=None
)
adata_hvg = adata[:, adata.var["highly_variable"].values].copy()
print("HVG adata:", adata_hvg)

  sc.pp.highly_variable_genes(


HVG adata: AnnData object with n_obs × n_vars = 29773 × 2000
    obs: 'cell_type', 'tissue', 'batch_condition', 'organism', 'assay', 'sex', 'cell_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'gene_name', 'ensembl_id', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'log1p', 'hvg'
    layers: 'counts', 'log1p_norm'


# Split

In [7]:
candidate_group_cols = [
    "donor", "donor_id", "donorID", "subject", "subject_id",
    "individual", "patient", "participant"
]
group_col = None
for c in candidate_group_cols:
    if c in adata_hvg.obs.columns:
        group_col = c
        break

if group_col is None:
    print("WARNUNG: Keine donor/subject Spalte gefunden. Fallback: random split ohne groups.")
    groups = None
else:
    print("Group split by:", group_col)
    groups = adata_hvg.obs[group_col].astype(str).values

cell_ids = adata_hvg.obs["cell_id"].values

if groups is not None:
    gss = GroupShuffleSplit(n_splits=1, test_size=0.15, random_state=SEED)
    trainval_idx, test_idx = next(gss.split(cell_ids, groups=groups))
else:
    idx = np.arange(len(cell_ids))
    np.random.shuffle(idx)
    cut = int(len(idx) * 0.85)
    trainval_idx, test_idx = idx[:cut], idx[cut:]

if groups is not None:
    gss2 = GroupShuffleSplit(n_splits=1, test_size=0.1765, random_state=SEED)
    tr_idx, va_idx = next(gss2.split(trainval_idx, groups=groups[trainval_idx]))
    train_idx = trainval_idx[tr_idx]
    val_idx = trainval_idx[va_idx]
else:
    idx = np.array(trainval_idx)
    np.random.shuffle(idx)
    cut = int(len(idx) * (0.70/0.85))
    train_idx, val_idx = idx[:cut], idx[cut:]

split = np.array([""] * len(cell_ids), dtype=object)
split[train_idx] = "train"
split[val_idx] = "val"
split[test_idx] = "test"

splits_df = pd.DataFrame({"cell_id": cell_ids, "split": split})
splits_df.to_csv(os.path.join(OUTDIR, "splits.csv"), index=False)
print(splits_df["split"].value_counts())

WARNUNG: Keine donor/subject Spalte gefunden. Fallback: random split ohne groups.
split
train    20841
val       4466
test      4466
Name: count, dtype: int64


# Top K Genes Ranking

In [8]:
# Bereite scGPT Input vor: top-K Gene + Werte pro Zelle
X = adata_hvg.X
gene_names = adata_hvg.var_names.to_numpy()

def topk_idx(row, k):
    row = row.ravel()
    if k >= row.shape[0]:
        return np.argsort(-row)[:k]
    idx_part = np.argpartition(-row, k)[:k]
    return idx_part[np.argsort(-row[idx_part])]

# Expression Ranking

In [9]:
# 1) Schreibe die CellSentences (Top-K Gene pro Zelle) in eine Textdatei
sent_path = os.path.join(OUTDIR, "c2s_sentences_expression.txt")
with open(sent_path, "w") as f:
    if sparse.issparse(X):
        X_csr = X.tocsr()
        for i in range(X_csr.shape[0]):
            row = X_csr.getrow(i).toarray().ravel()
            idx = topk_idx(row, K_TOP)
            sent = " ".join(gene_names[idx])
            f.write(f"{cell_ids[i]}\t{sent}\n")
    else:
        for i in range(X.shape[0]):
            row = np.array(X[i]).ravel()
            idx = topk_idx(row, K_TOP)
            sent = " ".join(gene_names[idx])
            f.write(f"{cell_ids[i]}\t{sent}\n")

print("Wrote:", sent_path)


Wrote: processed/c2s_sentences_expression.txt


# scGPT Inputs

In [10]:
# Bereite scGPT Input vor: top-K Gene + Werte pro Zelle
topk_genes = np.empty((len(cell_ids), K_TOP), dtype=object)
topk_vals  = np.zeros((len(cell_ids), K_TOP), dtype=np.float32)

if sparse.issparse(X):
    X_csr = X.tocsr()
    for i in range(X_csr.shape[0]):
        row = X_csr.getrow(i).toarray().ravel()
        idx = topk_idx(row, K_TOP)
        topk_genes[i, :] = gene_names[idx]
        topk_vals[i, :] = row[idx].astype(np.float32)
else:
    for i in range(X.shape[0]):
        row = np.array(X[i]).ravel()
        idx = topk_idx(row, K_TOP)
        topk_genes[i, :] = gene_names[idx]
        topk_vals[i, :] = row[idx].astype(np.float32)

npz_path = os.path.join(OUTDIR, "scgpt_inputs_topk.npz")
np.savez_compressed(
    npz_path,
    cell_id=cell_ids,
    topk_gene_symbols=topk_genes,
    topk_values=topk_vals,
)
print("Wrote:", npz_path)

Wrote: processed/scgpt_inputs_topk.npz


# Save processed Data

In [11]:
# Speichere die adata_hvg mit den neuen Layern und Split-Info
adata_hvg.obs["split"] = split
save_path = os.path.join(OUTDIR, "processed_adata_hvg.h5ad")

import pandas as pd

def _dearrow_df(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    # index
    if str(out.index.dtype).startswith("string"):
        out.index = out.index.astype(object)
    # columns
    for c in out.columns:
        if str(out[c].dtype).startswith("string"):
            out[c] = out[c].astype(object)
    return out

adata_hvg.obs = _dearrow_df(adata_hvg.obs)
adata_hvg.var = _dearrow_df(adata_hvg.var)

adata_hvg.write_h5ad(save_path)
print("Wrote:", save_path)

print("Done.")


Wrote: processed/processed_adata_hvg.h5ad
Done.
