# C2S Tutorial-Style 1: Finetuning On New Dataset

Dieses Notebook ist am Workflow von:
- `c2s_tutorial_3_finetuning_on_new_datasets.ipynb`

Ziel: `vandijklab/C2S-Pythia-410m-cell-type-prediction` auf deinem Datensatz feinjustieren.

In [1]:
# Optional (bei Bedarf):
# %pip install -q cell2sentence anndata scanpy transformers datasets pandas numpy scipy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
from datetime import datetime
import json
import random

import numpy as np
import anndata as ad
import scanpy as sc

import cell2sentence as cs
from transformers import TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# ---------- Config ----------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

H5AD_PATH = Path('../../data/dominguez_conde_immune_tissue_two_donors.h5ad')
BASE_MODEL = 'vandijklab/C2S-Pythia-410m-cell-type-prediction'
TRAINING_TASK = 'cell_type_prediction'
TOP_K_GENES = 200

RUN_NAME = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
RUN_DIR = Path('./runs') / RUN_NAME
CSDATA_DIR = RUN_DIR / 'csdata'
MODEL_DIR = RUN_DIR / 'model'
RUN_DIR.mkdir(parents=True, exist_ok=True)
CSDATA_DIR.mkdir(parents=True, exist_ok=True)
MODEL_DIR.mkdir(parents=True, exist_ok=True)

assert H5AD_PATH.exists(), f'Not found: {H5AD_PATH.resolve()}'
print('RUN_DIR:', RUN_DIR.resolve())

RUN_DIR: /root/AI-Biomedicine/Improving-Cell2Sentence-with-Single-Cell-Foundation-Model-Embeddings/notebooks/c2s_tutorial_style/runs/2026-02-26_13-42-01


In [4]:
# ---------- Load dataset ----------
adata = ad.read_h5ad(H5AD_PATH)
print('shape:', adata.shape)
print('obs columns:', list(adata.obs.columns))

if 'cell_type' not in adata.obs.columns:
    raise ValueError("adata.obs must contain 'cell_type' for training labels.")

shape: (29773, 36503)
obs columns: ['cell_type', 'tissue', 'batch_condition', 'organism', 'assay', 'sex']


In [5]:
# ---------- Minimal preprocessing (tutorial-style baseline) ----------
adata = adata.copy()
adata.var_names_make_unique()
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

label_cols = [c for c in ['cell_type', 'tissue', 'batch_condition', 'organism', 'sex'] if c in adata.obs.columns]
print('label columns:', label_cols)

label columns: ['cell_type', 'tissue', 'batch_condition', 'organism', 'sex']


In [6]:
# ---------- AnnData -> Arrow + vocabulary ----------
arrow_ds, vocab = cs.CSData.adata_to_arrow(
    adata,
    random_state=SEED,
    sentence_delimiter=' ',
    label_col_names=label_cols,
)
print('n samples in arrow:', len(arrow_ds))
print('vocab size:', len(vocab))

WARN: more variables (36503) than observations (29773)... did you mean to transpose the object (e.g. adata.T)?
WARN: more variables (36503) than observations (29773), did you mean to transpose the object (e.g. adata.T)?
100%|██████████| 29773/29773 [00:10<00:00, 2840.75it/s]


n samples in arrow: 29773
vocab size: 36503


In [7]:
# ---------- Train/val/test split ----------
_, split_indices = cs.utils.train_test_split_arrow_ds(arrow_ds)

split_path = RUN_DIR / 'split_indices.json'
with split_path.open('w') as f:
    json.dump(split_indices, f, indent=2)

print('saved:', split_path.resolve())
print({k: len(v) for k, v in split_indices.items()})

saved: /root/AI-Biomedicine/Improving-Cell2Sentence-with-Single-Cell-Foundation-Model-Embeddings/notebooks/c2s_tutorial_style/runs/2026-02-26_13-42-01/split_indices.json
{'train': 23847, 'val': 2948, 'test': 2978}


In [8]:
# ---------- Save CSData ----------
csdata = cs.CSData.csdata_from_arrow(
    arrow_dataset=arrow_ds,
    vocabulary=vocab,
    save_dir=str(CSDATA_DIR),
    save_name='dataset_arrow',
    dataset_backend='arrow',
)
print(csdata)

Saving the dataset (1/1 shards): 100%|██████████| 29773/29773 [00:00<00:00, 55301.64 examples/s]

CSData Object; Path=runs/2026-02-26_13-42-01/csdata/dataset_arrow, Format=arrow





In [9]:
# ---------- Init model ----------
csmodel = cs.CSModel(
    model_name_or_path=BASE_MODEL,
    save_dir=str(MODEL_DIR),
    save_name='finetuned_cell_type_prediction',
)
print('base model:', BASE_MODEL)

Using device: cpu


Loading weights: 100%|██████████| 292/292 [00:00<00:00, 362.74it/s, Materializing param=gpt_neox.layers.23.post_attention_layernorm.weight] 
Writing model shards: 100%|██████████| 1/1 [00:04<00:00,  4.14s/it]

base model: vandijklab/C2S-Pythia-410m-cell-type-prediction





In [12]:
# ---------- TrainingArguments ----------
HF_OUTPUT_DIR = MODEL_DIR / 'hf_trainer_output'
HF_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

train_args = TrainingArguments(
    output_dir=str(HF_OUTPUT_DIR),
    bf16=True,
    fp16=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    num_train_epochs=5,
    warmup_steps=0.05,
    lr_scheduler_type='cosine',
    logging_steps=50,
    eval_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to='none',
    use_cpu=True
)
train_args

TrainingArguments(
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=True,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=False,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
enable_jit_checkpoint=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=50,
eval_strategy=IntervalStrategy.STEPS,
eval_use_gather_object=False,

In [13]:
# ---------- Fine-tune ----------
csmodel.fine_tune(
    csdata=csdata,
    task=TRAINING_TASK,
    train_args=train_args,
    loss_on_response_only=False,
    top_k_genes=TOP_K_GENES,
    max_eval_samples=500,
    data_split_indices_dict={
        'train': split_indices['train'],
        'val': split_indices['val'],
        'test': split_indices.get('test', []),
    },
)
print('Fine-tuning finished.')

Reloading model from path on disk: runs/2026-02-26_13-42-01/model/finetuned_cell_type_prediction


Loading weights: 100%|██████████| 292/292 [00:00<00:00, 401.01it/s, Materializing param=gpt_neox.layers.23.post_attention_layernorm.weight] 
Map (num_proc=3): 100%|██████████| 29773/29773 [00:17<00:00, 1694.64 examples/s]


Starting training. Output directory: runs/2026-02-26_13-42-01/model/hf_trainer_output
Selecting 500 samples of eval dataset to shorten validation loop.


TypeError: Trainer.__init__() got an unexpected keyword argument 'tokenizer'

In [None]:
# ---------- Save run metadata for Notebook 2 ----------
run_info = {
    'h5ad_path': str(H5AD_PATH),
    'base_model': BASE_MODEL,
    'training_task': TRAINING_TASK,
    'top_k_genes': TOP_K_GENES,
    'run_dir': str(RUN_DIR.resolve()),
    'csdata_dir': str(CSDATA_DIR.resolve()),
    'model_dir': str(MODEL_DIR.resolve()),
    'finetuned_model_path': str((MODEL_DIR / 'finetuned_cell_type_prediction').resolve()),
    'split_indices_path': str(split_path.resolve()),
}

run_info_path = RUN_DIR / 'run_info.json'
with run_info_path.open('w') as f:
    json.dump(run_info, f, indent=2)

print('saved:', run_info_path.resolve())
run_info