In [2]:
import numpy as np
import scanpy as sc
from pathlib import Path
import torch

from distilled_tx1.preprocessing.pipeline import TahoePreprocessor, PreprocessingConfig
from distilled_tx1.models.modeling_distilled_tahoe import DistilledTahoeModel, DistilledTahoeConfig
from distilled_tx1.training.distillation import train_distilled_model
from distilled_tx1.data.load_h5ad_folder import load_h5ad_folder_lazy

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
adata = sc.read_h5ad("/data/scClassificationDatasets/data_yuto/processed_tahoe_x1/data_yuto_with_clusters_chunk_001.h5ad")

In [None]:
config = PreprocessingConfig(
        seq_len=2048,
        n_bins=51,
        normalize=False,
        normalization_method="log1p",
        target_sum=1e4,
        gene_sampling_strategy="random",
        add_cls_token=True,
        gene_id_key="gene_id"  # or None to use var_names
    )
    
preprocessor = TahoePreprocessor(
    config=config,
    tahoe_model_size="70m",
    vocab_path="/data/scClassificationDatasets/data_yuto/vocab.json"
)

# Process AnnData
processed = preprocessor.process_adata(adata, return_dict=True)

Gene vocabulary matching:
  Total genes in data: 36591
  Genes in vocabulary: 36391
  Coverage: 99.5%


: 

In [None]:
gene_ids = processed["gene_ids"].numpy()
expression_bins = processed["expression_bins"].numpy()
attention_masks = processed["attention_mask"].numpy()

print(f"Tokenized data shape: {gene_ids.shape}")

In [None]:
preprocessor.save("./model_outputs/preprocessor")