# Extracting Embeddings with scConcept

This tutorial demonstrates how to extract embeddings from single-cell RNA-seq data using scConcept.


In [None]:
import os
from pathlib import Path
import scanpy as sc
from concept import scConcept

The directory where the pre-trained model will be downloaded:

In [None]:
cache_dir = Path("./cache/")
os.makedirs(cache_dir, exist_ok=True)

Download a sample dataset:

In [None]:
filename = cache_dir / "cite_gex_processed_training.h5ad"
url = "https://openproblems-bio.s3.amazonaws.com/public/explore/cite/cite_gex_processed_training.h5ad"

if not os.path.exists(filename):
    import urllib.request
    print(f"Downloading {filename} ...")
    urllib.request.urlretrieve(url, filename)
else:
    print(f"{filename} already exists, skipping download.")

adata = sc.read(filename)
print(adata)

Load a pre-trained scConcept model:


In [None]:
concept = scConcept(cache_dir=cache_dir)
concept.load_config_and_model(model_name='Corpus-30M')

Extract embeddings: \
Indicate the column name of the gene ids in the adata.var of the format: ENSGXXXXXXXXX

In [None]:
result = concept.extract_embeddings(
    adata=adata,
    batch_size=32,
    gene_id_column="gene_ids", 
)

print(f"CLS embeddings: {result['cls_cell_emb'].shape}")
print(f"Mean embeddings: {result['mean_cell_emb'].shape}")


Compute UMAP on the embeddings:

In [None]:
sc.pp.neighbors(adata, use_rep='X_scConcept')
sc.pl.umap(adata, color='cell_type')

Add embeddings to AnnData object for downstream analysis:

In [None]:
adata.obsm['X_scConcept'] = result['cls_cell_emb']