In [1]:
from mmllm.module import InstructCell
import anndata
import numpy as np
from utils import unify_gene_features

  from .autonotebook import tqdm as notebook_tqdm


# Cell type annotation

This example demonstrates the cell type annotation task using the **InstructCell** model. The objective is to annotate a single cell based on its gene expression profile and metadata, such as species, tissue, and sequencing method.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata from the dataset (e.g., species, tissue, sequencing method) is extracted for annotation.
- **Prompt**: The prompt dynamically integrates metadata and the gene expression profile to guide the model.
- **Output**: The model predicts annotations for the single cell.

Below is the implementation:

In [2]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/cell_type_annotation/He-2020-Liver/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Can you help me annotate this single cell from a {species}?  It was sequenced using {sequencing_method} and is derived from {tissue}. The gene expression profile is {input}. Thanks!"
)

# Use the model to generate predictions
for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
The cell you provided has been annotated as T Cell based on the provided sequencing method, species, and tissue origin.


In [3]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/cell_type_annotation/He-2020-Liver/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Can you help me annotate this single cell from a {species}? It was sequenced using {sequencing_method} and is derived from {tissue}. The gene expression profile is {input}. Thanks!"
)

# Use the model to generate predictions
for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
T Cell


# Conditional pseudo-cell generation

This example demonstrates the conditional pseudo-cell generation task using the **InstructCell** model. The goal is to generate a synthetic gene expression profile based on provided metadata, such as cell type, species, tissue, and sequencing method.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata from the dataset (e.g., cell type, species, tissue, sequencing method) is used to define the prompt.
- **Prompt**: The prompt dynamically incorporates metadata to guide the model in generating a conditional pseudo-cell profile.
- **Output**: The generated gene profile is ranked and displayed, highlighting the top 20 genes with the highest expression levels.

Below is the implementation:

In [4]:
# Load the pre-trained InstructCell model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata
prompt = (
    "Hi, I lack the expertise to generate the profiles myself. Could you please generate a single-cell gene profile for a {species}, specifically a {cell_type} from {tissue}, using the {sequencing_method} method?"
)


for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
    if value is not None:  
        print(f"{key}: \n{value}")

  utils.warn_names_duplicates("var")


text: 
Sure, I can help with that. Here is a single-cell gene profile for a human, specifically a CD8+ Cytotoxic T from peripheral blood, using the 10xGenomics (GemCode Technology Platform) method:
cell: 
MALAT1: 146
RPL13A: 30
CCL5: 28
RPS2: 26
RPL10: 25
RPS6: 24
RPL3: 23
RPL19: 21
B2M: 21
RPS3: 20
RPS4X: 20
RPL13: 19
HLA-A: 19
RPS3A: 19
RPS9: 18
RPS19: 17
RPS27A: 17
RPL14: 15
RPLP1: 15
ACTB: 14


In [5]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata
prompt = (
    "Hi, I lack the expertise to generate the profiles myself. Could you please generate a single-cell gene profile for a {species}, specifically a {cell_type} from {tissue}, using the {sequencing_method} method?"
)


for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
        if value is not None:  
            print(f"{key}: \n{value}")

  utils.warn_names_duplicates("var")


cell: 
FTL: 53
RPL13: 39
S100A9: 33
RPS2: 29
RPL11: 28
RPLP1: 25
RPS19: 22
RPL10: 22
TMSB4X: 22
RPL13A: 22
HLA-B: 16
RPL19: 15
FTH1: 13
RPS9: 12
RPL26: 11
RPL12: 11
RPS3: 11
RPS12: 11
TMSB10: 10
RPS6: 10


We also support directly specifying and inputting the exact values for each placeholder in the prompt, as shown in this example：

In [6]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt
prompt = (
    "Hi, I lack the expertise to generate the profiles myself. Could you please generate a single-cell gene profile for a human, specifically a CD56+ NK from peripheral blood, using the 10xGenomics (GemCode Technology Platform) method?"
)

for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
        if value is not None:  
            print(f"{key}: \n{value}")

  utils.warn_names_duplicates("var")


cell: 
TMSB4X: 39
MALAT1: 35
B2M: 35
ACTG1: 20
RPL13A: 20
RPS2: 20
RPL3: 18
NKG7: 16
MT-CO1: 16
HLA-B: 15
RPS19: 14
HLA-A: 13
RPS6: 13
RPL10: 12
RPL15: 11
RPL19: 11
CD7: 11
RPL18: 10
RPL13: 10
HLA-C: 10


In [7]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt
prompt = (
    "Hi, I lack the expertise to generate the profiles myself. Could you please generate a single-cell gene profile for a human, specifically a CD56+ NK from peripheral blood, using the 10xGenomics (GemCode Technology Platform) method?"
)

for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
        if value is not None:  
            print(f"{key}: \n{value}")

  utils.warn_names_duplicates("var")


cell: 
B2M: 23
MALAT1: 17
GNLY: 17
RPL13A: 12
RPL13: 11
RPS3A: 10
TMSB4X: 10
RPS4X: 10
ACTB: 10
RPL14: 9
RPS5: 8
RPL18A: 8
NKG7: 8
RPL11: 8
MT-CO1: 7
TMSB10: 7
RPS6: 7
RPS3: 7
RPL19: 6
RPS19: 6


# Drug sensitivity prediction

This example demonstrates the drug sensitivity prediction task using the **InstructCell** model. The goal is to predict how a single cell responds to a specific drug based on its gene expression profile and associated metadata.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata such as species, tissue, drug, and sequencing method are dynamically incorporated into the prompt.
- **Prompt**: The prompt guides the model to predict the drug sensitivity of a single cell by providing the gene expression profile and metadata.
- **Output**: The model predicts the single cell's response to a specific drug.

Below is the implementation:

In [8]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/drug_sensitivity_prediction/GSE110894/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Given {sequencing_method}, can you predict the response of the single cell {input} from {species} when exposed to {drug} in {tissue}?"
)

for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
Using Cel-Seq2 on single cells from mouse in bone marrow, the predicted response to BET inhibitor (I-BET-762) is Sensitive.


In [9]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("/data2/fangyin/model/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/drug_sensitivity_prediction/GSE110894/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Given {sequencing_method}, can you predict the response of the single cell {input} from {species} when exposed to {drug} in {tissue}?"
)

for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
Sensitive
