In [None]:
from mmllm.module import InstructCell
import anndata
import numpy as np
from utils import unify_gene_features

  from .autonotebook import tqdm as notebook_tqdm


# Cell type annotation

This example demonstrates the cell type annotation task using the **InstructCell** model. The objective is to annotate a single cell based on its gene expression profile and metadata, such as species, tissue, and sequencing method.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata from the dataset (e.g., species, tissue, sequencing method) is extracted for annotation.
- **Prompt**: The prompt dynamically integrates metadata and the gene expression profile to guide the model.
- **Output**: The model predicts annotations for the single cell.

Below is the implementation:

In [None]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/cell_type_annotation/He-2020-Liver/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Can you help me annotate this single cell from a {species}? "
    "It was sequenced using {sequencing_method} and is derived from {tissue}. "
    "The gene expression profile is {input}. Thanks!"
)

# Use the model to generate predictions
for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: {value}")

text: The cell you provided has been annotated as Monocyte based on the provided sequencing method, species, and tissue origin.


In [None]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/cell_type_annotation/He-2020-Liver/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Can you help me annotate this single cell from a {species}? "
    "It was sequenced using {sequencing_method} and is derived from {tissue}. "
    "The gene expression profile is {input}. Thanks!"
)

# Use the model to generate predictions
for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: {value}")

text: Macrophage


# Conditional pseudo-cell generation

This example demonstrates the conditional pseudo-Cell generation task using the **InstructCell** model. The goal is to generate a synthetic gene expression profile based on provided metadata, such as cell type, species, tissue, and sequencing method.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata from the dataset (e.g., cell type, species, tissue, sequencing method) is used to define the prompt.
- **Prompt**: The prompt dynamically incorporates metadata to guide the model in generating a conditional pseudo-cell profile.
- **Output**: The generated gene profile is ranked and displayed, highlighting the top 20 genes with the highest expression levels.

Below is the implementation:

In [4]:
# Load the pre-trained InstructCell model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: "
    "cell type is {cell_type}, species is {species}, tissue is {tissue}, and sequencing method is {sequencing_method}?"
)


for key, value in model.predict(
    prompt, 
    # gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
    print(f"{key}:\n{value}")

text:
Sure, here is a single-cell gene profile based on your specifications: - Cell Type: CD8+/CD45RA+ Naive Cytotoxic - Species: human - Tissue: peripheral blood - Sequencing Method: 10xGenomics (GemCode Technology Platform) The generated profile is as follows:
cell:
RPL10: 57
MALAT1: 56
RPS6: 48
RPL11: 45
RPS18: 39
RPS2: 37
RPL13A: 32
RPL13: 31
RPL3: 31
RPS4X: 30
RPL7: 28
RPLP1: 26
RPS19: 25
RPL26: 25
RPS27: 24
TMSB4X: 24
RPS12: 23
RPL12: 22
B2M: 21
RPL21: 21


In [None]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: "
    "cell type is {cell_type}, species is {species}, tissue is {tissue}, and sequencing method is {sequencing_method}? "
)


for key, value in model.predict(
    prompt, 
    # gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
    print(f"{key}:\n{value}")

text:

cell:
RPL13: 44
MALAT1: 34
RPL10: 27
RPS2: 24
RPS6: 23
RPL18A: 21
TMSB4X: 17
RPL13A: 15
RPS3A: 15
RPL34: 14
RPS27A: 14
RPS19: 13
RPL19: 12
RPL3: 12
RPS25: 10
RPS9: 10
RPL21: 10
RPL7A: 10
RPS5: 9
RPL15: 9


We also support directly specifying and inputting the exact values for each placeholder in the prompt, as shown in this exampleï¼š

In [None]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: "
    "cell type is CD56+ NK, species is human, tissue is peripheral blood, and sequencing method is 10xGenomics (GemCode Technology Platform)? "
)

for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
    print(f"{key}:\n{value}")

text:
Sure, here is a single-cell gene profile based on your specifications: - Cell Type: CD56+ NK - Species: human - Tissue: peripheral blood - Sequencing Method: 10xGenomics (GemCode Technology Platform) The gene expression data aligns with the characteristics of these specific cells.
cell:
MALAT1: 47
B2M: 35
GNLY: 33
HLA-A: 25
NKG7: 23
RPL13: 22
TMSB4X: 18
RPL13A: 17
RPS3: 16
RPS18: 16
S100A4: 16
RPS6: 15
MT-CO1: 15
RPL10: 14
RPLP1: 13
RPS2: 13
HLA-B: 12
RPL15: 12
FTL: 11
GZMB: 11


In [None]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: "
    "cell type is CD56+ NK, species is human, tissue is peripheral blood, and sequencing method is 10xGenomics (GemCode Technology Platform)? "
)

for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
    print(f"{key}:\n{value}")

text:

cell:
MALAT1: 58
B2M: 52
TMSB4X: 32
PFN1: 30
RPL10: 19
ACTG1: 19
CCL5: 16
HLA-B: 16
GZMH: 14
ACTB: 14
HLA-C: 12
RPL3: 12
NKG7: 11
RPS3: 10
HLA-A: 10
LGALS1: 9
RPS2: 9
RPL18: 9
RPL13A: 9
MT-CO2: 8


# Drug sensitivity prediction

This example demonstrates the drug sensitivity prediction task using the **InstructCell** model. The goal is to predict how a single cell responds to a specific drug based on its gene expression profile and associated metadata.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata such as species, tissue, drug, and sequencing method are dynamically incorporated into the prompt.
- **Prompt**: The prompt guides the model to predict drug sensitivity by providing the gene expression profile and metadata.
- **Output**: The model predicts the drug response.

Below is the implementation:

In [8]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/drug_sensitivity_prediction/GSE110894/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Given {sequencing_method}, can you predict the response of the single cell {input} from {species} when exposed to {drug} in {tissue}?"
)

for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: {value}")

text: The single cell from mouse when exposed to BET inhibitor (I-BET-762) in bone marrow is predicted to have Sensitive response to BET inhibitor (I-BET-762).


In [None]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/drug_sensitivity_prediction/GSE110894/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Given {sequencing_method}, can you predict the response of the single cell {input} from {species} when exposed to {drug} in {tissue}?"
)

for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: {value}")

text: Sensitive
