In [1]:
from mmllm.module import InstructCell
import anndata
import numpy as np
from utils import unify_gene_features



# Cell type annotation

This example demonstrates the cell type annotation task using the **InstructCell** model. The objective is to annotate a single cell based on its gene expression profile and metadata, such as species, tissue, and sequencing method.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata from the dataset (e.g., species, tissue, sequencing method) is extracted for annotation.
- **Prompt**: The prompt dynamically integrates metadata and the gene expression profile to guide the model.
- **Output**: The model predicts annotations for the single cell.

Below is the implementation:

In [2]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/cell_type_annotation/He-2020-Liver/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Can you help me annotate this single cell from a {species}?  It was sequenced using {sequencing_method} and is derived from {tissue}. The gene expression profile is {input}. Thanks!"
)

# Use the model to generate predictions
for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
Based on the provided gene expression profile, the cell from the human sequenced using HiSeq X Ten System and derived from liver has been annotated as T Cell.


In [3]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/cell_type_annotation/He-2020-Liver/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Can you help me annotate this single cell from a {species}? It was sequenced using {sequencing_method} and is derived from {tissue}. The gene expression profile is {input}. Thanks!"
)

# Use the model to generate predictions
for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
Macrophage


# Conditional pseudo-cell generation

This example demonstrates the conditional pseudo-cell generation task using the **InstructCell** model. The goal is to generate a synthetic gene expression profile based on provided metadata, such as cell type, species, tissue, and sequencing method.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata from the dataset (e.g., cell type, species, tissue, sequencing method) is used to define the prompt.
- **Prompt**: The prompt dynamically incorporates metadata to guide the model in generating a conditional pseudo-cell profile.
- **Output**: The generated gene profile is ranked and displayed, highlighting the top 20 genes with the highest expression levels.

Below is the implementation:

In [4]:
# Load the pre-trained InstructCell model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: cell type is {cell_type}, species is {species}, tissue is {tissue}, and sequencing method is {sequencing_method}?"
)


for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
    if value is not None:  
        print(f"{key}: \n{value}")



text: 
Sure, here is a single-cell gene profile based on your specifications: Cell Type: CD4+/CD45RA+/CD25- Naive T Species: human Tissue: peripheral blood Sequencing Method: 10xGenomics (GemCode Technology Platform)
cell: 
RPS2: 62
MALAT1: 52
RPL10: 50
RPS18: 37
RPL13A: 33
RPL13: 29
RPS4X: 26
RPS12: 24
RPS19: 21
RPS27: 21
RPS9: 20
TPT1: 20
RPL19: 19
RPL11: 18
RPS3: 18
TMSB4X: 18
RPL15: 17
RPL21: 17
ACTB: 17
B2M: 16


In [5]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: cell type is {cell_type}, species is {species}, tissue is {tissue}, and sequencing method is {sequencing_method}?"
)


for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
        if value is not None:  
            print(f"{key}: \n{value}")



cell: 
B2M: 42
MALAT1: 38
RPS2: 31
RPL10: 29
RPL13A: 24
RPL3: 24
RPS19: 22
RPL7: 18
RPL13: 18
ACTB: 16
MT-CO1: 13
RPS27: 12
RPS12: 12
HLA-C: 12
RPL11: 11
EEF1D: 11
TMSB4X: 10
RPS25: 10
GNLY: 9
RPS3A: 9


We also support directly specifying and inputting the exact values for each placeholder in the prompt, as shown in this example：

In [6]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: cell type is CD56+ NK, species is human, tissue is peripheral blood, and sequencing method is 10xGenomics (GemCode Technology Platform)?"
)

for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
        if value is not None:  
            print(f"{key}: \n{value}")



cell: 
MALAT1: 36
RPL13: 23
ACTB: 17
RPL10: 16
RPL13A: 13
RPL19: 13
RPS2: 13
RPS6: 12
RPL3: 11
RPL7A: 10
RPS19: 10
RPL21: 10
PFN1: 8
RPS3: 8
B2M: 8
RPL18A: 8
RPL9: 7
RPL11: 7
RPS27: 7
RPS16: 6


In [7]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/conditional_pseudo_cell_generation/PBMC68K/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its metadata
k = np.random.randint(0, len(adata)) 
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt
prompt = (
    "Hey, can you whip up a single-cell gene profile for the given specs: cell type is CD56+ NK, species is human, tissue is peripheral blood, and sequencing method is 10xGenomics (GemCode Technology Platform)?"
)

for key, value in model.predict(
    prompt, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if not isinstance(value, str): 
        value = '\n'.join(
            f"{gene_vocab[idx]}: {int(value[idx])}" for idx in np.argsort(-value)[: 20]
        )
        if value is not None:  
            print(f"{key}: \n{value}")



cell: 
B2M: 56
MALAT1: 48
RPS2: 26
RPL10: 25
RPL13A: 21
RPS18: 19
RPL13: 19
GNLY: 16
RPL11: 14
TMSB10: 14
HLA-B: 13
RPS6: 12
NKG7: 12
RPL7: 11
TMSB4X: 11
RPS3: 11
ACTB: 10
RPL3: 10
RPL21: 9
RPL18A: 9


# Drug sensitivity prediction

This example demonstrates the drug sensitivity prediction task using the **InstructCell** model. The goal is to predict how a single cell responds to a specific drug based on its gene expression profile and associated metadata.

- **Input Data**: A single-cell dataset in **H5AD format** and a **gene vocabulary file**.
- **Metadata**: Metadata such as species, tissue, drug, and sequencing method are dynamically incorporated into the prompt.
- **Prompt**: The prompt guides the model to predict the drug sensitivity of a single cell by providing the gene expression profile and metadata.
- **Output**: The model predicts the single cell's response to a specific drug.

Below is the implementation:

In [8]:
# Load the pre-trained InstructCell-chat model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-chat") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/drug_sensitivity_prediction/GSE110894/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Given {sequencing_method}, can you predict the response of the single cell {input} from {species} when exposed to {drug} in {tissue}?"
)

for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
The single cell from mouse shows a Resistant response to BET inhibitor (I-BET-762) in bone marrow based on Cel-Seq2.


In [9]:
# Load the pre-trained InstructCell-instruct model from HuggingFace
model = InstructCell.from_pretrained("zjunlp/InstructCell-instruct") 

# Load the single-cell data (H5AD format) and gene vocabulary file (numpy format)
adata = anndata.read_h5ad("./exp_log/demo_data/drug_sensitivity_prediction/GSE110894/rna.h5ad")
gene_vocab = np.load("./exp_log/gene_vocab.npy")
adata = unify_gene_features(adata, gene_vocab, force_gene_symbol_uppercase=False)

# Select a random single-cell sample and extract its gene counts and metadata
k = np.random.randint(0, len(adata)) 
gene_counts = adata[k, :].X.toarray()
sc_metadata = adata[k, :].obs.iloc[0].to_dict()

# Define the model prompt with placeholders for metadata and gene expression profile
prompt = (
    "Given {sequencing_method}, can you predict the response of the single cell {input} from {species} when exposed to {drug} in {tissue}?"
)

for key, value in model.predict(
    prompt, 
    gene_counts=gene_counts, 
    sc_metadata=sc_metadata, 
    do_sample=True, 
    top_p=0.95,
    top_k=50,
    max_new_tokens=256,
).items():
    if value is not None:  
        print(f"{key}: \n{value}")

text: 
Sensitive
