In [10]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad

In [3]:
# Set the working directory
os.chdir("/users/yourpath")

### 1. Filter high variable genes

In [4]:
## Load data (anndata object)
adata = ad.read_h5ad("./data_demo.h5ad")

In [None]:
## Normalize total counts to 1e4 per cell
sc.pp.normalize_total(adata, target_sum=1e4)

## Perform log transformation on the data
sc.pp.log1p(adata)

## Identify highly variable genes (default: top 5000 genes)
batch_key = "batch" # Key for batch information
sc.pp.highly_variable_genes(adata, n_top_genes=5000, flavor="cell_ranger", batch_key= batch_key,layer=None)

In [7]:
## Save raw data for reference
adata.raw = adata

## Subset the AnnData object to only include highly variable genes
adata = adata[:,adata.var.highly_variable]

### 2. UMAP computation

In [None]:
## Scale the data (cap the values at 10 for numerical stability)
sc.pp.scale(adata, max_value=10)

## Perform PCA for dimensionality reduction
sc.tl.pca(adata, svd_solver='arpack')

## Construct a neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)

In [None]:
## Batch effect correction using Harmony
sc.external.pp.harmony_integrate(adata, 'sample_id')

## Replace the PCA coordinates with the Harmony-corrected PCA results
adata.obsm['X_pca'] = adata.obsm['X_pca_harmony']

In [None]:
## Recompute the neighborhood graph and calculate UMAP embedding
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=30)
sc.tl.umap(adata)

### 3. Cell annotation (example: Automatic annotation by CellTypist)

In [None]:
import celltypist
from celltypist import models

In [None]:
## Load the original dataset 
adatas = ad.read_h5ad("./data_demo.h5ad")

In [None]:
## Normalize total counts and log-transform the data
sc.pp.normalize_total(adatas, target_sum=1e4)
sc.pp.log1p(adatas)

## Example operation: recover original expression values
adatas.X.expm1().sum(axis = 1)

In [None]:
## Load a pretrained model in CellTypist
model = models.Model.load(model = 'Healthy_COVID19_PBMC.pkl')

## Annotate the dataset using the pretrained model
predictions = celltypist.annotate(adatas, model = 'Healthy_COVID19_PBMC.pkl', majority_voting = True)

In [None]:
## Integrate the predictions back into the AnnData object
adatas = predictions.to_adata()
adatas.obs["predicted_labels"] = adata.obs["predicted_labels"]
adatas.obs["over_clustering"] = adata.obs["over_clustering"]
adatas.obs["majority_voting"] = adata.obs["majority_voting"]

In [None]:
## Save the annotated dataset
adata.write("./data_demo_anno.h5ad")