In [None]:
import scanpy as sc
import celltypist
import time
import numpy as np

In [None]:
sc.set_figure_params(scanpy=True, dpi=80, dpi_save=150, 
                         frameon=True, vector_friendly=True, fontsize=14, figsize=[7,7], color_map=None, 
                         format='pdf', facecolor=None, transparent=False,)

In [None]:
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42

## Read in vivo and in vitro datasets

Data must be normalised and log-transformed 

In [None]:
# FETAL 
females_post10pcw = sc.read('/nfs/team292/vl6/FetalReproductiveTract/post_10pcw_females.20240326.h5ad')
females_post10pcw.shape

In [None]:
sc.pl.umap(females_post10pcw, color="celltype")

In [None]:
females_post10pcw = females_post10pcw[[i in ['HD_F_GON12873752'] for i in females_post10pcw.obs['sample']]]
females_post10pcw.shape

In [None]:
sc.pl.umap(females_post10pcw, color="celltype")

In [None]:
females_post10pcw = females_post10pcw[[i in ['Uterus/Cervix_Epithelium', 
                                            'MüllerianVagina_Epithelium', 
                                            'VaginalPlate_Epithelium'] for i in females_post10pcw.obs['celltype']]]

In [None]:
sc.pl.umap(females_post10pcw, color="celltype")

In [None]:
import anndata

In [None]:
invivo = anndata.AnnData(X = females_post10pcw.raw.X, var = females_post10pcw.raw.var, obs = females_post10pcw.obs)
invivo.shape

In [None]:
sc.pp.filter_genes(invivo, min_cells = 50)

In [None]:
invivo.raw = invivo.copy()

In [None]:
sc.pp.normalize_total(invivo, target_sum = 1e4, inplace=True)
sc.pp.log1p(invivo)

In [None]:
invivo.shape

In [None]:
# Select sample used for deriving organoids
organoids = sc.read('/nfs/team292/vl6/FetalReproductiveTract/fetal_organoids_18PCW.h5ad')
organoids

In [None]:
organoids = anndata.AnnData(X = organoids.raw.X, var = organoids.raw.var, obs = organoids.obs)
organoids.shape

In [None]:
sc.pp.filter_genes(organoids, min_cells = 50)

In [None]:
organoids.raw = organoids.copy()

In [None]:
sc.pp.normalize_total(organoids, target_sum = 1e4, inplace=True)
sc.pp.log1p(organoids)

In [None]:
organoids.shape

## Downsample cells from each cell type to a given number

In [None]:
invivo.obs['celltype'].value_counts()

In [None]:
# sampled_cell_index = celltypist.samples.downsample_adata(adata_uterus, 
#                                                          mode = 'each', n_cells = 7000, by = 'final_annotations_level_2',
#                                                          return_index = True)

In [None]:
# print(f"Number of downsampled cells for training: {len(sampled_cell_index)}")

A feature selection step will restrict the number of genes during training, and can improve both training efficiency and prediction accuracy. It is recommended in most cases (though CellTypist models are proved to be robust when all genes are used).

One example of feature selection is using the scanpy.pp.highly_variable_genes. Depending on the datasets, you may need to take into account batch effects (e.g. by specifying batch_key), add/remove a specific list of genes (e.g. VDJ genes for immune cells), combine high-confidence features across zoomed-in compartments, or any other approaches tailored to your data.

In [None]:
# Use `celltypist.train` to quickly train a rough CellTypist model.
# You can also set `mini_batch = True` to enable mini-batch training.
t_start = time.time()
model_fs = celltypist.train(invivo, 'celltype', n_jobs = 10, max_iter = 5, use_SGD = True)
t_end = time.time()
print(f"Time elapsed: {t_end - t_start} seconds")

This model is trained from all genes with only five epochs, and thus is not accurate enough for cell type prediction. But the information about genes can be utilised. Here, we drew top 100 important genes from each cell type as ranked by their absolute regression coefficients associated with the given cell type. For datasets with only several cell types, you may want to increase the top gene number from 100 to for example 300 in order to get a sufficient number of genes for final use.

In [None]:
gene_index = np.argpartition(np.abs(model_fs.classifier.coef_), -200, axis = 1)[:, -200:]

In [None]:
gene_index = np.unique(gene_index)

In [None]:
len(gene_index)

In [None]:
print(f"Number of genes selected: {len(gene_index)}")

In [None]:
# Add `check_expression = False` to bypass expression check with only a subset of genes.
t_start = time.time()
model = celltypist.train(invivo[:, gene_index], 'celltype', check_expression = False, n_jobs = 10, max_iter = 300)
t_end = time.time()
print(f"Time elapsed: {(t_end - t_start)/60} minutes")

In [None]:
# Save the model.
model.write('/nfs/team292/vl6/FetalReproductiveTract/celltypist_invivo_organoids_18pcw.pkl')

In [None]:
# CellTypist prediction with over-clustering and majority-voting.
t_start = time.time()
predictions = celltypist.annotate(organoids, model = '/nfs/team292/vl6/FetalReproductiveTract/celltypist_invivo_organoids_18pcw.pkl',
                                  majority_voting = True)
t_end = time.time()
print(f"Time elapsed: {t_end - t_start} seconds")

The results include both predicted cell type labels (predicted_labels), over-clustering result (over_clustering), and predicted labels after majority voting in local subclusters (majority_voting). Note in the predicted_labels, each query cell gets its inferred label by choosing the most probable cell type among all possible cell types in the given model.

In [None]:
predictions.predicted_labels

In [None]:
predictions.adata.obs['sample'].value_counts()

In [None]:
# You can also change the value of `use_as_prediction` to `predicted_labels` to compare the raw prediction result with the pre-defined cell types.
celltypist.dotplot(predictions, use_as_reference = 'sample', use_as_prediction = 'majority_voting')

In [None]:
predictions.predicted_labels['predicted_labels'] = predictions.predicted_labels['predicted_labels'].astype('category')

In [None]:
predictions.predicted_labels['predicted_labels'] = predictions.predicted_labels['predicted_labels'].cat.reorder_categories([
    'Uterus/Cervix_Epithelium', 'MüllerianVagina_Epithelium', 'VaginalPlate_Epithelium'
])

In [None]:
predictions.adata.obs['sample'] = predictions.adata.obs['sample'].astype('category')
predictions.adata.obs['sample'] = predictions.adata.obs['sample'].cat.reorder_categories([
    'HD_F_GON14896471', 'HD_F_GON14896472'
])

In [None]:
# You can also change the value of `use_as_prediction` to `predicted_labels` to compare the raw prediction result with the pre-defined cell types.
celltypist.dotplot(predictions, use_as_reference = 'sample', use_as_prediction = 'predicted_labels', 
                   cmap = 'OrRd',
                  save = '_invivo_organoids_18pcw.pdf')

In [None]:
organoids.obs['treated'].value_counts()

In [None]:
577 / (1161 + 577)