In [1]:
#celliD
#https://bioconductor.riken.jp/packages/3.14/bioc/vignettes/CelliD/inst/doc/BioconductorVignette.html#cellid-automatic-cell-type-prediction-using-pre-established-marker-lists


library(CelliD)
library(tidyverse) # general purpose library for data handling
library(ggpubr) #library for plotting





Loading required package: Seurat

Attaching SeuratObject

Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDif

In [2]:
data=read.table("data/zheng68k/zheng68k.csv",sep=",",header=T,row.names=1)

In [3]:
annotation=read.table("data/zheng68k/68k_pbmc_barcodes_annotation.tsv.txt",header=TRUE,sep="\t")

head(annotation)

Unnamed: 0_level_0,TSNE.1,TSNE.2,barcodes,celltype
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<chr>
1,7.56554,0.4413703,AAACATACACCCAA-1,CD8+ Cytotoxic T
2,2.552626,-25.7866723,AAACATACCCCTCA-1,CD8+/CD45RA+ Naive Cytotoxic
3,-5.771831,11.830846,AAACATACCGGAGA-1,CD4+/CD45RO+ Memory
4,1.762556,25.9793459,AAACATACTAACCG-1,CD19+ B
5,-16.793856,-16.5899699,AAACATACTCTTCA-1,CD4+/CD25 T Reg
6,-15.339791,-11.0882635,AAACATACTGGATC-1,CD4+/CD25 T Reg


In [5]:
data=data[1:10000,]
annotation=annotation[1:10000,]

In [4]:
extract_prediction_label = function(x){
    y=c()
    for (i in x){
        y=c(y,i)
    }
    y
    return(y)
}



calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}


calculate_f1_score_multiclass <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  # Get unique class labels
  classes <- unique(c(true_labels, predicted_labels))
  
  # Initialize variables to store per-class statistics
  precision <- numeric(length(classes))
  recall <- numeric(length(classes))
  f1_score <- numeric(length(classes))
  
  # Calculate precision, recall, and F1 score for each class
  for (i in 1:length(classes)) {
    class_label <- classes[i]
    true_positive <- sum(predicted_labels == class_label & true_labels == class_label)
    false_positive <- sum(predicted_labels == class_label & true_labels != class_label)
    false_negative <- sum(predicted_labels != class_label & true_labels == class_label)
    
    precision[i] <- true_positive / (true_positive + false_positive)
    recall[i] <- true_positive / (true_positive + false_negative)
    
    f1_score[i] <- 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
  }
  
  # Calculate macro-averaged F1 score
  macro_f1_score <- mean(f1_score, na.rm = TRUE)
  
  return(macro_f1_score)
}




In [5]:
library(Seurat)

set.seed(124)


num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 5:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
  ref = CreateSeuratObject(counts = t(data_train))
  ref <- NormalizeData(ref)
  ref <- ScaleData(ref, verbose = FALSE)
  ref =  FindVariableFeatures(ref, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  ref <- RunPCA(ref, npcs = 30, verbose = FALSE)
  ref <- RunUMAP(ref, reduction = "pca", dims = 1:30, verbose = FALSE)
  
  query = CreateSeuratObject(counts = t(data_test))
  query <- NormalizeData(query)
  query <- ScaleData(query, verbose = FALSE)
  query =  FindVariableFeatures(query, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  query <- RunPCA(query, npcs = 30, verbose = FALSE)
  query <- RunUMAP(query, reduction = "pca", dims = 1:30, verbose = FALSE)
  query <- RunMCA(query, nmcs = 50)

  #cell match
  ref=RunMCA(ref)
  # Extracting per-cell gene signatures from the Baron dataset with CelliD(c)
  ref_cell_gs <- GetCellGeneSet(ref, dims = 1:50, n.features = 200)

  ref$celltype=anno_train$celltype


  # Extracting per-group gene signatures from the Baron dataset with CelliD(g)
  ref_group_gs <- GetGroupGeneSet(ref, dims = 1:50, n.features = 200, group.by = "celltype")
  
  
  #query <- FindVariableFeatures(query)
  #query <- ScaleData(query)
  #query <- RunMCA(query, nmcs = 50)

  #query <- RunPCA(query)
  #query <- RunUMAP(query, dims = 1:30)
  #query <- RunTSNE(query, dims = 1:30)
  


  HGT_ref_cell_gs <- RunCellHGT(query, pathways = ref_cell_gs, dims = 1:50)

  ref_cell_gs_match <- rownames(HGT_ref_cell_gs)[apply(HGT_ref_cell_gs, 2, which.max)]
  ref_cell_gs_prediction <- ref$celltype[ref_cell_gs_match]


  predictions=extract_prediction_label(ref_cell_gs_prediction)
  
  acc=calculate_test_accuracy(anno_test$celltype,predictions)


  f1_score = calculate_f1_score_multiclass(anno_test$celltype,predictions)


  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc,"f1 score:",f1_score, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}


“data length is not a multiple of split variable”


“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”
“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“sparse->dense coercion: allocating vector of size 3.3 GiB”
Computing Fuzzy Matrix



8.788 sec elapsed


Computing SVD



43.241 sec elapsed


Computing Coordinates



4.112 sec elapsed


“sparse->dense coercion: allocating vector of size 13.4 GiB”
Computing Fuzzy Matrix



51.031 sec elapsed


Computing SVD



234.954 sec elapsed


Computing Coordinates



12.443 sec elapsed



calculating distance



creating ranking



creating geneset



creating ranking



calculating distance


ranking genes

54864 pathways kept for hypergeometric test out of 54864, 0 filtered as less than 10 features was present in the data


calculating features overlap


“sparse->dense coercion: allocating vector of size 5.6 GiB”
performing hypergeometric test


“sparse->dense coercion: allocating vector of size 5.6 GiB”


Fold 5 : Train samples = 54864 , Test samples = 13715 Accuracy: 0.5294203 f1 score: 0.5104825 


In [5]:
set.seed(123)
train <- sample(1:nrow(data), 0.999*nrow(data))
test <- setdiff(1:nrow(data), train)

#train
data_train <- data[train,]
anno_train <- annotation[train,]

#test
data_test <- data[test,]
anno_test <- annotation[test,]

In [6]:
library(Seurat)
 
ref = CreateSeuratObject(counts = t(data_train))

ref <- ScaleData(ref, verbose = FALSE)
ref =  FindVariableFeatures(ref, selection.method = "vst", nfeatures = 2000, verbose = FALSE)

ref <- RunPCA(ref, npcs = 30, verbose = FALSE)
ref <- RunUMAP(ref, reduction = "pca", dims = 1:30, verbose = FALSE)

“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”


In [8]:
query = CreateSeuratObject(counts = t(data_test))

query <- ScaleData(query, verbose = FALSE)
query =  FindVariableFeatures(query, selection.method = "vst", nfeatures = 2000, verbose = FALSE)

query <- RunPCA(query, npcs = 30, verbose = FALSE)
query <- RunUMAP(query, reduction = "pca", dims = 1:30, verbose = FALSE)

“Feature names cannot have underscores ('_'), replacing with dashes ('-')”
“at  -1.8561”
“radius  0.00029837”
“all data on boundary of neighborhood. make span bigger”
“pseudoinverse used at -1.8561”
“neighborhood radius 0.017273”
“reciprocal condition number  1”
“There are other near singularities as well. 0.090619”
“zero-width neighborhood. make span bigger”


In [11]:
query=RunMCA(query)

Computing Fuzzy Matrix



0.021 sec elapsed


Computing SVD

“You're computing too large a percentage of total singular values, use a standard svd instead.”


0.075 sec elapsed


Computing Coordinates



0.006 sec elapsed


In [13]:
# Performing per-cell hypergeometric tests against the gene signature collection
HGT_immune_gs <- RunCellHGT(query, pathways = Immune_gs, dims = 1:50, n.features = 200)


calculating distance


ranking genes



18 pathways kept for hypergeometric test out of 25, 7 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




In [14]:
# For each cell, assess the signature with the lowest corrected p-value (max -log10 corrected p-value)
Immune_gs_prediction <- rownames(HGT_immune_gs)[apply(HGT_immune_gs, 2, which.max)]


In [15]:
Immune_gs_prediction

In [18]:
anno_test$celltype

In [26]:
#cell match
ref=RunMCA(ref)
# Extracting per-cell gene signatures from the Baron dataset with CelliD(c)
ref_cell_gs <- GetCellGeneSet(ref, dims = 1:50, n.features = 200)

“sparse->dense coercion: allocating vector of size 16.7 GiB”


Computing Fuzzy Matrix



171.34 sec elapsed


Computing SVD



239.331 sec elapsed


Computing Coordinates



22.265 sec elapsed



calculating distance



creating ranking



creating geneset




In [29]:
ref$celltype=anno_train$celltype

In [30]:
# Extracting per-group gene signatures from the Baron dataset with CelliD(g)
ref_group_gs <- GetGroupGeneSet(ref, dims = 1:50, n.features = 200, group.by = "celltype")


creating ranking




In [31]:
query <- NormalizeData(query)
query <- FindVariableFeatures(query)
query <- ScaleData(query)
query <- RunMCA(query, nmcs = 50)

“at  -1.8561”
“radius  0.00029837”
“all data on boundary of neighborhood. make span bigger”
“pseudoinverse used at -1.8561”
“neighborhood radius 0.017273”
“reciprocal condition number  1”
“There are other near singularities as well. 0.090619”
“zero-width neighborhood. make span bigger”
Centering and scaling data matrix

Computing Fuzzy Matrix



0.015 sec elapsed


Computing SVD

“You're computing too large a percentage of total singular values, use a standard svd instead.”


0.072 sec elapsed


Computing Coordinates



0.007 sec elapsed


In [32]:
query <- RunPCA(query)
query <- RunUMAP(query, dims = 1:30)
query <- RunTSNE(query, dims = 1:30)

“You're computing too large a percentage of total singular values, use a standard svd instead.”
PC_ 1 
Positive:  SDPR, PPBP, GNG11, RUFY1, ACRBP, TMEM40, ARHGAP6, GP6, PF4, SPARC 
	   CLU, PTCRA, MPP1, CMTM5, CLDN5, PLA2G12A, TUBB1, C2orf88, MAP3K7CL, TSC22D1 
	   STX11, EXOC4, NRGN, RGS18, SMOX, MEF2C, CRAT, LYPLAL1, HIST1H2AC, GP9 
Negative:  RPL13, RPS4X, RPL10, RPS18, RPS3, RPL18A, RPS3A, RPS2, RPL32, RPL10A 
	   MALAT1, RPL9, RPS12, RPL12, RPL31, RPL3, RPS23, RPS8, TMSB10, TPT1 
	   RPL13A, RPL23A, RPS6, RPL34, RPS25, RPLP1, RPL29, RPL6, RPL7, RPL26 
PC_ 2 
Positive:  RPL21, RPL11, BTG1, RPL13A, RPS25, RPS14, RPS6, GYPC, RSL1D1, RPL9 
	   RPL3, NCOA4, RPL13, PIM1, APEX1, TXNIP, KLHL24, RTN3, SP140L, TALDO1 
	   GNL3, NCK2, HERPUD2, RAB37, PRDX2, CYB5R1, LBH, B2M, RBM38, CCR7 
Negative:  WDR61, ROGDI, TBC1D23, CIR1, NAPG, PRPF38A, CPNE3, SSR3, TMEM126B, SEC13 
	   COPS7B, PCYT1A, PTRHD1, ARRDC1, CTSH, AC009506.1, SMARCB1, SH3GLB1, GAR1, CLEC10A 
	   GSN, MNDA, CPVL, FCER1A, ACTR1A

ERROR: Error in .check_tsne_params(nrow(X), dims = dims, perplexity = perplexity, : perplexity is too large for the number of samples


In [33]:
HGT_ref_cell_gs <- RunCellHGT(query, pathways = ref_cell_gs, dims = 1:50)


calculating distance


ranking genes

45035 pathways kept for hypergeometric test out of 68510, 23475 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




In [35]:
ref_cell_gs_match <- rownames(HGT_ref_cell_gs)[apply(HGT_ref_cell_gs, 2, which.max)]
ref_cell_gs_prediction <- ref$celltype[ref_cell_gs_match]

In [36]:
ref_cell_gs_prediction

In [None]:
'CD19+ B''CD8+/CD45RA+ Naive Cytotoxic''CD8+/CD45RA+ Naive Cytotoxic''CD56+ NK''CD4+/CD45RO+ Memory''CD8+/CD45RA+ Naive Cytotoxic''CD34+''CD4+/CD45RA+/CD25- Naive T''CD8+ Cytotoxic T''Dendritic''CD8+/CD45RA+ Naive Cytotoxic''CD8+ Cytotoxic T''CD4+/CD45RO+ Memory''CD8+/CD45RA+ Naive Cytotoxic''CD8+/CD45RA+ Naive Cytotoxic''CD8+ Cytotoxic T''CD56+ NK''CD8+ Cytotoxic T''CD8+/CD45RA+ Naive Cytotoxic''CD4+/CD25 T Reg''CD8+ Cytotoxic T''CD56+ NK''CD14+ Monocyte''CD8+/CD45RA+ Naive Cytotoxic''Dendritic''CD8+ Cytotoxic T''CD8+/CD45RA+ Naive Cytotoxic''CD8+/CD45RA+ Naive Cytotoxic''CD19+ B''CD56+ NK''CD8+/CD45RA+ Naive Cytotoxic''CD8+/CD45RA+ Naive Cytotoxic''CD8+/CD45RA+ Naive Cytotoxic''CD56+ NK''CD8+ Cytotoxic T''CD8+/CD45RA+ Naive Cytotoxic''CD14+ Monocyte''CD4+/CD25 T Reg''CD8+/CD45RA+ Naive Cytotoxic''CD4+/CD25 T Reg''CD8+/CD45RA+ Naive Cytotoxic''CD4+/CD25 T Reg''CD8+/CD45RA+ Naive Cytotoxic''CD56+ NK''CD4+/CD45RA+/CD25- Naive T''CD8+ Cytotoxic T''CD8+ Cytotoxic T''CD56+ NK''CD8+ Cytotoxic T''CD8+/CD45RA+ Naive Cytotoxic''CD8+/CD45RA+ Naive Cytotoxic''CD4+/CD45RO+ Memory''CD56+ NK''CD8+ Cytotoxic T''CD4+/CD25 T Reg''CD8+ Cytotoxic T''CD8+ Cytotoxic T''CD8+ Cytotoxic T''CD8+ Cytotoxic T''CD4+/CD45RA+/CD25- Naive T''CD19+ B''CD4+/CD45RA+/CD25- Naive T''CD8+ Cytotoxic T''CD8+ Cytotoxic T''CD4+/CD25 T Reg''CD8+/CD45RA+ Naive Cytotoxic''CD4+/CD45RA+/CD25- Naive T''CD8+ Cytotoxic T''CD19+ B'