In [1]:


#read data

BaronMatrix   <- readRDS(url("https://storage.googleapis.com/cellid-cbl/BaronMatrix.rds"))
BaronMetaData <- readRDS(url("https://storage.googleapis.com/cellid-cbl/BaronMetaData.rds"))

data=t(as.matrix(BaronMatrix))
annotation=BaronMetaData

annotation$celltype=annotation$cell.type

Loading required package: Matrix

“sparse->dense coercion: allocating vector of size 1.3 GiB”


In [2]:
extract_prediction_label = function(x){
    y=c()
    for (i in x){
        y=c(y,i)
    }
    y
    return(y)
}



calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}


calculate_f1_score_multiclass <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  # Get unique class labels
  classes <- unique(c(true_labels, predicted_labels))
  
  # Initialize variables to store per-class statistics
  precision <- numeric(length(classes))
  recall <- numeric(length(classes))
  f1_score <- numeric(length(classes))
  
  # Calculate precision, recall, and F1 score for each class
  for (i in 1:length(classes)) {
    class_label <- classes[i]
    true_positive <- sum(predicted_labels == class_label & true_labels == class_label)
    false_positive <- sum(predicted_labels == class_label & true_labels != class_label)
    false_negative <- sum(predicted_labels != class_label & true_labels == class_label)
    
    precision[i] <- true_positive / (true_positive + false_positive)
    recall[i] <- true_positive / (true_positive + false_negative)
    
    f1_score[i] <- 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
  }
  
  # Calculate macro-averaged F1 score
  macro_f1_score <- mean(f1_score, na.rm = TRUE)
  
  return(macro_f1_score)
}




In [3]:
library(Seurat)
library(scPred)
set.seed(124)


num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 1:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
  
  reference = CreateSeuratObject(counts = t(data_train))
  query = CreateSeuratObject(counts = t(data_test))
  
  reference=NormalizeData(reference, verbose = FALSE)
  reference=FindVariableFeatures(reference, selection.method = "vst", nfeatures = 2000)
  reference=ScaleData(reference, verbose = FALSE)
  reference=RunPCA(reference, verbose = FALSE)
  reference=RunUMAP(reference, reduction = "pca", dims = 1:30, verbose = FALSE)
  

  reference$celltype=anno_train$celltype
 
  reference <- getFeatureSpace(reference, "celltype")

  reference <- trainModel(reference)

  query <- NormalizeData(query)
  query <- scPredict(query, reference)

  predictions=extract_prediction_label(query$scpred_prediction)

  acc=calculate_test_accuracy(anno_test$celltype,predictions)


  f1_score = calculate_f1_score_multiclass(anno_test$celltype,predictions)


  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc,"f1 score:",f1_score, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}
  

Attaching SeuratObject

“data length is not a multiple of split variable”
“The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
This message will be shown once per session”


[32m●  Extracting feature space for each cell type...
[39m[32mDONE!
[39m[32m●  Training models for each cell type...
[39m

Loading required package: ggplot2

Loading required package: lattice



[32mDONE!
[39m[32m●  Matching reference with new dataset...
[39m[36m	 ─ 2000 features present in reference loadings
[39m[36m	 ─ 2000 features shared between reference and new dataset
[39m[36m	 ─ 100% of features in the reference are present in new dataset
[39m[32m●  Aligning new data to reference...
[39m

Harmony 1/20

Harmony 2/20

Harmony 3/20

Harmony converged after 3 iterations



[32m●  Classifying cells...
[39m[32mDONE!
[39mFold 1 : Train samples = 6855 , Test samples = 1714 Accuracy: 0.9801634 f1 score: 0.9327988 
[32m●  Extracting feature space for each cell type...
[39m[32mDONE!
[39m[32m●  Training models for each cell type...
[39m[32mDONE!
[39m[32m●  Matching reference with new dataset...
[39m[36m	 ─ 2000 features present in reference loadings
[39m[36m	 ─ 2000 features shared between reference and new dataset
[39m[36m	 ─ 100% of features in the reference are present in new dataset
[39m[32m●  Aligning new data to reference...
[39m

Harmony 1/20

Harmony 2/20

Harmony 3/20

Harmony 4/20

Harmony converged after 4 iterations



[32m●  Classifying cells...
[39m[32mDONE!
[39mFold 2 : Train samples = 6855 , Test samples = 1714 Accuracy: 0.9784131 f1 score: 0.973887 
[32m●  Extracting feature space for each cell type...
[39m[32mDONE!
[39m[32m●  Training models for each cell type...
[39m

“There were missing values in resampled performance measures.”


[32mDONE!
[39m[32m●  Matching reference with new dataset...
[39m[36m	 ─ 2000 features present in reference loadings
[39m[36m	 ─ 2000 features shared between reference and new dataset
[39m[36m	 ─ 100% of features in the reference are present in new dataset
[39m[32m●  Aligning new data to reference...
[39m

Harmony 1/20

Harmony 2/20

Harmony 3/20

Harmony 4/20

Harmony converged after 4 iterations



[32m●  Classifying cells...
[39m[32mDONE!
[39mFold 3 : Train samples = 6855 , Test samples = 1714 Accuracy: 0.9848308 f1 score: 0.9034316 
[32m●  Extracting feature space for each cell type...
[39m[32mDONE!
[39m[32m●  Training models for each cell type...
[39m[32mDONE!
[39m[32m●  Matching reference with new dataset...
[39m[36m	 ─ 2000 features present in reference loadings
[39m[36m	 ─ 2000 features shared between reference and new dataset
[39m[36m	 ─ 100% of features in the reference are present in new dataset
[39m[32m●  Aligning new data to reference...
[39m

Harmony 1/20

Harmony 2/20

Harmony 3/20

Harmony 4/20

Harmony converged after 4 iterations



[32m●  Classifying cells...
[39m[32mDONE!
[39mFold 4 : Train samples = 6855 , Test samples = 1714 Accuracy: 0.9848308 f1 score: 0.9902553 
[32m●  Extracting feature space for each cell type...
[39m[32mDONE!
[39m[32m●  Training models for each cell type...
[39m

“There were missing values in resampled performance measures.”


[32mDONE!
[39m[32m●  Matching reference with new dataset...
[39m[36m	 ─ 2000 features present in reference loadings
[39m[36m	 ─ 2000 features shared between reference and new dataset
[39m[36m	 ─ 100% of features in the reference are present in new dataset
[39m[32m●  Aligning new data to reference...
[39m

Harmony 1/20

Harmony 2/20

Harmony 3/20

Harmony 4/20

Harmony converged after 4 iterations



[32m●  Classifying cells...
[39m[32mDONE!
[39mFold 5 : Train samples = 6856 , Test samples = 1713 Accuracy: 0.9801518 f1 score: 0.9678119 
