In [1]:
#read data

data=read.table("data/xin/xin_data.csv",sep=",",header=T,row.names=1)    

data=t(data)   #convert to cell by gene matrix

annotation=read.table("data/xin/xin_celltype.csv",header = T,sep=",",col.names = "celltype")

annotation$cell.type=annotation$celltype

In [2]:

extract_prediction_label = function(x){
    y=c()
    for (i in x){
        y=c(y,i)
    }
    y
    return(y)
}



calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}


calculate_f1_score_multiclass <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  # Get unique class labels
  classes <- unique(c(true_labels, predicted_labels))
  
  # Initialize variables to store per-class statistics
  precision <- numeric(length(classes))
  recall <- numeric(length(classes))
  f1_score <- numeric(length(classes))
  
  # Calculate precision, recall, and F1 score for each class
  for (i in 1:length(classes)) {
    class_label <- classes[i]
    true_positive <- sum(predicted_labels == class_label & true_labels == class_label)
    false_positive <- sum(predicted_labels == class_label & true_labels != class_label)
    false_negative <- sum(predicted_labels != class_label & true_labels == class_label)
    
    precision[i] <- true_positive / (true_positive + false_positive)
    recall[i] <- true_positive / (true_positive + false_negative)
    
    f1_score[i] <- 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
  }
  
  # Calculate macro-averaged F1 score
  macro_f1_score <- mean(f1_score, na.rm = TRUE)
  
  return(macro_f1_score)
}





In [3]:
#5-fold validation
library(SingleCellExperiment)
library(SingleR)

set.seed(123)

# Number of folds
num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 1:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
  
  
  counts_train=t(data_train)
  sce_train <- SingleCellExperiment(list(logcounts = as.matrix(counts_train)))

  sce_train$celltype <- as.factor(anno_train$celltype)

  counts_test=t(data_test)
  sce_test <- SingleCellExperiment(list(counts = as.matrix(counts_test)))



  pred <- SingleR(test = sce_test, ref = sce_train, labels = sce_train$celltype, assay.type.test=1)
  
  acc = calculate_test_accuracy(pred$labels,anno_test$celltype)
  f1_score = calculate_f1_score_multiclass(pred$labels,anno_test$celltype)

  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc,"F1 score:", f1_score, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}


Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges

Fold 1 : Train samples = 1280 , Test samples = 320 Accuracy: 0.959375 F1 score: 0.8463772 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”


Fold 2 : Train samples = 1280 , Test samples = 320 Accuracy: 0.975 F1 score: 0.854476 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”


Fold 3 : Train samples = 1280 , Test samples = 320 Accuracy: 0.96875 F1 score: 0.8392649 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”


Fold 4 : Train samples = 1280 , Test samples = 320 Accuracy: 0.975 F1 score: 0.8888424 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”


Fold 5 : Train samples = 1280 , Test samples = 320 Accuracy: 0.940625 F1 score: 0.807926 
