In [1]:
#read gene expression data and annotation data

data=read.table("data/zheng68k/zheng68k.csv",sep=",",header=T,row.names=1)



In [3]:
annotation=read.table("data/zheng68k/68k_pbmc_barcodes_annotation.tsv.txt",header=TRUE,sep="\t")

head(annotation)

Unnamed: 0_level_0,TSNE.1,TSNE.2,barcodes,celltype
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<chr>
1,7.56554,0.4413703,AAACATACACCCAA-1,CD8+ Cytotoxic T
2,2.552626,-25.7866723,AAACATACCCCTCA-1,CD8+/CD45RA+ Naive Cytotoxic
3,-5.771831,11.830846,AAACATACCGGAGA-1,CD4+/CD45RO+ Memory
4,1.762556,25.9793459,AAACATACTAACCG-1,CD19+ B
5,-16.793856,-16.5899699,AAACATACTCTTCA-1,CD4+/CD25 T Reg
6,-15.339791,-11.0882635,AAACATACTGGATC-1,CD4+/CD25 T Reg


In [4]:
data=data[1:10000,]
annotation=annotation[1:10000,]

In [4]:
extract_prediction_label = function(x){
    y=c()
    for (i in x){
        y=c(y,i)
    }
    y
    return(y)
}



calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}


calculate_f1_score_multiclass <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  # Get unique class labels
  classes <- unique(c(true_labels, predicted_labels))
  
  # Initialize variables to store per-class statistics
  precision <- numeric(length(classes))
  recall <- numeric(length(classes))
  f1_score <- numeric(length(classes))
  
  # Calculate precision, recall, and F1 score for each class
  for (i in 1:length(classes)) {
    class_label <- classes[i]
    true_positive <- sum(predicted_labels == class_label & true_labels == class_label)
    false_positive <- sum(predicted_labels == class_label & true_labels != class_label)
    false_negative <- sum(predicted_labels != class_label & true_labels == class_label)
    
    precision[i] <- true_positive / (true_positive + false_positive)
    recall[i] <- true_positive / (true_positive + false_negative)
    
    f1_score[i] <- 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
  }
  
  # Calculate macro-averaged F1 score
  macro_f1_score <- mean(f1_score, na.rm = TRUE)
  
  return(macro_f1_score)
}




In [5]:
#5-fold validation
library(SingleCellExperiment)
library(SingleR)

set.seed(123)

# Number of folds
num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 1:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
  
  
  counts_train=t(data_train)
  sce_train <- SingleCellExperiment(list(logcounts = as.matrix(counts_train)))

  sce_train$celltype <- as.factor(anno_train$celltype)

  counts_test=t(data_test)
  sce_test <- SingleCellExperiment(list(counts = as.matrix(counts_test)))



  pred <- SingleR(test = sce_test, ref = sce_train, labels = sce_train$celltype, assay.type.test=1)
  
  acc = calculate_test_accuracy(pred$labels,anno_test$celltype)
  f1_score = calculate_f1_score_multiclass(pred$labels,anno_test$celltype)

  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc,"F1 score:", f1_score, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}


Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats




Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
    rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
    rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
    rowWeightedMads, rowWeightedMeans, rowWeigh

Fold 1 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3817439 F1 score: 0.4435614 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 2 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3930446 F1 score: 0.4443934 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 3 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3920968 F1 score: 0.4967279 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 4 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3758384 F1 score: 0.4324964 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 5 : Train samples = 54864 , Test samples = 13715 Accuracy: 0.3668976 F1 score: 0.4327441 


In [22]:
#split data and annotation into train and test. use random seed
#Rcode

set.seed(123)
train <- sample(1:nrow(data), 0.8*nrow(data))
test <- setdiff(1:nrow(data), train)

#train
data_train <- data[train,]
anno_train <- annotation[train,]

#test
data_test <- data[test,]
anno_test <- annotation[test,]



In [23]:
head(data_train)

head(anno_train)

Unnamed: 0_level_0,MIR1302.10,FAM138A,OR4F5,RP11.34P13.7,RP11.34P13.8,AL627309.1,RP11.34P13.14,RP11.34P13.9,AP006222.2,RP4.669L17.10,⋯,KIR3DL2.1,AL590523.1,CT476828.1,PNRC2.1,SRSF10.1,AC145205.1,BAGE5,CU459201.1,AC002321.2,AC002321.1
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
TTGGGAACTAACCG-6,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
GGGATGGAAACAGA-7,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CACCCATGCGCATA-1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CTCTAATGTTCCAT-4,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
TTCAAAGAGTAAAG-8,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
ATGACGTGTCCAAG-8,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,TSNE.1,TSNE.2,barcodes,celltype
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<chr>
51663,-27.794167,-6.243935,TTGGGAACTAACCG-6,CD4+/CD25 T Reg
57870,-4.473658,11.844563,GGGATGGAAACAGA-7,CD8+ Cytotoxic T
2986,-24.091027,-5.99972,CACCCATGCGCATA-1,CD19+ B
29925,-12.443192,17.394858,CTCTAATGTTCCAT-4,CD4+/CD25 T Reg
68293,-11.840912,-10.940324,TTCAAAGAGTAAAG-8,CD4+/CD25 T Reg
62555,15.901041,-13.52371,ATGACGTGTCCAAG-8,CD56+ NK


In [24]:
#create singlecellExperiment object using splited data
#seurat_obj <- CreateSeuratObject(counts = data, project = "zheng68k", min.cells = 3, min.features = 200)

library(SingleCellExperiment)

counts_train=t(data_train)

#sce_train <- SingleCellExperiment(counts_train)

sce_train <- SingleCellExperiment(list(logcounts = as.matrix(counts_train)))

In [25]:
sce_train$celltype <- as.factor(anno_train$celltype)

In [26]:
counts_test=t(data_test)

sce_test <- SingleCellExperiment(list(counts = as.matrix(counts_test)))



In [27]:
library(SingleR)

pred <- SingleR(test = sce_test, ref = sce_train, 
    labels = sce_train$celltype, assay.type.test=1)
colnames(pred)

“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

In [28]:
pred$labels

In [29]:
anno_test$celltype

In [5]:
#calculate accuracy: pred$labels vs anno_test$celltype
calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}

acc=calculate_test_accuracy(pred$labels,anno_test$celltype)

acc



calculate_f1_score <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  true_positive <- sum(true_labels == predicted_labels & true_labels == 1)
  false_positive <- sum(predicted_labels == 1 & true_labels == 0)
  false_negative <- sum(predicted_labels == 0 & true_labels == 1)
  
  precision <- true_positive / (true_positive + false_positive)
  recall <- true_positive / (true_positive + false_negative)
  
  f1_score <- 2 * (precision * recall) / (precision + recall)
  return(f1_score)
}


In [9]:
#5-fold validation
library(SingleCellExperiment)
library(SingleR)

set.seed(123)

# Number of folds
num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 1:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
  
  
  counts_train=t(data_train)
  sce_train <- SingleCellExperiment(list(logcounts = as.matrix(counts_train)))

  sce_train$celltype <- as.factor(anno_train$celltype)

  counts_test=t(data_test)
  sce_test <- SingleCellExperiment(list(counts = as.matrix(counts_test)))



  pred <- SingleR(test = sce_test, ref = sce_train, labels = sce_train$celltype, assay.type.test=1)
  acc=calculate_test_accuracy(pred$labels,anno_test$celltype)

  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}


“data length is not a multiple of split variable”


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 1 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3817439 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 2 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3930446 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 3 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3920968 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 4 : Train samples = 54863 , Test samples = 13716 Accuracy: 0.3758384 


“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”
“useNames = NA is deprecated. Instead, specify either useNames = TRUE or useNames = TRUE.”

Fold 5 : Train samples = 54864 , Test samples = 13715 Accuracy: 0.3668976 
