In [1]:
#read data

SegerMatrix   <- readRDS(url("https://storage.googleapis.com/cellid-cbl/SegerstolpeMatrix.rds"))
SegerMetaData <- readRDS(url("https://storage.googleapis.com/cellid-cbl/SegerstolpeMetaData2.rds"))

data=t(as.matrix(SegerMatrix))
annotation=SegerMetaData

annotation$celltype=annotation$cell.type

Loading required package: Matrix



In [2]:
extract_prediction_label = function(x){
    y=c()
    for (i in x){
        y=c(y,i)
    }
    y
    return(y)
}



calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}


calculate_f1_score_multiclass <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  # Get unique class labels
  classes <- unique(c(true_labels, predicted_labels))
  
  # Initialize variables to store per-class statistics
  precision <- numeric(length(classes))
  recall <- numeric(length(classes))
  f1_score <- numeric(length(classes))
  
  # Calculate precision, recall, and F1 score for each class
  for (i in 1:length(classes)) {
    class_label <- classes[i]
    true_positive <- sum(predicted_labels == class_label & true_labels == class_label)
    false_positive <- sum(predicted_labels == class_label & true_labels != class_label)
    false_negative <- sum(predicted_labels != class_label & true_labels == class_label)
    
    precision[i] <- true_positive / (true_positive + false_positive)
    recall[i] <- true_positive / (true_positive + false_negative)
    
    f1_score[i] <- 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
  }
  
  # Calculate macro-averaged F1 score
  macro_f1_score <- mean(f1_score, na.rm = TRUE)
  
  return(macro_f1_score)
}




In [3]:
library(Seurat)
library(CelliD)
library(tidyverse) # general purpose library for data handling
library(ggpubr) #library for plotting

set.seed(124)


num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 1:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
  ref = CreateSeuratObject(counts = t(data_train))
  ref <- NormalizeData(ref)
  ref <- ScaleData(ref, verbose = FALSE)
  ref =  FindVariableFeatures(ref, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  ref <- RunPCA(ref, npcs = 30, verbose = FALSE)
  ref <- RunUMAP(ref, reduction = "pca", dims = 1:30, verbose = FALSE)
  
  query = CreateSeuratObject(counts = t(data_test))
  query <- NormalizeData(query)
  query <- ScaleData(query, verbose = FALSE)
  query =  FindVariableFeatures(query, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  query <- RunPCA(query, npcs = 30, verbose = FALSE)
  query <- RunUMAP(query, reduction = "pca", dims = 1:30, verbose = FALSE)
  query <- RunMCA(query, nmcs = 50)

  #cell match
  ref=RunMCA(ref)
  # Extracting per-cell gene signatures from the Baron dataset with CelliD(c)
  ref_cell_gs <- GetCellGeneSet(ref, dims = 1:50, n.features = 200)

  ref$celltype=anno_train$celltype


  # Extracting per-group gene signatures from the Baron dataset with CelliD(g)
  ref_group_gs <- GetGroupGeneSet(ref, dims = 1:50, n.features = 200, group.by = "celltype")
  
  
  #query <- FindVariableFeatures(query)
  #query <- ScaleData(query)
  #query <- RunMCA(query, nmcs = 50)

  #query <- RunPCA(query)
  #query <- RunUMAP(query, dims = 1:30)
  #query <- RunTSNE(query, dims = 1:30)
  


  HGT_ref_cell_gs <- RunCellHGT(query, pathways = ref_cell_gs, dims = 1:50)

  ref_cell_gs_match <- rownames(HGT_ref_cell_gs)[apply(HGT_ref_cell_gs, 2, which.max)]
  ref_cell_gs_prediction <- ref$celltype[ref_cell_gs_match]


  predictions=extract_prediction_label(ref_cell_gs_prediction)
  
  acc=calculate_test_accuracy(anno_test$celltype,predictions)


  f1_score = calculate_f1_score_multiclass(anno_test$celltype,predictions)


  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc,"f1 score:",f1_score, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}


Attaching SeuratObject

Loading required package: SingleCellExperiment

Loading required package: SummarizedExperiment

Loading required package: MatrixGenerics

Loading required package: matrixStats


Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
    rowMadDiffs, rowMads, rowMaxs, rowMeans2, r

0.295 sec elapsed


Computing SVD



3.274 sec elapsed


Computing Coordinates



0.208 sec elapsed


Computing Fuzzy Matrix



1.352 sec elapsed


Computing SVD



10.434 sec elapsed


Computing Coordinates



0.682 sec elapsed



calculating distance



creating ranking



creating geneset



creating ranking



calculating distance


ranking genes

1734 pathways kept for hypergeometric test out of 1734, 0 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




Fold 1 : Train samples = 1734 , Test samples = 434 Accuracy: 0.9792627 f1 score: 0.9809483 


Computing Fuzzy Matrix



0.392 sec elapsed


Computing SVD



5.319 sec elapsed


Computing Coordinates



0.239 sec elapsed


Computing Fuzzy Matrix



1.486 sec elapsed


Computing SVD



10.105 sec elapsed


Computing Coordinates



0.619 sec elapsed



calculating distance



creating ranking



creating geneset



creating ranking



calculating distance


ranking genes

1734 pathways kept for hypergeometric test out of 1734, 0 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




Fold 2 : Train samples = 1734 , Test samples = 434 Accuracy: 0.9769585 f1 score: 0.9607745 


Computing Fuzzy Matrix



0.206 sec elapsed


Computing SVD



2.656 sec elapsed


Computing Coordinates



0.095 sec elapsed


Computing Fuzzy Matrix



1.363 sec elapsed


Computing SVD



7.331 sec elapsed


Computing Coordinates



0.786 sec elapsed



calculating distance



creating ranking



creating geneset



creating ranking



calculating distance


ranking genes

1734 pathways kept for hypergeometric test out of 1734, 0 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




Fold 3 : Train samples = 1734 , Test samples = 434 Accuracy: 0.9769585 f1 score: 0.9729852 


Computing Fuzzy Matrix



0.298 sec elapsed


Computing SVD



2.04 sec elapsed


Computing Coordinates



0.082 sec elapsed


Computing Fuzzy Matrix



1.046 sec elapsed


Computing SVD



8.899 sec elapsed


Computing Coordinates



0.542 sec elapsed



calculating distance



creating ranking



creating geneset



creating ranking



calculating distance


ranking genes

1735 pathways kept for hypergeometric test out of 1735, 0 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




Fold 4 : Train samples = 1735 , Test samples = 433 Accuracy: 0.9445727 f1 score: 0.8971953 


Computing Fuzzy Matrix



0.316 sec elapsed


Computing SVD



2.054 sec elapsed


Computing Coordinates



0.146 sec elapsed


Computing Fuzzy Matrix



1.287 sec elapsed


Computing SVD



9.891 sec elapsed


Computing Coordinates



0.779 sec elapsed



calculating distance



creating ranking



creating geneset



creating ranking



calculating distance


ranking genes

1735 pathways kept for hypergeometric test out of 1735, 0 filtered as less than 10 features was present in the data


calculating features overlap


performing hypergeometric test




Fold 5 : Train samples = 1735 , Test samples = 433 Accuracy: 0.9769053 f1 score: 0.957866 
