In [2]:
#read data and save to csv
library(scRNAseq)
xin <- XinPancreasData()

data=as.matrix(xin@assays@data$rpkm)
               
rownames(data)=xin@rowRanges@elementMetadata$symbol

celltype=xin@colData$cell.type

write.table(data, file = "data/xin/xin_data.csv", sep = ",", col.names = T)

write.table(celltype, file = "data/xin/xin_celltype.csv", sep = ",", col.names = T)

snapshotDate(): 2022-10-31

see ?scRNAseq and browseVignettes('scRNAseq') for documentation

loading from cache

see ?scRNAseq and browseVignettes('scRNAseq') for documentation

loading from cache

see ?scRNAseq and browseVignettes('scRNAseq') for documentation

loading from cache



In [19]:
#read data

data=read.table("data/xin/xin_data.csv",sep=",",header=T,row.names=1)    

data=t(data)   #convert to cell by gene matrix

annotation=read.table("data/xin/xin_celltype.csv",header = T,sep=",",col.names = "celltype")

annotation$cell.type=annotation$celltype

In [20]:

extract_prediction_label = function(x){
    y=c()
    for (i in x){
        y=c(y,i)
    }
    y
    return(y)
}



calculate_test_accuracy <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  correct_predictions <- sum(true_labels == predicted_labels)
  total_samples <- length(true_labels)
  
  accuracy <- correct_predictions / total_samples
  return(accuracy)
}


calculate_f1_score_multiclass <- function(true_labels, predicted_labels) {
  if (length(true_labels) != length(predicted_labels)) {
    stop("Input vectors must have the same length.")
  }
  
  # Get unique class labels
  classes <- unique(c(true_labels, predicted_labels))
  
  # Initialize variables to store per-class statistics
  precision <- numeric(length(classes))
  recall <- numeric(length(classes))
  f1_score <- numeric(length(classes))
  
  # Calculate precision, recall, and F1 score for each class
  for (i in 1:length(classes)) {
    class_label <- classes[i]
    true_positive <- sum(predicted_labels == class_label & true_labels == class_label)
    false_positive <- sum(predicted_labels == class_label & true_labels != class_label)
    false_negative <- sum(predicted_labels != class_label & true_labels == class_label)
    
    precision[i] <- true_positive / (true_positive + false_positive)
    recall[i] <- true_positive / (true_positive + false_negative)
    
    f1_score[i] <- 2 * (precision[i] * recall[i]) / (precision[i] + recall[i])
  }
  
  # Calculate macro-averaged F1 score
  macro_f1_score <- mean(f1_score, na.rm = TRUE)
  
  return(macro_f1_score)
}





In [21]:
library(Seurat)

set.seed(124)


num_folds <- 5

# Generate 5-fold cross-validation indices
fold_indices <- split(sample(nrow(data)), 1:num_folds)

# Perform 5-fold cross-validation
for (i in 1:num_folds) {
  # Get indices for the current fold
  test_indices <- fold_indices[[i]]
  train_indices <- unlist(fold_indices[-i])
  
  # Subset data and annotation based on indices
  data_train <- data[train_indices, ]
  anno_train <- annotation[train_indices, ]
  
  data_test <- data[test_indices, ]
  anno_test <- annotation[test_indices, ]
  
    
  ref = CreateSeuratObject(counts = t(data_train))
  ref <- ScaleData(ref, verbose = FALSE)
  ref =  FindVariableFeatures(ref, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  ref <- RunPCA(ref, npcs = 30, verbose = FALSE)
  ref <- RunUMAP(ref, reduction = "pca", dims = 1:30, verbose = FALSE)

  query = CreateSeuratObject(counts = t(data_test))
  query <- ScaleData(query, verbose = FALSE)
  query =  FindVariableFeatures(query, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  query <- RunPCA(query, npcs = 30, verbose = FALSE)
  query <- RunUMAP(query, reduction = "pca", dims = 1:30, verbose = FALSE)

  ref$celltype=anno_train$cell.type
  query$celltype=anno_test$cell.type


  ref.anchors <- FindTransferAnchors(reference = ref, query = query, dims = 1:30, reference.reduction = "pca")
  predictions <- TransferData(anchorset = ref.anchors, refdata = ref$celltype,
      dims = 1:30)
  query <- AddMetaData(query, metadata = predictions)
  
  predictions = query$predicted.id



  acc=calculate_test_accuracy(anno_test$cell.type,predictions)


  f1_score = calculate_f1_score_multiclass(anno_test$cell.type,predictions)


  cat("Fold", i, ": Train samples =", nrow(data_train), ", Test samples =", nrow(data_test), "Accuracy:", acc,"f1 score:",f1_score, "\n")

  
  # Perform training and testing using data_train, anno_train, data_test, and anno_test
}


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 1001 anchors

Filtering anchors

	Retained 972 anchors

Finding integration vectors

Finding integration vector weights

Predicting cell labels



Fold 1 : Train samples = 1280 , Test samples = 320 Accuracy: 0.95625 f1 score: 0.8011125 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 1019 anchors

Filtering anchors

	Retained 957 anchors

Finding integration vectors

Finding integration vector weights

Predicting cell labels



Fold 2 : Train samples = 1280 , Test samples = 320 Accuracy: 0.934375 f1 score: 0.7570775 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 918 anchors

Filtering anchors

	Retained 901 anchors

Finding integration vectors

Finding integration vector weights

Predicting cell labels



Fold 3 : Train samples = 1280 , Test samples = 320 Accuracy: 0.96875 f1 score: 0.912208 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 1036 anchors

Filtering anchors

	Retained 996 anchors

Finding integration vectors

Finding integration vector weights

Predicting cell labels



Fold 4 : Train samples = 1280 , Test samples = 320 Accuracy: 0.9375 f1 score: 0.8747157 


"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
"Feature names cannot have underscores ('_'), replacing with dashes ('-')"
Projecting cell embeddings

Finding neighborhoods

Finding anchors

	Found 945 anchors

Filtering anchors

	Retained 911 anchors

Finding integration vectors

Finding integration vector weights

Predicting cell labels



Fold 5 : Train samples = 1280 , Test samples = 320 Accuracy: 0.95625 f1 score: 0.9232162 
