# Seurat integration - choosing more hvgs

I had to redo the integration for E8.5b to E9.5 since the first integration clustered Allantois and Neural crest on top of each other. By increasing the amount of highly variable genes from 2000 (default) to 3000, the integration yielded a much better result.

The following code is used to obtain the same representation as Qiu.
The code correponds to the code on github `ChengxiangQiu/tome_code/Section2_trajectory_Step1_connection.R` and 
`ChengxiangQiu/tome_code/help_code/help_code.R`
Qiu et al. used Seurat V3 in TOME, therefore Seurat V3 is used here as well.

#### seurat_object_XXX.rds can be downloaded from https://tome.gs.washington.edu/

## Import packages

In [1]:
library(Seurat)
library(future)
library(future.apply)

In [2]:
sessionInfo()

R version 4.1.3 (2022-03-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/icb/manuel.gander/miniconda3/envs/seurat4/lib/libopenblasp-r0.3.20.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] future.apply_1.9.0 future_1.26.1      Seurat_3.1.4      

loaded via a namespace (and not attached):
  [1] tsne_0.1-3.1        nlme_3.1-157        matrixStats_0.62.0 
  [4] RcppAnnoy_0.0.19    RColorBrewer_1.1-3  httr_1.4.3         
  [7] repr_1.1.4          numDeriv_2016.8-1.1 sctran

In [3]:
file_path='/home/mgander/moscot-framework_reproducibility/data/mouse_temporal/RDS'

work_path = "/home/mgander/moscot-framework_reproducibility/data/mouse_temporal/Seurat_Representations"

time_point = paste0("E", c(3.5, 4.5, 5.25, 5.5, 6.25, seq(6.5, 8.25, 0.25), "8.5a", "8.5b", seq(9.5, 13.5, 1)))

In [4]:
###############################################
### Function: doing clustering using Seurat ###
###############################################

doClusterSeurat <- function(obj, nfeatures = 2500, resolution = 1, k.filter = 200, savePath = NULL, correctCC = FALSE, n_dim = 30, min.dist = 0.75){
    
    if(length(table(obj$group))!=1){
        
        obj.list <- SplitObject(object = obj, split.by = "group")
        
        for (i in 1:length(x = obj.list)) {
            obj.list[[i]] <- NormalizeData(object = obj.list[[i]], verbose = FALSE)
            obj.list[[i]] <- FindVariableFeatures(object = obj.list[[i]], 
                                                  selection.method = "vst", nfeatures = nfeatures, verbose = FALSE)
        }
        
        reference.list <- obj.list[names(table(obj$group))]
        obj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:n_dim, k.filter = k.filter)
        obj.integrated <- IntegrateData(anchorset = obj.anchors, dims = 1:n_dim)
        
        # switch to integrated assay. The variable features of this assay are
        # automatically set during IntegrateData
        DefaultAssay(object = obj.integrated) <- "integrated"
        
        obj <- obj.integrated 
        
    } else {
        
        obj <- NormalizeData(obj, normalization.method = "LogNormalize", scale.factor = 10000)
        obj <- FindVariableFeatures(obj, selection.method = "vst", nfeatures = nfeatures)
        
    }
    
    if(correctCC == TRUE){
        obj <- ScaleData(object = obj, vars.to.regress = c("S.Score", "G2M.Score"), features = rownames(obj), verbose = FALSE)
    } else {
        obj <- ScaleData(object = obj, verbose = FALSE)
    }
    
    obj <- RunPCA(object = obj, npcs = n_dim, verbose = FALSE)
    obj <- FindNeighbors(object = obj, dims = 1:n_dim)
    obj <- FindClusters(object = obj, resolution = resolution)
    obj <- RunUMAP(object = obj, reduction = "pca", dims = 1:n_dim, min.dist = min.dist)
    obj <- RunTSNE(object = obj, reduction = "pca", dims = 1:30)
    
    obj
}

# Calcualtion for time pair E8.5b to E9.5

This code is an exact copy of the code in https://github.com/ChengxiangQiu/tome_code, except for the amount of anchors used when filtering anchors. I had to adjust these due to error messages, likely due to the fact that I'm using different package versions then Qiu et al.

In [5]:
i=15
time_1=time_point[i]
print(time_1)
time_2=time_point[i+1]

[1] "E8.5b"


In [None]:
if (time_2=='E7')
    {time_2='E7.0'}
if (time_1=='E7')
    {time_1='E7.0'}

obj_1 = readRDS(paste0(file_path, "/seurat_object_", time_1, ".rds"))
obj_1$group = paste0(time_1, "_", obj_1$group)
obj_2 = readRDS(paste0(file_path, "/seurat_object_", time_2, ".rds"))
obj_2$group = paste0(time_2, "_", obj_2$group)
obj = merge(obj_1, obj_2)


if(!time_2 %in% paste0("E", c("8.5b", "9.5", "10.5", "11.5", "12.5", "13.5"))){

    ### These lines where not present in the TOME-code 
    ### (https://github.com/ChengxiangQiu/tome_code/blob/main/Section2_trajectory_Step1_connection.R)
    ### However, I encountered errors (if k.filter is bigger than the number of cells in a batch), that is
    ### why I had to use less cells as neighbors when filtering anchors
    if (i %in% c(1,  2, 3)){
        k.filter=50}
    else if (i %in% c(4,5,6)){
        k.filter=100}
    else {
        k.filter=200
    }

    obj.integrated = doClusterSeurat(obj, k.filter=k.filter)

} else {

    obj.list <- SplitObject(obj, split.by = "group")
    obj.list <- future_lapply(X = obj.list, FUN = function(x) {
        x <- NormalizeData(x, verbose = FALSE)
        x <- FindVariableFeatures(x, verbose = FALSE)
    })
    
    ### Neural crest integrated really badly for Qiu et al. parameters + my package versions.
    ### In order to get a good integration, more features are taken here (default is 2000)
    features <- SelectIntegrationFeatures(object.list = obj.list, nfeatures = 3000)
    
    obj.list <- future_lapply(X = obj.list, FUN = function(x) {
        x <- ScaleData(x, features = features, verbose = FALSE)
        x <- RunPCA(x, features = features, verbose = FALSE)
    })
    
    ### I had to add anchor.features = 3000 here so that it takes 3k features both times
    anchors <- FindIntegrationAnchors(object.list = obj.list, reduction = "rpca", 
                                      dims = 1:50, anchor.features = 3000)
    obj.integrated <- IntegrateData(anchorset = anchors, dims = 1:50)

    obj.integrated <- ScaleData(obj.integrated, verbose = FALSE)
    obj.integrated <- RunPCA(obj.integrated, npcs = 30, verbose = FALSE)
    obj.integrated <- RunUMAP(obj.integrated, dims = 1:30, n.components = 3, min.dist = 0.75)

}

# Save the 30-dim PCA space and the 2d-umap for later use in TOME and Moscot
umap = data.frame(Embeddings(object = obj.integrated, reduction = "umap"))
write.csv(x=umap, file=paste0(work_path, "/", time_1, "_", time_2, "_umap3_new.csv"))
saveRDS(umap, file=paste0(work_path, "/", time_1, "_", time_2, "_umap3_new.rds"))

pca = data.frame(Embeddings(object = obj.integrated, reduction = "pca"))
write.csv(x=pca, file=paste0(work_path, "/", time_1, "_", time_2, "_pca.csv"))
saveRDS(pca, file=paste0(work_path, "/", time_1, "_", time_2, "_pca.rds"))

“UNRELIABLE VALUE: One of the ‘future.apply’ iterations (‘future_lapply-1’) unexpectedly generated random numbers without declaring so. There is a risk that those random numbers are not statistically sound and the overall results might be invalid. To fix this, specify 'future.seed=TRUE'. This ensures that proper, parallel-safe random numbers are produced via the L'Ecuyer-CMRG method. To disable this check, use 'future.seed = NULL', or set option 'future.rng.onMisuse' to "ignore".”
Computing 3000 integration features

Scaling features for provided objects

Computing within dataset neighborhoods

