# Subsampling of each treatment group into replicates of 10 before computing differential gene expression analysis

- We make as many groups of 10 cells as possible (https://doi.org/10.1038/s41467-020-19894-4), but discard the samples that have less than 14 cells in total
- We obtain for each of these groups simple pseudobulk profiles

The number of samples for each treatment group will depend on the number of cells retrieved for each of them.

In [1]:
library(Seurat)
library(readxl)
library(parallel)

Attaching SeuratObject



In [2]:
hgsoc <- readRDS("HGSOC_CellHashing_CLUSTERED.RDS")

In [3]:
PDC2 <- subset(hgsoc, subset = model == "PDC2")
PDC3 <- subset(hgsoc, subset = model == "PDC3")
JHOS2 <- subset(hgsoc, subset = model == "JHOS2")

In [4]:
Idents(PDC2) <- "Treatment_group"
sort(table(Idents(PDC2)))


  ENMD-2076      AT 101 Quisinostat  Danusertib  BMS-754807       AR-42 
        118         122         133         136         147         150 
  Dasatinib   LY3009120      AT9283   Selinexor    I-BET151  Belinostat 
        159         172         182         184         185         191 
 Pacritinib PF-03758309  Dinaciclib      OTS167  Pictilisib  Volasertib 
        196         205         208         210         216         221 
 Fedratinib    AZD-8186  Buparlisib Pracinostat     SNS-032   SCH772984 
        224         227         228         228         233         236 
    BI 2536 Gedatolisib   Milciclib         JQ1 PF-00477736  Copanlisib 
        237         239         241         246         250         250 
 NVP-BGT226 Talazoparib     AZD8055   PD0325901     TGX-221  Birinapant 
        253         261         264         265         269         269 
 Birabresib  NVP-LCL161 Ralimetinib     ZSTK474  Dactolisib      UCN-01 
        273         275         284         309   

In [5]:
Idents(PDC3) <- "Treatment_group"
sort(table(Idents(PDC3)))


     AT 101   ENMD-2076  Buparlisib  Fedratinib   Selinexor   Milciclib 
         29          91         161         163         164         166 
 Pacritinib PF-03758309  Volasertib     BI 2536  NVP-BGT226    AZD-8186 
        168         176         194         198         201         202 
    SNS-032   Niraparib PF-00477736      OTS167  Danusertib  Pictilisib 
        213         213         214         215         215         216 
 Dinaciclib  Belinostat  BMS-754807 Pracinostat   LY3009120     TGX-221 
        216         226         230         237         245         247 
 Copanlisib Cobimetinib      AT9283 Ipatasertib       AR-42  Dactolisib 
        251         252         253         254         255         256 
  SCH772984   PD0325901 Quisinostat Ralimetinib Talazoparib    I-BET151 
        257         275         283         293         297         301 
    AZD8055   Dasatinib         JQ1  Birabresib  NVP-LCL161     ZSTK474 
        303         304         315         316   

In [6]:
Idents(JHOS2) <- "Treatment_group"
sort(table(Idents(JHOS2)))


     AT 101    AZD-8186       AR-42   LY3009120         JQ1  Birinapant 
         61         166         174         174         217         222 
PF-03758309  Pacritinib  Danusertib Talazoparib PF-00477736  Volasertib 
        242         250         253         255         256         258 
 NVP-LCL161  Fedratinib    I-BET151  Dinaciclib   ENMD-2076 Pracinostat 
        260         268         269         270         274         275 
     AT9283  Buparlisib     BI 2536     TGX-221  BMS-754807  Copanlisib 
        275         276         277         278         281         281 
 Belinostat   SCH772984      UCN-01  Pictilisib Ipatasertib      OTS167 
        282         288         289         291         293         294 
  Milciclib Quisinostat  Birabresib   PD0325901   Niraparib     ZSTK474 
        294         295         296         296         298         311 
Cobimetinib   Selinexor     SNS-032   Dasatinib Ralimetinib Gedatolisib 
        316         320         320         325   

None of these have less than 14 cells, thankfully.

In [7]:
# Custom rounding that takes care of the 0.5 case
custom_round <- function(x) {
  ifelse(x - floor(x) == 0.5, ceiling(x), round(x, digits = 0))
}

In [8]:
split_subsample_pseudobulk <- function(seurat_object, group, modelname) {
    
  # Subsetting the seurat object in input for those cells
  seurat_object <- subset(seurat_object, subset = Treatment_group == group)
    
  # Retrieving cell names  
  input_vector <- Cells(seurat_object) 
    
  # Check if the input vector is empty or too small. 
  # This is because we want at least 2 replicates where one is of 10 cells.
  if (length(input_vector) <= 14) {
    stop("Input vector is too small.")
  }
  
  # Calculate the number of vectors needed
  # If ratio >= 0.5, then the next integer of groups is taken (5 cells is the minimum for a sample)
  num_vectors <- custom_round(length(input_vector) / 10)

  # Initialize a list to store the resulting vectors
  result_list <- vector("list", length = num_vectors)
  
  # Shuffle the input vector randomly
  # In this way, we take sequentially 10 cells every time to get a pseudobulk replicate as many
  # as the samples that we except
  set.seed(1) # Setting seed for reproducibility
  shuffled_vector <- sample(input_vector)

  # Split the shuffled vector into groups of 10
  cells <- mclapply(1:num_vectors, function(i) {
      start_index <- (i - 1) * 10 + 1
      end_index <- min(i * 10, length(shuffled_vector))
      return(shuffled_vector[start_index:end_index])
  })
  gc()
    
  # Now we have a list of the cells that need to be contained in each subsample for pseudobulk
  # So the first thing we need to do is extracting the seurat sample with those cells
  cells <- mclapply(cells, function(x) subset(seurat_object, cells = x))
                    
  # And then we do the pseudobulk via aggregation
  cells <- mclapply(cells, function(x) AggregateExpression(object = x, 
                                                                  slot = "count", 
                                                                  assays = "RNA", 
                                                                  group.by = "Treatment_group", 
                                                                  verbose = FALSE)$RNA)
                    
  # Re-naming elements of the list, using that for the column names                  
  names(cells) <- paste0(modelname, "_", group, "_", 1:length(cells))                   
  cells <- mclapply(names(cells), function(x){colnames(cells[[x]]) <- x
                                            return(cells[[x]])})
  names(cells) <- paste0(modelname, "_", group, "_", 1:length(cells))                   
  
  # Forming a unique matrix                  
  cells <- do.call(cbind, cells)  
  return(cells)
}

In [9]:
sort(levels(Idents(JHOS2))) == sort(levels(Idents(PDC2)))
sort(levels(Idents(PDC3))) == sort(levels(Idents(PDC2)))

In [10]:
all_drugs <- sort(levels(Idents(JHOS2))) # Picking one of them, they are all the same
all_drugs

In [11]:
JHOS2_subsampled <- mclapply(all_drugs, function(x) split_subsample_pseudobulk(seurat_object = JHOS2,
                                                                          group = x,
                                                                          modelname = "JHOS2"), mc.cores = 46)

In [12]:
PDC2_subsampled <- mclapply(all_drugs, function(x) split_subsample_pseudobulk(seurat_object = PDC2,
                                                                          group = x,
                                                                          modelname = "PDC2"), mc.cores = 46)

In [13]:
PDC3_subsampled <- mclapply(all_drugs, function(x) split_subsample_pseudobulk(seurat_object = PDC3,
                                                                          group = x,
                                                                          modelname = "PDC3"), mc.cores = 46)

In [14]:
names(JHOS2_subsampled) <- names(PDC2_subsampled) <- names(PDC3_subsampled) <- all_drugs

In [15]:
# Saving
saveRDS(object = JHOS2_subsampled, file = "JHOS2_pseudobulk_subsampled.RDS")
saveRDS(object = PDC2_subsampled, file = "PDC2_pseudobulk_subsampled.RDS")
saveRDS(object = PDC3_subsampled, file = "PDC3_pseudobulk_subsampled.RDS")

In [16]:
sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Rocky Linux 8.8 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /homedir01/adini22/.conda/envs/cellhashing_analyses/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] readxl_1.4.1       SeuratObject_4.1.3 Seurat_4.3.0.9002 

loaded via a namespace (and not attached):
  [1] Rtsne_0.16             colorspace_2.1-0       deldir_1.0-6          
  [4] ellipsis_0.3.2         ggridges_0.5.4         IRdisplay_1.1         
  [7]