# 3 vs 3 aggregated (pseudobulk) samples for treated vs untreated DGE analysis via edgeR

In [1]:
library(Seurat)
library(edgeR)
library(parallel)

Attaching SeuratObject

Loading required package: limma



We need to perform 3 vs 3 pseudobulk DGE analysis. This means that for each model and treatment group, we get the pseudobulk and then perform the differential gene expression analysis.

In [2]:
sc_data <- readRDS(file = "HGSOC_CellHashing_CLUSTERED.RDS")

For each drug, we can create a matrix having the three models' aggregated expression as columns

In [3]:
drugs <- unique(sc_data@meta.data$Treatment_group)

In [4]:
drugs

In [5]:
aggregate_expr <- mclapply(drugs, function(d){
    sub <- subset(sc_data, subset = Treatment_group == d)
    expr <- AggregateExpression(object = sub, 
                               slot = "count", 
                               assays = "RNA", 
                               group.by = "model", 
                               verbose = FALSE)$RNA
    return(expr)
}, mc.cores = 46)

In [6]:
names(aggregate_expr) <- drugs

Now we need to perform differential expression analysis between each drug and untreated (DMSO) cells

In [7]:
control_dataset <- aggregate_expr$DMSO

In [8]:
edgeR_diffExpress_PBRs <- function(drug_dataset){
    
    # Binding control and drug dataset
    dataset <- cbind(control_dataset, drug_dataset) 
    
    # Groups identification
    groups <- c(rep("Control", times = 3), rep("Treatment", times = 3))          
    groups <- factor(groups, levels = c("Control", "Treatment")) 
    
    # Creating a DGE list object                 
    dge_obj <- DGEList(counts = dataset, group = groups, remove.zeros = TRUE) 
                     
    # Average log CPM to perform filterings based on the previous histograms                 
    avglogCPM <- aveLogCPM(dge_obj)          
    keep <- (avglogCPM > 1)
    dge_obj_filtered <- dge_obj[keep, ]
                     
    # Normalization                 
    dge_obj_filtered <- calcNormFactors(dge_obj_filtered)
                                 
    # Preparing the design matrix and estimating the NB dispersions               
    design <- model.matrix(~0 + groups)
    colnames(design) <- levels(groups)
    dge_obj_filtered <- estimateDisp(dge_obj_filtered, design = design, robust = TRUE)
    
    # Fitting the model
    fit <- glmQLFit(dge_obj_filtered, design = design)
                     
    # Running the test, specifying the comparison
    # Positive logFC = higher in treatment
    qlf <- glmQLFTest(fit, contrast = makeContrasts(DrugVsDMSO = Treatment - Control, levels = design)) 
                     
    # Return results, BH p-value adjustment                 
    tt <- topTags(qlf, n = Inf)
    return(tt$table)
}

In [9]:
dge_result_3vs3_pseudobulk <- mclapply(X = drugs[-which(drugs == "DMSO")], 
                                       function(x) edgeR_diffExpress_PBRs(aggregate_expr[[x]]), 
                        mc.cores = 4)
names(dge_result_3vs3_pseudobulk) <- drugs[-which(drugs == "DMSO")]

Saving results

In [10]:
dge_result_3vs3_pseudobulk <- mclapply(drugs[-which(drugs == "DMSO")], 
                                       function(x){dge_result_3vs3_pseudobulk[[x]]$drug <- paste0(x, " vs. DMSO")
                                                   dge_result_3vs3_pseudobulk[[x]]$gene <- rownames(dge_result_3vs3_pseudobulk[[x]])
                                                   return(dge_result_3vs3_pseudobulk[[x]])})
names(dge_result_3vs3_pseudobulk) <- drugs[-which(drugs == "DMSO")]

In [11]:
dge_result_3vs3_pseudobulk_rbound <- do.call(rbind, dge_result_3vs3_pseudobulk)

In [12]:
write.table(x = dge_result_3vs3_pseudobulk_rbound, file = "3vs3_pseudobulk_aggregate_edgeR_DGE.txt", sep = "\t", quote = F)
write.table(x = dge_result_3vs3_pseudobulk_rbound[dge_result_3vs3_pseudobulk_rbound$FDR<0.01, ], file = "3vs3_pseudobulk_aggregate_edgeR_DGE_FDR_0.01.txt", sep = "\t", quote = F)

In [13]:
sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Rocky Linux 8.8 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /homedir01/adini22/.conda/envs/cellhashing_analyses/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] edgeR_3.40.2       limma_3.54.2       SeuratObject_4.1.3 Seurat_4.3.0.9002 

loaded via a namespace (and not attached):
  [1] Rtsne_0.16             colorspace_2.1-0       deldir_1.0-6          
  [4] ellipsis_0.3.2         ggridges_0.5.4         IRdisplay