# Performing differential expression analysis between treatment group and DMSO from  subsampled data without replacement

In [1]:
library(edgeR)
library(parallel)

Loading required package: limma



Loading data: these objects are a list previously produced and containing as elements matrices with pseudobulk replicates obtained via bootstrapping

In [2]:
JHOS2_pseudo <- readRDS("JHOS2_pseudobulk_subsampled_REVIEW.RDS")
PDC2_pseudo <- readRDS("PDC2_pseudobulk_subsampled_REVIEW.RDS")
PDC3_pseudo <- readRDS("PDC3_pseudobulk_subsampled_REVIEW.RDS")

In [3]:
sapply(JHOS2_pseudo, function(x) ncol(x))
sapply(PDC2_pseudo, function(x) ncol(x))
sapply(PDC3_pseudo, function(x) ncol(x))

Now we can iterate in them. The first element is always DMSO, and we store it. Also, it is the biggest one.

In [4]:
JHOS2_pseudo_DMSO <- JHOS2_pseudo$DMSO
PDC2_pseudo_DMSO <- PDC2_pseudo$DMSO
PDC3_pseudo_DMSO <- PDC3_pseudo$DMSO

Function for differential expression analysis

In [5]:
edgeR_diffExpress_PBRs <- function(control_dataset, drug_dataset, drug){
    
    # Binding control and drug dataset
    dataset <- cbind(control_dataset, drug_dataset) 
    
    # Groups identification
    groups <- sapply(colnames(dataset), function(x) strsplit(x = x, split = "_")[[1]][2]) 
                     
    # Instead of the drug name, we will put "Treatment" to make the function generalizable
    groups[groups != "DMSO"] <- "Treatment" 
    groups[groups == "DMSO"] <- "Control"                 
    groups <- factor(groups, levels = c("Control", "Treatment")) # Needs to be fixed to DMSO and drug
    
    # Creating a DGE list object                 
    dge_obj <- DGEList(counts = dataset, group = groups, remove.zeros = TRUE) 
                     
    # Average log CPM to perform filterings based on the previous histograms                 
    avglogCPM <- aveLogCPM(dge_obj)          
    keep <- (avglogCPM > 1)
    dge_obj_filtered <- dge_obj[keep, ]
                     
    # Normalization                 
    dge_obj_filtered <- calcNormFactors(dge_obj_filtered)
                                 
    # Preparing the design matrix and estimating the NB dispersions               
    design <- model.matrix(~0 + groups)
    colnames(design) <- levels(groups)
    dge_obj_filtered <- estimateDisp(dge_obj_filtered, design = design, robust = TRUE)
    
    # Fitting the model
    fit <- glmQLFit(dge_obj_filtered, design = design)
                     
    # Running the test, specifying the comparison
    # Positive logFC = higher in treatment
    qlf <- glmQLFTest(fit, contrast = makeContrasts(DrugVsDMSO = Treatment - Control, levels = design)) 
                     
    # Return results, BH p-value adjustment                 
    tt <- topTags(qlf, n = Inf)
    return(tt$table)
}

Differential expression, 4 cores max as RAM goes high fast and we are running other analyses in parallel on the same cluster

In [6]:
drug_names <- names(JHOS2_pseudo)[-which(names(JHOS2_pseudo) == "DMSO")]
drug_names

In [7]:
JHOS2_diffexpress <- mclapply(X = drug_names, 
                        function(x) edgeR_diffExpress_PBRs(JHOS2_pseudo_DMSO, JHOS2_pseudo[[x]], x), 
                        mc.cores = 4)
names(JHOS2_diffexpress) <- drug_names
saveRDS(object = JHOS2_diffexpress, file = "JHOS2_PBRs_diffexpress_subsampled.RDS")

In [8]:
PDC2_diffexpress <- mclapply(X = drug_names, function(x) edgeR_diffExpress_PBRs(PDC2_pseudo_DMSO, PDC2_pseudo[[x]], x), 
                        mc.cores = 4)
names(PDC2_diffexpress) <- drug_names
saveRDS(object = PDC2_diffexpress, file = "PDC2_PBRs_diffexpress_subsampled.RDS")

In [9]:
PDC3_diffexpress <- mclapply(X = drug_names, function(x) edgeR_diffExpress_PBRs(PDC3_pseudo_DMSO, PDC3_pseudo[[x]], x), 
                        mc.cores = 4)
names(PDC3_diffexpress) <- drug_names
saveRDS(object = PDC3_diffexpress, file = "PDC3_PBRs_diffexpress_subsampled.RDS")

In [10]:
sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Rocky Linux 8.8 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /homedir01/adini22/.conda/envs/cellhashing_analyses/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] edgeR_3.40.2 limma_3.54.2

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.10     uuid_1.1-0      lattice_0.20-45 rlang_1.1.0    
 [5] fastmap_1.1.1   fansi_1.0.4     tools_4.2.2     grid_4.2.2     
 [9] utf8_1.2.3      cli_3.6.0       htmltools_0