# Performing differential expression analysis between treatment group and DMSO from pseudobulkreplicates

In [1]:
suppressWarnings({suppressPackageStartupMessages({
    library(edgeR)
    library(parallel)
})})    

Loading data: these objects are a list previously produced and containing as elements matrices with pseudobulk replicates obtained via bootstrapping.

In [2]:
JHOS2_pseudo <- readRDS("JHOS2_pseudobulkreplicates.RDS")
PDC3_pseudo <- readRDS("PDC3_pseudobulkreplicates.RDS")
PDC2_pseudo <- readRDS("PDC2_pseudobulkreplicates.RDS")

Putting the drug names as element names in each list to ease iteration and downstream analyses. 

In [3]:
drug_names <- unique(sapply(JHOS2_pseudo, function(x) strsplit(x = colnames(x), split = "_")[[1]][2]))
names(JHOS2_pseudo) <- names(PDC3_pseudo) <- names(PDC2_pseudo) <- drug_names

In [4]:
head(JHOS2_pseudo$DMSO)

Unnamed: 0,JHOS2_DMSO_1_1,JHOS2_DMSO_1_2,JHOS2_DMSO_1_3,JHOS2_DMSO_1_4,JHOS2_DMSO_1_5,JHOS2_DMSO_1_6,JHOS2_DMSO_1_7,JHOS2_DMSO_1_8,JHOS2_DMSO_1_9,JHOS2_DMSO_1_10,⋯,JHOS2_DMSO_6_991,JHOS2_DMSO_6_992,JHOS2_DMSO_6_993,JHOS2_DMSO_6_994,JHOS2_DMSO_6_995,JHOS2_DMSO_6_996,JHOS2_DMSO_6_997,JHOS2_DMSO_6_998,JHOS2_DMSO_6_999,JHOS2_DMSO_6_1000
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL627309.1,0,1,2,3,0,1,1,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL627309.5,2,1,1,2,1,1,1,0,1,1,⋯,3,3,1,3,2,3,4,3,1,2
AP006222.2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL669831.2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
LINC01409,33,29,28,30,19,30,34,19,29,30,⋯,20,22,20,21,15,17,14,16,12,15


In [5]:
head(PDC3_pseudo$DMSO)

Unnamed: 0,PDC1_DMSO_1_1,PDC1_DMSO_1_2,PDC1_DMSO_1_3,PDC1_DMSO_1_4,PDC1_DMSO_1_5,PDC1_DMSO_1_6,PDC1_DMSO_1_7,PDC1_DMSO_1_8,PDC1_DMSO_1_9,PDC1_DMSO_1_10,⋯,PDC1_DMSO_6_991,PDC1_DMSO_6_992,PDC1_DMSO_6_993,PDC1_DMSO_6_994,PDC1_DMSO_6_995,PDC1_DMSO_6_996,PDC1_DMSO_6_997,PDC1_DMSO_6_998,PDC1_DMSO_6_999,PDC1_DMSO_6_1000
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL627309.1,6,8,4,10,7,9,11,6,5,8,⋯,7,4,4,2,3,4,3,3,3,4
AL627309.5,4,2,3,1,2,7,1,1,4,3,⋯,2,11,10,6,7,4,5,9,10,10
AP006222.2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL669831.2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
LINC01409,25,28,42,42,27,32,37,42,22,28,⋯,27,26,22,31,19,29,22,18,20,15


In [6]:
head(PDC2_pseudo$DMSO)

Unnamed: 0,PDC2_DMSO_1_1,PDC2_DMSO_1_2,PDC2_DMSO_1_3,PDC2_DMSO_1_4,PDC2_DMSO_1_5,PDC2_DMSO_1_6,PDC2_DMSO_1_7,PDC2_DMSO_1_8,PDC2_DMSO_1_9,PDC2_DMSO_1_10,⋯,PDC2_DMSO_6_991,PDC2_DMSO_6_992,PDC2_DMSO_6_993,PDC2_DMSO_6_994,PDC2_DMSO_6_995,PDC2_DMSO_6_996,PDC2_DMSO_6_997,PDC2_DMSO_6_998,PDC2_DMSO_6_999,PDC2_DMSO_6_1000
MIR1302-2HG,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AL627309.1,1,2,2,0,2,0,4,0,1,3,⋯,0,0,0,0,0,0,0,0,0,0
AL627309.5,0,0,0,0,0,0,0,0,0,0,⋯,1,1,1,1,2,0,2,1,1,0
AP006222.2,7,7,6,4,4,3,4,7,3,9,⋯,0,0,0,0,0,0,0,0,0,0
AL669831.2,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
LINC01409,41,31,47,48,54,41,37,44,47,42,⋯,23,35,31,30,31,32,35,26,30,37


In [7]:
length(drug_names)

Now we can iterate in them. The first element is always DMSO, and we store it. Also, it is the biggest one.

In [8]:
JHOS2_pseudo_DMSO <- JHOS2_pseudo$DMSO
PDC3_pseudo_DMSO <- PDC3_pseudo$DMSO
PDC2_pseudo_DMSO <- PDC2_pseudo$DMSO

We devise a function for differential expression analysis.

In [9]:
edgeR_diffExpress_PBRs <- function(control_dataset, drug_dataset, drug){
    
    # Binding control and drug dataset
    dataset <- cbind(control_dataset, drug_dataset) 
    
    # Groups identification
    groups <- sapply(colnames(dataset), function(x) strsplit(x = x, split = "_")[[1]][2]) 
                     
    # Instead of the drug name, we will put "Treatment" to make the function generalizable
    groups[groups != "DMSO"] <- "Treatment" 
    groups[groups == "DMSO"] <- "Control"                 
    groups <- factor(groups, levels = c("Control", "Treatment")) # Needs to be fixed to DMSO and drug
    
    # Creating a DGE list object                 
    dge_obj <- DGEList(counts = dataset, group = groups, remove.zeros = TRUE) 
                     
    # Average log CPM to perform filterings based on the previous histograms                 
    avglogCPM <- aveLogCPM(dge_obj)          
    keep <- (avglogCPM > 1)
    dge_obj_filtered <- dge_obj[keep, ]
                     
    # Normalization                 
    dge_obj_filtered <- calcNormFactors(dge_obj_filtered)
                                 
    # Preparing the design matrix and estimating the NB dispersions               
    design <- model.matrix(~0 + groups)
    colnames(design) <- levels(groups)
    dge_obj_filtered <- estimateDisp(dge_obj_filtered, design = design, robust = TRUE)
    
    # Fitting the model
    fit <- glmQLFit(dge_obj_filtered, design = design)
                     
    # Running the test, specifying the comparison
    # Positive logFC = higher in treatment
    qlf <- glmQLFTest(fit, contrast = makeContrasts(DrugVsDMSO = Treatment - Control, levels = design)) 
                     
    # Return results, BH p-value adjustment                 
    tt <- topTags(qlf, n = Inf)
    return(tt$table)
}

Differential expression, 3 or 4 cores max as RAM goes high fast.

In [10]:
JHOS2_diffexpress <- mclapply(X = drug_names[-1], 
                              function(x) edgeR_diffExpress_PBRs(JHOS2_pseudo_DMSO, JHOS2_pseudo[[x]], x), 
                        mc.cores = 4)
names(JHOS2_diffexpress) <- drug_names[-1]
saveRDS(object = JHOS2_diffexpress, file = "JHOS2_PBRs_diffexpress.RDS")

In [11]:
PDC3_diffexpress <- mclapply(X = drug_names[-1], 
                              function(x) edgeR_diffExpress_PBRs(PDC3_pseudo_DMSO, PDC3_pseudo[[x]], x), 
                        mc.cores = 4)
names(PDC3_diffexpress) <- drug_names[-1]
saveRDS(object = PDC3_diffexpress, file = "PDC3_PBRs_diffexpress.RDS")

In [12]:
PDC2_diffexpress <- mclapply(X = drug_names[-1], 
                              function(x) edgeR_diffExpress_PBRs(PDC2_pseudo_DMSO, PDC2_pseudo[[x]], x), 
                        mc.cores = 4)
names(PDC2_diffexpress) <- drug_names[-1]
saveRDS(object = PDC2_diffexpress, file = "PDC2_PBRs_diffexpress.RDS")

In [13]:
sessionInfo()

R version 4.2.2 (2022-10-31)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Rocky Linux 8.8 (Green Obsidian)

Matrix products: default
BLAS/LAPACK: /homedir01/adini22/.conda/envs/cellhashing_analyses/lib/libopenblasp-r0.3.21.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] edgeR_3.40.2 limma_3.54.2

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.10     uuid_1.1-0      lattice_0.20-45 rlang_1.1.0    
 [5] fastmap_1.1.1   fansi_1.0.4     tools_4.2.2     grid_4.2.2     
 [9] utf8_1.2.3      cli_3.6.0       htmltools_0