In [1]:
## CUTRUN DE and assign to nearest genes

library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)

cell_list <- c("N", "CM", "EM", "EMRA")
hd_list <- paste0("HD", c(1:3, 5:7))
expr_list = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cell_comp_list = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
hist_list <- c("H3K27me3", "H3K4me2")

in_path <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
out_path <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"


In [None]:
## output DP with nearest genes
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData")) ## voomDDS results
gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")

In [None]:
fig_path <- "CART_CUTRUN_Project/results/paper_figure/CUTRUN_DP_nearestGene/"


for(hist in c("H3K27me3", "H3K4me2")){
    dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
    dp_match_gene %>% data.frame %$% distance %>% summary
    peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) %>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))

    for(sample in names(results[[hist]])){
        info = sample %>% strsplit("_") %>% unlist
        if(info[1] == "Input"){
            hd_list = paste0("HD", 4:7)
        }else{
            hd_list = paste0("HD", 1:7)
        }
        limma_res <- results[[hist]][[sample]]
        limma_res <- cbind(data.frame(mPeak[[hist]])[limma_res$GeneIndex, c("seqnames", "start", "end")], limma_res)
        dp_res <- right_join(peak_match_gene, limma_res, by = c("peak_id" = "GeneIndex")) %>% arrange(adj.P.Val)
        
        ## raw
        data_raw = countMat[[hist]][
            dp_res$peak_id, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list)
                )]
        colnames(data_raw) = c(paste0(hist, "_", info[2], "_", info[1], "_", hd_list, "_raw"), paste0(hist, "_", info[3], "_", info[1], "_", hd_list, "_raw"))
        tmp = cbind(dp_res, data_raw)
        
        ## scaled
        data_norm <- countMat[[hist]]/colSums(countMat[[hist]]) * 1000000
        data_norm_scaled = data_norm[
                dp_res$peak_id, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list)
                )]
        colnames(data_norm_scaled) = c(paste0(hist, "_", info[2], "_", info[1], "_", hd_list, "_scaled"), paste0(hist, "_", info[3], "_", info[1], "_", hd_list, "_scaled"))
        tmp = cbind(tmp, data_norm_scaled) 
        
        ## log2
        data_norm_log2 = log2(data_norm[
                dp_res$peak_id, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list)
                )] + 1)
        colnames(data_norm_log2) = c(paste0(hist, "_", info[2], "_", info[1], "_", hd_list, "_log2"), paste0(hist, "_", info[3], "_", info[1], "_", hd_list, "_log2"))
        tmp = cbind(tmp, data_norm_log2) %>% data.frame
        
        ## log2 FC
        if(info[1] == "Input"){
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD4_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD4_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD4_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD5_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD5_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD5_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD6_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD6_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD6_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD7_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD7_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD7_scaled")])

        }else{
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD1_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD1_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD1_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD2_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD2_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD2_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD3_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD3_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD3_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD4_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD4_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD4_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD5_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD5_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD5_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD6_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD6_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD6_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD7_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD7_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD7_scaled")])

        }
        
        write.table(tmp, file = paste0(fig_path, "CUTRUN_DP_", hist, "_", sample, "_pairwise_withNearestGenes.csv"), sep = ",", quote = F, row.names = F, col.names = T)
    }
}

In [None]:
## Gene TSS neighborhood DP

library(chromVAR)
library(SummarizedExperiment)
library(Matrix)
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"

countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds")) ##  countMat[[hist]][, k]/seqDepth[[hist]][k] * 16294327 ## divide by the seqDepth and multiply by the largest seqDepth


In [4]:
# CUTRUN pairwise DP
# 1. pairwise
# inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
# outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
# load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
# load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
# load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))

outPath = "CART_CUTRUN_Project/results/paper_figure/CUTRUN_DP_TSS/"

cellList <- c("N", "CM", "EM", "EMRA")
hdList <- paste0("HD", 1:7)
exprList = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cellCompList = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
histList <- c("H3K27me3", "H3K4me2")

checkRep = function(hist, cell, expr, hd){
  rep = "rep1"
  if(paste(hist, cell, expr, hd, sep = "_") == "H3K27me3_CM_Input_HD1"){
    rep = "rep2"
  }
  if(paste(hist, cell, expr, hd, sep = "_") == "H3K27me3_N_Input_HD2"){
    rep = "rep2"
  }

  return(rep)

}

checkDup = function(expr, hd){

  dupType = "normLibSize" #"norm"
  if(expr == "Input" && hd %in% paste0("HD", 1:3)){
    dupType = "stringent"
  }
  return(dupType)

}

checkBam = function(expr, hd){

  bamType = "bowtie2_align.bam"
  if(expr == "Input" && hd %in% paste0("HD", 1:3)){
    bamType = "bowtie2_align.sorted.rmDup.sortName.bam"
  }
  return(bamType)

}

checkHD = function(expr){

  if(expr == "Input"){
    return(paste0("HD", 4:7))
  }else{
    return(paste0("HD", 1:7))
  }
}

target = c()
for(cell in cellList){
  for(expr in exprList){
    hdL = checkHD(expr)
    for(hd in hdL){
      if( !((cell == "EMRA") & (expr == "Input") && (hd == "HD2"))){
        target = rbind(target, data.frame(cell = cell, expr = expr, humanDonor = hd))
      }

    }
  }
}
target

## Consider human donor as random variable.
## Experimental design
treat <- factor(paste(target$cell, target$expr, sep="."))
design <- model.matrix(~0 + treat)
colnames(design) <- levels(treat)
contrast <- makeContrasts(Input_N_CM = CM.Input - N.Input,
                    Input_N_EM = EM.Input - N.Input,
                    Input_N_EMRA = EMRA.Input - N.Input,
                    Input_CM_EM = EM.Input - CM.Input,
                    Input_CM_EMRA = EMRA.Input - CM.Input,
                    Input_EM_EMRA = EMRA.Input - EM.Input,
                    Product_N_CM = CM.Product - N.Product,
                    Product_N_EM = EM.Product - N.Product,
                    Product_N_EMRA = EMRA.Product - N.Product,
                    Product_CM_EM = EM.Product - CM.Product,
                    Product_CM_EMRA = EMRA.Product - CM.Product,
                    Product_EM_EMRA = EMRA.Product - EM.Product,
                    Stim1_N_CM = CM.Stim1 - N.Stim1,
                    Stim1_N_EM = EM.Stim1 - N.Stim1,
                    Stim1_N_EMRA = EMRA.Stim1 - N.Stim1,
                    Stim1_CM_EM = EM.Stim1 - CM.Stim1,
                    Stim1_CM_EMRA = EMRA.Stim1 - CM.Stim1,
                    Stim1_EM_EMRA = EMRA.Stim1 - EM.Stim1,
                    Stim2_N_CM = CM.Stim2 - N.Stim2,
                    Stim2_N_EM = EM.Stim2 - N.Stim2,
                    Stim2_N_EMRA = EMRA.Stim2 - N.Stim2,
                    Stim2_CM_EM = EM.Stim2 - CM.Stim2,
                    Stim2_CM_EMRA = EMRA.Stim2 - CM.Stim2,
                    Stim2_EM_EMRA = EMRA.Stim2 - EM.Stim2,
                    Stim3_N_CM = CM.Stim3 - N.Stim3,
                    Stim3_N_EM = EM.Stim3 - N.Stim3,
                    Stim3_N_EMRA = EMRA.Stim3 - N.Stim3,
                    Stim3_CM_EM = EM.Stim3 - CM.Stim3,
                    Stim3_CM_EMRA = EMRA.Stim3 - CM.Stim3,
                    Stim3_EM_EMRA = EMRA.Stim3 - EM.Stim3,
                    levels = design)

results = vector("list", length(histList))
voomDDS = vector("list", length(histList))
for(hist in histList){
  ## Filter and delete low expressed genes
    selectR <- which(rowSums(countMat[[hist]]) > 5) ## remove low count genes
    dataS <- countMat[[hist]][selectR,]
    geneNameList = rownames(data)[selectR]

    voomDDS[[hist]] <- voom(counts = dataS, design = design, normalize.method = "cyclicloess", plot = FALSE)
    ## option 1 using voomDDS option2 using normDDS as normalized input.
    inputDDS <- voomDDS[[hist]]
    corfit <- duplicateCorrelation(inputDDS, design, block = target$humanDonor)
    ## corfit$consensus
    fit <- lmFit(inputDDS, design, block = target$humanDonor, correlation = corfit$consensus)
    fitContrast <- contrasts.fit(fit, contrast)
    fitBayes <- eBayes(fitContrast, robust = TRUE)


    results[[hist]] = list()
    for(i in 1:ncol(contrast)){
        ## Results
        res <- topTable(fit = fitBayes, adjust.method = 'fdr', coef = i, number = nrow(inputDDS), sort = 'P')

        res <- data.table(GeneName = geneNameList[as.numeric(rownames(res))], GeneIndex = rownames(res) %>% as.numeric, res)
        res[, Significance := ifelse((adj.P.Val <= 0.1 & sign(logFC) == 1 & abs(logFC) >= 1), 'Up',
                              ifelse((adj.P.Val <= 0.1 & sign(logFC) == -1 & abs(logFC) >= 1), 'Down', 'notDE'))]
        results[[hist]][[i]] = res

        ## Output
        write.table(res, file = paste0(outPath, '/CUTRUN_limma_tables/TSS_DE_', colnames(contrast)[i], '_HD1-7_adj0.1_logFC1.csv'), quote = FALSE, row.names = FALSE, sep = ",")
    }
    names(results[[hist]]) <- colnames(contrast)

}
saveRDS(list(results, voomDDS), file = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_limmaDEresults.rds")




In [4]:
fig_path <- "CART_CUTRUN_Project/results/paper_figure/CUTRUN_DP_TSS/"
results = readRDS( file = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_limmaDEresults.rds")[[1]]


for(hist in c("H3K27me3", "H3K4me2")){
    
    for(sample in names(results[[hist]])){
        info = sample %>% strsplit("_") %>% unlist
        if(info[1] == "Input"){
            hd_list = paste0("HD", 4:7)
        }else{
            hd_list = paste0("HD", 1:7)
        }
        limma_res <- results[[hist]][[sample]]
        data_norm <- countMat[[hist]]/colSums(countMat[[hist]]) * 1000000

        data_raw = countMat[[hist]][
                limma_res$ID, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list)
                )]
        colnames(data_raw) = c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list, "_raw"), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list, "_raw")
                )
        tmp = cbind(limma_res, data_raw)

        data_norm_scaled =  data_norm[
                limma_res$ID, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list)
                )]
        colnames(data_norm_scaled) = c(
            paste0(hist, "_", info[2], "_", info[1], "_", hd_list, "_scaled"), 
            paste0(hist, "_", info[3], "_", info[1], "_", hd_list, "_scaled")
        )
        tmp = cbind(tmp, data_norm_scaled)

        data_norm_log2 = log2(data_norm[
                limma_res$ID, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", hd_list), 
                    paste0(hist, "_", info[3], "_", info[1], "_", hd_list)
                )] + 1)
        colnames(data_norm_log2) = c(
            paste0(hist, "_", info[2], "_", info[1], "_", hd_list, "_log2"), 
            paste0(hist, "_", info[3], "_", info[1], "_", hd_list, "_log2")
        )
        tmp = cbind(tmp, data_norm_log2) %>% data.frame


        if(info[1] == "Input"){
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD4_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD4_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD4_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD5_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD5_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD5_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD6_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD6_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD6_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD7_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD7_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD7_scaled")])

        }else{
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD1_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD1_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD1_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD2_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD2_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD2_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD3_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD3_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD3_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD4_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD4_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD4_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD5_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD5_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD5_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD6_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD6_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD6_scaled")])
            tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD7_scaled")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD7_scaled")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD7_scaled")])

        }
        
        write.table(tmp, file = paste0(fig_path, "CUTRUN_DP_", hist, "_", sample, "_pairwise_TSSneighborhood.csv"), sep = ",", quote = F, row.names = F, col.names = T)
    }
}