In [1]:
library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)
in_path <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
out_path <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"
fig_path <- "CART_CUTRUN_Project/results/paper_figure/"
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"


### METHOD 1 Compare GEX with Histone Signals around TSS

In [None]:
## Get the TSS location and +/-1kb region around TSS
gtf <- rtracklayer::import('/fh/fast/newell_e/yezheng_working/SupplementaryData/hg38/GENCODE/gencode.v21.annotation.gtf')
gtf_df=as.data.frame(gtf)
head(gtf_df)

## get the TSS location and extend by 1kb up and downstream
tss_df = rbind(
    gtf_df %>% data.frame %>% dplyr::filter(type == "gene", strand == "+") %>% dplyr::select(seqnames, TSS = start, strand, source, gene_id, gene_name, gene_name, gene_type) ,
    gtf_df %>% data.frame %>% dplyr::filter(type == "gene", strand == "-") %>% dplyr::select(seqnames, TSS = end, strand, source, gene_id, gene_name, gene_name, gene_type) 
)
tss_filter_df = tss_df %>% dplyr::filter(!(seqnames %in% c("chrM", "chrX", "chrY")))
gtf_filter_df = gtf_df %>% dplyr::filter(type == "gene", !(seqnames %in% c("chrM", "chrX", "chrY")))

tss_gr = GRanges(seqnames = tss_filter_df$seqnames, ranges = IRanges(start = tss_filter_df$TSS - 1000, end = tss_filter_df$TSS + 1000), strand = tss_filter_df$strand, gene_id = tss_filter_df$gene_id, gene_name = tss_filter_df$gene_name, gene_type = tss_filter_df$gene_type)
tss_gr 

gene_gr = GRanges(seqnames = gtf_filter_df$seqnames, ranges = IRanges(start = gtf_filter_df$start, end = gtf_filter_df$end), strand = gtf_filter_df$strand, gene_id = gtf_filter_df$gene_id, gene_name = gtf_filter_df$gene_name, gene_type = gtf_filter_df$gene_type)
gene_gr 

In [None]:
## overlap with bam file to get count -- !!! Run it in terminal directly
bamDir <- "CART_CUTRUN_Project/results/CUTANDRUN/process/"
cellList <- c("N", "CM", "EM", "EMRA")
hdList <- paste0("HD", 1:7)
exprList = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cellCompList = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
histList <- c("H3K27me3", "H3K4me2")


checkRep = function(hist, cell, expr, hd){
  rep = "rep1"
  if(paste(hist, cell, expr, hd, sep = "_") == "H3K27me3_CM_Input_HD1"){
    rep = "rep2"
  }
  if(paste(hist, cell, expr, hd, sep = "_") == "H3K27me3_N_Input_HD2"){
    rep = "rep2"
  }

  return(rep)

}

checkDup = function(expr, hd){

  dupType = "norm"
  if(expr == "Input" && hd %in% paste0("HD", 1:3)){
    dupType = "stringent"
  }
  return(dupType)

}

checkBam = function(expr, hd){

  bamType = "bowtie2_align.bam"
  if(expr == "Input" && hd %in% paste0("HD", 1:3)){
    bamType = "bowtie2_align.sorted.rmDup.sortName.bam"
  }
  return(bamType)

}

checkHD = function(expr){

  if(expr == "Input"){
    return(paste0("HD", 4:7))
  }else{
    return(paste0("HD", 1:7))
  }
}
 
# library(doMC)
# coreN <- 6
# registerDoMC(cores = coreN)

bamDir <- "CART_CUTRUN_Project/results/CUTANDRUN/process/"
bamSelect = vector("list", length(histList))
# fragment_tss_counts = vector("list", length(histList))
for(hist in histList){
  bamSelect[[hist]] = c()
   for(cell in cellList){
    for(expr in exprList){
      hdL = checkHD(expr)
      for(hd in hdL){
        ## Get the replicate number and peak calling type
        rep = checkRep(hist, cell, expr, hd)
        bamType = checkBam(expr, hd)
        bamFile = paste0(bamDir, hist, "_CD8_", cell, "_", expr, "_", hd, "_", rep, "/alignment/", bamType)
        if(file.exists(bamFile)){
          bamSelect[[hist]] = c(bamSelect[[hist]], bamFile)
        }else{
          print(paste0(bamFile, " does not exist!"))
        }
      }
    }
   }
  # bamSelect[[hist]]
  ##get count matrix in parallel
  # fragment_tss_counts[[hist]] <- mclapply(as.list(bamSelect[[hist]]), chromVAR::getCounts, tss_gr, paired = TRUE, by_rg = FALSE, format = "bam", mc.cores = coreN)

}


# saveRDS(fragment_tss_counts, file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM.rds"))

fragment_tss_counts = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM.rds"))
head(fragment_tss_counts)       


In [None]:
library(chromVAR)
library(SummarizedExperiment)
library(Matrix)

## count matrix column name
# countMat = vector("list", length(histList))
# normMat = vector("list", length(histList))
# seqDepth = vector("list", length(histList))
# designInfo = c()
# for(hist in histList){
#   sampleName = c()
#   for(cell in cellList){
#     for(expr in exprList){
#       hdL = checkHD(expr)
#       for(hd in hdL){
#         sampleName = c(sampleName, paste(hist, cell, expr, hd, sep = "_"))
#       }
#     }
#   }

#   ## Form the matrix
#   rmIndex = which(sampleName == paste0(hist, "_EMRA_Input_HD2"))
#   if(length(rmIndex) > 0){
#     sampleName = sampleName[-rmIndex]
#   }
#   countMat[[hist]] <- matrix(NA, length(tss_gr), length(bamSelect[[hist]]))
#   normMat[[hist]] <- matrix(NA, length(tss_gr), length(bamSelect[[hist]]))
#   colnames(countMat[[hist]]) <- sampleName
#   colnames(normMat[[hist]]) <- sampleName
#   seqDepth[[hist]] <- NULL
#   for (k in 1:length(bamSelect[[hist]])){
#     countMat[[hist]][, k] <- counts(fragment_tss_counts[[hist]][[k]])[,1]
#     seqDepth[[hist]][k] <- fragment_tss_counts[[hist]][[k]]@colData[1,1]
#     normMat[[hist]][, k] <- countMat[[hist]][, k]/seqDepth[[hist]][k] * 16294327 ## divide by the seqDepth and multiply by the largest seqDepth
#   }
#   designInfo <- data.frame(exps = sampleName, depth = seqDepth[[hist]], hist = hist) %>% rbind(designInfo, .)
# }
# rownames(countMat$H3K4me2) = data.frame(tss_gr)$gene_name
# rownames(countMat$H3K27me3) = data.frame(tss_gr)$gene_name
# rownames(normMat$H3K4me2) = data.frame(tss_gr)$gene_name
# rownames(normMat$H3K27me3) = data.frame(tss_gr)$gene_name


# saveRDS(countMat, file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_countMat.rds"))
# saveRDS(normMat, file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds"))
# saveRDS(designInfo, file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_designInfo.rds"))
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"

countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds"))
# designInfo = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_designInfo.rds"))


In [None]:
options(repr.plot.width=22, repr.plot.height=10)
hist = "H3K27me3"
selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes
selectC = which(colnames(countMat[[hist]]) %>% stringr::str_detect("Input"))

tmp = countMat[[hist]][selectR, selectC]

k27_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, selectC], project = "cart", min.cells = 0, min.features = 5) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 100000000) %>% 
FindVariableFeatures(., selection.method = "mean.var.plot", nfeatures = 1000) %>% 
ScaleData()

k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)
k27_plot = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(gsub("H3K27me3_", "", colnames(k27_obj)) %>% gsub("_.*", "", .), levels = c("N", "CM", "EM", "EMRA"))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

hist = "H3K4me2"
selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes
selectC = which(colnames(countMat[[hist]]) %>% stringr::str_detect("Input"))
k4_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, selectC], project = "cart", min.cells = 0, min.features = 5) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 100000000) %>% 
FindVariableFeatures(., selection.method = "mean.var.plot", nfeatures = 1000) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

k4_plot = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(gsub("H3K4me2_", "", colnames(k4_obj)) %>% gsub("_.*", "", .), levels = c("N", "CM", "EM", "EMRA"))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

cr_obj = CreateSeuratObject(counts = rbind(tmp, countMat[[hist]][selectR, selectC]), project = "cart", min.cells = 0, min.features = 5) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 100000000) %>% 
FindVariableFeatures(., selection.method = "mean.var.plot", nfeatures = 1000) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)

cr_plot = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% 
mutate(celltype = factor(c(
        gsub("H3K27me3_", "", colnames(k27_obj)) %>% gsub("_.*", "", .)), 
    levels = c("N", "CM", "EM", "EMRA"))
) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")
# dev.off()
# pdf("CART_CUTRUN_Project/results/paper_figure/patient-related/Patient_CR_PD_PCA_on_all_peaks.pdf", width = 15, height = 6)
# ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)
# dev.off()
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)


In [4]:
## Read in RNA-seq data
cellList <- c("N", "CM", "EM", "EMRA")
hdList <- paste0("HD", c(1:3, 5:7))
exprList = c("Input", "Product", "Stim1", "Stim2", "Stim3")


inPath <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
outPath <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"

rna_expCount = c()
rna_TPM = c()
rna_FPKM = c()
for(cell in cellList){
  for(expr in exprList){
    for(hd in hdList){
      # print(paste(expr, cell, hd, sep = "_"))
      dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(expected_count)
      rna_expCount <- cbind(rna_expCount, round(dataTmp))
      dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(TPM)
      rna_TPM <- cbind(rna_TPM, round(dataTmp))
      dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(FPKM)
      rna_FPKM <- cbind(rna_FPKM, round(dataTmp))

    }
  }

}
geneID = fread(paste0(inPath, "RNA_CD8_N_Input_HD1.genes.results"))$gene_id
rownames(rna_expCount) <- geneID
colnames(rna_expCount) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")
rownames(rna_TPM) <- geneID
colnames(rna_TPM) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")
rownames(rna_FPKM) <- geneID
colnames(rna_FPKM) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")



In [None]:
## add target genes
library(readxl)
library(ggrepel)
gene_path <- "CART_CUTRUN_Project/functionalGeneList/"
select_gene <- list(
    TF = read_excel(paste0(gene_path, "2021_11_29_TFs.xlsx")) %>% mutate(gene_name = `HGNC symbol`),
    Tactive = read_excel(paste0(gene_path, "2021_11_30_TCellActivationFunction.xlsx")) %>% mutate(gene_name = `Official Symbol`),
    CellCycle = read_excel(paste0(gene_path, "2021_22_30_CellCycleAPoptosis.xlsx")) %>% mutate(gene_name = `Symbol`),
    Metabolism = read_excel(paste0(gene_path, "2021_22_30_MetabolismGeneList_FinalList.xlsx")) %>% mutate(gene_name = `Gene`)
)
small_list = read_excel(paste0(gene_path, "SmallCuratedGeneList.xlsx")) 
head(small_list)

In [54]:
## mean across healthy donors
for(gene_group in unique(small_list$Group)){
    pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_SmallCuratedGeneList_", gsub(" ", "_", gene_group), ".pdf"), width = 13, height =  11)
    for(cT in c("N", "CM", "EM", "EMRA")){
        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")

        rna_select_data = rna_expCount
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

        cr_rna_match = data.frame(
            H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans,
            gene_name = rna_gene[rna_row_index] %>% toupper
        )

        x_thres = 4.5
        y_thres = 2.5
        panel1 = cr_rna_match %>% dplyr::filter(H3K4me2 <= x_thres, RNA >= y_thres) %>% nrow()
        panel2 = cr_rna_match %>% dplyr::filter(H3K4me2 > x_thres, RNA >= y_thres) %>% nrow()
        panel3 = cr_rna_match %>% dplyr::filter(H3K4me2 <= x_thres, RNA < y_thres) %>% nrow()
        panel4 = cr_rna_match %>% dplyr::filter(H3K4me2 > x_thres, RNA < y_thres) %>% nrow()
        print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
        geom_point(size = 0.5) +
        geom_point(aes(x = H3K4me2, y = RNA), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        geom_text_repel(aes(x = H3K4me2, y = RNA, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("RNA-seq: log2(expected count + 1)") +
        scale_color_viridis() +
        geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
        annotate("text", x = c(0, 10, 0, 10), y = c(20, 20, -1, -1), label = paste0(c(panel1, panel2, panel3, panel4), " genes")) +
        ggtitle(rna_select_cond)) 


        x_thres = 2.5
        y_thres = 2.5
        panel1 = cr_rna_match %>% dplyr::filter(H3K27me3 <= x_thres, RNA >= y_thres) %>% nrow()
        panel2 = cr_rna_match %>% dplyr::filter(H3K27me3 > x_thres, RNA >= y_thres) %>% nrow()
        panel3 = cr_rna_match %>% dplyr::filter(H3K27me3 <= x_thres, RNA < y_thres) %>% nrow()
        panel4 = cr_rna_match %>% dplyr::filter(H3K27me3 > x_thres, RNA < y_thres) %>% nrow()
        print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
        geom_point(size = 0.5) +
        geom_point(aes(x = H3K27me3, y = RNA), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        geom_text_repel(aes(x = H3K27me3, y = RNA, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        xlab("H3K27me3: log2(scaled count + 1)") +
        ylab("RNA-seq: log2(expected count + 1)") +
        scale_color_viridis() +
        geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
        annotate("text", x = c(0, 10, 0, 10), y = c(20, 20, -1, -1), label = paste0(c(panel1, panel2, panel3, panel4), " genes")) +
        ggtitle(rna_select_cond))
    }
    dev.off()
}

In [None]:
## mean across healthy donors summarize gene number per category
gene_num_summary = c()
for(gene_group in unique(small_list$Group)){
    # pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_SmallCuratedGeneList_", gsub(" ", "_", gene_group), ".pdf"), width = 13, height =  11)
    for(cT in c("N", "CM", "EM", "EMRA")){
        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")

        rna_select_data = rna_expCount
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

        cr_rna_match = data.frame(
            H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans,
            gene_name = rna_gene[rna_row_index] %>% toupper
        )
        saveRDS(cr_rna_match, file = paste0(outPath, "/RDS/cr_rna_match_", cT, ".rds"))

        cr_rna_match_subset = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ]
        x_thres = 4.5
        y_thres = 2.5
        panel1 = cr_rna_match_subset %>% dplyr::filter(H3K4me2 <= x_thres, RNA >= y_thres) %>% nrow()
        panel2 = cr_rna_match_subset %>% dplyr::filter(H3K4me2 > x_thres, RNA >= y_thres) %>% nrow()
        panel3 = cr_rna_match_subset %>% dplyr::filter(H3K4me2 <= x_thres, RNA < y_thres) %>% nrow()
        panel4 = cr_rna_match_subset %>% dplyr::filter(H3K4me2 > x_thres, RNA < y_thres) %>% nrow()
        
        gene_num_summary = data.frame(
            gene_group = gene_group, 
            cell_type = cT, 
            histone = "H3K4me2", 
            panel = paste0("panel", 1:4), 
            gene_num = c(panel1, panel2, panel3, panel4), 
            total_gene_num = cr_rna_match_subset %>% nrow
        ) %>% rbind(gene_num_summary, .)

        x_thres = 2.5
        y_thres = 2.5
        panel1 = cr_rna_match_subset %>% dplyr::filter(H3K27me3 <= x_thres, RNA >= y_thres) %>% nrow()
        panel2 = cr_rna_match_subset %>% dplyr::filter(H3K27me3 > x_thres, RNA >= y_thres) %>% nrow()
        panel3 = cr_rna_match_subset %>% dplyr::filter(H3K27me3 <= x_thres, RNA < y_thres) %>% nrow()
        panel4 = cr_rna_match_subset %>% dplyr::filter(H3K27me3 > x_thres, RNA < y_thres) %>% nrow()
        
        gene_num_summary = data.frame(
            gene_group = gene_group, 
            cell_type = cT, 
            histone = "H3K27me3", 
            panel = paste0("panel", 1:4), 
            gene_num = c(panel1, panel2, panel3, panel4), 
            total_gene_num = cr_rna_match_subset %>% nrow
        ) %>% rbind(gene_num_summary, .)
  
    }
    
}
gene_num_summary

gene_num_summary %>% write.csv(., file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_gene_group_panel_summary.csv"), row.names = F)


In [35]:
#  for(cT in c("N", "CM", "EM", "EMRA")){
#         rna_select_cond = paste0("Input_", cT)
#         cr_select_cond = paste0(cT, "_Input")

#         rna_select_data = rna_expCount
#         cr_select_data = normMat$H3K27me3

#         rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
#         cr_gene = rownames(cr_select_data) %>% tolower
#         rna_gene_in_cr = match(rna_gene, cr_gene)
#         rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
#         cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

#         rna_row_index = which(!is.na(rna_gene_in_cr))
#         cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
#         target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

#         cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
#         rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

#         cr_rna_match = data.frame(
#             H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
#             H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
#             RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans,
#             gene_name = rna_gene[rna_row_index] %>% toupper
#         )
#         saveRDS(cr_rna_match, file = paste0(outPath, "/RDS/cr_rna_match_", cT, ".rds"))
#  }

## add hd value
 for(cT in c("N", "CM", "EM", "EMRA")){
        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")

        rna_select_data = rna_expCount
        colnames(rna_select_data) = paste0("RNA_", colnames(rna_select_data))
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        if(cT == "EM"){
            cr_col_index_rm = which(cr_select_data %>% colnames %>% stringr::str_detect("EMRA_Input"))
            rna_col_index_rm = which(rna_select_data %>% colnames %>% stringr::str_detect("Input_EMRA"))
            cr_col_index = setdiff(cr_col_index, cr_col_index_rm)
            rna_col_index = setdiff(rna_col_index, rna_col_index_rm)
        }
        #target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

        cr_rna_match = data.frame(
                gene_name = rna_gene[rna_row_index] %>% toupper,
            H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
        ) %>% cbind(., log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1)) %>%
        cbind(., log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1)) %>% 
        cbind(., log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1))

        k4_thres = 4.5
        k27_thres = 2.5
        bivalent_cr_rna = cr_rna_match %>% dplyr::filter(H3K27me3 >= k27_thres, H3K4me2 >= k4_thres)
        write.csv(bivalent_cr_rna, file = paste0("CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_", cT, "_withHDcounts.csv"))
        saveRDS(cr_rna_match, file = paste0(outPath, "/RDS/cr_rna_match_", cT, "_withHDcounts.rds"))
 }

In [None]:
cT = "N"
cr_rna_match = readRDS(paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/cr_rna_match_", cT, ".rds"))
x_thres = 4.5
y_thres = 2.5
cr_rna_match %>% ggplot(aes(x = H3K4me2, y = H3K27me3, color = RNA)) +
        geom_point(size = 0.5) +
        # geom_point(aes(x = H3K27me3, y = RNA), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        # geom_text_repel(aes(x = H3K27me3, y = RNA, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_viridis() +
        geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
        # annotate("text", x = c(0, 10, 0, 10), y = c(20, 20, -1, -1), label = paste0(c(panel1, panel2, panel3, panel4), " genes")) +
        ggtitle(cT)


In [None]:
options(repr.plot.width=18, repr.plot.height=8)

library(plotGMM)
library(magrittr)
library(ggplot2)
library(mixtools)
x_thres = 4.25
y_thres = 2.5
# Fit a GMM using EM
set.seed(576)

cT = "N"
# for(cT in c("N")){ #, "CM", "EM"

    pdf(paste0("CART_CUTRUN_Project/results/paper_figure_take3/gaussianMixture_histone_signal_TSS_", cT, ".pdf"), width = 14, height = 6)
    cr_rna_match = readRDS(paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/cr_rna_match_", cT, ".rds"))
    mixmdl <- normalmixEM(cr_rna_match$H3K4me2, k = 2)

    print(data.frame(x = cr_rna_match$H3K4me2) %>%
    ggplot() +
    geom_histogram(aes(x, ..density..), binwidth = 0.3, colour = "black",
                    fill = "white") +
    stat_function(geom = "line", fun = plot_mix_comps,
                    args = list(mixmdl$mu[1] + 0.1, mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                    colour = "red", lwd = 1.5) +
    stat_function(geom = "line", fun = plot_mix_comps,
                    args = list(mixmdl$mu[2] + 0.2, mixmdl$sigma[2], lam = mixmdl$lambda[2]),
                    colour = "blue", lwd = 1.5) +
    theme_bw(base_size = 35) +
    xlab("H3K4me2 Enrichment Signal around TSS") +
    ylab("Density") +
    geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
    ggtitle(cT))

    mixmdl <- normalmixEM(cr_rna_match$H3K27me3, k = 2)
    print(data.frame(x = cr_rna_match$H3K27me3) %>%
    ggplot() +
    geom_histogram(aes(x, ..density..), binwidth = 0.3, colour = "black",
                    fill = "white") +
    stat_function(geom = "line", fun = plot_mix_comps,
                    args = list(mixmdl$mu[1] + 0.2, mixmdl$sigma[1] + 0.2, lam = mixmdl$lambda[1]),
                    colour = "red", lwd = 1.5) +
    stat_function(geom = "line", fun = plot_mix_comps,
                    args = list(mixmdl$mu[2] + 0.4, mixmdl$sigma[2] - 0.2, lam = mixmdl$lambda[2]),
                    colour = "blue", lwd = 1.5) +
    theme_bw(base_size = 35) +
    xlab("H3K27me3 Enrichment Signal around TSS") +
    ylab("Density") +
    geom_vline(xintercept = y_thres, linetype = "dashed", color = 'grey10') +
    ggtitle(cT))
# }
dev.off()

In [None]:
cT = "CM"

pdf(paste0("CART_CUTRUN_Project/results/paper_figure_take3/gaussianMixture_histone_signal_TSS_", cT, ".pdf"), width = 14, height = 6)
cr_rna_match = readRDS(paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/cr_rna_match_", cT, ".rds"))
mixmdl <- normalmixEM(cr_rna_match$H3K4me2, k = 2)

print(data.frame(x = cr_rna_match$H3K4me2) %>%
ggplot() +
geom_histogram(aes(x, ..density..), binwidth = 0.3, colour = "black",
                fill = "white") +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5) +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[2] + 0.2, mixmdl$sigma[2], lam = mixmdl$lambda[2]),
                colour = "blue", lwd = 1.5) +
theme_bw(base_size = 35) +
xlab("H3K4me2 Enrichment Signal around TSS") +
ylab("Density") +
geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
ggtitle(cT))

mixmdl <- normalmixEM(cr_rna_match$H3K27me3, k = 2)
print(data.frame(x = cr_rna_match$H3K27me3) %>%
ggplot() +
geom_histogram(aes(x, ..density..), binwidth = 0.3, colour = "black",
                fill = "white") +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5) +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[2] + 0.3, mixmdl$sigma[2] - 0.1, lam = mixmdl$lambda[2]),
                colour = "blue", lwd = 1.5) +
theme_bw(base_size = 35) +
xlab("H3K27me3 Enrichment Signal around TSS") +
ylab("Density") +
geom_vline(xintercept = y_thres, linetype = "dashed", color = 'grey10') +
ggtitle(cT))

dev.off()

In [None]:
cT = "EM"

pdf(paste0("CART_CUTRUN_Project/results/paper_figure_take3/gaussianMixture_histone_signal_TSS_", cT, ".pdf"), width = 14, height = 6)
cr_rna_match = readRDS(paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/cr_rna_match_", cT, ".rds"))
mixmdl <- normalmixEM(cr_rna_match$H3K4me2, k = 2)

print(data.frame(x = cr_rna_match$H3K4me2) %>%
ggplot() +
geom_histogram(aes(x, ..density..), binwidth = 0.3, colour = "black",
                fill = "white") +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5) +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[2], mixmdl$sigma[2], lam = mixmdl$lambda[2]),
                colour = "blue", lwd = 1.5) +
theme_bw(base_size = 35) +
xlab("H3K4me2 Enrichment Signal around TSS") +
ylab("Density") +
geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
ggtitle(cT))

mixmdl <- normalmixEM(cr_rna_match$H3K27me3, k = 2)
print(data.frame(x = cr_rna_match$H3K27me3) %>%
ggplot() +
geom_histogram(aes(x, ..density..), binwidth = 0.3, colour = "black",
                fill = "white") +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[1], mixmdl$sigma[1], lam = mixmdl$lambda[1]),
                colour = "red", lwd = 1.5) +
stat_function(geom = "line", fun = plot_mix_comps,
                args = list(mixmdl$mu[2] + 0.1, mixmdl$sigma[2] - 0.1, lam = mixmdl$lambda[2]),
                colour = "blue", lwd = 1.5) +
theme_bw(base_size = 35) +
xlab("H3K27me3 Enrichment Signal around TSS") +
ylab("Density") +
geom_vline(xintercept = y_thres, linetype = "dashed", color = 'grey10') +
ggtitle(cT))

dev.off()

In [None]:
x_thres = 4.5
y_thres = 2.5

bivProp = c()
for(cT in c("N", "CM", "EM")){
    cr_rna_match = readRDS(paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/cr_rna_match_", cT, ".rds"))
    bivProp = data.frame(prop = (cr_rna_match %>% filter(H3K4me2 > 4.5, H3K27me3 > 2.5) %>% nrow)/nrow(cr_rna_match) * 100, num = cr_rna_match %>% filter(H3K4me2 > 4.5, H3K27me3 > 2.5) %>% nrow, celltype = cT, type = "Bivalent") %>% rbind(bivProp)
    bivProp = data.frame(prop = 100 - (cr_rna_match %>% filter(H3K4me2 > 4.5, H3K27me3 > 2.5) %>% nrow)/nrow(cr_rna_match) * 100, num = cr_rna_match %>% filter(H3K4me2 > 4.5, H3K27me3 > 2.5) %>% nrow, celltype = cT, type = "Non-bivalent") %>% rbind(bivProp)
}
bivProp$celltype = factor(bivProp$celltype, levels = c("N", "CM", "EM"))
bivProp

bivProp %>% ggplot(aes(x = celltype, y = prop, fill = type, label = round(prop, 2))) + 
geom_bar(stat = "identity") +
theme_bw(base_size = 25) +
scale_fill_brewer(palette = "Set1") +
xlab("")+
ylab("Percentage of Genes") +
rremove("legend.title") +
theme(legend.position = "top")

ggsave("CART_CUTRUN_Project/results/paper_figure_take3/Percentage_gene_bivalency.pdf", width = 8, height = 8)

In [None]:
## get the gene region falling in K4+ K27+ and K4+K27+ category 
## generate the computeMatrix heatmap for three category for each cell type for +/-1kb or +/-5kb
head(cr_rna_match)
colnames(gtf_df)
gene_full_list = gtf_df %>% data.frame %>% dplyr::filter(type == "gene") %>% dplyr::select(seqnames, start, end, gene_name, score, strand)

for(cT in c("N", "CM", "EM")){
    cr_rna_match = readRDS(paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/cr_rna_match_", cT, ".rds"))

    k4p_k27n = cr_rna_match %>% filter(H3K4me2 > 4.5, H3K27me3 < 2.5) %$% gene_name  
    k4p_k27p = cr_rna_match %>% filter(H3K4me2 > 4.5, H3K27me3 > 2.5) %$% gene_name
    k4n_k27p = cr_rna_match %>% filter(H3K4me2 < 4, H3K27me3 > 2.5) %$% gene_name

    gene_full_list %>% filter(gene_name %in% k4p_k27n) %>% write.table(paste0("CART_CUTRUN_Project/results/paper_figure_take3/K4pos_K27neg_gene_list_", cT, ".bed"), sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

    gene_full_list %>% filter(gene_name %in% k4p_k27p) %>% write.table(paste0("CART_CUTRUN_Project/results/paper_figure_take3/K4pos_K27pos_gene_list_", cT, ".bed"), sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)

    gene_full_list %>% filter(gene_name %in% k4n_k27p) %>% write.table(paste0("CART_CUTRUN_Project/results/paper_figure_take3/K4neg_K27pos_gene_list_", cT, ".bed"), sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
}


In [55]:
## for healthy donors

for(gene_group in unique(small_list$Group)){
    for(cT in c("N", "CM", "EM", "EMRA")){
    
        pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_SmallCuratedGeneList_", gsub(" ", "_", gene_group), "_perDonor_celltype", cT, ".pdf"), width = 13, height =  11)
    
        rna_select_data = rna_expCount
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        for(hd in paste0("HD", 5:7)){
            rna_select_cond = paste0("Input_", cT, "_", hd)
            cr_select_cond = paste0(cT, "_Input_", hd)
            cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
            rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))
    
            cr_rna_match = data.frame(
                H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1), # %>% rowMeans,
                H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1), # %>% rowMeans,
                RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1), # %>% rowMeans,
                gene_name = rna_gene[rna_row_index] %>% toupper
            )

            x_thres = 4.5
            y_thres = 2.5
            panel1 = cr_rna_match %>% dplyr::filter(H3K4me2 <= x_thres, RNA >= y_thres) %>% nrow()
            panel2 = cr_rna_match %>% dplyr::filter(H3K4me2 > x_thres, RNA >= y_thres) %>% nrow()
            panel3 = cr_rna_match %>% dplyr::filter(H3K4me2 <= x_thres, RNA < y_thres) %>% nrow()
            panel4 = cr_rna_match %>% dplyr::filter(H3K4me2 > x_thres, RNA < y_thres) %>% nrow()
        
            print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
            geom_point(size = 0.5) +
            geom_point(aes(x = H3K4me2, y = RNA), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
            geom_text_repel(aes(x = H3K4me2, y = RNA, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
            theme_bw(base_size = 25) +
            xlab("H3K4me2: log2(scaled count + 1)") +
            ylab("RNA-seq: log2(expected count + 1)") +
            scale_color_viridis() +
            geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
            geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
            annotate("text", x = c(0, 10, 0, 10), y = c(20, 20, -1, -1), label = paste0(c(panel1, panel2, panel3, panel4), " genes")) +
            ggtitle(rna_select_cond)) 


            x_thres = 2.5
            y_thres = 2.5
            panel1 = cr_rna_match %>% dplyr::filter(H3K27me3 <= x_thres, RNA >= y_thres) %>% nrow()
            panel2 = cr_rna_match %>% dplyr::filter(H3K27me3 > x_thres, RNA >= y_thres) %>% nrow()
            panel3 = cr_rna_match %>% dplyr::filter(H3K27me3 <= x_thres, RNA < y_thres) %>% nrow()
            panel4 = cr_rna_match %>% dplyr::filter(H3K27me3 > x_thres, RNA < y_thres) %>% nrow()
            print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
            geom_point(size = 0.5) +
            geom_point(aes(x = H3K27me3, y = RNA), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
            geom_text_repel(aes(x = H3K27me3, y = RNA, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
            theme_bw(base_size = 25) +
            xlab("H3K27me3: log2(scaled count + 1)") +
            ylab("RNA-seq: log2(expected count + 1)") +
            scale_color_viridis() +
            geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
            geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
            annotate("text", x = c(0, 10, 0, 10), y = c(20, 20, -1, -1), label = paste0(c(panel1, panel2, panel3, panel4), " genes")) +
            ggtitle(rna_select_cond))
        }
        dev.off()
    }
  
}

In [None]:
## for healthy donors summary gene number in each panel
gene_num_panel_summary = c()
for(cT in c("N", "CM", "EM", "EMRA")){
        rna_select_data = rna_expCount
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        for(hd in paste0("HD", 5:7)){
            rna_select_cond = paste0("Input_", cT, "_", hd)
            cr_select_cond = paste0(cT, "_Input_", hd)
            cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
            rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))
            
            
            cr_rna_match = data.frame(
                H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1), # %>% rowMeans,
                H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1), # %>% rowMeans,
                RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1), # %>% rowMeans,
                gene_name = rna_gene[rna_row_index] %>% toupper
            )

            x_thres = 4.5
            y_thres = 2.5
 
            panel1 = cr_rna_match %>% dplyr::filter(H3K4me2 <= x_thres, RNA >= y_thres) %>% nrow()
            panel2 = cr_rna_match %>% dplyr::filter(H3K4me2 > x_thres, RNA >= y_thres) %>% nrow()
            panel3 = cr_rna_match %>% dplyr::filter(H3K4me2 <= x_thres, RNA < y_thres) %>% nrow()
            panel4 = cr_rna_match %>% dplyr::filter(H3K4me2 > x_thres, RNA < y_thres) %>% nrow()
            gene_num_panel_summary = data.frame(gene_num = c(panel1, panel2, panel3, panel4), panel = paste0("Panel", 1:4), celltype = cT, histone = "H3K4me2", hd = hd) %>% rbind(gene_num_panel_summary, .)

            x_thres = 2.5
            y_thres = 2.5

            panel1 = cr_rna_match %>% dplyr::filter(H3K27me3 <= x_thres, RNA >= y_thres) %>% nrow()
            panel2 = cr_rna_match %>% dplyr::filter(H3K27me3 > x_thres, RNA >= y_thres) %>% nrow()
            panel3 = cr_rna_match %>% dplyr::filter(H3K27me3 <= x_thres, RNA < y_thres) %>% nrow()
            panel4 = cr_rna_match %>% dplyr::filter(H3K27me3 > x_thres, RNA < y_thres) %>% nrow()
            gene_num_panel_summary = data.frame(gene_num = c(panel1, panel2, panel3, panel4), panel = paste0("Panel", 1:4), celltype = cT, histone = "H3K27me3", hd = hd) %>% rbind(gene_num_panel_summary, .)

        }
        
  
}
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_geneNum_per_quadrant_per_donor.pdf"), width = 13, height =  11)
gene_num_panel_summary$celltype = factor(gene_num_panel_summary$celltype, levels = c("N", "CM", "EM", "EMRA"))
gene_num_panel_summary %>% ggplot(aes(hd, y = gene_num, fill = panel)) +
geom_bar(stat = "identity", position =  "stack") +
facet_grid(histone~celltype) +
scale_fill_brewer(palette = "Set1") +
theme_bw(base_size = 25) +
xlab("") +
ylab("# of Genes")
dev.off()

gene_num_panel_summary$celltype = factor(gene_num_panel_summary$celltype, levels = c("N", "CM", "EM", "EMRA"))
gene_num_panel_summary %>% ggplot(aes(hd, y = gene_num, fill = panel)) +
geom_bar(stat = "identity", position =  "stack") +
facet_grid(histone~celltype) +
scale_fill_brewer(palette = "Set1") +
theme_bw(base_size = 25) +
xlab("") +
ylab("# of Genes")


In [65]:
## get the gene name in each panel
gene_num_panel_summary = c()
for(cT in c("N", "CM", "EM", "EMRA")){

        rna_select_data = rna_expCount
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")
        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))
            
            
        cr_rna_match = data.frame(
                H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
                H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
                RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans,
                gene_name = rna_gene[rna_row_index] %>% toupper
        )

        cr_rna_match_full = cr_rna_match %>% dplyr::select(gene_name, RNA, H3K4me2, H3K27me3) %>%
        cbind(., data.frame(rna_select_data)[rna_row_index, rna_col_index]) %>% 
        cbind(., normMat$H3K4me2[cr_row_index, cr_col_index]) %>% 
        cbind(., normMat$H3K27me3[cr_row_index, cr_col_index])
        
        x_thres = 4.5
        y_thres = 2.5
        panel1_gene = cr_rna_match_full %>% dplyr::filter(H3K4me2 <= x_thres, RNA >= y_thres) 
        panel2_gene = cr_rna_match_full %>% dplyr::filter(H3K4me2 > x_thres, RNA >= y_thres) 
        panel3_gene = cr_rna_match_full %>% dplyr::filter(H3K4me2 <= x_thres, RNA < y_thres) 
        panel4_gene = cr_rna_match_full %>% dplyr::filter(H3K4me2 > x_thres, RNA < y_thres)
        write.csv(panel1_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K4me2_celltype", cT, "_panel1.csv"), row.names = F)
        write.csv(panel2_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K4me2_celltype", cT, "_panel2.csv"), row.names = F)
        write.csv(panel3_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K4me2_celltype", cT, "_panel3.csv"), row.names = F)
        write.csv(panel4_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K4me2_celltype", cT, "_panel4.csv"), row.names = F)
      
        x_thres = 2.5
        y_thres = 2.5
        panel1_gene = cr_rna_match_full %>% dplyr::filter(H3K27me3 <= x_thres, RNA >= y_thres)
        panel2_gene = cr_rna_match_full %>% dplyr::filter(H3K27me3 > x_thres, RNA >= y_thres)
        panel3_gene = cr_rna_match_full %>% dplyr::filter(H3K27me3 <= x_thres, RNA < y_thres)
        panel4_gene = cr_rna_match_full %>% dplyr::filter(H3K27me3 > x_thres, RNA < y_thres)
        write.csv(panel1_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K27me3_celltype", cT, "_panel1.csv"), row.names = F)
        write.csv(panel2_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K27me3_celltype", cT, "_panel2.csv"), row.names = F)
        write.csv(panel3_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K27me3_celltype", cT, "_panel3.csv"), row.names = F)
        write.csv(panel4_gene, file = paste0(fig_path, "/CSV/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_geneInfo_H3K27me3_celltype", cT, "_panel4.csv"), row.names = F)
  
}


In [None]:
## bivalent gene categories
gene_hist_group = c()
for(cT in c("N", "CM", "EM", "EMRA")){
        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")

        rna_select_data = rna_expCount
        colnames(rna_select_data) = paste0("RNA_", colnames(rna_select_data))
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))
        if(cT == "EM"){
            cr_col_index_rm = which(cr_select_data %>% colnames %>% stringr::str_detect("EMRA_Input"))
            rna_col_index_rm = which(rna_select_data %>% colnames %>% stringr::str_detect("Input_EMRA"))
            cr_col_index = setdiff(cr_col_index, cr_col_index_rm)
            rna_col_index = setdiff(rna_col_index, rna_col_index_rm)
        }
        # cr_rna_match = data.frame(
        #     gene_name = rna_gene[rna_row_index] %>% toupper,
        #     H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        #     H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        #     RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
        # ) %>% mutate(H3K27me3_group = 0, H3K4me2_group = 0)

        cr_rna_match = data.frame(
            gene_name = rna_gene[rna_row_index] %>% toupper,
            H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
        ) %>% mutate(H3K27me3_group = 0, H3K4me2_group = 0) %>% 
        cbind(., log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1)) %>%
        cbind(., log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1)) %>% 
        cbind(., log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1)) 


        cr_rna_match$H3K27me3_group[which(cr_rna_match$H3K27me3 > 2.5)] = 1
        cr_rna_match$H3K4me2_group[which(cr_rna_match$H3K4me2 > 4.5)] = 1
        colnames(cr_rna_match)[2:6] = paste0(colnames(cr_rna_match)[2:6], "_", cT)
        if(length(gene_hist_group) == 0){
            gene_hist_group = cr_rna_match
        }else{
            gene_hist_group = cbind(gene_hist_group, cr_rna_match[, 2:ncol(cr_rna_match)])
        }
        print(colnames(gene_hist_group))
}


In [None]:
bivalent_active_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N >= H3K27me3_group_CM, H3K27me3_group_CM >= H3K27me3_group_EM, H3K27me3_group_EM >= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N <= H3K4me2_group_CM, H3K4me2_group_CM <= H3K4me2_group_EM, H3K4me2_group_EM <= H3K4me2_group_EMRA) %>% dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)

gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N >= H3K27me3_group_CM, H3K27me3_group_CM >= H3K27me3_group_EM, H3K27me3_group_EM >= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N <= H3K4me2_group_CM, H3K4me2_group_CM <= H3K4me2_group_EM, H3K4me2_group_EM <= H3K4me2_group_EMRA) %>% 
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_activation_from_N_to_EMRA_withHDcounts.csv")

In [None]:
bivalent_repress_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N <= H3K27me3_group_CM, H3K27me3_group_CM <= H3K27me3_group_EM, H3K27me3_group_EM <= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N >= H3K4me2_group_CM, H3K4me2_group_CM >= H3K4me2_group_EM, H3K4me2_group_EM >= H3K4me2_group_EMRA) %>%
dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)

gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N <= H3K27me3_group_CM, H3K27me3_group_CM <= H3K27me3_group_EM, H3K27me3_group_EM <= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N >= H3K4me2_group_CM, H3K4me2_group_CM >= H3K4me2_group_EM, H3K4me2_group_EM >= H3K4me2_group_EMRA) %>%
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_repression_from_N_to_EMRA_withHDcounts.csv")

In [None]:
## Poised to activated
bivalent_posie_active_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum == 4) %>%
dplyr::filter(H3K27me3_group_N >= H3K27me3_group_CM, H3K27me3_group_CM >= H3K27me3_group_EM, H3K27me3_group_EM >= H3K27me3_group_EMRA) %>%
dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)
bivalent_posie_active_gene %>% head


gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum == 4) %>%
dplyr::filter(H3K27me3_group_N >= H3K27me3_group_CM, H3K27me3_group_CM >= H3K27me3_group_EM, H3K27me3_group_EM >= H3K27me3_group_EMRA) %>%
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_poised_to_activated_from_N_to_EMRA_withHDcounts.csv")

In [None]:
## Poised to further repressed
bivalent_posie_repress_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum == 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N >= H3K27me3_group_CM, H3K27me3_group_CM >= H3K27me3_group_EM, H3K27me3_group_EM >= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N >= H3K4me2_group_CM, H3K4me2_group_CM >= H3K4me2_group_EM, H3K4me2_group_EM >= H3K4me2_group_EMRA) %>%
dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)


gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum == 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N >= H3K27me3_group_CM, H3K27me3_group_CM >= H3K27me3_group_EM, H3K27me3_group_EM >= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N >= H3K4me2_group_CM, H3K4me2_group_CM >= H3K4me2_group_EM, H3K4me2_group_EM >= H3K4me2_group_EMRA) %>%
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_poised_to_further_repression_from_N_to_EMRA_withHDcounts.csv")

In [None]:
## No mark to a single active mark
bivalent_none_active_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum == 0, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K4me2_group_N <= H3K4me2_group_CM, H3K4me2_group_CM <= H3K4me2_group_EM, H3K4me2_group_EM <= H3K4me2_group_EMRA) %>%
dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)
bivalent_none_active_gene %>% dim
bivalent_none_active_gene %>% head

gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum == 0, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K4me2_group_N <= H3K4me2_group_CM, H3K4me2_group_CM <= H3K4me2_group_EM, H3K4me2_group_EM <= H3K4me2_group_EMRA) %>% 
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_none_to_activation_from_N_to_EMRA_withHDcounts.csv")

In [None]:
## None to only repress marker
bivalent_none_repress_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum == 0) %>%
dplyr::filter(H3K27me3_group_N <= H3K27me3_group_CM, H3K27me3_group_CM <= H3K27me3_group_EM, H3K27me3_group_EM <= H3K27me3_group_EMRA) %>%
dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)
bivalent_none_repress_gene %>% dim
bivalent_none_repress_gene %>% head()

gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum == 0) %>%
dplyr::filter(H3K27me3_group_N <= H3K27me3_group_CM, H3K27me3_group_CM <= H3K27me3_group_EM, H3K27me3_group_EM <= H3K27me3_group_EMRA) %>%
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_none_to_repression_from_N_to_EMRA_withHDcounts.csv")

In [None]:
## None to both active and repress marker
bivalent_none_both_gene = gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N <= H3K27me3_group_CM, H3K27me3_group_CM <= H3K27me3_group_EM, H3K27me3_group_EM <= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N <= H3K4me2_group_CM, H3K4me2_group_CM <= H3K4me2_group_EM, H3K4me2_group_EM <= H3K4me2_group_EMRA) %>%
dplyr::select(gene_name, H3K27me3_group_N, H3K27me3_group_CM, H3K27me3_group_EM, H3K27me3_group_EMRA, H3K4me2_group_N, H3K4me2_group_CM, H3K4me2_group_EM, H3K4me2_group_EMRA)
bivalent_none_both_gene %>% dim
bivalent_none_both_gene %>% head()

gene_hist_group %>% mutate(H3K27me3_group_sum = H3K27me3_group_N + H3K27me3_group_CM + H3K27me3_group_EM + H3K27me3_group_EMRA, H3K4me2_group_sum = H3K4me2_group_N + H3K4me2_group_CM + H3K4me2_group_EM + H3K4me2_group_EMRA) %>%
dplyr::filter(H3K27me3_group_sum != 0, H3K27me3_group_sum != 4, H3K4me2_group_sum != 0, H3K4me2_group_sum != 4) %>%
dplyr::filter(H3K27me3_group_N <= H3K27me3_group_CM, H3K27me3_group_CM <= H3K27me3_group_EM, H3K27me3_group_EM <= H3K27me3_group_EMRA) %>%
dplyr::filter(H3K4me2_group_N <= H3K4me2_group_CM, H3K4me2_group_CM <= H3K4me2_group_EM, H3K4me2_group_EM <= H3K4me2_group_EMRA) %>%
write.csv(file = "CART_CUTRUN_Project/results/paper_figure/RNA_vs_CUTRUN/Bivalent_Genes_none_to_both_from_N_to_EMRA_withHDcounts.csv")

In [None]:
## match the gene name -- original figure
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_expectedCounts.pdf"), width = 13, height =  11)
for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower %>% unique
    cr_gene = rownames(cr_select_data) %>% tolower
    rna_gene_in_cr = match(rna_gene, cr_gene)
    rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
    cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

    rna_row_index = which(!is.na(rna_gene_in_cr))
    cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    cr_rna_match = data.frame(
        H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans,
        gene_length = log(common_gene$width_uniq)
    )

    print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K4me2: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K27me3: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = gene_length)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K4me2: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = gene_length)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K27me3: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

}
dev.off()

options(repr.plot.width=12, repr.plot.height=9)
cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K4me2: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)

cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K27me3: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)



In [None]:
## match the gene name -- original figure --TPM version
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_TPMversion.pdf"), width = 13, height =  11)
for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_TPM
    cr_select_data = normMat$H3K27me3

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    cr_gene = rownames(cr_select_data) %>% tolower
    rna_gene_in_cr = match(rna_gene, cr_gene)
    rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
    cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

    rna_row_index = which(!is.na(rna_gene_in_cr))
    cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    cr_rna_match = data.frame(
        H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
    )

    print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K4me2: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K27me3: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

}
dev.off()

options(repr.plot.width=12, repr.plot.height=9)
cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K4me2: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)

cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K27me3: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)


In [None]:
## match the gene name -- original figure --FPKM version
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_FPKMversion.pdf"), width = 13, height =  11)
for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_FPKM
    cr_select_data = normMat$H3K27me3

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    cr_gene = rownames(cr_select_data) %>% tolower
    rna_gene_in_cr = match(rna_gene, cr_gene)
    rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
    cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

    rna_row_index = which(!is.na(rna_gene_in_cr))
    cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    cr_rna_match = data.frame(
        H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
    )

    print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K4me2: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K27me3: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

}
dev.off()

options(repr.plot.width=12, repr.plot.height=9)
cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K4me2: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)

cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K27me3: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)



In [29]:
options(repr.plot.width=18, repr.plot.height=11)

## check the bivalent peaks
for(gene_group in unique(small_list$Group)){
## mean across healthy donors
    pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_TSSneighborHistoneSignal_scatterplot_SmallCuratedGeneList_", gsub(" ", "_", gene_group), "_BivalentDecipher.pdf"), width = 18, height =  16)
    for(cT in c("N", "CM", "EM", "EMRA")){
        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")

        rna_select_data = rna_expCount
        cr_select_data = normMat$H3K27me3

        rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
        cr_gene = rownames(cr_select_data) %>% tolower
        rna_gene_in_cr = match(rna_gene, cr_gene)
        rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
        cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

        rna_row_index = which(!is.na(rna_gene_in_cr))
        cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]
        
        target_gene = small_list %>% dplyr::filter(Group == gene_group) %$% Gene %>% tolower

        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

        cr_rna_match = data.frame(
            H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
            RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans,
            gene_name = rna_gene[rna_row_index] %>% toupper
        )

        x_thres = 4.5
        y_thres = 2.5
        cr_rna_match$H3K4me2_panel = "I"
        cr_rna_match$H3K4me2_panel[which(cr_rna_match$H3K4me2 > x_thres & cr_rna_match$RNA >= y_thres)] = "II"
        cr_rna_match$H3K4me2_panel[which(cr_rna_match$H3K4me2 <= x_thres & cr_rna_match$RNA < y_thres)] = "III"
        cr_rna_match$H3K4me2_panel[which(cr_rna_match$H3K4me2 > x_thres & cr_rna_match$RNA < y_thres)] = "IV"

        print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = H3K27me3, color = H3K4me2_panel)) +
        geom_point(size = 0.5) +
        geom_point(aes(x = H3K4me2, y = H3K27me3), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        geom_text_repel(aes(x = H3K4me2, y = H3K27me3, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_brewer(palette = "Set1") +
        geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = 2.5, linetype = "dashed", color = "grey10") +
        guides(colour = guide_legend(override.aes = list(size=10))) +
        ggtitle(rna_select_cond)) +
        coord_cartesian(ylim = c(0, 8), xlim = c(0, 10))
        
        print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = H3K27me3, color = H3K4me2_panel)) +
        geom_point(size = 0.5) +
        geom_point(aes(x = H3K4me2, y = H3K27me3), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        geom_text_repel(aes(x = H3K4me2, y = H3K27me3, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        facet_wrap(~H3K4me2_panel) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_brewer(palette = "Set1") +
        geom_vline(xintercept = x_thres, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = 2.5, linetype = "dashed", color = "grey10") +
        guides(colour = guide_legend(override.aes = list(size=10))) +
        ggtitle(rna_select_cond)) +
        coord_cartesian(ylim = c(0, 8), xlim = c(0, 10))

        x_thres = 2.5
        y_thres = 2.5
        cr_rna_match$H3K27me3_panel = "I"
        cr_rna_match$H3K27me3_panel[which(cr_rna_match$H3K27me3 > x_thres & cr_rna_match$RNA >= y_thres)] = "II"
        cr_rna_match$H3K27me3_panel[which(cr_rna_match$H3K27me3 <= x_thres & cr_rna_match$RNA < y_thres)] = "III"
        cr_rna_match$H3K27me3_panel[which(cr_rna_match$H3K27me3 > x_thres & cr_rna_match$RNA < y_thres)] = "IV"

        print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = H3K27me3, color = H3K27me3_panel)) +
        geom_point(size = 0.5) +
        geom_point(aes(x = H3K4me2, y = H3K27me3), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        geom_text_repel(aes(x = H3K4me2, y = H3K27me3, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_brewer(palette = "Set1") +
        geom_vline(xintercept = 4.5, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
        guides(colour = guide_legend(override.aes = list(size=10))) +
        ggtitle(rna_select_cond)) +
        coord_cartesian(ylim = c(0, 8), xlim = c(0, 10))

        print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = H3K27me3, color = H3K27me3_panel)) +
        geom_point(size = 0.5) +
        geom_point(aes(x = H3K4me2, y = H3K27me3), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 2, color = "#ff1c1c") +
        geom_text_repel(aes(x = H3K4me2, y = H3K27me3, label = gene_name), data = cr_rna_match[which(rna_gene[rna_row_index] %in% target_gene), ], size = 5, color = "#ff1c1c") +
        theme_bw(base_size = 25) +
        facet_wrap(~H3K27me3_panel) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_brewer(palette = "Set1") +
        geom_vline(xintercept = 4.5, linetype = "dashed", color = 'grey10') +
        geom_hline(yintercept = y_thres, linetype = "dashed", color = "grey10") +
        guides(colour = guide_legend(override.aes = list(size=10))) +
        ggtitle(rna_select_cond)) +
        coord_cartesian(ylim = c(0, 8), xlim = c(0, 10))
    }
    dev.off()
}

### METHOD 2 Consider all peaks that are near TSS and Normalize peak length

In [None]:
## load peak information and match with nearest genes
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
mPeak$H3K4me2 = mPeak$H3K4me2[which(width(mPeak$H3K4me2) < quantile(width(mPeak$H3K4me2), 0.75))]
mPeak$H3K27me3 = mPeak$H3K27me3[which(width(mPeak$H3K27me3) < quantile(width(mPeak$H3K27me3), 0.75))]

max_distance = 5000
peak_match_gene = list()
for(hist in c("H3K4me2", "H3K27me3")){
    dp_match_gene = distanceToNearest(mPeak[[hist]], gene_gr, select = "all") %>% data.frame %>% dplyr::filter(distance <= max_distance)
    peak_match_gene[[hist]] = data.frame(peak_id = dp_match_gene$queryHits, gene_name = gene_gr$gene_name[dp_match_gene$subjectHits]) #%>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))

}

## normalize the peak count by sequencing depth
normMat = countMat
for(hist in c("H3K4me2", "H3K27me3")){
    for(sample_each in colnames(countMat[[hist]])){
        normMat[[hist]][, sample_each] = log1p(countMat[[hist]][, sample_each]/designInfo$depth[which(designInfo$exps == sample_each)] * max(designInfo$depth))
    }
}

head(countMat$H3K4me2)
head(normMat$H3K4me2)

## Method I.
## normalize peak count scaled by peak length 
for(hist in c("H3K4me2", "H3K27me3")){
    peak_width = data.frame(mPeak[[hist]])$width
    for(i in 1:nrow(normMat[[hist]])){
         normMat[[hist]][i,] =  normMat[[hist]][i, ]/peak_width[i] * median(peak_width)
    }
}
head(normMat$H3K4me2)

## Method II.
## normalize peak count standardized by peak length 
# for(hist in c("H3K4me2", "H3K27me3")){
    
#     for(i in 1:nrow(normMat[[hist]])){
#         tmp = normMat[[hist]][i, ]
#         normMat[[hist]][i,] =  (tmp - mean(tmp))/sd(tmp)
#     }
# }
# head(normMat$H3K4me2)


In [None]:
## get the matching genes
hist = "H3K4me2"
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_HistonePeakScalePeakLength_scatterplot.pdf"), width = 13, height =  11)
# pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_HistonePeakStandardizePeakLength_scatterplot.pdf"), width = 13, height =  11)
for(cT in c("N", "CM", "EM", "EMRA")){


        tmp = match(peak_match_gene[[hist]]$gene_name %>% tolower, rownames(rna_expCount) %>% gsub(".*_", "", .) %>% tolower)
        cr_row_index = which(!is.na(tmp))
        rna_row_index = tmp[which(!is.na(tmp))]

        rna_select_cond = paste0("Input_", cT)
        cr_select_cond = paste0(cT, "_Input")

        rna_select_data = rna_expCount
        cr_select_data = normMat[[hist]]

        cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
        rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

        ## for scalePeakLength method
        cr_rna_match = data.frame(
                H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
                H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
                RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
        )

        ## for standardizePeakLength method
        # cr_rna_match = data.frame(
        #         H3K27me3 = normMat$H3K27me3[cr_row_index, cr_col_index]  %>% rowMeans,
        #         H3K4me2 = normMat$H3K4me2[cr_row_index, cr_col_index] %>% rowMeans,
        #         RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
        # )

    print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K4me2: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K27me3: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))
}
dev.off()

In [None]:
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_HistonePeakScalePeakLength_scatterplot.pdf"), width = 13, height =  11)
for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    cr_gene = rownames(cr_select_data) %>% tolower
    rna_gene_in_cr = match(rna_gene, cr_gene)
    rna_gene[which(!is.na(rna_gene_in_cr))] %>% head
    cr_gene[rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]] %>% head

    rna_row_index = which(!is.na(rna_gene_in_cr))
    cr_row_index = rna_gene_in_cr[which(!is.na(rna_gene_in_cr))]

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    cr_rna_match = data.frame(
        H3K27me3 = log2(normMat$H3K27me3[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        H3K4me2 = log2(normMat$H3K4me2[cr_row_index, cr_col_index] + 1) %>% rowMeans,
        RNA = log2(data.frame(rna_select_data)[rna_row_index, rna_col_index] + 1) %>% rowMeans
    )

    print(cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K4me2: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

    print(cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
    geom_point(size = 0.5) +
    # geom_hex(bins = 300) +
    theme_bw(base_size = 25) +
    xlab("H3K27me3: log2(scaled count + 1)") +
    ylab("RNA-seq: log2(expected count + 1)") +
    scale_color_viridis() +
    ggtitle(rna_select_cond))

}
dev.off()

options(repr.plot.width=12, repr.plot.height=9)
cr_rna_match %>% ggplot(aes(x = H3K4me2, y = RNA, color = H3K27me3)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K4me2: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)

cr_rna_match %>% ggplot(aes(x = H3K27me3, y = RNA, color = H3K4me2)) +
geom_point(size = 0.1) +
# geom_hex(bins = 300) +
theme_bw(base_size = 25) +
xlab("H3K27me3: log2(scaled count + 1)") +
ylab("RNA-seq: log2(expected count + 1)") +
scale_color_viridis() +
ggtitle(rna_select_cond)