In [1]:
## CUTRUN DE and assign to nearest genes

library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)

cell_list <- c("N", "CM", "EM", "EMRA")
hd_list <- paste0("HD", c(1:3, 5:7))
expr_list = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cell_comp_list = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
hist_list <- c("H3K27me3", "H3K4me2")

in_path <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
out_path <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"
fig_path <- "CART_CUTRUN_Project/results/paper_figure/"
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"


In [8]:
## CUTRUN pairwise DP
## 1. pairwise
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))

cellList <- c("N", "CM", "EM", "EMRA")
hdList <- paste0("HD", 1:7)
exprList = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cellCompList = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
histList <- c("H3K27me3", "H3K4me2")

checkRep = function(hist, cell, expr, hd){
  rep = "rep1"
  if(paste(hist, cell, expr, hd, sep = "_") == "H3K27me3_CM_Input_HD1"){
    rep = "rep2"
  }
  if(paste(hist, cell, expr, hd, sep = "_") == "H3K27me3_N_Input_HD2"){
    rep = "rep2"
  }

  return(rep)

}

checkDup = function(expr, hd){

  dupType = "normLibSize" #"norm"
  if(expr == "Input" && hd %in% paste0("HD", 1:3)){
    dupType = "stringent"
  }
  return(dupType)

}

checkBam = function(expr, hd){

  bamType = "bowtie2_align.bam"
  if(expr == "Input" && hd %in% paste0("HD", 1:3)){
    bamType = "bowtie2_align.sorted.rmDup.sortName.bam"
  }
  return(bamType)

}

checkHD = function(expr){

  if(expr == "Input"){
    return(paste0("HD", 4:7))
  }else{
    return(paste0("HD", 1:7))
  }
}

target = c()
for(cell in cellList){
  for(expr in exprList){
    hdL = checkHD(expr)
    for(hd in hdL){
      if( !((cell == "EMRA") & (expr == "Input") && (hd == "HD2"))){
        target = rbind(target, data.frame(cell = cell, expr = expr, humanDonor = hd))
      }

    }
  }
}
target

## Consider human donor as random variable.
## Experimental design
treat <- factor(paste(target$cell, target$expr, sep="."))
design <- model.matrix(~0 + treat)
colnames(design) <- levels(treat)
contrast <- makeContrasts(Input_N_CM = CM.Input - N.Input,
                    Input_N_EM = EM.Input - N.Input,
                    Input_N_EMRA = EMRA.Input - N.Input,
                    Input_CM_EM = EM.Input - CM.Input,
                    Input_CM_EMRA = EMRA.Input - CM.Input,
                    Input_EM_EMRA = EMRA.Input - EM.Input,
                    Product_N_CM = CM.Product - N.Product,
                    Product_N_EM = EM.Product - N.Product,
                    Product_N_EMRA = EMRA.Product - N.Product,
                    Product_CM_EM = EM.Product - CM.Product,
                    Product_CM_EMRA = EMRA.Product - CM.Product,
                    Product_EM_EMRA = EMRA.Product - EM.Product,
                    Stim1_N_CM = CM.Stim1 - N.Stim1,
                    Stim1_N_EM = EM.Stim1 - N.Stim1,
                    Stim1_N_EMRA = EMRA.Stim1 - N.Stim1,
                    Stim1_CM_EM = EM.Stim1 - CM.Stim1,
                    Stim1_CM_EMRA = EMRA.Stim1 - CM.Stim1,
                    Stim1_EM_EMRA = EMRA.Stim1 - EM.Stim1,
                    Stim2_N_CM = CM.Stim2 - N.Stim2,
                    Stim2_N_EM = EM.Stim2 - N.Stim2,
                    Stim2_N_EMRA = EMRA.Stim2 - N.Stim2,
                    Stim2_CM_EM = EM.Stim2 - CM.Stim2,
                    Stim2_CM_EMRA = EMRA.Stim2 - CM.Stim2,
                    Stim2_EM_EMRA = EMRA.Stim2 - EM.Stim2,
                    Stim3_N_CM = CM.Stim3 - N.Stim3,
                    Stim3_N_EM = EM.Stim3 - N.Stim3,
                    Stim3_N_EMRA = EMRA.Stim3 - N.Stim3,
                    Stim3_CM_EM = EM.Stim3 - CM.Stim3,
                    Stim3_CM_EMRA = EMRA.Stim3 - CM.Stim3,
                    Stim3_EM_EMRA = EMRA.Stim3 - EM.Stim3,
                    levels = design)

results = vector("list", length(histList))
voomDDS = vector("list", length(histList))
for(hist in histList){
  ## Filter and delete low expressed genes
    selectR <- which(rowSums(countMat[[hist]]) > 10) ## remove low count genes
    dataS <- countMat[[hist]][selectR,]
    geneNameList = rownames(data)[selectR]

    voomDDS[[hist]] <- voom(counts = dataS, design = design, normalize.method = "cyclicloess", plot = FALSE)
    ## option 1 using voomDDS option2 using normDDS as normalized input.
    inputDDS <- voomDDS[[hist]]
    corfit <- duplicateCorrelation(inputDDS, design, block = target$humanDonor)
    ## corfit$consensus
    fit <- lmFit(inputDDS, design, block = target$humanDonor, correlation = corfit$consensus)
    fitContrast <- contrasts.fit(fit, contrast)
    fitBayes <- eBayes(fitContrast, robust = TRUE)


    results[[hist]] = list()
    for(i in 1:ncol(contrast)){
        ## Results
        res <- topTable(fit = fitBayes, adjust.method = 'fdr', coef = i, number = nrow(inputDDS), sort = 'P')

        res <- data.table(GeneName = geneNameList[as.numeric(rownames(res))], GeneIndex = rownames(res) %>% as.numeric, res)
        res[, Significance := ifelse((adj.P.Val <= 0.05 & sign(logFC) == 1 & abs(logFC) >= 1), 'Up',
                              ifelse((adj.P.Val <= 0.05 & sign(logFC) == -1 & abs(logFC) >= 1), 'Down', 'notDE'))]
        results[[hist]][[i]] = res

        ## Output
        write.table(res, file = paste0(outPath, '/CUTRUN_limma_tables/DE_', colnames(contrast)[i], '_HD1-7_adj0.05_logFC1.csv'), quote = FALSE, row.names = FALSE, sep = ",")
    }
    names(results[[hist]]) <- colnames(contrast)

}
save(results, voomDDS, file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))




In [None]:
## heatmap
## 1. pairwise
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))
for(hist in histList){
    norm_cr <- voomDDS[[hist]]$E

    gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")
    dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
    dp_match_gene %>% data.frame %$% distance %>% summary

    peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) %>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))
    peak_match_gene %>% head()

    rownames(norm_cr) <- rep("notAssigned", nrow(norm_cr))
    rownames(norm_cr)[peak_match_gene$peak_id] <- peak_match_gene$nearest_genes
    head(norm_cr)

    de_gene_list <- c(
    results[[hist]]$Input_N_CM %>% filter(Significance != "notDE") %$% GeneIndex,
    results[[hist]]$Input_N_EM %>% filter(Significance != "notDE") %$% GeneIndex,
    results[[hist]]$Input_N_EMRA %>% filter(Significance != "notDE") %$% GeneIndex,
    results[[hist]]$Input_CM_EM %>% filter(Significance != "notDE") %$% GeneIndex,
    results[[hist]]$Input_CM_EMRA %>% filter(Significance != "notDE") %$% GeneIndex,
    results[[hist]]$Input_EM_EMRA %>% filter(Significance != "notDE") %$% GeneIndex
    ) %>% unique
    sample_select <- paste0(hist, "_", rep(cell_list, each = 4), rep(c("_Input_HD4", "_Input_HD5", "_Input_HD6", "_Input_HD7"), length(cell_list)))

    norm_cr[de_gene_list, ] %>% dim
    norm_cr[de_gene_list, sample_select] %>% head

    annotation_col <- data.frame(CellType = rep(c("N", "CM", "EM", "EMRA"), each = 4))
    rownames(annotation_col) <- sample_select
    annotation_color <- list(CellType = c("N" = "#440154", "CM" = "#31688e", "EM" = "#35b779", "EMRA" = "#fde725"))


    test = pheatmap(norm_cr[de_gene_list, sample_select], cluster_rows = TRUE, cluster_cols = FALSE, show_rownames = FALSE, scale = "none", annotation_col = annotation_col, color = BlueAndRed(),
    annotation_colors = annotation_color, filename = paste0(fig_path, "CUTRUN_DP_", hist, "_pairwise_comparison_heatmap.pdf"), width = 7, height = 7) 

    write.table(norm_cr[de_gene_list[test$tree_row$order], sample_select], file = paste0(fig_path, "CUTRUN_DP_", hist, "_pairwise_comparison_heatmap.csv"), quote = F, row.names = TRUE, col.names = TRUE, sep = ",")

}

# 2. one vs rest

load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))

for(hist in histList){
    norm_cr <- voomDDS[[hist]]$E

    gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")
    dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
    dp_match_gene %>% data.frame %$% distance %>% summary

    peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) %>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))
    peak_match_gene %>% head()

    rownames(norm_cr) <- rep("notAssigned", nrow(norm_cr))
    rownames(norm_cr)[peak_match_gene$peak_id] <- peak_match_gene$nearest_genes

    de_gene_list <- c()
    for(target_cell in c("N", "CM", "EM", "EMRA")){
        load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM_one_vs_rest_", target_cell, ".RData"))
        de_gene_list <- c(de_gene_list, results[[hist]][[paste0("Input_", target_cell, "_Others")]] %>% filter(Significance != "notDE") %$% GeneIndex)
    }
    de_gene_list <- de_gene_list %>% unique

    sample_select <- paste0(hist, "_", rep(cell_list, each = 4), rep(c("_Input_HD4", "_Input_HD5", "_Input_HD6", "_Input_HD7"), length(cell_list)))

    norm_cr[de_gene_list, ] %>% dim
    norm_cr[de_gene_list, sample_select] %>% head

    annotation_col <- data.frame(CellType = rep(c("N", "CM", "EM", "EMRA"), each = 4))
    rownames(annotation_col) <- sample_select
    annotation_color <- list(CellType = c("N" = "#440154", "CM" = "#31688e", "EM" = "#35b779", "EMRA" = "#fde725"))


    test = pheatmap(norm_cr[de_gene_list, sample_select], cluster_rows = TRUE, cluster_cols = FALSE, show_rownames = FALSE, scale = "none", annotation_col = annotation_col, color = BlueAndRed(),
    annotation_colors = annotation_color, filename = paste0(fig_path, "CUTRUN_DP_", hist, "_one_vs_rest_heatmap.pdf"), width = 7, height = 7) 

    write.table(norm_cr[de_gene_list[test$tree_row$order], sample_select], file = paste0(fig_path, "CUTRUN_DP_", hist, "_one_vs_rest_heatmap.csv"), quote = F, row.names = TRUE, col.names = TRUE, sep = ",")

}

In [None]:
## GSEA of CUTRUN
## 1. pairwise
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
# load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
# load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
# load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))
pathway_list <- c("msigdb.v7.4.symbols.gmt", "h.all.v7.4.symbols.gmt", "c2.cp.v7.4.symbols.gmt", "c3.tft.v7.4.symbols.gmt", "c5.all.v7.4.entrez.gmt", "c7.all.v7.4.symbols.gmt")
pathway_name_list <- c("All_Gene_Sets", "Hallmark_All", "C2_noCGP", "C3_TFTonly", "C5_All", "C7_All")
cell_pair_list <- c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")

get_abs_max <- function(x){
    return(x[which(abs(x) == max(abs(x)))])
}

gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")

for(hist in histList){
    for(cell_pair in cell_pair_list){
        dp_ind <- results[[hist]][[paste0("Input_", cell_pair)]] %$% GeneIndex ## DiffPeak Peak ID
        dp_match_gene <- distanceToNearest(mPeak[[hist]][dp_ind], gene_gr, select = "all") ## get the nearest gene for each peak
        peak_match_gene <- data.frame(peak_id = dp_ind[dp_match_gene@from], gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) ## get the peak id and matching gene name
        peak_match_gene <- left_join(peak_match_gene, results[[hist]][[paste0("Input_", cell_pair)]] %>% dplyr::select(GeneIndex, logFC), by = c("peak_id" = "GeneIndex")) ## get the peak id, peak logFC, and matching gene name
        gene_peak_logFC <- peak_match_gene %>% group_by(gene_name) %>% summarize(logFC_abs_max = get_abs_max(logFC)) ## for each gene get the abs max logFC peak

        geneList <- gene_peak_logFC$logFC_abs_max
        names(geneList) <- gene_peak_logFC$gene_name
        geneList <- sort(geneList, decreasing = TRUE)

        for(p in 1:length(pathway_list)){
            pathway <- pathway_list[p]
            pathway_name <- pathway_name_list[p]

            module_file <- paste0("/fh/fast/gottardo_r/yezheng_working/SupplementaryData/GSEA/", pathway)
            gene_set <- getGmt(module_file)
            gene_pathway <- geneIds(gene_set)

            gsea_res <- fgsea(pathways = gene_pathway, stats = geneList, minSize = 10, maxSize = 500, eps = 0)
            gsea_res <- gsea_res %>% arrange(padj, decreaing = FALSE)
            fwrite(gsea_res, file = paste0(fig_path, "/CUTRUN_", hist, "_nearest_gene_DP_pairwise_", cell_pair, "_", pathway_name, "_GSEA.csv"), sep = ",")

            pdf(paste0(fig_path, "/CUTRUN_", hist, "_nearest_gene_DP_", cell_pair, "_", pathway_name, "_GSEA_topEnrichmentScore.pdf"), width = 15, height = 10)
            topPathwaysUp <- gsea_res[ES > 0][head(order(pval), n=10), pathway]
            topPathwaysDown <- gsea_res[ES < 0][head(order(pval), n=10), pathway]
            topPathways <- c(topPathwaysUp, rev(topPathwaysDown)) %>% unique
            plotGseaTable(
                gene_pathway[topPathways], 
                geneList, 
                gsea_res,
                gseaParam=0.5
            )
            dev.off()
        }

    }
}


In [None]:
## GSEA of CUTRUN
## 2. one vs rest
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
pathway_list <- c("msigdb.v7.4.symbols.gmt", "h.all.v7.4.symbols.gmt", "c2.cp.v7.4.symbols.gmt", "c3.tft.v7.4.symbols.gmt", "c5.all.v7.4.entrez.gmt", "c7.all.v7.4.symbols.gmt")
pathway_name_list <- c("All_Gene_Sets", "Hallmark_All", "C2_noCGP", "C3_TFTonly", "C5_All", "C7_All")
cell_pair_list <- c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")

get_abs_max <- function(x){
    return(x[which(abs(x) == max(abs(x)))])
}

gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")

for(hist in histList){
    for(target_cell in c("N", "CM", "EM", "EMRA")){
        load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM_one_vs_rest_", target_cell, ".RData"))
        dp_ind <- results[[hist]][[paste0("Input_", target_cell, "_Others")]] %$% GeneIndex ## DiffPeak Peak ID
        dp_match_gene <- distanceToNearest(mPeak[[hist]][dp_ind], gene_gr, select = "all") ## get the nearest gene for each peak
        peak_match_gene <- data.frame(peak_id = dp_ind[dp_match_gene@from], gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) ## get the peak id and matching gene name
        peak_match_gene <- left_join(peak_match_gene, results[[hist]][[paste0("Input_", target_cell, "_Others")]] %>% dplyr::select(GeneIndex, logFC), by = c("peak_id" = "GeneIndex")) ## get the peak id, peak logFC, and matching gene name
        gene_peak_logFC <- peak_match_gene %>% group_by(gene_name) %>% summarize(logFC_abs_max = get_abs_max(logFC)) ## for each gene get the abs max logFC peak

        geneList <- gene_peak_logFC$logFC_abs_max
        names(geneList) <- gene_peak_logFC$gene_name
        geneList <- sort(geneList, decreasing = TRUE)

        for(p in 1:length(pathway_list)){
            pathway <- pathway_list[p]
            pathway_name <- pathway_name_list[p]

            module_file <- paste0("/fh/fast/gottardo_r/yezheng_working/SupplementaryData/GSEA/", pathway)
            gene_set <- getGmt(module_file)
            gene_pathway <- geneIds(gene_set)

            gsea_res <- fgsea(pathways = gene_pathway, stats = geneList, minSize = 10, maxSize = 500, eps = 0)
            gsea_res <- gsea_res %>% arrange(padj, decreaing = FALSE)
            fwrite(gsea_res, file = paste0(fig_path, "/CUTRUN_", hist, "_nearest_gene_DP_one_vs_rest_", target_cell, "_", pathway_name, "_GSEA.csv"), sep = ",")

            pdf(paste0(fig_path, "/CUTRUN_", hist, "_nearest_gene_DP_one_vs_rest_", target_cell, "_", pathway_name, "_GSEA_topEnrichmentScore.pdf"), width = 15, height = 10)
            topPathwaysUp <- gsea_res[ES > 0][head(order(pval), n=10), pathway]
            topPathwaysDown <- gsea_res[ES < 0][head(order(pval), n=10), pathway]
            topPathways <- c(topPathwaysUp, rev(topPathwaysDown)) %>% unique
            plotGseaTable(
                gene_pathway[topPathways], 
                geneList, 
                gsea_res,
                gseaParam=0.5
            )
            dev.off()
        }

    }
}


In [2]:
## Figure 2 -	Differences between RNAseq and CUT&RUN identified gene sets
# o	Displayed as scatter plot
# 	RNAseq expression on Y vs log(CPM) for H3K4me2 on X for N, CM, EM and EMRA
# o	Full list of genes that lie in Q1 vs Q3
# o	Gene ontology for genes that lie in Q1 bs Q3 
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData")) ## voomDDS results
cr_voomDDS <- voomDDS
load(file = paste0(out_path, "/RData/HD1-3_5-7_normLimma_perExprCondition.RData"))
rna_voomDDS <- voomDDS$E
geneID <- fread(paste0(in_path, "RNA_CD8_N_Input_HD1.genes.results"))$gene_id
rownames(rna_voomDDS) <- geneID[selectR] %>% gsub(".*_", "", .)



In [None]:
select_gene <- list(
    TF = read_excel(paste0(gene_path, "2021_11_29_TFs.xlsx")) %>% mutate(gene_name = `HGNC symbol`),
    Tactive = read_excel(paste0(gene_path, "2021_11_30_TCellActivationFunction.xlsx")) %>% mutate(gene_name = `Official Symbol`),
    CellCycle = read_excel(paste0(gene_path, "2021_22_30_CellCycleAPoptosis.xlsx")) %>% mutate(gene_name = `Symbol`),
    Metabolism = read_excel(paste0(gene_path, "2021_22_30_MetabolismGeneList_FinalList.xlsx")) %>% mutate(gene_name = `Gene`)
)

for(gene_type in names(select_gene)){
    pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_PeakSignal_scatterplot_", gene_type, "_maxCRsignal.pdf"), width = 11, height =  11)
   
    for(hist in c("H3K27me3", "H3K4me2")){
        for(target_cell in c("N", "CM", "EM", "EMRA")){
            gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")
            dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
            dp_match_gene %>% data.frame %$% distance %>% summary
            peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) #%>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))

            rna_gene_exp <- data.frame(gene_name = rownames(rna_voomDDS), GEX = rna_voomDDS[, paste0(rep("Input_", 6), target_cell, c("_HD1", "_HD2", "_HD3", "_HD5", "_HD6", "_HD7"))] %>% rowMeans )
            rna_gene_exp %>% dim
            cr_mean <- cr_voomDDS[[hist]]$E[, paste0(hist, "_", target_cell, c("_Input_HD4", "_Input_HD5", "_Input_HD6", "_Input_HD7"))] %>% rowMeans
            cr_gene_cr <- data.frame(gene_name = peak_match_gene$gene_name, CUTRUN_signal = cr_mean[peak_match_gene$peak_id]) 
            cr_gene_cr %>% dim
            
            if(hist == "H3K4me2"){
                line_slope <- 1
                line_intercept <- 0
                line_x <- 5
                line_y <- 5
            }else{
                line_slope <- -1
                line_intercept <- 7
                line_x <- 3.5
                line_y <- 3.5
            }
            print(inner_join(cr_gene_cr, rna_gene_exp, by = "gene_name")  %>% filter(gene_name %in% select_gene[[gene_type]]$gene_name) %>% group_by(gene_name, GEX) %>% summarize(maxCR = max(CUTRUN_signal)) %>% 
            ## ggplot(aes(x = CUTRUN_signal, y = GEX)) +
            ggplot(aes(x = maxCR, y = GEX)) +
            geom_hex(bins = 200) +
            ## geom_point(size = 0.05, color = "grey") +
            theme_bw(base_size = 20) +
            geom_abline(slope = line_slope, intercept = line_intercept, linetype = "dashed", color = "red", size = 1.2) +
            geom_vline(xintercept = line_x, linetype = "dashed", color = "red", size = 1.2) +
            geom_hline(yintercept = line_y, linetype = "dashed", color = "red", size = 1.2) +
            xlab(paste0(hist, " Normalized Peak Signals")) +
            ylab("Normalized Gene Expression") +
            ggtitle(paste0(target_cell, " Cells ", gene_type)))

        }
    }
    dev.off()
}


In [None]:
## hexbin plot comparing normalized gex and peak signal
pdf(paste0(fig_path, "/RNAseq_CUTRUN_GEX_PeakSignal_scatterplot.pdf"), width = 11, height =  11)
for(hist in c("H3K27me3", "H3K4me2")){
    for(target_cell in c("N", "CM", "EM", "EMRA")){
        gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")
        dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
        dp_match_gene %>% data.frame %$% distance %>% summary
        peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) #%>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))

        rna_gene_exp <- data.frame(gene_name = rownames(rna_voomDDS), GEX = rna_voomDDS[, paste0(rep("Input_", 6), target_cell, c("_HD1", "_HD2", "_HD3", "_HD5", "_HD6", "_HD7"))] %>% rowMeans )
        rna_gene_exp %>% dim
        cr_mean <- cr_voomDDS[[hist]]$E[, paste0(hist, "_", target_cell, c("_Input_HD4", "_Input_HD5", "_Input_HD6", "_Input_HD7"))] %>% rowMeans
        cr_gene_cr <- data.frame(gene_name = peak_match_gene$gene_name, CUTRUN_signal = cr_mean[peak_match_gene$peak_id]) 
        cr_gene_cr %>% dim
        
        if(hist == "H3K4me2"){
            line_slope <- 1
            line_intercept <- 0
            line_x <- 5
            line_y <- 5
        }else{
            line_slope <- -1
            line_intercept <- 7
            line_x <- 3.5
            line_y <- 3.5
        }
        print(inner_join(cr_gene_cr, rna_gene_exp, by = "gene_name")  %>% 
        ggplot(aes(x = CUTRUN_signal, y = GEX)) +
        geom_hex(bins = 200) +
        ## geom_point(size = 0.01, color = "grey") +
        theme_bw(base_size = 20) +
        geom_abline(slope = line_slope, intercept = line_intercept, linetype = "dashed", color = "red", size = 1.2) +
        geom_vline(xintercept = line_x, linetype = "dashed", color = "red", size = 1.2) +
        geom_hline(yintercept = line_y, linetype = "dashed", color = "red", size = 1.2) +
        xlab(paste0(hist, " Normalized Peak Signals")) +
        ylab("Normalized Gene Expression") +
        ggtitle(paste0(target_cell, " Cells")))

    }
}
dev.off()

In [None]:
## output DP with nearest genes
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData")) ## voomDDS results
gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")


for(hist in c("H3K27me3", "H3K4me2")){
    dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
    dp_match_gene %>% data.frame %$% distance %>% summary
    peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) %>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))

    for(sample in names(results[[hist]])){
        info = sample %>% strsplit("_") %>% unlist
        limma_res <- results[[hist]][[sample]]
        limma_res <- cbind(data.frame(mPeak[[hist]])[limma_res$GeneIndex, c("seqnames", "start", "end")], limma_res)
        dp_res <- right_join(peak_match_gene, limma_res, by = c("peak_id" = "GeneIndex")) %>% arrange(adj.P.Val)

        data_norm <- countMat[[hist]]/colSums(countMat[[hist]]) * 1000000
        tmp = cbind(dp_res, 
            data_norm[
                dp_res$peak_id, 
                c(
                    paste0(hist, "_", info[2], "_", info[1], "_", c("HD4", "HD5", "HD6", "HD7")), 
                    paste0(hist, "_", info[3], "_", info[1], "_", c("HD4", "HD5", "HD6", "HD7"))
                )]
        ) 
        tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD4")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD4")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD4")])
        tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD5")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD5")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD5")])
        tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD6")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD6")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD6")])
        tmp[, paste0(hist, "_log2(", info[3], "/", info[2], ")_", info[1], "_HD7")] = log2(tmp[, paste0(hist, "_", info[3], "_", info[1], "_HD7")] / tmp[, paste0(hist, "_", info[2], "_", info[1], "_HD7")])

        write.table(tmp, file = paste0(fig_path, "CUTRUN_DP_", hist, "_", sample, "_pairwise_withNearestGenes.csv"), sep = ",", quote = F, row.names = F, col.names = T)
    }
}

In [None]:
## hexbin plot comparing logFC of GEX and Peak signal
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData")) ## voomDDS results
cr_results <- results
load(file = paste0(out_path, "/RData/HD1-3_5-7_normLimma_perExprCondition.RData"))
rna_results <- results

pdf(paste0(fig_path, "/RNAseq_CUTRUN_logFC_scatterplot.pdf"), width = 11, height =  11)
for(hist in c("H3K27me3", "H3K4me2")){
    for(cell_pair in c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")){

        
        gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")
        dp_match_gene <- distanceToNearest(mPeak[[hist]], gene_gr, select = "all")
        dp_match_gene %>% data.frame %$% distance %>% summary
        peak_match_gene <- data.frame(peak_id = dp_match_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[dp_match_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) #%>% group_by(peak_id) %>% summarize(nearest_genes = paste(gene_name, collapse = ";"))

        rna_gene_exp <- data.frame(gene_name = rna_results[[paste0("Input_", cell_pair)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cell_pair)]]$logFC)
        rna_gene_exp %>% dim
        cr_gene_cr <- left_join(peak_match_gene, cr_results[[hist]][[paste0("Input_", cell_pair)]], by = c("peak_id" = "GeneIndex")) %>% group_by(gene_name) %>% summarize(logFC_abs_max = get_abs_max(logFC))
        cr_gene_cr %>% dim
        
        if(hist == "H3K4me2"){
            line_slope <- 1
            line_intercept <- 0
            line_x <- 5
            line_y <- 5
        }else{
            line_slope <- -1
            line_intercept <- 7
            line_x <- 3.5
            line_y <- 3.5
        }
        print(inner_join(cr_gene_cr, rna_gene_exp, by = "gene_name")  %>% 
        ggplot(aes(x = logFC_abs_max, y = GEX)) +
        ## geom_hex(bins = 200) +
        geom_point(size = 0.1, alpha = 0.3) +
        theme_bw(base_size = 20) +
        # geom_abline(slope = line_slope, intercept = line_intercept, linetype = "dashed", color = "red", size = 1.2) +
        # geom_vline(xintercept = line_x, linetype = "dashed", color = "red", size = 1.2) +
        # geom_hline(yintercept = line_y, linetype = "dashed", color = "red", size = 1.2) +
        xlab(paste0(hist, " logFC")) +
        ylab("Gene Expression logFC") +
        ggtitle(cell_pair))

    }
}
dev.off()

In [None]:
## Bivalent methylation marks
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))

names(peakAll)

In [None]:
## overlap K4me2 and K27me3 ensuring at least 50bp overlap, count number of regions after merging
bi_path <- "CART_CUTRUN_Project/results/CUTANDRUN/analysis/BivalentGenes/"
gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")

get_nearest_gene <- function(gene_gr, region_gr){
    region_assign_gene <- distanceToNearest(region_gr, gene_gr, select = "all")
    # dist_tmp <- region_assign_gene %>% data.frame %$% distance
    # region_assign_gene <- region_assign_gene[dist_tmp < 1000]
    region_match_gene <- cbind(data.frame(region_gr)[region_assign_gene@from, ], data.frame(peak_id = region_assign_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[region_assign_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) )
    return(region_match_gene)

}
bi_num <- c()
for(cT in c("N", "CM", "EM", "EMRA")){
    for(pt in paste0("HD", 4:7)){
        bi_olp <- findOverlaps(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]], minoverlap = 50)
        bi_peak.gr <- reduce(append(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]][bi_olp@from], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]][bi_olp@to]))
        bi_gene <- get_nearest_gene(gene_gr, bi_peak.gr)
        write.table(unique(bi_gene$gene_name), paste0(bi_path, "Bivalent_peaks_nearest_genes_", cT, "_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")

        total_peak.gr <- append(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]]) %>% reduce
        total_gene <- get_nearest_gene(gene_gr, total_peak.gr)

        bi_num <- data.frame(
            bi_peak_num = length(bi_peak.gr),
            bi_gene_num = length(unique(bi_gene$gene_name)),
            total_peak_num = length(total_peak.gr),
            total_gene_num = length(unique(total_gene$gene_name)),
            cT = cT,
            pt = pt
        ) %>% rbind(bi_num, .)
    }
}
bi_num$cT <- factor(bi_num$cT, levels = c("N", "CM", "EM", "EMRA"))

p1 = bi_num %>% ggplot(aes(x = pt, y = bi_peak_num, fill = cT)) + 
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("# of Peak Regions") +
rremove("legend.title")

p2 = bi_num %>% mutate(bi_peak_prop = bi_peak_num/total_peak_num * 100) %>% ggplot(aes(x = pt, y = bi_peak_prop, fill = cT)) + 
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("% of Peak Regions") +
rremove("legend.title")

p3 = bi_num %>% ggplot(aes(x = pt, y = bi_gene_num, fill = cT)) + 
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("# of Genes") +
rremove("legend.title")

p4 = bi_num %>% mutate(bi_gene_prop = bi_gene_num/total_gene_num * 100) %>% ggplot(aes(x = pt, y = bi_gene_prop, fill = cT)) + 
geom_bar(stat = "identity", position = position_dodge()) +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("% of Genes") +
rremove("legend.title")

ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = TRUE) 

pdf(paste0(fig_path, "/bivalent_peak_gene_num_prop_boxplot.pdf"), width = 7, height = 7)
bi_num %>% ggplot(aes(x = cT, y = bi_peak_num, fill = cT)) + 
geom_boxplot() +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("# of Peak Regions") +
rremove("legend")

bi_num %>% mutate(bi_peak_prop = bi_peak_num/total_peak_num * 100) %>% ggplot(aes(x = cT, y = bi_peak_prop, fill = cT)) + 
geom_boxplot() +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("% of Peak Regions") +
rremove("legend")

bi_num %>% ggplot() + 
geom_boxplot(aes(x = cT, y = bi_gene_num, fill = cT)) +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("# of Genes") +
rremove("legend")


bi_num %>% mutate(bi_gene_prop = bi_gene_num/total_gene_num * 100) %>% ggplot() + 
geom_boxplot(aes(x = cT, y = bi_gene_prop, fill = cT)) +
scale_fill_viridis(discrete = TRUE) +
theme_bw(base_size = 20) +
xlab("") +
ylab("% of Genes") +
rremove("legend") 
dev.off()

# pdf(paste0(fig_path, "/bivalent_peak_gene_num_prop.pdf"), width = 10, height = 10)
# ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = TRUE) 
# dev.off()


In [None]:
load(file = paste0(out_path, "/RData/HD1-3_5-7_normLimma_perExprCondition.RData"))
rna_results <- results

bi_change <- c()
bi_change_gene_lfc <- c()
ct_list <- c("N", "CM", "EM", "EMRA")
pt_list <- paste0("HD", 4:7)
for(i in 1:length(ct_list)){
    cT <- ct_list[i]
    cT1 <- cT
    for(j in 1:length(ct_list)){
        if(i != j){
            cT2 <- ct_list[j]
            for(pt in pt_list){
                bi_olp <- findOverlaps(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]], minoverlap = 50)
                bi_olp2 <- findOverlaps(peakAll[[paste0("H3K27me3_", cT2, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT2, "_Input_", pt)]], minoverlap = 50)

                bi_peak.gr <- reduce(append(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]][bi_olp@from], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]][bi_olp@to]))
                bi_peak.gr2 <- reduce(append(peakAll[[paste0("H3K27me3_", cT2, "_Input_", pt)]][bi_olp2@from], peakAll[[paste0("H3K4me2_", cT2, "_Input_", pt)]][bi_olp2@to]))

                bi_again <- findOverlaps(bi_peak.gr, bi_peak.gr2, minoverlap = 50)@from %>% unique
                bi_k27 <- findOverlaps(bi_peak.gr, peakAll[[paste0("H3K27me3_", cT2, "_Input_", pt)]], minoverlap = 50)@from %>% unique
                bi_k4 <- findOverlaps(bi_peak.gr, peakAll[[paste0("H3K4me2_", cT2, "_Input_", pt)]], minoverlap = 50)@from %>% unique
                bi_k27 <- setdiff(setdiff(bi_k27, bi_k4), bi_again)
                bi_k4 <- setdiff(setdiff(bi_k4, bi_k27), bi_again)
                bi_none <- setdiff(1:length(bi_peak.gr), unique(c(bi_again, bi_k27, bi_k4)) %>% sort)

                bi_gene <- get_nearest_gene(gene_gr, bi_peak.gr)
                bi_again_gene <- bi_gene %>% filter(peak_id %in% bi_again) %$% gene_name %>% unique
                bi_k27_gene <- bi_gene %>% filter(peak_id %in% bi_k27) %$% gene_name %>% unique
                bi_k4_gene <- bi_gene %>% filter(peak_id %in% bi_k4) %$% gene_name %>% unique
                bi_none_gene <- setdiff(unique(bi_gene$gene_name), c(bi_again_gene, bi_k27_gene, bi_k4_gene)) %>% unique
                write.table(bi_again_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_BivalentPeakAgainIn", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                write.table(bi_k27_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_H3K27me3In", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                write.table(bi_k4_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_H3K4me2In", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                write.table(bi_none_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_NoPeakIn", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                
                if(i < j){
                    bi_again_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_again_gene)
                    bi_k27_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_k27_gene)
                    bi_k4_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_k4_gene)
                    bi_none_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_none_gene)
                }else{
                    bi_again_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_again_gene)
                    bi_k27_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_k27_gene)
                    bi_k4_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_k4_gene)
                    bi_none_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_none_gene)

                }
                
                bi_change <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    bi_original_num = length(bi_peak.gr), bi_again_num = length(bi_again), bi_k27_num = length(bi_k27), bi_k4_num = length(bi_k4), bi_none_num = length(bi_none),
                    bi_original_gene_num = length(unique(bi_gene$gene_name)), bi_again_gene_num = length(bi_again_gene), bi_k27_gene_num = length(bi_k27_gene), bi_k4_gene_num = length(bi_k4_gene), bi_none_gene_num = length(bi_none_gene)) %>% 
                    mutate(bi_again_prop = bi_again_num/bi_original_num * 100, bi_k27_prop = bi_k27_num/bi_original_num * 100, bi_k4_prop = bi_k4_num/bi_original_num * 100, bi_none_prop = bi_none_num/bi_original_num * 100) %>% 
                    mutate(bi_again_gene_prop = bi_again_gene_num/bi_original_gene_num * 100, bi_k27_gene_prop = bi_k27_gene_num/bi_original_gene_num * 100, bi_k4_gene_prop = bi_k4_gene_num/bi_original_gene_num * 100, bi_none_gene_prop = bi_none_gene_num/bi_original_gene_num * 100) %>% 
                    rbind(bi_change, .)
                
                bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_again_gene_lfc$GEX, gene_name = bi_again_gene_lfc$gene_name, type = "bi_again"
                ) %>% rbind(bi_change_gene_lfc, .)
                bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_k27_gene_lfc$GEX, gene_name = bi_k27_gene_lfc$gene_name, type = "bi_k27"
                ) %>% rbind(bi_change_gene_lfc, .)
                bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_k4_gene_lfc$GEX, gene_name = bi_k4_gene_lfc$gene_name, type = "bi_k4"
                ) %>% rbind(bi_change_gene_lfc, .)
                if(nrow(bi_none_gene_lfc) > 0){
                    bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_none_gene_lfc$GEX, gene_name = bi_none_gene_lfc$gene_name, type = "bi_none"
                ) %>% rbind(bi_change_gene_lfc, .)
                }
                
            }
        }
        
    }
}


bi_change %>% head
bi_change_gene_lfc %>% head

In [None]:
library(tidyr)
bi_change$cT <- factor(bi_change$cT, levels = ct_list)
bi_change$cT_compare <- factor(bi_change$cT_compare, levels = ct_list)

p1 = bi_change %>% pivot_longer(., cols = c("bi_again_prop", "bi_k27_prop", "bi_k4_prop", "bi_none_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%  group_by(cT, pt, cT_compare) %>% arrange(rev(name)) %>% mutate(label_y = cumsum(value)) %>%
ggplot(aes(cT_compare, y = value, fill = name, label = round(value, 1))) +
geom_bar(stat = "identity") +
geom_text(aes(y = label_y), vjust = 1, stat = "identity") +
facet_grid(pt~cT, scale = "free", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("% of Peaks") +
rremove("legend.title")

p2 = bi_change %>% pivot_longer(., cols = c("bi_again_num", "bi_k27_num", "bi_k4_num", "bi_none_num")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%  group_by(cT, pt, cT_compare) %>% arrange(rev(name)) %>% mutate(label_y = cumsum(value)) %>%
ggplot(aes(cT_compare, y = value, fill = name, label = round(value, 1))) +
geom_bar(stat = "identity") +
geom_text(aes(y = label_y), vjust = 1, stat = "identity") +
facet_grid(pt~cT, scale = "free", space = "free_x") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("# of Peaks") +
rremove("legend.title")

p3 = bi_change %>% pivot_longer(., cols = c("bi_again_gene_prop", "bi_k27_gene_prop", "bi_k4_gene_prop", "bi_none_gene_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%  group_by(cT, pt, cT_compare) %>% arrange(rev(name)) %>% mutate(label_y = cumsum(value)) %>%
ggplot(aes(cT_compare, y = value, fill = name, label = round(value, 1))) +
geom_bar(stat = "identity") +
geom_text(aes(y = label_y), vjust = 1, stat = "identity") +
facet_grid(pt~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("% of Genes") +
rremove("legend.title") 

p4 = bi_change %>% pivot_longer(., cols = c("bi_again_gene_num", "bi_k27_gene_num", "bi_k4_gene_num", "bi_none_gene_num")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%  group_by(cT, pt, cT_compare) %>% arrange(rev(name)) %>% mutate(label_y = cumsum(value)) %>%
ggplot(aes(cT_compare, y = value, fill = name, label = round(value, 1))) +
geom_bar(stat = "identity") +
geom_text(aes(y = label_y), vjust = 1, stat = "identity") +
facet_grid(pt~cT, scale = "free", space = "free_x") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("# of Genes") +
rremove("legend.title")


pdf(paste0(fig_path, "bivalent_peak_gene_change_between_celltypes_perHD.pdf"), width = 13, height = 13)
p1
p2
p3
p4
dev.off()

ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2, common.legend = TRUE)



In [None]:
p1 = bi_change %>% pivot_longer(., cols = c("bi_again_prop", "bi_k27_prop", "bi_k4_prop", "bi_none_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%
ggplot(aes(cT_compare, y = value, fill = name, label = round(value, 1))) +
geom_boxplot() +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("% of Peaks") +
rremove("legend.title") +
theme(legend.position = "top")

p2 = bi_change %>% pivot_longer(., cols = c("bi_again_gene_prop", "bi_k27_gene_prop", "bi_k4_gene_prop", "bi_none_gene_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%
ggplot(aes(cT_compare, y = value, fill = name, label = round(value, 1))) +
geom_boxplot() +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("% of Genes") +
rremove("legend.title") +
theme(legend.position = "top")

pdf(paste0(fig_path, "bivalent_peak_gene_change_between_celltypes.pdf"), width = 11, height = 17)
ggarrange(p1, p2, nrow = 2, ncol = 1, common.legend = TRUE)
dev.off()

ggarrange(p1, p2, nrow = 2, ncol = 1, common.legend = TRUE)

In [None]:
bi_change_gene_lfc %>% head
bi_change_gene_lfc$cT <- factor(bi_change_gene_lfc$cT, levels = ct_list)
bi_change_gene_lfc$cT_compare <- factor(bi_change_gene_lfc$cT_compare, levels = ct_list)

p1 = bi_change_gene_lfc %>% 
ggplot(aes(cT_compare, y = logFC, fill = type)) +
geom_boxplot() +
facet_grid(pt~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("logFC of GEX") +
rremove("legend.title") +
theme(legend.position = "top")

p2 = bi_change_gene_lfc %>% 
ggplot(aes(cT_compare, y = logFC, fill = type)) +
geom_boxplot() +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("logFC of GEX") +
rremove("legend.title") +
theme(legend.position = "top")

pdf(paste0(fig_path, "bivalent_peak_gene_change_between_celltypes_gex_logfc_perHD.pdf"), width = 11, height = 20)
p1
dev.off()
pdf(paste0(fig_path, "bivalent_peak_gene_change_between_celltypes_gex_logfc.pdf"), width = 11, height = 8)
p2
dev.off()
p1
p2


In [None]:
## K4 of earlier cell type overlap with K27 of later cell type and no bivalent in each cell type
load(file = paste0(out_path, "/RData/HD1-3_5-7_normLimma_perExprCondition.RData"))
rna_results <- results

gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")

get_nearest_gene <- function(gene_gr, region_gr){
    region_assign_gene <- distanceToNearest(region_gr, gene_gr, select = "all")
    region_match_gene <- cbind(data.frame(region_gr)[region_assign_gene@from, ], data.frame(peak_id = region_assign_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[region_assign_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) )
    return(region_match_gene)

}

switch_summary <- c()
gex_switch_summary <- c()
ct_list <- c("N", "CM", "EM", "EMRA")
pt_list <- paste0("HD", 4:7)

mark1 <- "H3K4me2"
mark2 <- "H3K27me3"
for(i in 1:(length(ct_list)-1)){
    cT1 <- ct_list[i]
    for(j in (i+1):length(ct_list)){
        cT2 <- ct_list[j]
        for(pt in pt_list){          
            bi_olp <- findOverlaps(peakAll[[paste0(mark1, "_", cT1, "_Input_", pt)]], peakAll[[paste0(mark2, "_", cT1, "_Input_", pt)]], minoverlap = 50)
            bi_olp2 <- findOverlaps(peakAll[[paste0(mark1, "_", cT2, "_Input_", pt)]], peakAll[[paste0(mark2, "_", cT2, "_Input_", pt)]], minoverlap = 50)

            pure_mark1 <- peakAll[[paste0(mark1, "_", cT1, "_Input_", pt)]][-unique(bi_olp@from)]
            pure_mark2 <- peakAll[[paste0(mark2, "_", cT2, "_Input_", pt)]][-unique(bi_olp2@to)]
            pure_mark1_gene <- get_nearest_gene(gene_gr, pure_mark1)
            
            switch_mark <- findOverlaps(pure_mark1, pure_mark2, minoverlap = 50)
            switch_region <- reduce(append(pure_mark1[switch_mark@from], pure_mark2[switch_mark@to]))
            switch_gene <- get_nearest_gene(gene_gr, switch_region)
            rna_gene_exp <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% unique(switch_gene$gene_name))
            
            write.table(unique(switch_gene$gene_name), paste0(bi_path, mark1, "Of", cT1, "_", mark2, "Of", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                

            switch_summary <- data.frame(
                cT1 = cT1, cT2 = cT2, mark1 = mark1, mark2 = mark2, pt = pt, 
                switch_region_num = length(switch_region), switch_gene_num = length(unique(switch_gene$gene_name)),
                pure_region_num = length(pure_mark1), pure_gene_num = length(unique(pure_mark1_gene$gene_name))
            ) %>% rbind(switch_summary, .)

            gex_switch_summary <- data.frame(
                cT1 = cT1, cT2 = cT2, mark1 = mark1, mark2 = mark2, pt = pt, 
                logFC = rna_gene_exp$GEX, gene_name = rna_gene_exp$gene_name
            ) %>% rbind(gex_switch_summary, .)
        }
    }
}
mark1 <- "H3K27me3"
mark2 <- "H3K4me2"
for(i in 1:(length(ct_list)-1)){
    cT1 <- ct_list[i]
    for(j in (i+1):length(ct_list)){
        cT2 <- ct_list[j]
        for(pt in pt_list){          
            bi_olp <- findOverlaps(peakAll[[paste0(mark1, "_", cT1, "_Input_", pt)]], peakAll[[paste0(mark2, "_", cT1, "_Input_", pt)]], minoverlap = 50)
            bi_olp2 <- findOverlaps(peakAll[[paste0(mark1, "_", cT2, "_Input_", pt)]], peakAll[[paste0(mark2, "_", cT2, "_Input_", pt)]], minoverlap = 50)

            pure_mark1 <- peakAll[[paste0(mark1, "_", cT1, "_Input_", pt)]][-unique(bi_olp@from)]
            pure_mark2 <- peakAll[[paste0(mark2, "_", cT2, "_Input_", pt)]][-unique(bi_olp2@to)]
            pure_mark1_gene <- get_nearest_gene(gene_gr, pure_mark1)
            
            switch_mark <- findOverlaps(pure_mark1, pure_mark2, minoverlap = 50)
            switch_region <- reduce(append(pure_mark1[switch_mark@from], pure_mark2[switch_mark@to]))
            switch_gene <- get_nearest_gene(gene_gr, switch_region)
            rna_gene_exp <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% unique(switch_gene$gene_name))
            write.table(unique(switch_gene$gene_name), paste0(bi_path, mark1, "Of", cT1, "_", mark2, "_Of", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
            
            switch_summary <- data.frame(
                cT1 = cT1, cT2 = cT2, mark1 = mark1, mark2 = mark2, pt = pt, 
                switch_region_num = length(switch_region), switch_gene_num = length(unique(switch_gene$gene_name)),
                pure_region_num = length(pure_mark1), pure_gene_num = length(unique(pure_mark1_gene$gene_name))
            ) %>% rbind(switch_summary, .)

            gex_switch_summary <- data.frame(
                cT1 = cT1, cT2 = cT2, mark1 = mark1, mark2 = mark2, pt = pt, 
                logFC = rna_gene_exp$GEX, gene_name = rna_gene_exp$gene_name
            ) %>% rbind(gex_switch_summary, .)
        }
    }
}
head(switch_summary)
head(gex_switch_summary)

In [None]:
switch_summary$cT1 <- factor(switch_summary$cT1, levels = ct_list)
switch_summary$cT2 <- factor(switch_summary$cT2, levels = ct_list)

p1 = switch_summary %>% filter(mark1 == "H3K4me2") %>% mutate(switch_region_prop = switch_region_num/pure_region_num * 100) %>%
ggplot(aes(x = pt, y = switch_region_prop, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("% of Peaks") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K4me2 -> H3K27me3"))

p2 = switch_summary  %>% filter(mark1 == "H3K4me2") %>% 
ggplot(aes(x = pt, y = switch_region_num, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("# of Peaks") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K4me2 -> H3K27me3"))

p3 = switch_summary  %>% filter(mark1 == "H3K4me2") %>% mutate(switch_gene_prop = switch_gene_num/pure_gene_num * 100) %>%
ggplot(aes(x = pt, y = switch_gene_prop, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("% of Genes") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K4me2 -> H3K27me3"))


p4 = switch_summary  %>% filter(mark1 == "H3K4me2") %>% 
ggplot(aes(x = pt, y = switch_gene_num, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("# of Genes") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K4me2 -> H3K27me3"))

pdf(paste0(fig_path, "mark_switch_peak_gene_num_prop_K4ToK27.pdf"), width = 13, height = 13)
ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
dev.off()

ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

In [None]:
switch_summary$cT1 <- factor(switch_summary$cT1, levels = ct_list)
switch_summary$cT2 <- factor(switch_summary$cT2, levels = ct_list)

p1 = switch_summary %>% filter(mark1 == "H3K27me3") %>% mutate(switch_region_prop = switch_region_num/pure_region_num * 100) %>%
ggplot(aes(x = pt, y = switch_region_prop, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("% of Peaks") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K27me3 -> H3K4me2"))

p2 = switch_summary  %>% filter(mark1 == "H3K27me3") %>% 
ggplot(aes(x = pt, y = switch_region_num, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("# of Peaks") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K27me3 -> H3K4me2"))

p3 = switch_summary  %>% filter(mark1 == "H3K27me3") %>% mutate(switch_gene_prop = switch_gene_num/pure_gene_num * 100) %>%
ggplot(aes(x = pt, y = switch_gene_prop, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("% of Genes") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K27me3 -> H3K4me2"))


p4 = switch_summary  %>% filter(mark1 == "H3K27me3") %>% 
ggplot(aes(x = pt, y = switch_gene_num, fill = cT1)) +
geom_bar(stat = "identity") +
facet_grid(cT2~cT1) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("# of Genes") +
rotate_x_text(angle = 90) +
rremove("legend") +
ggtitle(paste0("H3K27me3 -> H3K4me2"))

pdf(paste0(fig_path, "mark_switch_peak_gene_num_prop_K27ToK4.pdf"), width = 13, height = 13)
ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
dev.off()

ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2)

In [None]:
switch_summary$cT1 <- factor(switch_summary$cT1, levels = ct_list)
switch_level <- c("N->CM", "N->EM", "N->EMRA", "CM->EM", "CM->EMRA", "EM->EMRA")
p1 = switch_summary %>% mutate(switch_gene_prop = switch_gene_num/pure_gene_num * 100) %>%
ggplot(aes(x = factor(paste0(cT1, "->", cT2), switch_level), y = switch_gene_prop, fill = cT1)) +
geom_boxplot() +
facet_grid(~paste0(mark1, "->", mark2)) +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE) +
xlab("") +
ylab("% of Genes") +
rotate_x_text(angle = 90) +
rremove("legend") +
ylim(0, 7)

# p2 = switch_summary  %>% filter(mark1 == "H3K27me3") %>% mutate(switch_gene_prop = switch_gene_num/pure_gene_num * 100) %>%
# ggplot(aes(x = factor(paste0(cT1, "->", cT2), switch_level), y = switch_gene_prop, fill = cT1)) +
# geom_boxplot() +
# theme_bw(base_size = 20) +
# scale_fill_viridis(discrete = TRUE) +
# xlab("") +
# ylab("% of Genes") +
# rotate_x_text(angle = 90) +
# rremove("legend") +
# ggtitle(paste0("H3K27me3 -> H3K4me2")) +
# ylim(0, 7)
pdf(paste0(fig_path, "mark_switch_gene_prop.pdf"), width = 13, height = 8)
# ggarrange(p1, p2, nrow = 1, ncol = 2)
p1
dev.off()

p1


In [None]:
switch_level <- c("N->CM", "N->EM", "N->EMRA", "CM->EM", "CM->EMRA", "EM->EMRA")
gex_switch_summary$cT1 <- factor(gex_switch_summary$cT1, levels = ct_list)

p1 = gex_switch_summary %>% ggplot(aes(x = factor(paste0(cT1, "->", cT2), levels = switch_level), y = logFC, fill = cT1)) +
geom_boxplot() +
facet_grid(~paste0(mark1, "->", mark2)) +
theme_bw(base_size = 20) +
xlab("") +
ylab("logFC of GEX") +
rremove("legend") +
rotate_x_text(angle = 90) +
scale_fill_viridis(discrete = TRUE)

p2 = gex_switch_summary %>% ggplot(aes(x = factor(paste0(cT1, "->", cT2), levels = switch_level), y = logFC, fill = cT1)) +
geom_boxplot() +
facet_grid(pt~paste0(mark1, "->", mark2)) +
theme_bw(base_size = 20) +
xlab("") +
ylab("logFC of GEX") +
rremove("legend") +
rotate_x_text(angle = 90) +
scale_fill_viridis(discrete = TRUE)

pdf(paste0(fig_path, "mark_switch_gene_expression_change_logFC.pdf"), width = 13, height = 8)
p1
dev.off()

pdf(paste0(fig_path, "mark_switch_gene_expression_change_logFC_perHD.pdf"), width = 13, height = 20)
p2
dev.off()

p1

In [None]:
## correlation between hd across cell types
## bin genome by 500bp
# library(corrplot)
# reprod = c()
# fragCount = NULL
# projPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
# for(hist in c("H3K27me3", "H3K4me2")){
#     for(ct in c("N", "CM", "EM", "EMRA")){
#         for(hd in paste0("HD", 4:7)){
#             if(is.null(fragCount)){

#                 fragCount = read.table(paste0(projPath, hist, "_CD8_", ct, "_Input_", hd, "_rep1/alignment/bowtie2_align.fragmentsCount.bin1000.bed"), header = FALSE) 
#                 colnames(fragCount) = c("chrom", "bin", paste0(hist, "_", ct, "_", hd))

#             }else{

#                 fragCountTmp = read.table(paste0(projPath, hist, "_CD8_", ct, "_Input_", hd, "_rep1/alignment/bowtie2_align.fragmentsCount.bin1000.bed"), header = FALSE) 
#                 colnames(fragCountTmp) = c("chrom", "bin", paste0(hist, "_", ct, "_", hd))
#                 fragCount = full_join(fragCount, fragCountTmp, by = c("chrom", "bin"))

#             }
#         }
#     }  
# }

# M = cor(fragCount %>% dplyr::select(-c("chrom", "bin")) %>% log2(), use = "complete.obs") 
# saveRDS(M, file = paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/HD_bin1000_corr_M.rds"))
pdf("CART_CUTRUN_Project/results/paper_figure/SuppFig-QC/HD_bin1000_reprod_corr.pdf", width = 20, height = 20)
corrplot(M, method = "color", outline = T, addgrid.col = "darkgray", order="hclust", hclust.method = 'ward.D2', addrect = 2, rect.col = "black", rect.lwd = 3,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, addCoef.col = "black", number.digits = 2, number.cex = 1, col = colorRampPalette(c("midnightblue","white","darkred"))(100))
dev.off()

In [46]:
library(corrplot)
reprod = c()
fragCount = NULL
projPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
for(hist in c("H3K27me3", "H3K4me2")){
    for(ct in c("N", "CM", "EM", "EMRA")){
        for(hd in paste0("HD", 4:7)){
            if(is.null(fragCount)){

                fragCount = read.table(paste0(projPath, hist, "_CD8_", ct, "_Input_", hd, "_rep1/alignment/bowtie2_align.fragmentsCount.bin5000.bed"), header = FALSE) 
                colnames(fragCount) = c("chrom", "bin", paste0(hist, "_", ct, "_", hd))

            }else{

                fragCountTmp = read.table(paste0(projPath, hist, "_CD8_", ct, "_Input_", hd, "_rep1/alignment/bowtie2_align.fragmentsCount.bin5000.bed"), header = FALSE) 
                colnames(fragCountTmp) = c("chrom", "bin", paste0(hist, "_", ct, "_", hd))
                fragCount = full_join(fragCount, fragCountTmp, by = c("chrom", "bin"))

            }
        }
    }  
}
saveRDS(fragCount, file = paste0("CART_CUTRUN_Project/results/CUTANDRUN/analysis/RDS/HD_bin5000_fragCount.rds"))

In [None]:
M = cor(fragCount %>% dplyr::select(-c("chrom", "bin")) %>% log2(), use = "complete.obs")
corrplot(M, method = "color", outline = T, addgrid.col = "darkgray", order="hclust", hclust.method = 'ward.D2', addrect = 2, rect.col = "black", rect.lwd = 3,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, addCoef.col = "black", number.digits = 2, number.cex = 1, col = colorRampPalette(c("midnightblue","white","darkred"))(100))


In [None]:
options(repr.plot.width=20, repr.plot.height=20)

fragCount[is.na(fragCount)] = 0
# head(fragCount)
fragCountFilter <- fragCount[rowSums(fragCount %>% dplyr::select(-c("chrom", "bin"))) >= quantile(rowSums(fragCount%>% dplyr::select(-c("chrom", "bin"))), 0.5), ]
# head(fragCountFilter)

# dim(fragCount)
# dim(fragCountFilter)
M = cor(fragCountFilter %>% dplyr::select(-c("chrom", "bin")), use = "complete.obs") 
# pdf("CART_CUTRUN_Project/results/paper_figure/SuppFig-QC/HD_bin1000_reprod_corr_fillNA0_0.25.pdf", width = 20, height = 20)

corrplot(M, method = "color", outline = T, addgrid.col = "darkgray", order="hclust", hclust.method = 'ward.D2', addrect = 2, rect.col = "black", rect.lwd = 3,cl.pos = "b", tl.col = "indianred4", tl.cex = 1, cl.cex = 1, addCoef.col = "black", number.digits = 2, number.cex = 1, col = colorRampPalette(c("midnightblue","white","darkred"))(100))
# dev.off()