In [1]:
## CUTRUN DE and assign to nearest genes

library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)

cell_list <- c("N", "CM", "EM", "EMRA")
hd_list <- paste0("HD", c(1:3, 5:7))
expr_list = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cell_comp_list = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
hist_list <- c("H3K27me3", "H3K4me2")

in_path <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
out_path <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"
fig_path <- "CART_CUTRUN_Project/results/paper_figure/"
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"


In [None]:
## PCA for RNA and CUTRUN
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData")) ## 'cell_comp_list''cell_list''countMat''designInfo''expr_list''fig_path''hd_list''hist_list''in_path''inPath''out_path''outPath'
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/RData/HD1-3_5-7_normLimma_perExprCondition.RData") ## data, selectR, dataS, voomDDS, results, 

geneID = fread(paste0("CART_CUTRUN_Project/results/RNAseq/process/RSEM/", "RNA_CD8_N_Input_HD1.genes.results"))$gene_id
data = data.frame(data)
select_exp = "Input"

rna_m = data[, colnames(data) %>% stringr::str_detect(select_exp)]
k27_m = countMat[["H3K27me3"]][, colnames(countMat[["H3K27me3"]]) %>% stringr::str_detect(select_exp)]
k4_m = countMat[["H3K4me2"]][, colnames(countMat[["H3K4me2"]]) %>% stringr::str_detect(select_exp)]

rownames(rna_m) = geneID
rownames(k27_m) = mPeak[["H3K27me3"]] %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region
rownames(k4_m) = mPeak[["H3K4me2"]] %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region

dim(rna_m)
dim(k27_m)
dim(k4_m)
head(rna_m)
head(k27_m)
head(k4_m)

rna_voom = voomDDS$E[, colnames(voomDDS$E) %>% stringr::str_detect(select_exp)]
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))
k27_voom = voomDDS[["H3K27me3"]]$E[, colnames(voomDDS[["H3K27me3"]]$E) %>% stringr::str_detect(select_exp)]
k4_voom = voomDDS[["H3K4me2"]]$E[, colnames(voomDDS[["H3K4me2"]]$E) %>% stringr::str_detect(select_exp)]
rownames(rna_voom) = 1:nrow(rna_voom)
rownames(k27_voom) = paste0("K27me3_", 1:nrow(k27_voom))
rownames(k4_voom) = paste0("K4me2_", 1:nrow(k4_voom))

dim(rna_voom)
dim(k27_voom)
dim(k4_voom)

In [None]:
rna_obj = CreateSeuratObject(counts = rna_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 2000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = 10000000) %>% 
ScaleData()
rna_obj = RunPCA(rna_obj, features = VariableFeatures(object = rna_obj), npcs = 20)

k27_obj = CreateSeuratObject(counts = k27_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round(nrow(k27_m)/10*8)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)

k4_obj = CreateSeuratObject(counts = k4_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round(nrow(k4_m)/10*8)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

cr_obj = CreateSeuratObject(counts = rbind(k27_m, k4_m), project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round((nrow(k27_m) + nrow(k4_m))/10*8)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)


In [None]:
options(repr.plot.width=15, repr.plot.height=5)
# DimPlot(rna_obj, reduction = "pca")
ct_list = c("N", "CM", "EM", "EMRA")
p1 = Embeddings(rna_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 6), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

p2 = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

p3 = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

p4 = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3+H3K4me2") +
rremove("legend.title")

ggarrange(p1, p2, p3, p4, nrow = 1, ncol = 4, common.legend = TRUE)
ggsave(filename = "CART_CUTRUN_Project/results/paper_figure/fig1/PCA_seurat_procedure.pdf", width = 20, height = 6)

In [41]:
anosim(t(rna_m[, 1:18]), grouping = factor(rep(ct_list[1:3], 6), levels = ct_list[1:3]))
anosim(t(k27_m[, 1:12]), grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3]))
anosim(t(k4_m[, 1:12]), grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3]))
anosim(t(rbind(k27_m, k4_m)[, 1:12]), grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3]))


Call:
anosim(x = t(rna_m[, 1:18]), grouping = factor(rep(ct_list[1:3],      6), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: 0.1432 
      Significance: 0.04 

Permutation: free
Number of permutations: 999



Call:
anosim(x = t(k27_m[, 1:12]), grouping = factor(rep(ct_list[1:3],      4), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: -0.1852 
      Significance: 0.898 

Permutation: free
Number of permutations: 999



Call:
anosim(x = t(k4_m[, 1:12]), grouping = factor(rep(ct_list[1:3],      4), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: -0.169 
      Significance: 0.891 

Permutation: free
Number of permutations: 999



Call:
anosim(x = t(rbind(k27_m, k4_m)[, 1:12]), grouping = factor(rep(ct_list[1:3],      4), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: -0.2083 
      Significance: 0.944 

Permutation: free
Number of permutations: 999


In [144]:
anosim(Embeddings(rna_obj, reduction = "pca")[1:18, 1:2] %>% data.frame, grouping = factor(rep(ct_list[1:3], 6), levels = ct_list[1:3]))
anosim(Embeddings(k27_obj, reduction = "pca")[1:12, 1:2] %>% data.frame, grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3]))
anosim(Embeddings(k4_obj, reduction = "pca")[1:12, 1:2] %>% data.frame, grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3]))
anosim(Embeddings(cr_obj, reduction = "pca")[1:12, 1:2] %>% data.frame, grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3]))

“results may be meaningless because data have negative entries in method “bray””



Call:
anosim(x = Embeddings(rna_obj, reduction = "pca")[1:18, 1:2] %>%      data.frame, grouping = factor(rep(ct_list[1:3], 6), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: -0.0858 
      Significance: 0.858 

Permutation: free
Number of permutations: 999


“results may be meaningless because data have negative entries in method “bray””



Call:
anosim(x = Embeddings(k27_obj, reduction = "pca")[1:12, 1:2] %>%      data.frame, grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R:    -0 
      Significance: 0.463 

Permutation: free
Number of permutations: 999


“results may be meaningless because data have negative entries in method “bray””



Call:
anosim(x = Embeddings(k4_obj, reduction = "pca")[1:12, 1:2] %>%      data.frame, grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: -0.1667 
      Significance: 0.847 

Permutation: free
Number of permutations: 999


“results may be meaningless because data have negative entries in method “bray””



Call:
anosim(x = Embeddings(cr_obj, reduction = "pca")[1:12, 1:2] %>%      data.frame, grouping = factor(rep(ct_list[1:3], 4), levels = ct_list[1:3])) 
Dissimilarity: bray 

ANOSIM statistic R: -0.1273 
      Significance: 0.86 

Permutation: free
Number of permutations: 999


In [None]:
#!/bin/bash
ml SAMtools/1.11-GCC-10.2.0 deepTools/3.3.1-foss-2019b-Python-3.7.4 libpciaccess/0.14-GCCcore-8.3.0
outPath="CART_CUTRUN_Project/results/CUTANDRUN/bigwig"
genePath="CART_CUTRUN_Project/results/RNAseq/analysis/TSV"
analysisOut="CART_CUTRUN_Project/results/paper_figure/fig1/Tornado"
# for hist in H3K27me3 #H3K4me2
# do
#     for ct in N #CM EM EMRA
#     do
#         echo $hist
#         echo $ct
#     	  file1="${hist}_CD8_${ct}_Input_HD4_rep1"
#         file2="${hist}_CD8_${ct}_Input_HD5_rep1"
#         file3="${hist}_CD8_${ct}_Input_HD6_rep1"
#         file4="${hist}_CD8_${ct}_Input_HD7_rep1"
#         inFile1="CART_CUTRUN_Project/results/CUTANDRUN/process/${file1}/alignment/bowtie2_align.bam"
#         inFile2="CART_CUTRUN_Project/results/CUTANDRUN/process/${file2}/alignment/bowtie2_align.bam"
#         inFile3="CART_CUTRUN_Project/results/CUTANDRUN/process/${file3}/alignment/bowtie2_align.bam"
#         inFile4="CART_CUTRUN_Project/results/CUTANDRUN/process/${file4}/alignment/bowtie2_align.bam"

#         samtools merge $outPath/${hist}_${ct}_bowtie2_align.bam $inFile1 $inFile2 $inFile3 $inFile4
#         samtools sort -o $outPath/${hist}_${ct}_bowtie2_align.sorted.bam $outPath/${hist}_${ct}_bowtie2_align.bam
#         samtools index $outPath/${hist}_${ct}_bowtie2_align.sorted.bam
#         bamCoverage -b $outPath/${hist}_${ct}_bowtie2_align.sorted.bam -o $outPath/${hist}_${ct}_bowtie2_align.bw
#     done
# done
cores=6
for hist in H3K4me2 H3K27me3
do
    for ct in N CM EM # EMRA
    do
        echo $hist
        echo $ct
        computeMatrix reference-point -S $outPath/${hist}_${ct}_bowtie2_align.bw \
                              -R $genePath/Input_${ct}_ordered_gene_region.tsv\
                              --skipZeros -o $analysisOut/${hist}_${ct}.mat.gz -p $cores -a 1000 -b 1000 --referencePoint TSS --samplesLabel ${hist}_${ct}

        plotHeatmap -m $analysisOut/${hist}_${ct}.mat.gz -out $analysisOut/${hist}_${ct}_sort_gex.png --sortRegions keep --regionsLabel "" --colorMap Reds
        plotHeatmap -m $analysisOut/${hist}_${ct}.mat.gz -out $analysisOut/${hist}_${ct}_sort_peak_signal.png --sortUsing sum --regionsLabel "" --colorMap Reds
    done
done

In [None]:
library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db, ggrepel)
fig_path <- "CART_CUTRUN_Project/results/paper_figure_paper1/"

outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"

## Read in RNA-seq data
cellList <- c("N", "CM", "EM", "EMRA")
hdList <- paste0("HD", c(1:3, 5:7))
exprList = c("Input", "Product", "Stim1", "Stim2", "Stim3")


inPath <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
outPath <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"

rna_expCount = c()
rna_TPM = c()
rna_FPKM = c()
for(cell in cellList){
  for(expr in exprList){
    for(hd in hdList){
      # print(paste(expr, cell, hd, sep = "_"))
      dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(expected_count)
      rna_expCount <- cbind(rna_expCount, round(dataTmp))
      dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(TPM)
      rna_TPM <- cbind(rna_TPM, round(dataTmp))
      dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(FPKM)
      rna_FPKM <- cbind(rna_FPKM, round(dataTmp))

    }
  }

}
geneID = fread(paste0(inPath, "RNA_CD8_N_Input_HD1.genes.results"))$gene_id
rownames(rna_expCount) <- geneID
colnames(rna_expCount) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")
rownames(rna_TPM) <- geneID
colnames(rna_TPM) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")
rownames(rna_FPKM) <- geneID
colnames(rna_FPKM) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")

library(chromVAR)
library(SummarizedExperiment)
library(Matrix)
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"

countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds"))


In [6]:
target_gene_list = c("Tim3", "LAG", "CCL3", "CCL4") 

In [None]:
## mean across healthy donors
fig_path = "CART_CUTRUN_Project/results/paper_figure/fig1/"
pdf(paste0(fig_path, "/K4_VS_K27_pointColorSizeRNA_TRgenes_nolabel.pdf"), width = 20, height = 20)

for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    k27_data = log2(normMat$H3K27me3[, cr_col_index] + 1) %>% rowMeans %>% t %>% t %>% data.frame
    colnames(k27_data) = c("K27")
    k27_data$gene_name = rownames(k27_data) %>% tolower

    k4_data = log2(normMat$H3K4me2[, cr_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(k4_data) = c("K4")
    k4_data$gene_name = rownames(k4_data) %>% tolower

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    RNA_data = log2(data.frame(rna_select_data)[, rna_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(RNA_data) = c("RNA")
    RNA_data$gene_name = rna_gene

    cr_data = cbind(k27_data, K4 = k4_data$K4)
    cr_rna_full = full_join(RNA_data, cr_data, by = "gene_name") %>% dplyr::select(gene_name, K4, K27, RNA)
    cr_rna_full[is.na(cr_rna_full)] = 0



    print(cr_rna_full %>% ggplot() +
        geom_point(aes(x = K4, y = K27, color = RNA, size = RNA), alpha = 0.5) +
        theme_bw(base_size = 50) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_viridis() +
        ggtitle(cT) +
        geom_vline(xintercept = 4.5, linetype = "dashed") +
        geom_hline(yintercept = 2.5, linetype = "dashed"))



}
dev.off()



In [None]:
## mean across healthy donors
fig_path = "CART_CUTRUN_Project/results/fig1/"
pdf(paste0(fig_path, "/K4_VS_K27_pointColorSizeRNA_TRgenes_withlabel.pdf"), width = 20, height = 20)

for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    k27_data = log2(normMat$H3K27me3[, cr_col_index] + 1) %>% rowMeans %>% t %>% t %>% data.frame
    colnames(k27_data) = c("K27")
    k27_data$gene_name = rownames(k27_data) %>% tolower

    k4_data = log2(normMat$H3K4me2[, cr_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(k4_data) = c("K4")
    k4_data$gene_name = rownames(k4_data) %>% tolower

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    RNA_data = log2(data.frame(rna_select_data)[, rna_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(RNA_data) = c("RNA")
    RNA_data$gene_name = rna_gene

    cr_data = cbind(k27_data, K4 = k4_data$K4)
    cr_rna_full = full_join(RNA_data, cr_data, by = "gene_name") %>% dplyr::select(gene_name, K4, K27, RNA)
    cr_rna_full[is.na(cr_rna_full)] = 0



    print(cr_rna_full %>% ggplot() +
        geom_point(aes(x = K4, y = K27, color = RNA, size = RNA), alpha = 0.5) +
        geom_point(aes(x = K4, y = K27, size = RNA), data = cr_rna_full %>% dplyr::filter(gene_name %in% tolower(target_gene_list)), color = "black", size = 8) +
        geom_text_repel(aes(x = K4, y = K27, label = gene_name), data = cr_rna_full %>% dplyr::filter(gene_name %in% tolower(target_gene_list)), color = "black", size = 15) +
        theme_bw(base_size = 50) +
        xlab("H3K4me2: log2(scaled count + 1)") +
        ylab("H3K27me3: log2(scaled count + 1)") +
        scale_color_viridis() +
        ggtitle(cT) +
        geom_vline(xintercept = 4.5, linetype = "dashed") +
        geom_hline(yintercept = 2.5, linetype = "dashed"))



}
dev.off()



In [8]:
## PCA of cordant or discordant genes and see how well the cell types are separated
cr_rna_full$K4_group = "-"
cr_rna_full$K4_group[which(cr_rna_full$K4 > 4.5)] = "+"
cr_rna_full$K27_group = "-"
cr_rna_full$K27_group[which(cr_rna_full$K27 > 2.5)] = "+"
cr_rna_full$RNA_group = "-"
cr_rna_full$RNA_group[which(cr_rna_full$RNA > 2.5)] = "+"

In [None]:
## full gene
rna_col_index = which(rna_expCount %>% colnames %>% stringr::str_detect("Input"))
cr_col_index = which(normMat$H3K27me %>% colnames %>% stringr::str_detect("Input"))

rna_m = rna_expCount %>% data.frame
rownames(rna_m) = rownames(rna_expCount)
rna_m = rna_m[, rna_col_index]

rna_obj = CreateSeuratObject(counts = rna_m[, 1:18], project = "cart", min.cells = 0, min.features = 50)  %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = 1000000) %>% 
ScaleData()
rna_obj = RunPCA(rna_obj, features = VariableFeatures(object = rna_obj), npcs = 10)

k27_obj = CreateSeuratObject(counts = normMat$H3K27me[, cr_col_index][, 1:12], project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round(nrow(normMat$H3K27me)/10*8)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 10)

k4_obj = CreateSeuratObject(counts = normMat$H3K4me2[, cr_col_index][, 1:12], project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round(nrow(normMat$H3K4me2)/10*8)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 10)

cr_obj = CreateSeuratObject(counts = rbind(normMat$H3K27me, normMat$H3K4me2)[, cr_col_index][, 1:12], project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round((nrow(normMat$H3K27me) + nrow(normMat$H3K4me2))/10*8)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 10)


In [None]:
options(repr.plot.width=15, repr.plot.height=5)
# DimPlot(rna_obj, reduction = "pca")
ct_list = c("N", "CM", "EM") #, "EMRA")
p1 = Embeddings(rna_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 6), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

p2 = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

p3 = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

p4 = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3+H3K4me2") +
rremove("legend.title")

ggarrange(p1, p2, p3, p4, nrow = 1, ncol = 4, common.legend = TRUE)
ggsave(filename = "CART_CUTRUN_Project/results/paper_figure/fig1/PCA_seurat_procedure_usingReadsAroundGenes.pdf", width = 20, height = 6)

In [None]:
rna_col_index = which(rna_expCount %>% colnames %>% stringr::str_detect("Input"))
cr_col_index = which(normMat$H3K27me %>% colnames %>% stringr::str_detect("Input"))
rna_m = rna_expCount %>% data.frame
rownames(rna_m) = rownames(rna_expCount)


rna_obj = CreateSeuratObject(counts = rna_m[, rna_col_index][, 1:18], project = "cart", min.cells = 0, min.features = 50)  %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(rna_m)) %>% 
ScaleData()

k27_obj = CreateSeuratObject(counts = normMat$H3K27me[, cr_col_index][, 1:12], project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(normMat$H3K27me)) %>% 
ScaleData()

k4_obj = CreateSeuratObject(counts = normMat$H3K4me2[, cr_col_index][, 1:12], project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(normMat$H3K4me2)) %>% 
ScaleData()

cr_obj = CreateSeuratObject(counts = rbind(normMat$H3K27me, normMat$H3K4me2)[, cr_col_index][, 1:12], project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(normMat$H3K27me)*2) %>% 
ScaleData()

In [None]:
# gene_category = "RNA+K4+K27-_CE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "-", RNA_group == "+") %$% gene_name

gene_category = "RNA-K4+K27-_DCNE"
target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "-", RNA_group == "-") %$% gene_name

# gene_category = "RNA+K4-K27+_DE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "+", RNA_group == "+") %$% gene_name

# gene_category = "RNA-K4-K27+_CNE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "+", RNA_group == "-") %$% gene_name

# gene_category = "RNA+K4+K27+_BE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "+", RNA_group == "+") %$% gene_name

# gene_category = "RNA-K4+K27+_BNE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "+", RNA_group == "-") %$% gene_name

# gene_category = "RNA+K4-K27-_NME"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "-", RNA_group == "+") %$% gene_name

# gene_category = "RNA-K4-K27-_NMNE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "-", RNA_group == "-") %$% gene_name

rna_gene_name_list = rownames(rna_m) %>% gsub(".*_", "", .) %>% tolower 
rna_row_index = which(rna_gene_name_list %in% target_gene)

cr_gene_name_list = rownames(normMat$H3K27me3) %>% tolower
cr_row_index = which(cr_gene_name_list %in% target_gene)

rna_obj_subset = rna_obj[rna_row_index, ]
rna_obj_subset = RunPCA(rna_obj_subset, features = VariableFeatures(object = rna_obj_subset), npcs = 10)

k27_obj_subset = k27_obj[cr_row_index, ]
k27_obj_subset = RunPCA(k27_obj_subset, features = VariableFeatures(object = k27_obj_subset), npcs = 10)

k4_obj_subset = k4_obj[cr_row_index, ]
k4_obj_subset = RunPCA(k4_obj_subset, features = VariableFeatures(object = k4_obj_subset), npcs = 10)

cr_obj_subset = cr_obj[c(cr_row_index, nrow(normMat$H3K27me) + cr_row_index), ]
cr_obj_subset = RunPCA(cr_obj_subset, features = VariableFeatures(object = cr_obj_subset), npcs = 10)

options(repr.plot.width=15, repr.plot.height=5)
# DimPlot(rna_obj, reduction = "pca")
ct_list = c("N", "CM", "EM") #, "EMRA")
p12 = Embeddings(rna_obj_subset, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 6), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

p22 = Embeddings(k27_obj_subset, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

p32 = Embeddings(k4_obj_subset, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

p42 = Embeddings(cr_obj_subset, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, each = 4), levels = ct_list)) %>% filter(celltype != "EMRA") %>%
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0, end = 0.75) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3+H3K4me2") +
rremove("legend.title")

ggarrange(p12, p22, p32, p42, nrow = 1, ncol = 4, common.legend = TRUE)
ggsave(filename = paste0("CART_CUTRUN_Project/results/paper_figure/fig1/PCA_seurat_procedure_", gene_category, ".pdf"), width = 20, height = 6)


In [None]:
## patient related

## Patient data infusion product of CM-derived cells 
patientRaw = read_xlsx("CART_CUTRUN_Project/report/CART_CUTRUN/analysis/20201028_NHL_CUTandRUN_PatientData.xlsx")
cm_ind = which(patientRaw$`CD8+ T cell isolation` == "CD8TCM")
patient_id_list = patientRaw$Xnumber[cm_ind] %>% tolower
patient_id_list

## Patient data infusion product of CM-derived cells 
inPath = "CART_CUTRUN_Project/results/RNAseq/process/NHL_RSEM_gencode_v21/"

pt_expCount = c()
pt_name_tmp_list = c()
for(sample in paste0("Sample_", patient_id_list, "_IP_CD8-CAR")){
  if(file.exists(paste0(inPath, sample, "/RSEM.genes.results"))){
    print(sample)
    tmp = fread(paste0(inPath, sample, "/RSEM.genes.results")) %>% dplyr::select(expected_count)
    pt_expCount = cbind(pt_expCount, round(tmp))
    pt_name_tmp_list = c(pt_name_tmp_list, sample)
  }
}
pt_expCount = data.frame(pt_expCount)
colnames(pt_expCount) = paste0(pt_name_tmp_list, "_PT") #paste0(patient_id_list, "_PT")

geneID = fread(paste0(inPath, sample, "/RSEM.genes.results"))$gene_id
pt_expCount$gene_name = geneID

head(pt_expCount)
dim(pt_expCount)

## healthy donor data
## Read in RNA-seq data
cellList <- "CM" #c("N", "CM", "EM", "EMRA")
hdList <- paste0("HD", c(1:3, 5:7))
exprList = "Product" #c("Input", "Product", "Stim1", "Stim2", "Stim3")

inPath <- "CART_CUTRUN_Project/results/RNAseq/process/HD_RSEM_gencode_v21/" #"CART_CUTRUN_Project/results/RNAseq/process/RSEM/"

hd_expCount = c()
for(cell in cellList){
  for(expr in exprList){
    for(hd in hdList){
      # print(paste(expr, cell, hd, sep = "_"))
      sample = paste0("RNA_CD8_", cell, "_", expr, "_", hd)
      if(file.exists(paste0(inPath, sample, "/RSEM.genes.results"))){
        print(sample)
        dataTmp = fread(paste0(inPath, sample, "/RSEM.genes.results")) %>% dplyr::select(expected_count)
        # dataTmp <- fread(paste0(inPath, "RNA_CD8_", cell, "_", expr, "_", hd, ".genes.results")) %>% dplyr::select(expected_count)
        hd_expCount <- cbind(hd_expCount, round(dataTmp))
      }

    }
  }

}
hd_expCount = data.frame(hd_expCount)
colnames(hd_expCount) <- paste(rep(rep(exprList, each = length(hdList)), length(cellList)), rep(cellList, each = length(hdList)*length(exprList)), rep(hdList, length(exprList)*length(cellList)), sep = "_")

geneID = fread(paste0(inPath, sample, "/RSEM.genes.results"))$gene_id
hd_expCount$gene_name <- geneID
head(hd_expCount)
dim(hd_expCount)

dim(pt_expCount)
dim(hd_expCount)
countMat = inner_join(pt_expCount, hd_expCount, by = "gene_name")
rownames(countMat) = countMat$gene_name
countMat = countMat %>% dplyr::select(-gene_name)
head(countMat)
dim(countMat)

countMat_rna = countMat

In [None]:
options(repr.plot.width=7, repr.plot.height=6)
selectR = which(rowSums(countMat_rna) > 10) ## remove low count genes
length(selectR)

rna_obj = CreateSeuratObject(counts = countMat_rna[selectR, ], project = "cart", min.cells = 0, min.features = 10) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>%  # length(selectR)
ScaleData()

rna_obj = RunPCA(rna_obj, features = VariableFeatures(object = rna_obj), npcs = 10)
rna_plot = Embeddings(rna_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", ncol(pt_expCount) - 1), rep("HD", ncol(hd_expCount) - 1))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

pdf("CART_CUTRUN_Project/results/paper_figure/fig1/CMpatient_CMhealthyDonor_PCA_on_RNA-seq_allfeatures.pdf", width = 7, height = 6)
rna_plot
dev.off()
rna_plot

In [None]:
patientRaw = read_xlsx("CART_CUTRUN_Project/report/CART_CUTRUN/analysis/20201028_NHL_CUTandRUN_PatientData.xlsx")
cm_ind = which(patientRaw$`CD8+ T cell isolation` == "CD8TCM")
patientRaw$Xnumber[cm_ind]

## 1. TSS neighbor regions
## use TSS region for differential detection
## Get the TSS location and +/-1kb region around TSS
gtf <- rtracklayer::import('/fh/fast/gottardo_r/yezheng_working/SupplementaryData/hg38/transcriptomeHumanReference/gencode.v33.annotation.gtf') #rtracklayer::import('/fh/fast/gottardo_r/yezheng_working/SupplementaryData/hg38/GENCODE/gencode.v21.annotation.gtf')
gtf_df=as.data.frame(gtf)


## get the TSS location and extend by 1kb up and downstream
tss_df = rbind(
    gtf_df %>% data.frame %>% dplyr::filter(type == "gene", strand == "+") %>% dplyr::select(seqnames, TSS = start, strand, source, gene_id, gene_name, gene_name, gene_type) ,
    gtf_df %>% data.frame %>% dplyr::filter(type == "gene", strand == "-") %>% dplyr::select(seqnames, TSS = end, strand, source, gene_id, gene_name, gene_name, gene_type) 
)
tss_filter_df = tss_df %>% dplyr::filter(!(seqnames %in% c("chrM", "chrX", "chrY")))

tss_gr = GRanges(seqnames = tss_filter_df$seqnames, ranges = IRanges(start = tss_filter_df$TSS - 1000, end = tss_filter_df$TSS + 1000), strand = tss_filter_df$strand, gene_id = tss_filter_df$gene_id, gene_name = tss_filter_df$gene_name, gene_type = tss_filter_df$gene_type)
tss_gr 

## patient data

outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
histList = c("H3K27me3", "H3K4me2")

countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_NHLpatient_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_NHLpatient_noChrXYM_normMat.rds"))
designInfo = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_NHLpatient_noChrXYM_designInfo.rds"))

pt_countMat = countMat
pt_countMat$H3K27me3 = countMat$H3K27me3[, cm_ind]
pt_countMat$H3K4me2 = countMat$H3K4me2[, cm_ind]
colnames(pt_countMat$H3K27me3) = paste0(colnames(pt_countMat$H3K27me3), "_PT")
colnames(pt_countMat$H3K4me2) = paste0(colnames(pt_countMat$H3K4me2), "_PT")

pt_designInfo = designInfo %>% dplyr::filter(exps %in% patientRaw$Xnumber[cm_ind])
head(pt_countMat$H3K27me3)
pt_designInfo %>% arrange(hist, exps)

## healthy donor data
countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds"))
designInfo = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_designInfo.rds"))

selectC = colnames(countMat$H3K27me3) %>% stringr::str_detect("CM_Product")
head(countMat$H3K4me2[, selectC])

hd_countMat = countMat
hd_countMat$H3K27me3 = countMat$H3K27me3[, selectC]
hd_countMat$H3K4me2 = countMat$H3K4me2[, selectC]

hd_designInfo = designInfo[which(stringr::str_detect(designInfo$exps, "CM_Product")), ]
hd_designInfo

for(hist in histList){
    tmp = c()

    for(j in 1:ncol(pt_countMat[[hist]])){
        pt_designInfo_tmp = pt_designInfo %>% dplyr::filter(hist == hist)
        tmp = cbind(tmp, round(pt_countMat[[hist]][, j]/pt_designInfo_tmp$depth[j] * 13240732)) ## remove sequencing depth effect
    }
    for(j in 1:ncol(hd_countMat[[hist]])){
        hd_designInfo_tmp = hd_designInfo %>% dplyr::filter(hist == hist)
        tmp = cbind(tmp, round(hd_countMat[[hist]][, j]/hd_designInfo_tmp$depth[j] * 13240732)) ## remove sequencing depth effect
    }
    countMat[[hist]] = tmp
    colnames(countMat[[hist]]) = c(colnames(pt_countMat[[hist]]), colnames(hd_countMat[[hist]]))
}


In [None]:
options(repr.plot.width=22, repr.plot.height=10)
hist = "H3K27me3"
selectR = which(rowSums(countMat[[hist]]) > 50) ## remove low count genes

tmp = countMat[[hist]][selectR, ]

k27_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)
k27_plot = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

hist = "H3K4me2"
selectR = which(rowSums(countMat[[hist]]) > 1000) ## remove low count genes
length(selectR)
k4_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

k4_plot = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

cr_obj = CreateSeuratObject(counts = rbind(tmp, countMat[[hist]][selectR, ]), project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR) + nrow(tmp)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)
cr_plot = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")


pdf("CART_CUTRUN_Project/results/paper_figure/fig1/CMpatient_CMhealthyDonor_PCA_on_all_TSSneighbors.pdf", width = 15, height = 6)
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)
dev.off()
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)


In [None]:
pdf("CART_CUTRUN_Project/results/paper_figure/fig1/CMpatient_CMhealthyDonor_PCA_on_all_TSSneighbors_RNA_K27_K4.pdf", width = 20, height = 6)
ggarrange(rna_plot, k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 4, nrow = 1)
dev.off()

In [None]:
options(repr.plot.width=21, repr.plot.height=6)
## select cordant and discordant genes

# gene_category = "RNA+K4+K27-_CE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "-", RNA_group == "+") %$% gene_name

# gene_category = "RNA-K4+K27-_DCNE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "-", RNA_group == "-") %$% gene_name

# gene_category = "RNA+K4-K27+_DE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "+", RNA_group == "+") %$% gene_name

# gene_category = "RNA-K4-K27+_CNE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "+", RNA_group == "-") %$% gene_name

# gene_category = "RNA+K4+K27+_BE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "+", RNA_group == "+") %$% gene_name

# gene_category = "RNA-K4+K27+_BNE"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "+", K27_group == "+", RNA_group == "-") %$% gene_name

# gene_category = "RNA+K4-K27-_NME"
# target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "-", RNA_group == "+") %$% gene_name

gene_category = "RNA-K4-K27-_NMNE"
target_gene = cr_rna_full %>% dplyr::filter(K4_group == "-", K27_group == "-", RNA_group == "-") %$% gene_name

rna_gene_name_list = rownames(countMat_rna) %>% gsub(".*_", "", .) %>% tolower 
rna_row_index = which(rna_gene_name_list %in% target_gene)

cr_gene_name_list = rownames(countMat[["H3K4me2"]]) %>% tolower
cr_row_index = which(cr_gene_name_list %in% target_gene)


selectR = rna_row_index
length(selectR)

rna_obj = CreateSeuratObject(counts = countMat_rna[selectR, ], project = "cart", min.cells = 0, min.features = 10) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>%  # length(selectR)
ScaleData()

rna_obj = RunPCA(rna_obj, features = VariableFeatures(object = rna_obj), npcs = 10)
rna_plot = Embeddings(rna_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", ncol(pt_expCount) - 1), rep("HD", ncol(hd_expCount) - 1))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")


hist = "H3K27me3"
selectR = cr_row_index

tmp = countMat[[hist]][selectR, ]

k27_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)
k27_plot = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

hist = "H3K4me2"
selectR = cr_row_index
length(selectR)
k4_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

k4_plot = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

cr_obj = CreateSeuratObject(counts = rbind(tmp, countMat[[hist]][selectR, ]), project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR) + nrow(tmp)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)
cr_plot = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point(size = 5) +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")


ggarrange(rna_plot, k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 4, nrow = 1)
ggsave(paste0("CART_CUTRUN_Project/results/paper_figure/fig1/CMpatient_CMhealthyDonor_PCA_on_all_TSSneighbors_RNA_K27_K4_", gene_category, ".pdf"), width = 20, height = 6)


In [None]:
## Bivalent genes
## CUTRUN DE and assign to nearest genes

library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)

cell_list <- c("N", "CM", "EM", "EMRA")
hd_list <- paste0("HD", c(1:3, 5:7))
expr_list = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cell_comp_list = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
hist_list <- c("H3K27me3", "H3K4me2")

in_path <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
out_path <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"
fig_path <- "CART_CUTRUN_Project/results/paper_figure/"
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"

## Bivalent methylation marks

load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/IDR_master_peak_list_chromVar_count_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))

names(peakAll)

In [None]:

## overlap K4me2 and K27me3 ensuring at least 50bp overlap, count number of regions after merging
fig_path = "CART_CUTRUN_Project/results/paper_figure/fig1"
bi_path <- "CART_CUTRUN_Project/results/CUTANDRUN/analysis/BivalentGenes/"
gene_gr <- genes(TxDb.Hsapiens.UCSC.hg38.knownGene, columns = "gene_id")

get_nearest_gene <- function(gene_gr, region_gr){
    region_assign_gene <- distanceToNearest(region_gr, gene_gr, select = "all")
    # dist_tmp <- region_assign_gene %>% data.frame %$% distance
    # region_assign_gene <- region_assign_gene[dist_tmp < 1000]
    region_match_gene <- cbind(data.frame(region_gr)[region_assign_gene@from, ], data.frame(peak_id = region_assign_gene@from, gene_name = mapIds(org.Hs.eg.db, gene_gr$gene_id[region_assign_gene@to], "SYMBOL", "ENTREZID") %>% as.vector) )
    return(region_match_gene)

}
bi_num <- c()
for(cT in c("N", "CM", "EM", "EMRA")){
    for(pt in paste0("HD", 4:7)){
        bi_olp <- findOverlaps(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]], minoverlap = 50)
        bi_peak.gr <- reduce(append(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]][bi_olp@from], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]][bi_olp@to]))
        bi_gene <- get_nearest_gene(gene_gr, bi_peak.gr)
        write.table(unique(bi_gene$gene_name), paste0(bi_path, "Bivalent_peaks_nearest_genes_", cT, "_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")

        total_peak.gr <- append(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]]) %>% reduce
        total_gene <- get_nearest_gene(gene_gr, total_peak.gr)

        bi_num <- data.frame(
            bi_peak_num = length(bi_peak.gr),
            bi_gene_num = length(unique(bi_gene$gene_name)),
            total_peak_num = length(total_peak.gr),
            total_gene_num = length(unique(total_gene$gene_name)),
            cT = cT,
            pt = pt
        ) %>% rbind(bi_num, .)
    }
}
bi_num$cT <- factor(bi_num$cT, levels = c("N", "CM", "EM", "EMRA"))

# p1 = bi_num %>% ggplot(aes(x = pt, y = bi_peak_num, fill = cT)) + 
# geom_bar(stat = "identity", position = position_dodge()) +
# scale_fill_viridis(discrete = TRUE) +
# theme_bw(base_size = 20) +
# xlab("") +
# ylab("# of Peak Regions") +
# rremove("legend.title")

# p2 = bi_num %>% mutate(bi_peak_prop = bi_peak_num/total_peak_num * 100) %>% ggplot(aes(x = pt, y = bi_peak_prop, fill = cT)) + 
# geom_bar(stat = "identity", position = position_dodge()) +
# scale_fill_viridis(discrete = TRUE) +
# theme_bw(base_size = 20) +
# xlab("") +
# ylab("% of Peak Regions") +
# rremove("legend.title")

# p3 = bi_num %>% ggplot(aes(x = pt, y = bi_gene_num, fill = cT)) + 
# geom_bar(stat = "identity", position = position_dodge()) +
# scale_fill_viridis(discrete = TRUE) +
# theme_bw(base_size = 20) +
# xlab("") +
# ylab("# of Genes") +
# rremove("legend.title")

# p4 = bi_num %>% mutate(bi_gene_prop = bi_gene_num/total_gene_num * 100) %>% ggplot(aes(x = pt, y = bi_gene_prop, fill = cT)) + 
# geom_bar(stat = "identity", position = position_dodge()) +
# scale_fill_viridis(discrete = TRUE) +
# theme_bw(base_size = 20) +
# xlab("") +
# ylab("% of Genes") +
# rremove("legend.title")

# ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = TRUE) 


b1 = bi_num %>% filter(cT != "EMRA") %>% ggplot(aes(x = cT, y = bi_peak_num, fill = cT)) + 
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, begin = 0, end = 0.75, alpha = 0.8) +
theme_bw(base_size = 20) +
xlab("") +
ylab("# of Peak Regions") +
rremove("legend")

b2 = bi_num %>% filter(cT != "EMRA") %>% mutate(bi_peak_prop = bi_peak_num/total_peak_num * 100) %>% ggplot(aes(x = cT, y = bi_peak_prop, fill = cT)) + 
geom_boxplot() +
scale_fill_viridis(discrete = TRUE, begin = 0, end = 0.75, alpha = 0.8) +
theme_bw(base_size = 20) +
xlab("") +
ylab("% of Peak Regions") +
rremove("legend")

b3 = bi_num %>% filter(cT != "EMRA") %>% ggplot() + 
geom_boxplot(aes(x = cT, y = bi_gene_num, fill = cT)) +
scale_fill_viridis(discrete = TRUE, begin = 0, end = 0.75, alpha = 0.8) +
theme_bw(base_size = 20) +
xlab("") +
ylab("# of Genes") +
rremove("legend")


b4 = bi_num %>% filter(cT != "EMRA") %>% mutate(bi_gene_prop = bi_gene_num/total_gene_num * 100) %>% ggplot() + 
geom_boxplot(aes(x = cT, y = bi_gene_prop, fill = cT)) +
scale_fill_viridis(discrete = TRUE, begin = 0, end = 0.75, alpha = 0.8) +
theme_bw(base_size = 20) +
xlab("") +
ylab("% of Genes") +
rremove("legend") 

ggarrange(b1, b2, b3, b4, ncol = 4, nrow = 1)
ggsave(paste0(fig_path, "/bivalent_peak_gene_num_prop_boxplot.pdf"), width = 28, height = 7)


# pdf(paste0(fig_path, "/bivalent_peak_gene_num_prop.pdf"), width = 10, height = 10)
# ggarrange(p1, p2, p3, p4, ncol = 2, nrow = 2, common.legend = TRUE) 
# dev.off()


In [None]:
load(file = paste0(out_path, "/RData/HD1-3_5-7_normLimma_perExprCondition.RData"))
rna_results <- results

bi_change <- c()
bi_change_gene_lfc <- c()
ct_list <- c("N", "CM", "EM", "EMRA")
pt_list <- paste0("HD", 4:7)
for(i in 1:length(ct_list)){
    cT <- ct_list[i]
    cT1 <- cT
    for(j in 1:length(ct_list)){
        if(i != j){
            cT2 <- ct_list[j]
            for(pt in pt_list){
                bi_olp <- findOverlaps(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]], minoverlap = 50)
                bi_olp2 <- findOverlaps(peakAll[[paste0("H3K27me3_", cT2, "_Input_", pt)]], peakAll[[paste0("H3K4me2_", cT2, "_Input_", pt)]], minoverlap = 50)

                bi_peak.gr <- reduce(append(peakAll[[paste0("H3K27me3_", cT, "_Input_", pt)]][bi_olp@from], peakAll[[paste0("H3K4me2_", cT, "_Input_", pt)]][bi_olp@to]))
                bi_peak.gr2 <- reduce(append(peakAll[[paste0("H3K27me3_", cT2, "_Input_", pt)]][bi_olp2@from], peakAll[[paste0("H3K4me2_", cT2, "_Input_", pt)]][bi_olp2@to]))

                bi_again <- findOverlaps(bi_peak.gr, bi_peak.gr2, minoverlap = 50)@from %>% unique
                bi_k27 <- findOverlaps(bi_peak.gr, peakAll[[paste0("H3K27me3_", cT2, "_Input_", pt)]], minoverlap = 50)@from %>% unique
                bi_k4 <- findOverlaps(bi_peak.gr, peakAll[[paste0("H3K4me2_", cT2, "_Input_", pt)]], minoverlap = 50)@from %>% unique
                bi_k27 <- setdiff(setdiff(bi_k27, bi_k4), bi_again)
                bi_k4 <- setdiff(setdiff(bi_k4, bi_k27), bi_again)
                bi_none <- setdiff(1:length(bi_peak.gr), unique(c(bi_again, bi_k27, bi_k4)) %>% sort)

                bi_gene <- get_nearest_gene(gene_gr, bi_peak.gr)
                bi_again_gene <- bi_gene %>% filter(peak_id %in% bi_again) %$% gene_name %>% unique
                bi_k27_gene <- bi_gene %>% filter(peak_id %in% bi_k27) %$% gene_name %>% unique
                bi_k4_gene <- bi_gene %>% filter(peak_id %in% bi_k4) %$% gene_name %>% unique
                bi_none_gene <- setdiff(unique(bi_gene$gene_name), c(bi_again_gene, bi_k27_gene, bi_k4_gene)) %>% unique
                write.table(bi_again_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_BivalentPeakAgainIn", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                write.table(bi_k27_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_H3K27me3In", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                write.table(bi_k4_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_H3K4me2In", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                write.table(bi_none_gene, paste0(bi_path, "BivalentPeaksOf", cT1, "_NoPeakIn", cT2, "_nearest_genes_", pt, ".csv"), quote = F, row.names = F, col.names = F, sep = ",")
                
                if(i < j){
                    bi_again_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_again_gene)
                    bi_k27_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_k27_gene)
                    bi_k4_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_k4_gene)
                    bi_none_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT1, "_", cT2)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT1, "_", cT2)]]$logFC) %>% filter(gene_name %in% bi_none_gene)
                }else{
                    bi_again_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_again_gene)
                    bi_k27_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_k27_gene)
                    bi_k4_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_k4_gene)
                    bi_none_gene_lfc <- data.frame(gene_name = rna_results[[paste0("Input_", cT2, "_", cT1)]]$GeneName %>% gsub(".*_", "", .), GEX = rna_results[[paste0("Input_", cT2, "_", cT1)]]$logFC) %>% filter(gene_name %in% bi_none_gene)

                }
                
                bi_change <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    bi_original_num = length(bi_peak.gr), bi_again_num = length(bi_again), bi_k27_num = length(bi_k27), bi_k4_num = length(bi_k4), bi_none_num = length(bi_none),
                    bi_original_gene_num = length(unique(bi_gene$gene_name)), bi_again_gene_num = length(bi_again_gene), bi_k27_gene_num = length(bi_k27_gene), bi_k4_gene_num = length(bi_k4_gene), bi_none_gene_num = length(bi_none_gene)) %>% 
                    mutate(bi_again_prop = bi_again_num/bi_original_num * 100, bi_k27_prop = bi_k27_num/bi_original_num * 100, bi_k4_prop = bi_k4_num/bi_original_num * 100, bi_none_prop = bi_none_num/bi_original_num * 100) %>% 
                    mutate(bi_again_gene_prop = bi_again_gene_num/bi_original_gene_num * 100, bi_k27_gene_prop = bi_k27_gene_num/bi_original_gene_num * 100, bi_k4_gene_prop = bi_k4_gene_num/bi_original_gene_num * 100, bi_none_gene_prop = bi_none_gene_num/bi_original_gene_num * 100) %>% 
                    rbind(bi_change, .)
                
                bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_again_gene_lfc$GEX, gene_name = bi_again_gene_lfc$gene_name, type = "bi_again"
                ) %>% rbind(bi_change_gene_lfc, .)
                bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_k27_gene_lfc$GEX, gene_name = bi_k27_gene_lfc$gene_name, type = "bi_k27"
                ) %>% rbind(bi_change_gene_lfc, .)
                bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_k4_gene_lfc$GEX, gene_name = bi_k4_gene_lfc$gene_name, type = "bi_k4"
                ) %>% rbind(bi_change_gene_lfc, .)
                if(nrow(bi_none_gene_lfc) > 0){
                    bi_change_gene_lfc <- data.frame(
                    cT = cT, cT_compare = cT2, pt = pt, 
                    logFC = bi_none_gene_lfc$GEX, gene_name = bi_none_gene_lfc$gene_name, type = "bi_none"
                ) %>% rbind(bi_change_gene_lfc, .)
                }
                
            }
        }
        
    }
}


bi_change %>% head
bi_change_gene_lfc %>% head

In [6]:
library(tidyr)
bi_change$cT <- factor(bi_change$cT, levels = ct_list)
bi_change$cT_compare <- factor(bi_change$cT_compare, levels = ct_list)


p1 = bi_change %>% pivot_longer(., cols = c("bi_again_prop", "bi_k27_prop", "bi_k4_prop", "bi_none_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%
filter(cT != "EMRA", cT_compare != "EMRA", name != "bi_none_prop") %>%
ggplot(aes(cT_compare, y = value, fill = factor(name, levels = c("bi_again_prop", "bi_k27_prop", "bi_k4_prop"), labels = c("Bivalent->again", "Bivalent->H3K27me3", "Bivalent->H3K4me2")), label = round(value, 1))) +
geom_boxplot(position = position_dodge(width = 0.9)) +
stat_summary(
    fun.y = median,
    geom = 'line',
    aes(group = name, colour = name),
    position = position_dodge(width = 0.9) #this has to be added
  ) +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
scale_color_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1) +
xlab("") +
ylab("% of Peaks") +
rremove("legend.title") +
theme(legend.position = "top")

p12 = bi_change %>% pivot_longer(., cols = c("bi_again_prop", "bi_k27_prop", "bi_k4_prop", "bi_none_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%
filter(cT != "EMRA", cT_compare != "EMRA", name != "bi_none_prop") %>%
ggplot(aes(cT_compare, y = value, fill = factor(name, levels = c("bi_again_prop", "bi_k27_prop", "bi_k4_prop"), labels = c("Bivalent->again", "Bivalent->H3K27me3", "Bivalent->H3K4me2")), label = round(value, 1))) +
geom_boxplot(position = position_dodge(width = 0.9)) +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
scale_color_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1) +
xlab("") +
ylab("% of Peaks") +
rremove("legend.title") +
theme(legend.position = "top")

p2 = bi_change %>% pivot_longer(., cols = c("bi_again_gene_prop", "bi_k27_gene_prop", "bi_k4_gene_prop", "bi_none_gene_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%
filter(cT != "EMRA", cT_compare != "EMRA", name != "bi_none_gene_prop") %>%
ggplot(aes(cT_compare, y = value, fill = factor(name, levels = c("bi_again_gene_prop", "bi_k27_gene_prop", "bi_k4_gene_prop"), labels = c("Bivalent->again", "Bivalent->H3K27me3", "Bivalent->H3K4me2")), label = round(value, 1))) +
geom_boxplot(position = position_dodge(width = 0.9)) +
stat_summary(
    fun.y = median,
    geom = 'line',
    aes(group = name, colour = name),
    position = position_dodge(width = 0.9) #this has to be added
  ) +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
scale_color_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1) +
xlab("") +
ylab("% of Genes") +
rremove("legend.title") +
theme(legend.position = "top")

p22 = bi_change %>% pivot_longer(., cols = c("bi_again_gene_prop", "bi_k27_gene_prop", "bi_k4_gene_prop", "bi_none_gene_prop")) %>% dplyr::select(name, value, cT, cT_compare, pt) %>%
filter(cT != "EMRA", cT_compare != "EMRA", name != "bi_none_gene_prop") %>%
ggplot(aes(cT_compare, y = value, fill = factor(name, levels = c("bi_again_gene_prop", "bi_k27_gene_prop", "bi_k4_gene_prop"), labels = c("Bivalent->again", "Bivalent->H3K27me3", "Bivalent->H3K4me2")), label = round(value, 1))) +
geom_boxplot(position = position_dodge(width = 0.9)) +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
scale_color_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1) +
xlab("") +
ylab("% of Genes") +
rremove("legend.title") +
theme(legend.position = "top")

pdf(paste0(fig_path, "/bivalent_peak_gene_change_between_celltypes.pdf"), width = 10, height = 8)
p1
p12
p2
p22
dev.off()

ggarrange(p1, p2, nrow = 2, ncol = 1, common.legend = TRUE)


In [None]:
bi_change_gene_lfc %>% head
bi_change_gene_lfc$cT <- factor(bi_change_gene_lfc$cT, levels = ct_list)
bi_change_gene_lfc$cT_compare <- factor(bi_change_gene_lfc$cT_compare, levels = ct_list)

bi_change_gene_lfc %>% filter(cT != "EMRA", cT_compare != "EMRA", type != "bi_none") %>%
ggplot(aes(cT_compare, y = logFC, fill = factor(type, levels = c("bi_again", "bi_k27", "bi_k4"), labels = c("Bivalent->again", "Bivalent->H3K27me3", "Bivalent->H3K4me2")))) +
geom_boxplot() +
facet_grid(~cT, scale = "free_x", space = "free") +
theme_bw(base_size = 20) +
scale_fill_viridis(discrete = TRUE, option = "magma", begin = 0.2, end = 1, alpha = 0.8) +
xlab("") +
ylab("logFC of GEX") +
rremove("legend.title") +
theme(legend.position = "top")


ggsave(paste0(fig_path, "/bivalent_peak_gene_change_between_celltypes_gex_logfc.pdf"), width = 10, height = 8)


In [None]:
cordant_gene_df = c()
for(cT in c("N", "CM", "EM", "EMRA")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    k27_data = log2(normMat$H3K27me3[, cr_col_index] + 1) %>% rowMeans %>% t %>% t %>% data.frame
    colnames(k27_data) = c("K27")
    k27_data$gene_name = rownames(k27_data) %>% tolower

    k4_data = log2(normMat$H3K4me2[, cr_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(k4_data) = c("K4")
    k4_data$gene_name = rownames(k4_data) %>% tolower

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    RNA_data = log2(data.frame(rna_select_data)[, rna_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(RNA_data) = c("RNA")
    RNA_data$gene_name = rna_gene

    cr_data = cbind(k27_data, K4 = k4_data$K4)
    cr_rna_full = full_join(RNA_data, cr_data, by = "gene_name") %>% dplyr::select(gene_name, K4, K27, RNA)
    cr_rna_full[is.na(cr_rna_full)] = 0

    ## PCA of cordant or discordant genes and see how well the cell types are separated
    cr_rna_full$K4_group = "-"
    cr_rna_full$K4_group[which(cr_rna_full$K4 > 4.5)] = "+"
    cr_rna_full$K27_group = "-"
    cr_rna_full$K27_group[which(cr_rna_full$K27 > 2.5)] = "+"
    cr_rna_full$RNA_group = "-"
    cr_rna_full$RNA_group[which(cr_rna_full$RNA > 2.5)] = "+"

    cr_rna_full = cr_rna_full %>% mutate(gene_type = paste0("RNA", RNA_group, "K4", K4_group, "K27", K27_group)) %>% data.frame
        

    tmp = cr_rna_full %>% group_by(gene_type) %>% summarize(N = n()) %>% 
        mutate(cell_type = cT)


    cordant_gene_df = rbind(cordant_gene_df, left_join(cr_rna_full %>% mutate(gene_name_index = paste0(gene_name, "_", 1:nrow(cr_rna_full))), tmp, by = "gene_type"))
}
cordant_gene_df$gene_type_name = factor(cordant_gene_df$gene_type, levels = c("RNA+K4-K27-", "RNA-K4-K27-", "RNA+K4+K27-", "RNA-K4+K27+", 
"RNA+K4-K27+", "RNA-K4+K27-", "RNA-K4-K27+", "RNA+K4+K27+"), labels = c("RNA+K4-K27-(NME)", "RNA-K4-K27-(NMNE)", "RNA+K4+K27-(CE)", "RNA-K4+K27+(BNE)", 
"RNA+K4-K27+(DE)", "RNA-K4+K27-(DCNE)", "RNA-K4-K27+(CNE)", "RNA+K4+K27+(BE)"))

cordant_gene_df$cell_type = factor(cordant_gene_df$cell_type, levels = c("N", "CM", "EM"))
cordant_gene_df$gene_num = 1
head(cordant_gene_df)

In [None]:
cordant_gene_df %>% head
cordant_gene_df %>% write.table("CART_CUTRUN_Project/results/paper_figure/fig1/concordant_discordant_summary_N_CM_EM_EMRA.csv", quote = F, row.names = F, sep = ",")

In [None]:
target_start_gene = cordant_gene_df %>% dplyr::filter(cell_type == "N", gene_type %in% c("RNA-K4+K27+", "RNA+K4+K27+")) %$% gene_name_index
head(target_start_gene)
length(target_start_gene)
length(unique(target_start_gene))
target_start_gene_cm = cordant_gene_df %>% dplyr::filter(cell_type == "CM", gene_type %in% c("RNA-K4+K27+", "RNA+K4+K27+")) %$% gene_name_index
head(target_start_gene_cm)
length(target_start_gene_cm)
length(unique(target_start_gene_cm))

In [None]:
target_start_gene = cordant_gene_df %>% dplyr::filter(cell_type == "N", gene_type %in% c("RNA-K4+K27+", "RNA+K4+K27+")) %$% gene_name
head(target_start_gene)
length(target_start_gene)
length(unique(target_start_gene))

In [None]:
options(repr.plot.width=15, repr.plot.height=5)
pdf(paste0(fig_path, "/FlowChart_allGenes.pdf"), width =11, height = 11)
ggplot(cordant_gene_df %>% filter(cell_type != "EMRA"),
       aes(x = cell_type, stratum = gene_type_name, alluvium = gene_name_index,
           y = gene_num,
           fill = gene_type_name, label = N)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_fit_text(stat = "stratum", size = 10, min.size = 6, formatter = comma) +
  theme(legend.position = "bottom") +
  ggtitle("")
dev.off()

In [None]:
options(repr.plot.width=15, repr.plot.height=5)
pdf(paste0(fig_path, "/FlowChart_BE_BNE.pdf"), width =9, height = 9)
ggplot(cordant_gene_df %>% filter(cell_type != "EMRA", gene_name_index %in% target_start_gene),
       aes(x = cell_type, stratum = gene_type_name, alluvium = gene_name_index,
           y = gene_num,
           fill = gene_type_name, label = N)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_fit_text(stat = "stratum", size = 10, min.size = 6, formatter = comma) +
  theme(legend.position = "bottom") +
  ggtitle("")
dev.off()

In [None]:
## target gene for N

cordant_gene_df = c()
for(cT in c("N", "CM", "EM")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    k27_data = log2(normMat$H3K27me3[, cr_col_index] + 1) %>% rowMeans %>% t %>% t %>% data.frame
    colnames(k27_data) = c("K27")
    k27_data$gene_name = rownames(k27_data) %>% tolower

    k4_data = log2(normMat$H3K4me2[, cr_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(k4_data) = c("K4")
    k4_data$gene_name = rownames(k4_data) %>% tolower

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    RNA_data = log2(data.frame(rna_select_data)[, rna_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(RNA_data) = c("RNA")
    RNA_data$gene_name = rna_gene

    cr_data = cbind(k27_data, K4 = k4_data$K4)
    cr_rna_full = full_join(RNA_data, cr_data, by = "gene_name") %>% dplyr::select(gene_name, K4, K27, RNA)
    cr_rna_full[is.na(cr_rna_full)] = 0

    ## PCA of cordant or discordant genes and see how well the cell types are separated
    cr_rna_full$K4_group = "-"
    cr_rna_full$K4_group[which(cr_rna_full$K4 > 4.5)] = "+"
    cr_rna_full$K27_group = "-"
    cr_rna_full$K27_group[which(cr_rna_full$K27 > 2.5)] = "+"
    cr_rna_full$RNA_group = "-"
    cr_rna_full$RNA_group[which(cr_rna_full$RNA > 2.5)] = "+"

    cr_rna_full = cr_rna_full %>% mutate(gene_type = paste0("RNA", RNA_group, "K4", K4_group, "K27", K27_group)) %>% 
    mutate(gene_name_index = paste0(gene_name, "_", 1:nrow(cr_rna_full))) %>% data.frame %>% dplyr::filter(gene_name_index %in% target_start_gene)
    print(dim(cr_rna_full))
    tmp = cr_rna_full %>% group_by(gene_type) %>% summarize(N = n()) %>% 
        mutate(cell_type = cT)
    print(dim(tmp))
    print(tmp$N %>% sum)
    cordant_gene_df = rbind(cordant_gene_df, left_join(cr_rna_full, tmp, by = "gene_type"))
    print(dim(left_join(cr_rna_full, tmp, by = "gene_type")))
}
cordant_gene_df$gene_type_name = factor(cordant_gene_df$gene_type, levels = c("RNA+K4-K27-", "RNA-K4-K27-", "RNA+K4+K27-", "RNA-K4+K27+", 
"RNA+K4-K27+", "RNA-K4+K27-", "RNA-K4-K27+", "RNA+K4+K27+"), labels = c("RNA+K4-K27-(NME)", "RNA-K4-K27-(NMNE)", "RNA+K4+K27-(CE)", "RNA-K4+K27+(BNE)", 
"RNA+K4-K27+(DE)", "RNA-K4+K27-(DCNE)", "RNA-K4-K27+(CNE)", "RNA+K4+K27+(BE)"))

cordant_gene_df$cell_type = factor(cordant_gene_df$cell_type, levels = c("N", "CM", "EM"))
cordant_gene_df$gene_num = 1
head(cordant_gene_df)

In [None]:
options(repr.plot.width=13, repr.plot.height=10)
pdf(paste0(fig_path, "/FlowChart_BE_BNE_targetgeneonly_for_N.pdf"), width =9, height = 9)

ggplot(cordant_gene_df, # %>% filter(cell_type != "EMRA", gene_name_index %in% target_start_gene),
       aes(x = cell_type, stratum = gene_type_name, alluvium = gene_name_index,
           y = gene_num,
           fill = gene_type_name, label = N)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_fit_text(stat = "stratum", size = 10, min.size = 6, formatter = comma) +
  theme(legend.position = "bottom") +
  ggtitle("") +
  ylab("Number of Genes")
dev.off()

In [None]:
## target gene for CM
cordant_gene_df = c()
for(cT in c("N", "CM", "EM")){
    rna_select_cond = paste0("Input_", cT)
    cr_select_cond = paste0(cT, "_Input")

    rna_select_data = rna_expCount
    cr_select_data = normMat$H3K27me3

    cr_col_index = which(cr_select_data %>% colnames %>% stringr::str_detect(cr_select_cond))
    rna_col_index = which(rna_select_data %>% colnames %>% stringr::str_detect(rna_select_cond))

    k27_data = log2(normMat$H3K27me3[, cr_col_index] + 1) %>% rowMeans %>% t %>% t %>% data.frame
    colnames(k27_data) = c("K27")
    k27_data$gene_name = rownames(k27_data) %>% tolower

    k4_data = log2(normMat$H3K4me2[, cr_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(k4_data) = c("K4")
    k4_data$gene_name = rownames(k4_data) %>% tolower

    rna_gene = rownames(rna_select_data) %>% gsub(".*_", "", .) %>% tolower
    RNA_data = log2(data.frame(rna_select_data)[, rna_col_index] + 1) %>% rowMeans %>% t %>%  t %>% data.frame
    colnames(RNA_data) = c("RNA")
    RNA_data$gene_name = rna_gene

    cr_data = cbind(k27_data, K4 = k4_data$K4)
    cr_rna_full = full_join(RNA_data, cr_data, by = "gene_name") %>% dplyr::select(gene_name, K4, K27, RNA)
    cr_rna_full[is.na(cr_rna_full)] = 0

    ## PCA of cordant or discordant genes and see how well the cell types are separated
    cr_rna_full$K4_group = "-"
    cr_rna_full$K4_group[which(cr_rna_full$K4 > 4.5)] = "+"
    cr_rna_full$K27_group = "-"
    cr_rna_full$K27_group[which(cr_rna_full$K27 > 2.5)] = "+"
    cr_rna_full$RNA_group = "-"
    cr_rna_full$RNA_group[which(cr_rna_full$RNA > 2.5)] = "+"

    cr_rna_full = cr_rna_full %>% mutate(gene_type = paste0("RNA", RNA_group, "K4", K4_group, "K27", K27_group)) %>% 
    mutate(gene_name_index = paste0(gene_name, "_", 1:nrow(cr_rna_full))) %>% data.frame %>% dplyr::filter(gene_name_index %in% target_start_gene_cm)
    print(dim(cr_rna_full))
    tmp = cr_rna_full %>% group_by(gene_type) %>% summarize(N = n()) %>% 
        mutate(cell_type = cT)
    print(dim(tmp))
    print(tmp$N %>% sum)
    cordant_gene_df = rbind(cordant_gene_df, left_join(cr_rna_full, tmp, by = "gene_type"))
    print(dim(left_join(cr_rna_full, tmp, by = "gene_type")))
}
cordant_gene_df$gene_type_name = factor(cordant_gene_df$gene_type, levels = c("RNA+K4-K27-", "RNA-K4-K27-", "RNA+K4+K27-", "RNA-K4+K27+", 
"RNA+K4-K27+", "RNA-K4+K27-", "RNA-K4-K27+", "RNA+K4+K27+"), labels = c("RNA+K4-K27-(NME)", "RNA-K4-K27-(NMNE)", "RNA+K4+K27-(CE)", "RNA-K4+K27+(BNE)", 
"RNA+K4-K27+(DE)", "RNA-K4+K27-(DCNE)", "RNA-K4-K27+(CNE)", "RNA+K4+K27+(BE)"))

cordant_gene_df$cell_type = factor(cordant_gene_df$cell_type, levels = c("CM", "N", "EM"))
cordant_gene_df$gene_num = 1
head(cordant_gene_df)

In [None]:
options(repr.plot.width=13, repr.plot.height=10)
# pdf(paste0(fig_path, "/FlowChart_BE_BNE_targetgeneonly_forCM.pdf"), width =9, height = 9)

ggplot(cordant_gene_df %>% filter(cell_type != "N"),
       aes(x = cell_type, stratum = gene_type_name, alluvium = gene_name_index,
           y = gene_num,
           fill = gene_type_name, label = N)) +
  scale_x_discrete(expand = c(.1, .1)) +
  geom_flow() +
  geom_stratum(alpha = .5) +
  geom_fit_text(stat = "stratum", size = 10, min.size = 6, formatter = comma) +
  theme(legend.position = "bottom") +
  ggtitle("") +
  ylab("Number of Genes")
# dev.off()