In [1]:
## CUTRUN DE and assign to nearest genes

library(pacman)
p_load(data.table, dplyr, ggplot2, viridis, magrittr, VennDiagram, ggpubr, limma, edgeR, tidyr, GenomicRanges, RColorBrewer, pheatmap, Seurat, fgsea, GSEABase, limma, TxDb.Hsapiens.UCSC.hg38.knownGene, org.Hs.eg.db)

cell_list <- c("N", "CM", "EM", "EMRA")
hd_list <- paste0("HD", c(1:3, 5:7))
expr_list = c("Input", "Product", "Stim1", "Stim2", "Stim3")
cell_comp_list = c("N_CM", "N_EM", "N_EMRA", "CM_EM", "CM_EMRA", "EM_EMRA")
hist_list <- c("H3K27me3", "H3K4me2")

in_path <- "CART_CUTRUN_Project/results/RNAseq/process/RSEM/"
out_path <- "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/"
fig_path <- "CART_CUTRUN_Project/results/paper_figure/"
inPath = "CART_CUTRUN_Project/results/CUTANDRUN/process/"
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"


In [None]:
## PCA for RNA and CUTRUN
load(file = paste0(outPath, "/RData/countMat_designInfo_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData")) 
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
load(file = "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/RData/HD1-3_5-7_normLimma_perExprCondition.RData") 
geneID = fread(paste0("CART_CUTRUN_Project/results/RNAseq/process/RSEM/", "RNA_CD8_N_Input_HD1.genes.results"))$gene_id
data = data.frame(data)
select_exp = "Input"

rna_m = data[, colnames(data) %>% stringr::str_detect(select_exp)]
k27_m = countMat[["H3K27me3"]][, colnames(countMat[["H3K27me3"]]) %>% stringr::str_detect(select_exp)]
k4_m = countMat[["H3K4me2"]][, colnames(countMat[["H3K4me2"]]) %>% stringr::str_detect(select_exp)]

rownames(rna_m) = geneID
rownames(k27_m) = mPeak[["H3K27me3"]] %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region
rownames(k4_m) = mPeak[["H3K4me2"]] %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region

rna_voom = voomDDS$E[, colnames(voomDDS$E) %>% stringr::str_detect(select_exp)]
load(file = paste0(outPath, "/RData/results_histList_hd1-7_adjustPeak_noChrXYM.RData"))
k27_voom = voomDDS[["H3K27me3"]]$E[, colnames(voomDDS[["H3K27me3"]]$E) %>% stringr::str_detect(select_exp)]
k4_voom = voomDDS[["H3K4me2"]]$E[, colnames(voomDDS[["H3K4me2"]]$E) %>% stringr::str_detect(select_exp)]
rownames(rna_voom) = 1:nrow(rna_voom)
rownames(k27_voom) = paste0("K27me3_", 1:nrow(k27_voom))
rownames(k4_voom) = paste0("K4me2_", 1:nrow(k4_voom))


In [None]:
rna_obj = CreateSeuratObject(counts = rna_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = 5000) %>% 
ScaleData()
rna_obj = RunPCA(rna_obj, features = VariableFeatures(object = rna_obj), npcs = 20)

k27_obj = CreateSeuratObject(counts = k27_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round(nrow(k27_m)/10*8)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)

k4_obj = CreateSeuratObject(counts = k4_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round(nrow(k4_m)/10*8)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

cr_obj = CreateSeuratObject(counts = rbind(k27_m, k4_m), project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = round((nrow(k27_m) + nrow(k4_m))/10*8)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)


In [None]:
# DimPlot(rna_obj, reduction = "pca")
ct_list = c("N", "CM", "EM", "EMRA")
p1 = Embeddings(rna_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 6), levels = ct_list)) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

p2 = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

p3 = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

p4 = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")

ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2, common.legend = TRUE)
ggsave(filename = "CART_CUTRUN_Project/results/paper_figure/fig4/PCA_seurat_procedure.pdf", width = 10, height = 10)

In [None]:
## PCA on raw count all features

rna_obj = CreateSeuratObject(counts = rna_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(rna_obj)) %>% 
ScaleData()
rna_obj = RunPCA(rna_obj, features = VariableFeatures(object = rna_obj), npcs = 20) %>% RunUMAP(dims = 1:20, n.neighbors = 20)

k27_obj = CreateSeuratObject(counts = k27_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(k27_obj)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15) %>% RunUMAP(dims = 1:15, n.neighbors = 15)

k4_obj = CreateSeuratObject(counts = k4_m, project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(k4_obj)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15) %>% RunUMAP(dims = 1:15, n.neighbors = 15)

cr_obj = CreateSeuratObject(counts = rbind(k27_m, k4_m), project = "cart", min.cells = 0, min.features = 50) %>%  
NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = nrow(k27_obj) + nrow(k4_obj)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15) %>% RunUMAP(dims = 1:15, n.neighbors = 15)


In [None]:
# # DimPlot(rna_obj, reduction = "pca")
# ct_list = c("N", "CM", "EM", "EMRA")
# p1 = Embeddings(rna_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 6), levels = ct_list)) %>% 
# ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
# geom_point() +
# theme_bw(base_size = 20) +
# scale_color_viridis(discrete = TRUE) +
# xlab("PC 1") +
# ylab("PC 2") +
# ggtitle("RNA-seq") +
# rremove("legend.title")

# p2 = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
# ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
# geom_point() +
# theme_bw(base_size = 20) +
# scale_color_viridis(discrete = TRUE) +
# xlab("PC 1") +
# ylab("PC 2") +
# ggtitle("H3K27me3") +
# rremove("legend.title")

# p3 = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
# ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
# geom_point() +
# theme_bw(base_size = 20) +
# scale_color_viridis(discrete = TRUE) +
# xlab("PC 1") +
# ylab("PC 2") +
# ggtitle("H3K4me2") +
# rremove("legend.title")

# p4 = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
# ggplot(aes(x = PC_1, y = PC_2, color = celltype)) +
# geom_point() +
# theme_bw(base_size = 20) +
# scale_color_viridis(discrete = TRUE) +
# xlab("PC 1") +
# ylab("PC 2") +
# ggtitle("H3K27me3 + H3K4me2") +
# rremove("legend.title")

# ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2, common.legend = TRUE)
# ggsave(filename = "CART_CUTRUN_Project/results/paper_figure/fig4/PCA_seurat_procedure_allFeatures.pdf", width = 10, height = 10)

# DimPlot(rna_obj, reduction = "pca")
ct_list = c("N", "CM", "EM", "EMRA")
p1 = Embeddings(rna_obj, reduction = "umap") %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 6), levels = ct_list)) %>% 
ggplot(aes(x = UMAP_1, y = UMAP_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("UMAP 1") +
ylab("UMAP 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

p2 = Embeddings(k27_obj, reduction = "umap") %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
ggplot(aes(x = UMAP_1, y = UMAP_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("UMAP 1") +
ylab("UMAP 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

p3 = Embeddings(k4_obj, reduction = "umap") %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
ggplot(aes(x = UMAP_1, y = UMAP_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("UMAP 1") +
ylab("UMAP 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

p4 = Embeddings(cr_obj, reduction = "umap") %>% data.frame %>% mutate(celltype = factor(rep(ct_list, 4), levels = ct_list)) %>% 
ggplot(aes(x = UMAP_1, y = UMAP_2, color = celltype)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("UMAP 1") +
ylab("UMAP 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")

ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2, common.legend = TRUE)
ggsave(filename = "CART_CUTRUN_Project/results/paper_figure/fig4/UMAP_seurat_procedure_allFeatures.pdf", width = 10, height = 10)

In [None]:
p1 = prcomp(t(data.frame(rna_voom)) %>% data.frame)$x %>% data.frame %>% mutate(label = factor(rep(cellList, each = 6), levels = cellList)) %>% 
ggplot(aes(x = PC1, y = PC2, color = label)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("RNA-seq") +
rremove("legend.title")

p2 = prcomp(t(data.frame(k27_voom)) %>% data.frame)$x %>% data.frame %>% mutate(label = factor(rep(cellList, each = 4), levels = cellList)) %>% 
ggplot(aes(x = PC1, y = PC2, color = label)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

p3 = prcomp(t(data.frame(k4_voom)) %>% data.frame)$x %>% data.frame %>% mutate(label = factor(rep(cellList, each = 4), levels = cellList)) %>% 
ggplot(aes(x = PC1, y = PC2, color = label)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

p4 = prcomp(t(data.frame(rbind(k27_voom, k4_voom))) %>% data.frame)$x %>% data.frame %>% mutate(label = factor(rep(cellList, each = 4), levels = cellList)) %>% 
ggplot(aes(x = PC1, y = PC2, color = label)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE) +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")


ggarrange(p1, p2, p3, p4, nrow = 2, ncol = 2, common.legend = TRUE)
ggsave(filename = "CART_CUTRUN_Project/results/paper_figure/fig4/PCA_voom_norm_pca.pdf", width = 10, height = 10)

In [None]:
## Tornado plot
samtools sort -o $projPath/alignment/bam/${histName}.sorted.bam $projPath/alignment/bam/${histName}_bowtie2.mapped.bam                                                     
samtools index $projPath/alignment/bam/${histName}.sorted.bam                                                                                                              
bamCoverage -b $projPath/alignment/bam/${histName}.sorted.bam -o $projPath/alignment/bigwig/${histName}_raw.bw 

cores=8
computeMatrix scale-regions -S $projPath/alignment/bigwig/K27me3_rep1_raw.bw \
                               $projPath/alignment/bigwig/K27me3_rep2_raw.bw \
                               $projPath/alignment/bigwig/K4me3_rep1_raw.bw \
                               $projPath/alignment/bigwig/K4me3_rep2_raw.bw \
                              -R $projPath/data/hg38_gene/hg38_gene.tsv \
                              --beforeRegionStartLength 3000 \
                              --regionBodyLength 5000 \
                              --afterRegionStartLength 3000 \
                              --skipZeros -o $projPath/data/hg38_gene/matrix_gene.mat.gz -p $cores

plotHeatmap -m $projPath/data/hg38_gene/matrix_gene.mat.gz -out $projPath/data/hg38_gene/Histone_gene.png --sortUsing sum


awk '{split($6, summit, ":"); split(summit[2], region, "-"); print summit[1]"\t"region[1]"\t"region[2]}' $projPath/peakCalling/SEACR/${histName}_${repName}_seacr_control.pe\
aks.stringent.bed >$projPath/peakCalling/SEACR/${histName}_${repName}_seacr_control.peaks.summitRegion.bed

computeMatrix reference-point -S $projPath/alignment/bigwig/${histName}_${repName}_raw.bw \
              -R $projPath/peakCalling/SEACR/${histName}_${repName}_seacr_control.peaks.summitRegion.bed \
              --skipZeros -o $projPath/peakCalling/SEACR/${histName}_${repName}_SEACR.mat.gz -p $cores -a 3000 -b 3000 --referencePoint center

plotHeatmap -m $projPath/peakCalling/SEACR/${histName}_SEACR.mat.gz -out $projPath/peakCalling/SEACR/${histName}_SEACR_heatmap.png --sortUsing sum --startLabel "Peak Start" -\
-endLabel "Peak End" --xAxisLabel "" --regionsLabel "Peaks" --samplesLabel "${histName} ${repName}"

In [None]:
## Tornado plot
## get the gene order based on gene expression for each cell type
load(file = "CART_CUTRUN_Project/results/RNAseq/analysis/RSEM/RData/HD1-3_5-7_normLimma_perExprCondition.RData") ## data, selectR, dataS, voomDDS, results, 
geneID = fread(paste0("CART_CUTRUN_Project/results/RNAseq/process/RSEM/", "RNA_CD8_N_Input_HD1.genes.results"))$gene_id
select_exp = "Input"
data = data.frame(data)
rna_m = data[, colnames(data) %>% stringr::str_detect(select_exp)]
rna_lcpm = log(rna_m/colSums(rna_m) * 1000000 + 1)
head(rna_m)

head(rna_lcpm)

ct_list = c("N", "CM", "EM", "EMRA")
rna_lcpm_median = list()
rna_lcpm_order = list()
for(ct in ct_list){
    rna_lcpm_median[[ct]] = apply(rna_lcpm[, colnames(rna_lcpm) %>% stringr::str_detect(ct)], 1, median)
    rna_lcpm_order[[ct]] = order(rna_lcpm_median[[ct]], decreasing = TRUE)
}

In [None]:
## get the ordered gene region
library(biomaRt)
mart = useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
for(ct in ct_list){
    print(ct)
    ensembl <- rownames(rna_m)[rna_lcpm_order[[ct]]] %>% gsub("\\..*", "", .)

    gene_region <- getBM(filters= "ensembl_gene_id", attributes= c("ensembl_gene_id","hgnc_symbol", 'chromosome_name', 'start_position', 'end_position', "strand"),
                    values = ensembl, mart= mart, uniqueRows = TRUE)
    gene_order_region <- left_join(data.frame(ensembl_gene_id = ensembl), gene_region, by = "ensembl_gene_id") %>% unique() %>% filter(!is.na(strand)) %>% mutate(chrom = paste0("chr", chromosome_name))
    strand_ind <- which(gene_order_region$strand == -1)
    gene_order_region$strand = "+"
    gene_order_region$strand[strand_ind] = "-"

    gene_order_region %>% dplyr::select(chrom, start_position, end_position, ensembl_gene_id, hgnc_symbol, strand) %>% 
    write.table(., paste0("CART_CUTRUN_Project/results/RNAseq/analysis/TSV/Input_", ct, "_ordered_gene_region.tsv"), quote = FALSE, col.names = F, row.names = F, sep = "\t")

}


In [None]:
library(readxl)
gene_path <- "CART_CUTRUN_Project/functionalGeneList/"
select_gene <- list(
    TF = read_excel(paste0(gene_path, "2021_11_29_TFs.xlsx")) %>% mutate(gene_name = `HGNC symbol`),
    Tactive = read_excel(paste0(gene_path, "2021_11_30_TCellActivationFunction.xlsx")) %>% mutate(gene_name = `Official Symbol`),
    CellCycle = read_excel(paste0(gene_path, "2021_22_30_CellCycleAPoptosis.xlsx")) %>% mutate(gene_name = `Symbol`),
    Metabolism = read_excel(paste0(gene_path, "2021_22_30_MetabolismGeneList_FinalList.xlsx")) %>% mutate(gene_name = `Gene`)
)
head(select_gene[["TF"]])

In [None]:
## subsect by gene types
for(ct in ct_list){
    print(ct)
    for(type in names(select_gene)){
        print(type)
        gene_order_region %>% filter(hgnc_symbol %in% select_gene[[type]]$gene_name) %>% dplyr::select(chrom, start_position, end_position, ensembl_gene_id, hgnc_symbol, strand) %>% 
        write.table(., paste0("CART_CUTRUN_Project/results/RNAseq/analysis/TSV/Input_", ct, "_ordered_gene_region_subset_", type, ".tsv"), quote = FALSE, col.names = F, row.names = F, sep = "\t")
    }    
}
