In [3]:
library(GenomicRanges)
library(dplyr)
library(data.table)
library(ggplot2)
library(viridis)
library(readxl)
library(pheatmap)
library(RColorBrewer)
library(tidyr)
library(gridExtra)
library(corrplot)
library(tibble)
library(ggrepel)
library(ggpubr)
library(DEFormats)
library(BiocParallel)
register(MulticoreParam(4))
library(chromVAR)
library(GenomicFeatures)
library(BSgenome.Hsapiens.UCSC.hg38)
library(ChIPseeker)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(org.Hs.eg.db)
library(pheatmap)
library(RColorBrewer)
library(tidyr)
library(gridExtra)
library(ggrepel)
library(limma)
library(ggdendro)
library(ggfortify)
library(ggpubr)
library(magrittr)
library(Seurat)

In [None]:
patientRaw = read_xlsx("CART_CUTRUN_Project/report/CART_CUTRUN/analysis/20201028_NHL_CUTandRUN_PatientData.xlsx")
cm_ind = which(patientRaw$`CD8+ T cell isolation` == "CD8TCM")
patientRaw$Xnumber[cm_ind]

In [5]:
## 1. TSS neighbor regions
## use TSS region for differential detection
## Get the TSS location and +/-1kb region around TSS
gtf <- rtracklayer::import('SupplementaryData/hg38/transcriptomeHumanReference/gencode.v33.annotation.gtf') 
gtf_df=as.data.frame(gtf)


## get the TSS location and extend by 1kb up and downstream
tss_df = rbind(
    gtf_df %>% data.frame %>% dplyr::filter(type == "gene", strand == "+") %>% dplyr::select(seqnames, TSS = start, strand, source, gene_id, gene_name, gene_name, gene_type) ,
    gtf_df %>% data.frame %>% dplyr::filter(type == "gene", strand == "-") %>% dplyr::select(seqnames, TSS = end, strand, source, gene_id, gene_name, gene_name, gene_type) 
)
tss_filter_df = tss_df %>% dplyr::filter(!(seqnames %in% c("chrM", "chrX", "chrY")))

tss_gr = GRanges(seqnames = tss_filter_df$seqnames, ranges = IRanges(start = tss_filter_df$TSS - 1000, end = tss_filter_df$TSS + 1000), strand = tss_filter_df$strand, gene_id = tss_filter_df$gene_id, gene_name = tss_filter_df$gene_name, gene_type = tss_filter_df$gene_type)
tss_gr 


In [None]:
## patient data

outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
histList = c("H3K27me3", "H3K4me2")

countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_NHLpatient_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_NHLpatient_noChrXYM_normMat.rds"))
designInfo = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_NHLpatient_noChrXYM_designInfo.rds"))

pt_countMat = countMat
pt_countMat$H3K27me3 = countMat$H3K27me3[, cm_ind]
pt_countMat$H3K4me2 = countMat$H3K4me2[, cm_ind]
colnames(pt_countMat$H3K27me3) = paste0(colnames(pt_countMat$H3K27me3), "_PT")
colnames(pt_countMat$H3K4me2) = paste0(colnames(pt_countMat$H3K4me2), "_PT")

pt_designInfo = designInfo %>% dplyr::filter(exps %in% patientRaw$Xnumber[cm_ind])
head(pt_countMat$H3K27me3)
pt_designInfo %>% arrange(hist, exps)


In [None]:
## healthy donor data
countMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_countMat.rds"))
normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds"))
designInfo = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_designInfo.rds"))

selectC = colnames(countMat$H3K27me3) %>% stringr::str_detect("CM_Product")
head(countMat$H3K4me2[, selectC])

hd_countMat = countMat
hd_countMat$H3K27me3 = countMat$H3K27me3[, selectC]
hd_countMat$H3K4me2 = countMat$H3K4me2[, selectC]

hd_designInfo = designInfo[which(stringr::str_detect(designInfo$exps, "CM_Product")), ]
hd_designInfo



In [6]:
for(hist in histList){
    tmp = c()

    for(j in 1:ncol(pt_countMat[[hist]])){
        pt_designInfo_tmp = pt_designInfo %>% dplyr::filter(hist == hist)
        tmp = cbind(tmp, round(pt_countMat[[hist]][, j]/pt_designInfo_tmp$depth[j] * 13240732)) ## remove sequencing depth effect
    }
    for(j in 1:ncol(hd_countMat[[hist]])){
        hd_designInfo_tmp = hd_designInfo %>% dplyr::filter(hist == hist)
        tmp = cbind(tmp, round(hd_countMat[[hist]][, j]/hd_designInfo_tmp$depth[j] * 13240732)) ## remove sequencing depth effect
    }
    countMat[[hist]] = tmp
    colnames(countMat[[hist]]) = c(colnames(pt_countMat[[hist]]), colnames(hd_countMat[[hist]]))
}


In [None]:
groupList = c("PT", "HD")
# hdList <- infoData$H3K4me2$Xnumber
histList <- c("H3K27me3", "H3K4me2")
target = data.frame(
    group = c(
        rep("PT", ncol(pt_countMat$H3K4me2)), 
        rep("HD", ncol(hd_countMat$H3K4me2))
    )
)

target$group = factor(target$group, levels = groupList)

## Experimental design
treat <- factor(target$group)
design <- model.matrix(~0 + treat)
colnames(design) <- levels(treat)
contrast <- makeContrasts(PT_HD = PT - HD, levels = design)


In [64]:
min_dist_peak_to_gene = 5000
results = vector("list", length(histList))
results_fullInfo = vector("list", length(histList))
voomDDS = vector("list", length(histList))
for(hist in histList){
  ## Filter and delete low expressed genes
    selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes
    
    dataS = countMat[[hist]][selectR, ]
    peakRegionList = tss_gr[selectR] %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end, "(", gene_name, ")")) %$% region
    rownames(dataS) = peakRegionList
    voomDDS[[hist]] = voom(counts = dataS, design = design, normalize.method = "cyclicloess", plot = FALSE)
    ## option 1 using voomDDS option2 using normDDS as normalized input.
    inputDDS = voomDDS[[hist]]
    ## corfit$consensus
    fit = lmFit(inputDDS, design)
    fitContrast = contrasts.fit(fit, contrast)
    fitBayes = eBayes(fitContrast, robust = TRUE)


    results[[hist]] = list()
    results_fullInfo[[hist]] = list()
    for(i in 1:ncol(contrast)){
      ## Results
      res = topTable(fit = fitBayes, adjust.method = 'fdr', coef = i, number = nrow(inputDDS), sort = 'P') %>% data.table

      # res = data.table(PeakRegion = rownames(res), res)
      res[, Significance := ifelse((adj.P.Val <= 0.1 & sign(logFC) == 1 & abs(logFC) >= 1), 'Up',
                              ifelse((adj.P.Val <= 0.1 & sign(logFC) == -1 & abs(logFC) >= 1), 'Down', 'notDE'))]
      results[[hist]][[i]] = left_join(res, dataS %>% data.frame %>% dplyr::mutate(ID = rownames(dataS)), by = "ID")

      ## overlap with genes
      # results_peakRegion = results[[hist]][[i]] %$% PeakRegion
      # results_chrom = gsub(":.*", "", results_peakRegion)
      # results_start = gsub(".*:", "", results_peakRegion) %>% gsub("-.*", "", .) %>% as.numeric
      # results_end = gsub(".*:", "", results_peakRegion) %>% gsub(".*-", "", .) %>% as.numeric
      # results_peakRegion_gr = GRanges(seqnames = results_chrom, IRanges(start = results_start, end = results_end), strand = "*")
      
      # dp_match_gene = distanceToNearest(results_peakRegion_gr, gene_gr, select = "all") ## get the nearest gene for each peak
      # dp_match_gene = dp_match_gene[which(data.frame(dp_match_gene)$distance <= min_dist_peak_to_gene)]

      # results_fullInfo[[hist]][[i]] = cbind(
      #           results[[hist]][[i]][dp_match_gene@from, ], 
      #           gene_gr[dp_match_gene@to] %>% data.frame %>% dplyr::select(gene_name, gene_id, gene_type)
      # )
      # results_fullInfo[[hist]][[i]]$gene_region = paste0(data.frame(gene_gr[dp_match_gene@to])$seqnames, ":", data.frame(gene_gr[dp_match_gene@to])$start, "-", data.frame(gene_gr[dp_match_gene@to])$end)
      # results_fullInfo[[hist]][[i]]$peak_distance_to_gene = data.frame(dp_match_gene)$distance
      # ## Output
      write.table(results[[hist]][[i]], file = paste0(outPath, '/CUTRUN_limma_tables/DE_', hist, '_TSSregions_cmPatient_vs_cmHealthyDonor_InfusionProduct_adj0.1_logFC2.csv'), quote = FALSE, row.names = FALSE, sep = ",")
      # write.table(results_fullInfo[[hist]][[i]], file = paste0(outPath, '/CUTRUN_limma_tables/DE_', hist, '_patient_TSSregions_CR_VS_PD_adj0.1_logFC2_link_nearest_genes.csv'), quote = FALSE, row.names = FALSE, sep = ",")
    }
    names(results[[hist]]) <- colnames(contrast)
    # names(results_fullInfo[[hist]]) <- colnames(contrast)


}
save(results, voomDDS, file = paste0(outPath, "/RData/results_histList_noChrXYM_TSSregions_cmPatient_vs_cmHealthyDonor_InfusionProduct.RData"))


In [None]:
options(repr.plot.width=22, repr.plot.height=10)
hist = "H3K27me3"
selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes

tmp = countMat[[hist]][selectR, ]

k27_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)
k27_plot = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

hist = "H3K4me2"
selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes
k4_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

k4_plot = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

cr_obj = CreateSeuratObject(counts = rbind(tmp, countMat[[hist]][selectR, ]), project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR) + nrow(tmp)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)
cr_plot = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")

pdf("CART_CUTRUN_Project/results/paper_figure/patient-related/CMpatient_CMhealthyDonor_PCA_on_all_TSSneighbors.pdf", width = 15, height = 6)
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)
dev.off()
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)


In [None]:
## call peaks and associated with nearest genes
outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
histList = c("H3K27me3", "H3K4me2")

topN = 20
load(file = paste0(outPath, "/RData/PatientData_peakOverlap_withDuplicates_seacrOnly_top0_", topN, ".RData"))
peakAll_pt = peakAll
load(file = paste0(outPath, "/RData/masterPeak_peakAnno_histList_hd1-7_SEACRcontrolTop10_noChrXYM.RData"))
peakAll_hd = peakAll

mPeak = vector("list", 2)
for(hist in histList){
    tmp = GRanges()
    for(pt in patientRaw$Xnumber[cm_ind]){
        tmp = append(tmp, peakAll_pt[[hist]][[pt]])
    }

    selectC = names(peakAll_hd)[names(peakAll_hd) %>% stringr::str_detect("CM_Product") & names(peakAll_hd) %>% stringr::str_detect(hist)]
    for(hd in selectC){
        tmp = append(tmp, peakAll_hd[[hd]])
    }
    mPeak[[hist]] = reduce(tmp)
}
mPeak$H3K4me2 %>% length
mPeak$H3K27me3 %>% length

In [81]:
## overlap with bam file to get count -- !!! Run it in terminal directly
# library(doMC)
# coreN <- 6
# registerDoMC(cores = coreN)

# outPath = "CART_CUTRUN_Project/results/CUTANDRUN/analysis/"
# histList = c("H3K27me3", "H3K4me2")
# bamSelect = vector("list", length(histList))
# patient_ordered = vector("list", length(histList))
# fragment_tss_counts = vector("list", length(histList))

# patientRaw = read_xlsx("CART_CUTRUN_Project/report/CART_CUTRUN/analysis/20201028_NHL_CUTandRUN_PatientData.xlsx")
# cm_ind = which(patientRaw$`CD8+ T cell isolation` == "CD8TCM")
# patientRaw$Xnumber[cm_ind]

# ## dataset 1
# inPath = "/shared/ngs/illumina/sfiorenz/201014_D00300_1082_AHHYJJBCX3/analysis/"
# index = grep("IP", list.files(inPath), fixed = TRUE)[!grep("IP", list.files(inPath), fixed = TRUE) %in% grep("IgG", list.files(inPath), fixed = TRUE)]
# sampleL = list.files(inPath)[index]
# sampleLtmp = sampleL %>% strsplit("_") %>% unlist
# patientL = sampleLtmp[seq(4, length(sampleLtmp), 7)]
# histL = sampleLtmp[seq(6, length(sampleLtmp), 7)]

# for(i in 1:length(sampleL)){
#   sample = sampleL[i]
#   hist = histL[i]
#   pt = patientL[i]
#   if(pt %in% patientRaw$Xnumber[cm_ind]){
#     bamFile = paste0(inPath, sample, "/bowtie2_align.bam")
#     if(file.exists(bamFile)){
#         bamSelect[[hist]] = c(bamSelect[[hist]], bamFile)
#         patient_ordered[[hist]] = c(patient_ordered[[hist]], pt)
#     }else{
#         print(paste0(bamFile, " does not exist!"))
#     }
#   }
# }

# ## dataset 2

# inPath = "/shared/ngs/illumina/tphi/200924_D00300_1063_AHHLGCBCX3/analysis/"
# index = grep("IP", list.files(inPath), fixed = TRUE)[!grep("IP", list.files(inPath), fixed = TRUE) %in% grep("IgG", list.files(inPath), fixed = TRUE)]
# sampleL = list.files(inPath)[index]
# sampleLtmp = sampleL %>% strsplit("_") %>% unlist
# patientL = sampleLtmp[seq(3, length(sampleLtmp), 6)]
# histL = sampleLtmp[seq(5, length(sampleLtmp), 6)]

# for(i in 1:length(sampleL)){
#   sample = sampleL[i]
#   hist = histL[i]
#   pt = patientL[i]
#   if(pt %in% patientRaw$Xnumber[cm_ind]){
#     bamFile = paste0(inPath, sample, "/bowtie2_align.bam")
#     if(file.exists(bamFile)){
#         bamSelect[[hist]] = c(bamSelect[[hist]], bamFile)
#         patient_ordered[[hist]] = c(patient_ordered[[hist]], pt)
#     }else{
#         print(paste0(bamFile, " does not exist!"))
#     }
#   }
# }

# ## healthy donor
# bamDir <- "CART_CUTRUN_Project/results/CUTANDRUN/process/"
# # fragment_tss_counts = vector("list", length(histList))
# for(hist in histList){
#    for(cell in "CM"){
#     for(expr in "Product"){
#       hdL = paste0("HD", 1:7)
#       for(hd in hdL){
#         ## Get the replicate number and peak calling type
#         rep = "rep1"
#         bamType = "bowtie2_align.bam"
#         bamFile = paste0(bamDir, hist, "_CD8_", cell, "_", expr, "_", hd, "_", rep, "/alignment/", bamType)
#         if(file.exists(bamFile)){
#           bamSelect[[hist]] = c(bamSelect[[hist]], bamFile)
#           patient_ordered[[hist]] = c(patient_ordered[[hist]], hd)
#         }else{
#           print(paste0(bamFile, " does not exist!"))
#         }
#       }
#     }
#    }
# }


# ## read in the reads
# for(hist in histList){
#   fragment_tss_counts[[hist]] <- mclapply(as.list(bamSelect[[hist]]), chromVAR::getCounts, mPeak[[hist]], paired = TRUE, by_rg = FALSE, format = "bam", mc.cores = coreN)
# }

# saveRDS(fragment_tss_counts, file = paste0(outPath, "/RData/peakRegions_chromVar_count_CMpatients_CMhealthydonor_noChrXYM.rds"))
# saveRDS(list(bamSelect, patient_ordered), file = paste0(outPath, "/RData/peakRegions_chromVar_count_CMpatients_CMhealthydonor_noChrXYM_file_pt_info.rds"))



fragment_tss_counts = readRDS(file = paste0(outPath, "/RData/peakRegions_chromVar_count_CMpatients_CMhealthydonor_noChrXYM.rds"))
tmp = readRDS(file = paste0(outPath, "/RData/peakRegions_chromVar_count_CMpatients_CMhealthydonor_noChrXYM_file_pt_info.rds"))
bamSelect = tmp[[1]]
patient_ordered = tmp[[2]]


In [None]:
library(chromVAR)
library(SummarizedExperiment)
library(Matrix)

## count matrix column name
countMat = vector("list", length(histList))
normMat = vector("list", length(histList))
seqDepth = vector("list", length(histList))
designInfo = c()
for(hist in histList){

  countMat[[hist]] <- matrix(NA, length(mPeak[[hist]]), length(bamSelect[[hist]]))
  normMat[[hist]] <- matrix(NA, length(mPeak[[hist]]), length(bamSelect[[hist]]))
  colnames(countMat[[hist]]) <- patient_ordered[[hist]]
  colnames(normMat[[hist]]) <- patient_ordered[[hist]]
  seqDepth[[hist]] <- NULL
  for (k in 1:length(bamSelect[[hist]])){
    countMat[[hist]][, k] <- counts(fragment_tss_counts[[hist]][[k]])[,1]
    seqDepth[[hist]][k] <- fragment_tss_counts[[hist]][[k]]@colData[1,1]
    normMat[[hist]][, k] <- countMat[[hist]][, k]/seqDepth[[hist]][k] * 13240732 ## divide by the seqDepth and multiply by the largest seqDepth
  }
  designInfo <- data.frame(exps = patient_ordered[[hist]], depth = seqDepth[[hist]], hist = hist) %>% rbind(designInfo, .)
}
rownames(countMat$H3K4me2) = mPeak$H3K4me2 %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region
rownames(countMat$H3K27me3) = mPeak$H3K27me3 %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region
rownames(normMat$H3K4me2) = mPeak$H3K4me2 %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region
rownames(normMat$H3K27me3) = mPeak$H3K27me3 %>% data.frame %>% mutate(region = paste0(seqnames, ":", start, "-", end)) %$% region

# ## read in patient info
# patientRaw = read_xlsx("CART_CUTRUN_Project/report/CART_CUTRUN/analysis/20201028_NHL_CUTandRUN_PatientData.xlsx")
# info = patientRaw %>% data.frame %>% dplyr::select(Xnumber, candr_day, age, sex, bestresp) 

# countMat$H3K27me3 = countMat$H3K27me3[, patientRaw$Xnumber]
# countMat$H3K4me2 = countMat$H3K4me2[, patientRaw$Xnumber]

# infoData = list(H3K27me3 = data.frame(Xnumber = colnames(countMat[[histList[1]]])) %>% left_join(., info, by = "Xnumber") %>% mutate(candr_day = factor(candr_day), age = factor(age), sex = factor(sex), bestresp = factor(bestresp, levels = c("CR", "PR", "SD", "PD"))), 
#                 H3K4me2 = data.frame(Xnumber = colnames(countMat[[histList[2]]])) %>% left_join(., info, by = "Xnumber") %>% mutate(candr_day = factor(candr_day), age = factor(age), sex = factor(sex), bestresp = factor(bestresp, levels = c("CR", "PR", "SD", "PD"))))

# colnames(countMat$H3K27me3) = paste0(infoData$H3K27me3$Xnumber, "_", infoData$H3K27me3$bestresp)
# colnames(countMat$H3K4me2) = paste0(infoData$H3K4me2$Xnumber, "_", infoData$H3K4me2$bestresp)


saveRDS(countMat, file = paste0(outPath, "/RData/peakRegion_chromVar_count_CMpatient_CMhealthydonor_noChrXYM_countMat.rds"))
saveRDS(normMat, file = paste0(outPath, "/RData/peakRegion_chromVar_count_CMpatient_CMhealthydonor_noChrXYM_normMat.rds"))
saveRDS(designInfo, file = paste0(outPath, "/RData/peakRegion_chromVar_count_CMpatient_CMhealthydonor_noChrXYM_designInfo.rds"))
# normMat = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_normMat.rds"))
# designInfo = readRDS(file = paste0(outPath, "/RData/TSS_neighbors_chromVar_count_histList_hd1-7_noChrXYM_designInfo.rds"))


In [None]:
groupList = c("PT", "HD")
# hdList <- infoData$H3K4me2$Xnumber
histList <- c("H3K27me3", "H3K4me2")
target = data.frame(
    group = c(
        rep("PT", 15), 
        rep("HD", 7)
    )
)

target$group = factor(target$group, levels = groupList)

## Experimental design
treat <- factor(target$group)
design <- model.matrix(~0 + treat)
colnames(design) <- levels(treat)
contrast <- makeContrasts(PT_HD = PT - HD, levels = design)

target
design
contrast


## read in genes location
gtf_filter_df = rtracklayer::import('SupplementaryData/hg38/GENCODE/gencode.v21.annotation.gtf') %>% 
    as.data.frame %>% 
    dplyr::filter(type == "gene", !(seqnames %in% c("chrM", "chrX", "chrY")))
gene_gr = GRanges(seqnames = gtf_filter_df$seqnames, ranges = IRanges(start = gtf_filter_df$start, end = gtf_filter_df$end), strand = gtf_filter_df$strand, gene_id = gtf_filter_df$gene_id, gene_name = gtf_filter_df$gene_name, gene_type = gtf_filter_df$gene_type)



In [96]:
min_dist_peak_to_gene = 5000
results = vector("list", length(histList))
results_fullInfo = vector("list", length(histList))
voomDDS = vector("list", length(histList))
for(hist in histList){
  ## Filter and delete low expressed genes
    selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes
    
    dataS = countMat[[hist]][selectR, ]
    # rownames(dataS) = peakRegionList
    voomDDS[[hist]] = voom(counts = dataS, design = design, normalize.method = "cyclicloess", plot = FALSE)
    ## option 1 using voomDDS option2 using normDDS as normalized input.
    inputDDS = voomDDS[[hist]]
    ## corfit$consensus
    fit = lmFit(inputDDS, design)
    fitContrast = contrasts.fit(fit, contrast)
    fitBayes = eBayes(fitContrast, robust = TRUE)


    results[[hist]] = list()
    results_fullInfo[[hist]] = list()
    for(i in 1:ncol(contrast)){
      ## Results
      res = topTable(fit = fitBayes, adjust.method = 'fdr', coef = i, number = nrow(inputDDS), sort = 'P') 

      res = data.table(PeakRegion = rownames(res), res)
      res[, Significance := ifelse((adj.P.Val <= 0.1 & sign(logFC) == 1 & abs(logFC) >= 1), 'Up',
                              ifelse((adj.P.Val <= 0.1 & sign(logFC) == -1 & abs(logFC) >= 1), 'Down', 'notDE'))]
      results[[hist]][[i]] = left_join(res, dataS %>% data.frame %>% dplyr::mutate(PeakRegion = rownames(dataS)), by = "PeakRegion")

      ## overlap with genes
      results_peakRegion = results[[hist]][[i]] %$% PeakRegion
      results_chrom = gsub(":.*", "", results_peakRegion)
      results_start = gsub(".*:", "", results_peakRegion) %>% gsub("-.*", "", .) %>% as.numeric
      results_end = gsub(".*:", "", results_peakRegion) %>% gsub(".*-", "", .) %>% as.numeric
      results_peakRegion_gr = GRanges(seqnames = results_chrom, IRanges(start = results_start, end = results_end), strand = "*")
      
      dp_match_gene = distanceToNearest(results_peakRegion_gr, gene_gr, select = "all") ## get the nearest gene for each peak
      dp_match_gene = dp_match_gene[which(data.frame(dp_match_gene)$distance <= min_dist_peak_to_gene)]

      results_fullInfo[[hist]][[i]] = cbind(
                results[[hist]][[i]][dp_match_gene@from, ], 
                gene_gr[dp_match_gene@to] %>% data.frame %>% dplyr::select(gene_name, gene_id, gene_type)
      )
      results_fullInfo[[hist]][[i]]$gene_region = paste0(data.frame(gene_gr[dp_match_gene@to])$seqnames, ":", data.frame(gene_gr[dp_match_gene@to])$start, "-", data.frame(gene_gr[dp_match_gene@to])$end)
      results_fullInfo[[hist]][[i]]$peak_distance_to_gene = data.frame(dp_match_gene)$distance
      ## Output
      write.table(results[[hist]][[i]], file = paste0(outPath, '/CUTRUN_limma_tables/DE_', hist, '_peakRegions_cmPatient_vs_cmHealthyDonor_InfusionProduct_adj0.1_logFC2.csv'), quote = FALSE, row.names = FALSE, sep = ",")
      write.table(results_fullInfo[[hist]][[i]], file = paste0(outPath, '/CUTRUN_limma_tables/DE_', hist, '_peakRegions_cmPatient_vs_cmHealthyDonor_InfusionProduct_adj0.1_logFC2_link_nearest_genes.csv'), quote = FALSE, row.names = FALSE, sep = ",")
    }
    names(results[[hist]]) <- colnames(contrast)
    names(results_fullInfo[[hist]]) <- colnames(contrast)


}
save(results, results_fullInfo, voomDDS, file = paste0(outPath, "/RData/results_histList_noChrXYM_peakRegions_cmPatient_vs_cmHealthyDonor_InfusionProduct.RData"))


In [None]:
options(repr.plot.width=22, repr.plot.height=10)
hist = "H3K27me3"
selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes

tmp = countMat[[hist]][selectR, ]

k27_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k27_obj = RunPCA(k27_obj, features = VariableFeatures(object = k27_obj), npcs = 15)
k27_plot = Embeddings(k27_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3") +
rremove("legend.title")

hist = "H3K4me2"
selectR = which(rowSums(countMat[[hist]]) > 200) ## remove low count genes
k4_obj = CreateSeuratObject(counts = countMat[[hist]][selectR, ], project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR)) %>% 
ScaleData()
k4_obj = RunPCA(k4_obj, features = VariableFeatures(object = k4_obj), npcs = 15)

k4_plot = Embeddings(k4_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K4me2") +
rremove("legend.title")

cr_obj = CreateSeuratObject(counts = rbind(tmp, countMat[[hist]][selectR, ]), project = "cart", min.cells = 0, min.features = 50) %>%  
# NormalizeData(., normalization.method = "LogNormalize", scale.factor = 1000000) %>% 
FindVariableFeatures(., selection.method = "vst", nfeatures = length(selectR) + nrow(tmp)) %>% 
ScaleData()
cr_obj = RunPCA(cr_obj, features = VariableFeatures(object = cr_obj), npcs = 15)
cr_plot = Embeddings(cr_obj, reduction = "pca")[, 1:2] %>% data.frame %>% mutate(group = c(rep("PT", 15), rep("HD", 7))) %>% 
ggplot(aes(x = PC_1, y = PC_2, color = group)) +
geom_point() +
theme_bw(base_size = 20) +
scale_color_viridis(discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
xlab("PC 1") +
ylab("PC 2") +
ggtitle("H3K27me3 + H3K4me2") +
rremove("legend.title")

pdf("CART_CUTRUN_Project/results/paper_figure/patient-related/CMpatient_CMhealthyDonor_PCA_on_all_peakRegions.pdf", width = 15, height = 6)
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)
dev.off()
ggarrange(k27_plot, k4_plot, cr_plot, common.legend = TRUE, ncol = 3, nrow = 1)
