In [1]:
suppressPackageStartupMessages({
    library(chromVAR)
    library(motifmatchr)
    library(SummarizedExperiment)
    library(Matrix)
    library('cicero')
})

In [2]:
library(BiocParallel)

In [3]:
register(MulticoreParam(20))

# chromVAR

In [None]:
## Rscript
# Rscript chromVAR.R -d data/Hematopoiesis-All/data_filtered -p data/Hematopoiesis-All/Hematopoiesis-All_filtered_peak.bed -g hg19 -n hema_all_f -o result/Hematopoiesis-All/chromVAR_f

In [5]:
library(BSgenome.Hsapiens.UCSC.hg19)
genome = BSgenome.Hsapiens.UCSC.hg19
species = 'Homo sapiens'

Loading required package: BSgenome
Loading required package: Biostrings
Loading required package: XVector

Attaching package: 'Biostrings'

The following object is masked from 'package:DelayedArray':

    type

The following object is masked from 'package:base':

    strsplit

Loading required package: rtracklayer


In [7]:
motifs <- getJasparMotifs(species=species)

In [6]:
#data_file = 'data//Hematopoiesis-All/data'
#peak_file = 'data//Hematopoiesis-All/Hematopoiesis-All_peak.bed'
data_file = 'data//Hematopoiesis-All/data_filtered'
peak_file = 'data//Hematopoiesis-All/Hematopoiesis-All_filtered_peak.bed'

dir.create(outdir, recursive=TRUE, showWarnings=FALSE)

#counts = read.table(data_file, row.names=1, header=T)
counts = read_count(data_file)
peaks = getPeaks(peak_file, sort_peaks=T) 

peak_index = as.data.frame(peaks)
peak_index = paste0(peak_index$seqnames, ':' ,peak_index$start-1, '-', peak_index$end)

index = row.names(counts)%in%peak_index
counts = counts[index,]

In [None]:
dev = apply_chromVAR(counts, peaks, motifs, genome)

In [10]:
outdir = 'result/Hematopoiesis-All/chromVAR_f'
name = 'hema_all_f'
write.table(deviationScores(dev), paste0(outdir, '/', name, '_dev.txt'), quote=F, sep='\t')
var = computeVariability(dev)
write.table(var, paste0(outdir, '/',name, '_var.txt'), quote=F, sep='\t')

In [12]:
saveRDS(dev, "result/Hematopoiesis-All/chromVAR_f/dev.rds")

# Cicero

In [3]:
suppressPackageStartupMessages({
    library(BSgenome.Hsapiens.UCSC.hg19)
    library(TxDb.Hsapiens.UCSC.hg19.knownGene)
    library(org.Hs.eg.db)
    library(cicero)
    library(magrittr)
    })

txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
orgdb <- org.Hs.eg.db
bsgenome <- BSgenome.Hsapiens.UCSC.hg19

In [17]:
# read in matrix data using the Matrix package
indata <- Matrix::readMM("data/Hematopoiesis-All/data/count.mtx") 
# binarize the matrix
indata@x[indata@x > 0] <- 1

# format cell info
cellinfo <- read.table("data/Hematopoiesis-All/data/barcodes.txt",comment.char = "")
row.names(cellinfo) <- cellinfo$V1
names(cellinfo) <- "cells"

# format peak info
peakinfo <- read.table("data/Hematopoiesis-All/Hematopoiesis-All_peak.bed")
names(peakinfo) <- c("chr", "bp1", "bp2")
peakinfo$site_name <- paste(peakinfo$chr, peakinfo$bp1, peakinfo$bp2, sep="_")
row.names(peakinfo) <- peakinfo$site_name

row.names(indata) <- row.names(peakinfo)
colnames(indata) <- row.names(cellinfo)

# make CDS
fd <- methods::new("AnnotatedDataFrame", data = peakinfo)
pd <- methods::new("AnnotatedDataFrame", data = cellinfo)
input_cds <-  suppressWarnings(newCellDataSet(indata,
                            phenoData = pd,
                            featureData = fd,
                            expressionFamily=VGAM::binomialff(),
                            lowerDetectionLimit=0))
input_cds@expressionFamily@vfamily <- "binomialff"
input_cds <- monocle::detectGenes(input_cds)

#Ensure there are no peaks included with zero reads
input_cds <- input_cds[Matrix::rowSums(exprs(input_cds)) != 0,] 


In [None]:
cells = read.table('data/Hematopoiesis-All/GSE129785_scATAC-Hematopoiesis-All.cell_barcodes.txt', 
                   comment.char = "", head=1)
dimred <- data.frame(row.names = cells$Group_Barcode, 
                     cells$UMAP1, 
                     cells$UMAP2)

In [None]:
set.seed(2017)
input_cds <- detectGenes(input_cds)
input_cds <- estimateSizeFactors(input_cds)

In [19]:
saveRDS(input_cds, "result/Hematopoiesis-All/hema_all_input.rds")

In [2]:
input_cds <- readRDS("result/Hematopoiesis-All/hema_all_input.rds")

In [None]:
cicero_cds  <- make_cicero_cds(input_cds, k = 50, reduced_coordinates = dimred[colnames(input_cds),])

In [5]:
object.size(input_cds)

129578392 bytes

In [18]:
object.size(input_cds)

129578392 bytes

In [33]:
cicero_cds  <- make_cicero_cds(input_cds, k = 50, reduced_coordinates = dimred[colnames(input_cds),])

Overlap QC metrics:
Cells per bin: 50
Maximum shared cells bin-bin: 44
Mean shared cells bin-bin: 0.0384606816936211
Median shared cells bin-bin: 0


ERROR: Error in asMethod(object): Cholmod error 'problem too large' at file ../Core/cholmod_dense.c, line 105


In [6]:
input_cds_f=input_cds[Matrix::rowSums(exprs(input_cds)) >= 1000

In [7]:
object.size(input_cds_f)

38079080 bytes

In [8]:
saveRDS(input_cds_f, "result/Hematopoiesis-All/hema_all_input_f.rds")

In [43]:
set.seed(2017)
input_cds_f <- detectGenes(input_cds_f)
input_cds_f <- estimateSizeFactors(input_cds_f)

In [44]:
cicero_cds_f  <- make_cicero_cds(input_cds_f, k = 50, reduced_coordinates = dimred[colnames(input_cds_f),])

Overlap QC metrics:
Cells per bin: 50
Maximum shared cells bin-bin: 44
Mean shared cells bin-bin: 0.0384606816936211
Median shared cells bin-bin: 0


In [45]:
saveRDS(cicero_cds_f, "result/Hematopoiesis-All/hema_all_cicero_f.rds")

In [3]:
cicero_cds_f <- readRDS("result/Hematopoiesis-All/hema_all_cicero_f.rds")

In [54]:
# choose genome region
bsgenome <- BSgenome.Hsapiens.UCSC.hg19
chromSizes <- seqlengths(bsgenome)[paste0("chr",c(1:22,"X"))]
genome <- data.frame(names(chromSizes),chromSizes)
rownames(genome) <- NULL

In [None]:
data("human.hg19.genome")
sample_genome <- subset(human.hg19.genome, V1 == "chr1")
conns <- run_cicero(cicero_cds_f, sample_genome) # Takes a few minutes to run

In [11]:
write.table(conns, 'result//Hematopoiesis-All/hema_all_f_conns.txt',quote = FALSE, sep='\t', row.names = FALSE)

In [4]:
conns = read.table( 'result//Hematopoiesis-All/hema_all_f_conns.txt', sep='\t', header = 1)
input_cds_f = readRDS('result//Hematopoiesis-All/hema_all_input_f.rds')

In [8]:
tssWindow <- 2500
flank <- 250*10^3
corCutOff <- 0.35

#Annotate CDS
message("Annotating Cell Data Set...")
genes <- getTxDbGenes(txdb=txdb,orgdb=orgdb)
names(genes) <- genes$symbol
genes <- resize(genes, 1, "start") %>% resize(tssWindow * 2 + 1, "center")
geneDF <- data.frame(chromosome=seqnames(genes),start=start(genes),end=end(genes), gene=genes$symbol)
obj <- annotate_cds_by_site(input_cds_f, geneDF)

#Prepare for Co-Accessibility
nSites <- Matrix::colSums(assayData(obj)$exprs)
names(nSites) <- row.names(pData(obj))


Annotating Cell Data Set...


In [9]:
#Cicero with Correlations
message("Calculating normalized gene activities...")
ciceroGA <- normalize_gene_activities(build_gene_activity_matrix(obj, conns, coaccess_cutoff = corCutOff), nSites)

Calculating normalized gene activities...


In [None]:
library('Matrix')
writeMM(ciceroGA,file='result//Hematopoiesis-All/hema_all_f_ciceroGA.txt')

In [None]:
write.table(rownames(ciceroGA),'result//Hematopoiesis-All/hema_all_f_ciceroGA_genes.txt',
           quote = FALSE)
write.table(colnames(ciceroGA),'result//Hematopoiesis-All/hema_all_f_ciceroGA_barcodes.txt',
           quote = FALSE)

In [None]:
write.table(ciceroGA, 'result//Hematopoiesis-All/hema_all_f_ciceroGA.txt',
            quote = FALSE, sep='\t', row.names = FALSE)

In [11]:
dim(ciceroGA)

In [None]:
head(ciceroGA)

In [20]:
library(SummarizedExperiment)
seCicero <- SummarizedExperiment(
	assays = SimpleList(gA = ciceroGA),
	rowRanges = genes[rownames(ciceroGA),],
	colData = mdata
)

seCiceroLog <- SummarizedExperiment(
	assays = SimpleList(logGA = log2(10^6 * ciceroGA + 1)),
	rowRanges = genes[rownames(ciceroGA),],
	colData = mdata
)

#Save Output
#saveRDS(connections, "results/Peaks-Co-Accessibility.rds")
#saveRDS(seCicero, "results/Cicero-Gene-Activity.rds")
#saveRDS(seCiceroLog, "results/Cicero-Log2-Gene-Activity.rds")

Loading required package: DelayedArray
Loading required package: matrixStats

Attaching package: 'matrixStats'

The following objects are masked from 'package:Biobase':

    anyMissing, rowMedians

Loading required package: BiocParallel

Attaching package: 'DelayedArray'

The following objects are masked from 'package:matrixStats':

    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

The following object is masked from 'package:Biostrings':

    type

The following objects are masked from 'package:base':

    aperm, apply



ERROR: Error in is(colData, "DataFrame"): object 'mdata' not found


In [7]:
getTxDbGenes <- function(txdb = NULL, orgdb = NULL, gr = NULL, ignore.strand = TRUE){
    
    if (is.null(genome)) {
        if (is.null(txdb) | is.null(orgdb)) {
            stop("If no provided genome then you need txdb and orgdb!")
        }
    }
        
    if (is.null(gr)) {
        genes <- GenomicFeatures::genes(txdb)
    }else {
        genes <- suppressWarnings(subsetByOverlaps(GenomicFeatures::genes(txdb), gr, ignore.strand = ignore.strand))
    }
    
    if (length(genes) > 1) {
        mcols(genes)$symbol <- suppressMessages(mapIds(orgdb, 
            keys = mcols(genes)$gene_id, column = "SYMBOL", keytype = "ENTREZID", 
            multiVals = "first"))
        genes <- sort(sortSeqlevels(genes), ignore.strand = TRUE)
        names(genes) <- NULL
        out <- genes
    }else {
        out <- GRanges(seqnames(gr), ranges = IRanges(0, 0), gene_id = 0, symbol = "none")[-1]
    }

    return(out)

}