In [105]:
# Pseudobulk methods for cell type comparison in lupus data

library(edgeR)
library(DESeq2)

data_path <- '/data_volume/memento/method_comparison/lupus/'

### Read the data

bulk = read.table(paste(data_path, 'T4_vs_cM.bulk.csv', sep=''), sep=',', header=1, row.names=1)
pseudobulk = read.table(paste(data_path, 'T4_vs_cM.pseudobulk.csv', sep=''), sep=',', header=1, row.names=1)

### Create the metadata


num_inds = 4

metadata = data.frame(name=colnames(bulk), ct=rep(c('CD4', 'CD14'), num_inds), ind=sapply(strsplit(colnames(bulk), '_'), tail, 1))


### Run edgeR

subject <- factor(metadata$ind)
ct <- factor(metadata$ct)
design <- model.matrix(~subject+ct)

edger_preprocess <- function (data, design) {
    y <- DGEList(counts=data)
    keep <- filterByExpr(y,min.count=1)
    y <- y[keep,,keep.lib.sizes=FALSE]
    y <- calcNormFactors(y)
    y <- estimateDisp(y,design)
    
    return(y)
}

edger_get_lrt <- function (y, design) {

    fit <- glmFit(y,design)
    lrt <- glmLRT(fit,coef=num_inds+1)
    
    return(topTags(lrt, n=Inf))
}
edger_get_qlft <- function (y, design) {

    fit <- glmQLFit(y,design)
    qlft <- glmQLFTest(fit,coef=num_inds+1)
    
    return(topTags(qlft, n=Inf))
}

bulk_y <- edger_preprocess(bulk, design)
bulk_result_lrt <- edger_get_lrt(bulk_y, design)
bulk_result_qlft <- edger_get_qlft(bulk_y, design)

write.csv(bulk_result_lrt, paste(data_path, 'T4_vs_cM.bulk.edger_lrt.csv', sep=''))
write.csv(bulk_result_qlft, paste(data_path, 'T4_vs_cM.bulk.edger_qlft.csv', sep=''))

pseudobulk_y <- edger_preprocess(pseudobulk, design)
pseudobulk_lrt <- edger_get_lrt(pseudobulk_y, design)
write.csv(pseudobulk_lrt, paste(data_path, 'T4_vs_cM.pseudobulk.edger_lrt.csv', sep=''))

dispersion_df = data.frame(gene=rownames(pseudobulk_y), dispersion=pseudobulk_y$tagwise.dispersion)

write.csv(dispersion_df, paste(data_path, 'T4_vs_cM.dispersions.csv', sep=''))

### Run DESeq2

run_deseq2_wald <- function(data, info) {
    dds <- DESeqDataSetFromMatrix(countData = round(data),
                                  colData = info,
                                  design= ~ ind + ct)
    levels(dds$ind) <- sub("\\.", "", levels(dds$ind))
    dds <- DESeq(dds)
    resultsNames(dds) # lists the coefficients
    res <- results(dds, name="ct_CD4_vs_CD14")
    
    return(res)
}

run_deseq2_lrt <- function(data, info) {
    dds <- DESeqDataSetFromMatrix(countData = round(data),
                                  colData = info,
                                  design= ~ ind + ct)
    levels(dds$ind) <- sub("\\.", "", levels(dds$ind))
    dds <- DESeq(dds, test="LRT", reduced=~ind)
    res <- results(dds, name="ct_CD4_vs_CD14")
    
    return(res)
}

bulk_deseq2_wald <- run_deseq2_wald(bulk, metadata)
bulk_deseq2_lrt <- run_deseq2_lrt(bulk, metadata)
write.csv(bulk_deseq2_lrt, paste(data_path, 'T4_vs_cM.bulk.deseq2_lrt.csv', sep=''))
write.csv(bulk_deseq2_wald, paste(data_path, 'T4_vs_cM.bulk.deseq2_wald.csv', sep=''))

pseudobulk_deseq2_wald <- run_deseq2_wald(pseudobulk, metadata)
write.csv(pseudobulk_deseq2_wald, paste(data_path, 'T4_vs_cM.pseudobulk.deseq2_wald.csv', sep=''))

In [19]:
# R
suppressPackageStartupMessages({
    library(ggplot2)
    library(limma)
    library(reshape2)
    library(data.table)

    library(MAST)
})

options(mc.cores = 12)


data_path <- '/data_volume/memento/method_comparison/lupus/'

numcell <- 50
trial <- 0
expr_fname <- sprintf('%sT4_vs_cM.single_cell.%s.%s.expr.csv', data_path, numcell, trial)
obs_fname <- sprintf('%sT4_vs_cM.single_cell.%s.%s.obs.csv', data_path, numcell, trial)
var_fname <- sprintf('%sT4_vs_cM.single_cell.%s.%s.var.csv', data_path, numcell, trial)
output_fname <- sprintf('%sT4_vs_cM.sc.MAST.%s.%s.csv', data_path, numcell, trial)

expr = read.csv(expr_fname, row.names = 1)
obs = read.csv(obs_fname, row.names = 1)
var = read.csv(var_fname, row.names = 1)

expr_norm<-log(t(apply(expr,1, function(x) x/sum(x)*10000))+1)

scaRaw <- FromMatrix(t(expr_norm), obs, var)


freq_expressed <- 0.005
sca <- scaRaw
expressed_genes <- freq(sca) > freq_expressed
sca <- sca[expressed_genes,]

cond<-factor(colData(sca)$cg_cov)
colData(sca)$ind <- factor(colData(sca)$ind)
cond<-relevel(cond,"cM")
colData(sca)$condition<-cond
zlmCond <- zlm(~condition + ind, sca)

summaryCond <- summary(zlmCond, doLRT='conditionT4') 

summaryDt <- summaryCond$datatable
fcHurdle <- merge(summaryDt[contrast=='conditionT4' & component=='H',.(primerid, `Pr(>Chisq)`)], #hurdle P values
                      summaryDt[contrast=='conditionT4' & component=='logFC', .(primerid, coef, ci.hi, ci.lo)], by='primerid') #logFC coefficients

fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, 'fdr')]
# fcHurdleSig <- merge(fcHurdle[fdr<.05 & abs(coef)>FCTHRESHOLD], as.data.table(mcols(sca)), by='primerid')
# setorder(fcHurdleSig, fdr)

write.csv(fcHurdle, out_fname)

In [34]:
write.csv(fcHurdle, output_fname, row.names = FALSE)

In [None]:
print(summaryCond, n=4, by='C')