In [7]:
suppressMessages(library(edgeR))
suppressMessages(library(DESeq2))

data_path <-'/data_volume/memento/tfko140/'

# guide_df = read.csv('guides.csv', header=TRUE)
# mapping = lapply(with(unique(guide_df), split(guide1_cov, target)), as.list)

### Run edgeR

In [158]:
edger_preprocess <- function (data, design) {
    y <- DGEList(counts=data)
    keep <- filterByExpr(y,min.count=1)
    y <- y[keep,,keep.lib.sizes=FALSE]
    y <- calcNormFactors(y)
    y <- estimateDisp(y,design)

    return(y)
}

edger_get_lrt <- function (y, design) {

    fit <- glmFit(y,design)
    lrt <- glmLRT(fit,coef=dim(design)[2])

    return(topTags(lrt, n=Inf))
}
edger_get_qlft <- function (y, design) {

    fit <- glmQLFit(y,design)
    qlft <- glmQLFTest(fit,coef=dim(design)[2])

    return(topTags(qlft, n=Inf))
}


In [159]:
pseudobulks <- read.csv(paste(data_path, 'pseudobulks.csv', sep=''), check.names=FALSE, row.names=1)

In [160]:
guides_to_test <- read.csv(paste(data_path, 'benchmarking/guide_pairs_to_test.csv', sep=''))

In [None]:
for(i in 1:nrow(guides_to_test)) {
    row <- guides_to_test[i,]

    t1_guides <- c(row$target1_guide1, row$target1_guide2)
    t2_guides <- c(row$target2_guide1, row$target2_guide2)
    
    for (pair_idx in 1:2) {
        
        
        t1 <- t1_guides[pair_idx]
        t2 <- t2_guides[pair_idx]

        subset_pseudobulk <- cbind(pseudobulks[,grepl(t1, colnames(pseudobulks))], pseudobulks[,grepl(t2, colnames(pseudobulks))])


        metadata = data.frame(
            name=colnames(subset_pseudobulk), 
            ko=sapply(strsplit(colnames(subset_pseudobulk), '\\^'), '[', 3),
            ind=sapply(strsplit(colnames(subset_pseudobulk), '\\^'), '[', 2))

        subject <- factor(metadata$ind)
        ko <- factor(metadata$ko)
        design <- model.matrix(~subject+ko)

        subset_pseudobulk_y <- edger_preprocess(subset_pseudobulk, design)
        subset_pseudobulk_qlft <- edger_get_qlft(subset_pseudobulk_y, design)
        subset_pseudobulk_lrt <- edger_get_lrt(subset_pseudobulk_y, design)

        write.csv(subset_pseudobulk_qlft, paste(data_path, 'benchmarking/edger_results/qlft_',t1,'_',t2,'.csv', sep=''))
        write.csv(subset_pseudobulk_lrt, paste(data_path, 'benchmarking/edger_results/lrt_',t1,'_',t2,'.csv', sep=''))
    }
}

### Run DESeq2

In [217]:
run_deseq2_wald <- function(data, info) {
    dds <- DESeqDataSetFromMatrix(countData = round(data),
                                  colData = info,
                                  design= ~ ind + ko)
    levels(dds$ind) <- sub("\\.", "", levels(dds$ind))
    dds <- DESeq(dds)
    res <- results(dds, name='ko')

    return(res)
}

run_deseq2_lrt <- function(data, info) {
    dds <- DESeqDataSetFromMatrix(countData = round(data),
                                  colData = info,
                                  design= ~ ind + ko)
    levels(dds$ind) <- sub("\\.", "", levels(dds$ind))
    dds <- DESeq(dds, test="LRT", reduced=~ind)
    res <- results(dds, name='ko')

    return(res)
}

In [218]:
for(i in 1:nrow(guides_to_test)) {
    row <- guides_to_test[i,]

    t1_guides <- c(row$target1_guide1, row$target1_guide2)
    t2_guides <- c(row$target2_guide1, row$target2_guide2)
    
    for (pair_idx in 1:2) {
        
        
        t1 <- t1_guides[pair_idx]
        t2 <- t2_guides[pair_idx]

        subset_pseudobulk <- cbind(pseudobulks[,grepl(t1, colnames(pseudobulks))], pseudobulks[,grepl(t2, colnames(pseudobulks))])


        metadata = data.frame(
            name=colnames(subset_pseudobulk), 
            ko=sapply(strsplit(colnames(subset_pseudobulk), '\\^'), '[', 3),
            ind=sapply(strsplit(colnames(subset_pseudobulk), '\\^'), '[', 2))
        metadata$ko <- as.numeric(metadata$ko==t1)
        
        pseudobulk_deseq2_wald <- run_deseq2_wald(subset_pseudobulk, metadata)
        write.csv(pseudobulk_deseq2_wald, paste(data_path, 'benchmarking/deseq2_results/wald_',t1,'_',t2,'.csv', sep=''))
        pseudobulk_deseq2_lrt <- run_deseq2_lrt(subset_pseudobulk, metadata)
        write.csv(pseudobulk_deseq2_lrt, paste(data_path, 'benchmarking/deseq2_results/lrt_',t1,'_',t2,'.csv', sep=''))
        }
    }

converting counts to integer mode

“some variables in design formula are characters, converting to factors”
  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

converting counts to integer mode

“some variables in design formula are characters, converting to factors”
  the design formula contains one or more numeric variables with integer values,
  specifying a model with increasing fold change for higher values.
  did you mean for this to be a factor? if so, first convert
  this variable to a factor using the factor() function

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mea

In [None]:
metadata