In [103]:
#library(Seurat)
#library(SCeQTL)
library(doParallel)
library(ggplot2)
library(pscl)
library(lmtest)

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




In [2]:
gene.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_cpm_final.csv"
gene.df <- read.csv(gene.file, header=TRUE,row.names = 1)
snv.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_snv_on_filtered_genes.csv"
snv.df <- read.csv(snv.file, header=TRUE,row.names = 1)

In [3]:
dim(gene.df)

In [4]:
dim(snv.df)

In [None]:
cal.pvalue <- function(gene, snp, thread = 8, remove_outlier = TRUE,EM = FALSE, dist = 'negbin', type = 0){


    zeroinfl_model <- function(sample_gene, sample_snp, remove_outlier=TRUE, dist='negbin', EM = TRUE, type = 0){
        sample.data = data.frame(unlist(sample_gene), unlist(sample_snp))
        colnames(sample.data) = c('expression','snp')
        if(remove_outlier)
          sample.data = removeoutlier(sample.data)
        m1 <- try(zeroinfl(expression ~ snp, data = sample.data, dist = dist, EM = EM), silent=TRUE)
        if(class(m1)=="try-error")
          return(NA)

        if(type==0){
          m0 <- zeroinfl(expression ~ snp|1, data = sample.data, dist = dist, EM = EM)
          .df <- 1
        }
        else if(type==1){
          m0 <- zeroinfl(expression ~ 1|snp, data = sample.data, dist = dist, EM = EM)
          .df <- 1
        }
        else{
          m0 <- zeroinfl(expression ~ 1, data = sample.data, dist = dist, EM = EM)
          .df <- 2
        }
        return(pchisq(2 * (logLik(m1) - logLik(m0)), df = .df, lower.tail=FALSE))
    }
    
    registerDoParallel(thread)

    pvalue = list()
    j = 0
    message("Start calculating p value...\n")
    for(i in 1:gene.count){
        message(paste0("calculate pvalue for gene: ", i, "\n"))
        result = foreach(j=1:snp.count) %dopar% {zeroinfl_model(gene[i,],snp[j,],remove_outlier=remove_outlier,dist=dist,EM=EM,type=type)}
        pvalue = rbind(pvalue,result)
    }
    gene.name = rep(row.names(gene), each = snp.count)
    if(is.null(row.names(snp))){
        snp.raw.name = 1:snp.count
    }else
        snp.raw.name = row.names(snp)
    snp.name = list()
    for(i in 1:gene.count)
        snp.name = c(snp.name, snp.raw.name)
    result = data.frame(gene.name, unlist(snp.name), unlist(pvalue))
    colnames(result) <- c("gene","snp","pvalue")
    return(result)
}


In [70]:
calculate.pvalue.one.pair <- function(gene.vector,snv.vector){
  # Here I need to merge gene vector(a dataframe with only one gene row) and 
  # snv vector(a dataframe with only one snv row) into a dataframe
  df <- as.data.frame(t(rbind(gene.vector,snv.vector)))
  colnames(df) <- c('gene','snv')
  if(sum(df$gene==0)>0){
    zinb.model <- try(zeroinfl(formula = gene ~ snv, data = df, dist = "negbin"),silent = TRUE)
    m0 <- try(zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin"),silent = TRUE)
    if (class(zinb.model)=='try-error' | class(m0)=='try-error'){
      pvalue <- NA
    }else{
      pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
    }
    return(pvalue)
  }else{
    print('yes')
    poisson.model <- try(glm(formula = gene ~ snv, family="poisson", data=df),silent = TRUE)
    m0 <- try(glm(formula = gene ~ 1, family="poisson", data=df),silent = TRUE)
    message("Since all genes are non-zero, back to Poisson regression.")
    if ('try-error' %in% class(poisson.model)| 'try-error' %in% class(m0)){
      pvalue <- NA
    }else{
      pvalue <- waldtest(m0,poisson.model)[2,'Pr(>F)']
    }
    return(pvalue)
  }
}

In [220]:
calculate.pvalue.from.df <- function(gene.df,snv.df,thread = 16){
    # gene.df is gene * sample
    # snv.df is snv * sample
    registerDoParallel(thread)
    gene.count = dim(gene.df)[1]
    snv.count = dim(snv.df)[1]
    gene.name <- row.names(gene.df)
    snv.name <- row.names(snv.df)
    pvalue = list()
    j = 0
    message("Start calculating p value...\n")
    for(i in 1:gene.count){
        message(paste0("calculate pvalue for gene: ", i, "\n"))
        result = foreach(j=1:snv.count) %dopar% {calculate.pvalue.one.pair(gene.df[i,],snv.df[j,])}
        pvalue = rbind(pvalue,result)
    }
    pvalue.df <- as.data.frame(pvalue)
    pvalue.df <- as.data.frame(sapply(pvalue.df, as.numeric))
    row.names(pvalue.df) <- gene.name
    colnames(pvalue.df) <- snv.name
    return(pvalue.df)
}

In [225]:
start.time <- Sys.time()
ttt <- calculate.pvalue.from.df(gene.df[1:2,],snv.df[1:3500,],thread = 24)
end.time <- Sys.time()
time.taken <- end.time - start.time

Start calculating p value...


calculate pvalue for gene: 1


calculate pvalue for gene: 2




In [226]:
ttt

Unnamed: 0_level_0,chr1__881627,chr1__888639,chr1__1247494,chr1__7901995,chr1__11148759,chr1__13940864,chr1__13942731,chr1__20826685,chr1__20826686,chr1__20826910,⋯,chr5__63630680,chr5__63665861,chr5__63666279,chr5__63666280,chr5__64273018,chr5__64748681,chr5__64854624,chr5__64854714,chr5__64854715,chr5__64857063
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000064886.9,0.60257,0.6926937,0.5699269,0.8216961,0.01089003,3.830646e-06,0.0003757341,1.219441e-05,1.219441e-05,1.221242e-05,⋯,1.353992e-06,1.35563e-08,,,1.0,1.0,0.73210933,0.8053735,0.8053735,
ENSG00000175899.10,0.8481158,0.851796,0.1737132,0.1173339,0.03361019,3.649012e-14,1.016172e-10,0.079057,0.079057,0.006570018,⋯,0.5401174,0.08509915,,,0.9089293,0.03038904,0.01018914,0.4501856,0.4501856,0.2762313


In [229]:
time.taken

Time difference of 2.370075 mins

In [224]:
1000 /60

In [231]:
70 * 1000 / 60 /60

In [232]:
saveRDS(ttt, "delete.rds")

In [237]:
readRDS("delete.rds")

Unnamed: 0_level_0,chr1__881627,chr1__888639,chr1__1247494,chr1__7901995,chr1__11148759,chr1__13940864,chr1__13942731,chr1__20826685,chr1__20826686,chr1__20826910,⋯,chr5__63630680,chr5__63665861,chr5__63666279,chr5__63666280,chr5__64273018,chr5__64748681,chr5__64854624,chr5__64854714,chr5__64854715,chr5__64857063
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
ENSG00000064886.9,0.60257,0.6926937,0.5699269,0.8216961,0.01089003,3.830646e-06,0.0003757341,1.219441e-05,1.219441e-05,1.221242e-05,⋯,1.353992e-06,1.35563e-08,,,1.0,1.0,0.73210933,0.8053735,0.8053735,
ENSG00000175899.10,0.8481158,0.851796,0.1737132,0.1173339,0.03361019,3.649012e-14,1.016172e-10,0.079057,0.079057,0.006570018,⋯,0.5401174,0.08509915,,,0.9089293,0.03038904,0.01018914,0.4501856,0.4501856,0.2762313
