In [103]:
#library(Seurat)
#library(SCeQTL)
library(doParallel)
library(ggplot2)
library(pscl)
library(lmtest)

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




In [249]:
gene.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_cpm_final_1000genes.csv"
gene.df <- read.csv(gene.file, header=TRUE,row.names = 1)
snv.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_snv_on_filtered_genes_pass_40_cells.csv"
snv.df <- read.csv(snv.file, header=TRUE,row.names = 1)

In [250]:
dim(gene.df)

In [251]:
dim(snv.df)

In [252]:
calculate.pvalue.one.pair <- function(gene.vector,snv.vector){
  # Here I need to merge gene vector(a dataframe with only one gene row) and 
  # snv vector(a dataframe with only one snv row) into a dataframe
  df <- as.data.frame(t(rbind(gene.vector,snv.vector)))
  colnames(df) <- c('gene','snv')
  if(sum(df$gene==0)>0){
    zinb.model <- try(zeroinfl(formula = gene ~ snv, data = df, dist = "negbin"),silent = TRUE)
    m0 <- try(zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin"),silent = TRUE)
    if (class(zinb.model)=='try-error' | class(m0)=='try-error'){
      pvalue <- NA
    }else{
      pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
    }
    return(pvalue)
  }else{
    print('yes')
    poisson.model <- try(glm(formula = gene ~ snv, family="poisson", data=df),silent = TRUE)
    m0 <- try(glm(formula = gene ~ 1, family="poisson", data=df),silent = TRUE)
    message("Since all genes are non-zero, back to Poisson regression.")
    if ('try-error' %in% class(poisson.model)| 'try-error' %in% class(m0)){
      pvalue <- NA
    }else{
      pvalue <- waldtest(m0,poisson.model)[2,'Pr(>F)']
    }
    return(pvalue)
  }
}

In [290]:
calculate.pvalue.from.df <- function(gene.df,snv.df,thread = 16){
    # gene.df is gene * sample
    # snv.df is snv * sample
    registerDoParallel(thread)
    gene.count = dim(gene.df)[1]
    snv.count = dim(snv.df)[1]
    gene.name <- row.names(gene.df)
    snv.name <- row.names(snv.df)
    pvalue = list()
    j = 0
    message("Start calculating p value...\n")
    for(i in 1:gene.count){
        if(i%%100==0){
            message(paste0("calculate pvalue for gene: ", i, "\n"))
        }
        result = foreach(j=1:snv.count) %dopar% {calculate.pvalue.one.pair(gene.df[i,],snv.df[j,])}
        pvalue = rbind(pvalue,result)
    }
    pvalue.df <- as.data.frame(pvalue)
    pvalue.df <- as.data.frame(sapply(pvalue.df, as.numeric))
    row.names(pvalue.df) <- gene.name
    colnames(pvalue.df) <- snv.name
    return(pvalue.df)
}

In [291]:
start.time <- Sys.time()
result <- calculate.pvalue.from.df(gene.df,snv.df,thread = 28)
end.time <- Sys.time()
time.taken <- end.time - start.time
saveRDS(result, "all_gene_snv_test.rds")

Start calculating p value...


