In [12]:
library(doParallel)
library(ggplot2)
library(pscl)
library(lmtest)
library(qvalue)

In [2]:
gene.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_real_cpm_subcelltype_Cla.csv"
gene.df <- read.csv(gene.file, header=TRUE,row.names = 1)
snv.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_real_snv_subcelltype_Cla.csv"
snv.df <- read.csv(snv.file, header=TRUE,row.names = 1)

In [3]:
dim(gene.df)

In [4]:
dim(snv.df)

In [5]:
calculate.pvalue.one.pair <- function(gene.vector,snv.vector){
  # Here I need to merge gene vector(a dataframe with only one gene row) and 
  # snv vector(a dataframe with only one snv row) into a dataframe
  df <- as.data.frame(t(rbind(gene.vector,snv.vector)))
  colnames(df) <- c('gene','snv')
  if(sum(df$gene==0)>0){
    #message('zinb')
    zinb.model <- try(zeroinfl(formula = gene ~ snv, data = df, dist = "negbin"),silent = TRUE)
    m0 <- try(zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin"),silent = TRUE)
    if (class(zinb.model)=='try-error' | class(m0)=='try-error'){
      pvalue <- NA
    }else{
      pvalue <- try(waldtest(m0,zinb.model)[2,'Pr(>Chisq)'],silent = TRUE)
      if ('try-error' %in% class(pvalue)){pvalue <- NA}
    }
    return(pvalue)
  }else{
    #message('poisson')
    poisson.model <- try(glm(formula = gene ~ snv, family="poisson", data=df),silent = TRUE)
    m0 <- try(glm(formula = gene ~ 1, family="poisson", data=df),silent = TRUE)
    message("Since all genes are non-zero, back to Poisson regression.")
    if ('try-error' %in% class(poisson.model)| 'try-error' %in% class(m0)){
      pvalue <- NA
    }else{
      pvalue <- try(waldtest(m0,poisson.model)[2,'Pr(>F)'],silent = TRUE)
      if ('try-error' %in% class(pvalue)){pvalue <- NA}
    }
    return(pvalue)
  }
}

In [6]:
calculate.pvalue.from.df <- function(gene.df,snv.df,thread = 16){
    # gene.df is gene * sample
    # snv.df is snv * sample
    registerDoParallel(thread)
    gene.count = dim(gene.df)[1]
    snv.count = dim(snv.df)[1]
    gene.name <- row.names(gene.df)
    snv.name <- row.names(snv.df)
    pvalue = list()
    j = 0
    message("Start calculating p value...\n")
    for(i in 1:gene.count){
        if(i%%30==0){
            message(paste0("calculate pvalue for gene: ", i, "\n"))
        }
        result = foreach(j=1:snv.count) %dopar% {calculate.pvalue.one.pair(gene.df[i,],snv.df[j,])}
        pvalue = rbind(pvalue,result)
    }
    pvalue.df <- as.data.frame(pvalue)
    #pvalue.df <- as.data.frame(sapply(pvalue.df, as.numeric))
    row.names(pvalue.df) <- gene.name
    colnames(pvalue.df) <- snv.name
    return(pvalue.df)
}

In [7]:
start.time <- Sys.time()
result <- calculate.pvalue.from.df(gene.df,snv.df,thread = 28)
end.time <- Sys.time()
time.taken <- end.time - start.time
saveRDS(result, "Cla.rds")

Start calculating p value...


calculate pvalue for gene: 30


calculate pvalue for gene: 60


calculate pvalue for gene: 90


calculate pvalue for gene: 120


calculate pvalue for gene: 150


calculate pvalue for gene: 180


calculate pvalue for gene: 210


calculate pvalue for gene: 240


calculate pvalue for gene: 270


calculate pvalue for gene: 300


calculate pvalue for gene: 330


calculate pvalue for gene: 360


calculate pvalue for gene: 390


calculate pvalue for gene: 420


calculate pvalue for gene: 450


calculate pvalue for gene: 480


calculate pvalue for gene: 510


calculate pvalue for gene: 540


calculate pvalue for gene: 570


calculate pvalue for gene: 600


calculate pvalue for gene: 630


calculate pvalue for gene: 660




In [109]:
result2 <- unstack(stack(result))

In [110]:
result2

chr1__888639,chr1__1247494,chr1__13940864,chr1__13942731,chr1__20982631,chr1__25168124,chr1__36690047,chr1__36752433,chr1__52290984,chr1__93308798,⋯,chr6__25343823,chr6__25344086,chr11__85698207,chr7__55250026,chr20__330408,chr7__55312340,chr7__32957661,chr3__187927165,chr1__66628829,chr17__41381981
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
3.096973e-36,1.263549e-16,4.806330e-18,3.090474e-28,5.155779e-08,6.398310e-17,3.451988e-40,5.919559e-19,1.011333e-15,2.275882e-20,⋯,4.873532e-38,4.172485e-33,7.245796e-18,8.207206e-14,,2.378934e-27,9.563248e-27,0.658107633,6.896780e-33,2.975224e-40
8.193532e-01,2.662594e-01,2.538426e-06,1.491656e-01,3.594033e-01,8.444987e-02,9.841823e-01,4.009570e-04,8.566107e-01,5.676287e-01,⋯,1.612239e-01,1.651460e-01,7.194296e-01,2.038152e-01,,1.702463e-01,8.529022e-01,0.954140833,8.058207e-01,6.153431e-01
3.882790e-02,8.597296e-01,2.260985e-01,2.787175e-01,7.579316e-01,3.365294e-01,3.163379e-02,9.959027e-01,3.702196e-01,7.192178e-01,⋯,4.914753e-01,6.006377e-01,4.142539e-01,2.641238e-01,,,5.154858e-13,0.327965954,3.685586e-01,4.914788e-01
3.000226e-01,6.982141e-01,5.071286e-02,1.113814e-01,2.938630e-01,7.231655e-01,1.100842e-01,3.719158e-01,3.825843e-01,4.289994e-01,⋯,4.434310e-01,2.637972e-01,8.831738e-01,9.425777e-01,,1.152800e-01,5.671645e-01,0.093432342,5.670939e-01,2.080845e-01
5.294417e-01,3.396275e-01,9.013493e-01,8.579209e-01,8.495310e-02,9.669472e-01,6.635397e-01,4.961612e-01,3.659266e-01,2.004078e-01,⋯,4.879583e-01,1.972963e-01,8.216648e-01,5.781019e-01,,3.251520e-01,2.999283e-01,0.943158776,1.017991e-01,4.740046e-02
7.696805e-01,7.459654e-01,6.743239e-01,9.838674e-01,4.308079e-01,6.747624e-01,8.434179e-01,3.202779e-01,2.309515e-01,1.861601e-01,⋯,8.058697e-01,8.282315e-01,1.822314e-01,9.161196e-01,,9.730882e-02,9.730703e-02,0.130284312,1.889350e-05,
2.638619e-01,9.639955e-01,7.958156e-01,8.584633e-01,1.846291e-01,3.141579e-01,7.909904e-02,2.887711e-02,7.829748e-01,2.035381e-01,⋯,4.151339e-01,4.151339e-01,4.675543e-01,1.435428e-01,,1.705062e-01,9.833734e-01,0.309473703,7.007388e-01,1.341028e-01
2.630614e-01,8.885112e-01,8.192352e-01,4.663786e-01,3.920990e-01,4.619162e-01,3.698730e-01,2.854331e-01,6.208550e-01,2.833169e-01,⋯,3.650028e-02,1.181256e-02,2.058376e-02,7.170727e-01,,5.928889e-03,7.918372e-01,0.117803154,1.609646e-01,8.275052e-01
2.934825e-01,6.033940e-01,1.950487e-01,3.317641e-01,6.988163e-01,9.421343e-01,9.524009e-01,6.739293e-01,5.317788e-01,5.552943e-01,⋯,2.629359e-01,3.281474e-01,3.856605e-01,1.800073e-01,,8.428139e-01,6.562385e-01,0.808458647,6.696375e-01,5.881041e-01
,7.735490e-01,2.621539e-01,5.304386e-01,6.118271e-01,6.479073e-01,4.706945e-01,7.355447e-01,4.041768e-01,7.735586e-01,⋯,2.621570e-01,5.304371e-01,,5.339868e-01,,5.304374e-01,4.405850e-01,0.006840567,5.591773e-01,2.384240e-01


In [112]:
rownames(result2) <- rownames(result)

In [114]:
write.csv(result2,"Cla.csv", row.names = TRUE)