In [1]:
library(doParallel)
library(ggplot2)
library(pscl)
library(lmtest)

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel

Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




In [2]:
gene.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_real_cpm_subcelltype_Neu.csv"
gene.df <- read.csv(gene.file, header=TRUE,row.names = 1)
snv.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_real_snv_subcelltype_Neu.csv"
snv.df <- read.csv(snv.file, header=TRUE,row.names = 1)

In [3]:
dim(gene.df)

In [4]:
dim(snv.df)

In [5]:
calculate.pvalue.one.pair <- function(gene.vector,snv.vector){
  # Here I need to merge gene vector(a dataframe with only one gene row) and 
  # snv vector(a dataframe with only one snv row) into a dataframe
  df <- as.data.frame(t(rbind(gene.vector,snv.vector)))
  colnames(df) <- c('gene','snv')
  if(sum(df$gene==0)>0){
    #message('zinb')
    zinb.model <- try(zeroinfl(formula = gene ~ snv, data = df, dist = "negbin"),silent = TRUE)
    m0 <- try(zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin"),silent = TRUE)
    if (class(zinb.model)=='try-error' | class(m0)=='try-error'){
      pvalue <- NA
    }else{
      pvalue <- try(waldtest(m0,zinb.model)[2,'Pr(>Chisq)'],silent = TRUE)
      if ('try-error' %in% class(pvalue)){pvalue <- NA}
    }
    return(pvalue)
  }else{
    #message('poisson')
    poisson.model <- try(glm(formula = gene ~ snv, family="poisson", data=df),silent = TRUE)
    m0 <- try(glm(formula = gene ~ 1, family="poisson", data=df),silent = TRUE)
    message("Since all genes are non-zero, back to Poisson regression.")
    if ('try-error' %in% class(poisson.model)| 'try-error' %in% class(m0)){
      pvalue <- NA
    }else{
      pvalue <- try(waldtest(m0,poisson.model)[2,'Pr(>F)'],silent = TRUE)
      if ('try-error' %in% class(pvalue)){pvalue <- NA}
    }
    return(pvalue)
  }
}

In [6]:
calculate.pvalue.from.df <- function(gene.df,snv.df,thread = 16){
    # gene.df is gene * sample
    # snv.df is snv * sample
    registerDoParallel(thread)
    gene.count = dim(gene.df)[1]
    snv.count = dim(snv.df)[1]
    gene.name <- row.names(gene.df)
    snv.name <- row.names(snv.df)
    pvalue = list()
    j = 0
    message("Start calculating p value...\n")
    for(i in 1:gene.count){
        if(i%%30==0){
            message(paste0("calculate pvalue for gene: ", i, "\n"))
        }
        result = foreach(j=1:snv.count) %dopar% {calculate.pvalue.one.pair(gene.df[i,],snv.df[j,])}
        pvalue = rbind(pvalue,result)
    }
    pvalue.df <- as.data.frame(pvalue)
    #pvalue.df <- as.data.frame(sapply(pvalue.df, as.numeric))
    row.names(pvalue.df) <- gene.name
    colnames(pvalue.df) <- snv.name
    return(pvalue.df)
}

In [7]:
start.time <- Sys.time()
result <- calculate.pvalue.from.df(gene.df,snv.df,thread = 28)
end.time <- Sys.time()
time.taken <- end.time - start.time
saveRDS(result, "Neu.rds")

Start calculating p value...


calculate pvalue for gene: 30


calculate pvalue for gene: 60


calculate pvalue for gene: 90


calculate pvalue for gene: 120


calculate pvalue for gene: 150


calculate pvalue for gene: 180


calculate pvalue for gene: 210


calculate pvalue for gene: 240


calculate pvalue for gene: 270


calculate pvalue for gene: 300


calculate pvalue for gene: 330


calculate pvalue for gene: 360


calculate pvalue for gene: 390


calculate pvalue for gene: 420


calculate pvalue for gene: 450


calculate pvalue for gene: 480


calculate pvalue for gene: 510


calculate pvalue for gene: 540


calculate pvalue for gene: 570


calculate pvalue for gene: 600


calculate pvalue for gene: 630


calculate pvalue for gene: 660




In [9]:
time.taken

Time difference of 3.220046 hours

In [10]:
result

Unnamed: 0_level_0,chr1__888639,chr1__1247494,chr1__13940864,chr1__13942731,chr1__20982631,chr1__25168124,chr1__36690047,chr1__36752433,chr1__52290984,chr1__93308798,⋯,chr6__25343823,chr6__25344086,chr11__85698207,chr7__55250026,chr20__330408,chr7__55312340,chr7__32957661,chr3__187927165,chr1__66628829,chr17__41381981
Unnamed: 0_level_1,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,⋯,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>
ENSG00000074800.9,0.02070899,2.381102e-24,8.125351e-23,2.14066e-06,0.0003610579,2.768269e-16,4.971926e-10,7.100113e-18,7.798396e-18,6.751943e-20,⋯,5.358442e-22,5.564536e-21,0.00134307,7.724048e-19,3.241355e-17,4.487253e-17,1.491131e-23,3.103287e-20,1.654096e-21,1.826028e-18
ENSG00000162493.12,0.2027333,0.341169,2.15457e-08,0.1017314,0.4069575,0.3181632,0.3294766,0.4595788,0.1080998,0.9319825,⋯,0.4348386,0.3903849,0.4558535,0.4639168,0.6047619,0.1170199,0.1748031,0.06673889,0.9344902,0.4229495
ENSG00000127472.6,,0.7800172,0.1758499,0.05032647,0.06659893,0.1639803,0.6363687,0.8851305,0.2821129,0.161941,⋯,0.5707431,0.6926011,0.9723207,0.1733254,0.7707444,0.0661643,0.8231665,,0.7774344,
ENSG00000070831.11,1.536469e-07,0.0005917184,0.1784357,3.12949e-07,2.070305e-07,0.0004699909,0.001377009,0.02131506,2.244284e-07,0.02989066,⋯,1.167182e-06,2.432681e-07,0.626691,0.03401414,0.002062733,4.503802e-05,1.317033e-06,0.5184948,1.99417e-09,4.303265e-05
ENSG00000117632.16,0.9803833,0.1261347,0.2692333,0.5202297,0.1487659,0.1859865,0.6790774,0.3471047,0.3979347,0.3226273,⋯,0.4324917,0.1374648,0.1304334,0.7096227,0.6886443,0.1467574,0.2147943,0.003157569,0.9203688,0.513502
ENSG00000159023.14,,0.7179509,0.007098278,0.001655586,0.7232196,4.4312e-06,,0.7764982,0.01765916,0.001388459,⋯,0.7132936,0.3184207,,0.2701655,0.10821,0.7038436,0.1289416,0.1479766,0.10821,0.10821
ENSG00000175130.6,,0.9952241,0.6950648,0.6997862,0.2408774,0.7905786,,0.2722731,0.9196235,0.5376725,⋯,0.4139541,0.4139583,0.6946141,,0.6197063,0.04561221,0.001764686,0.12983,0.6197063,0.02445722
ENSG00000134686.12,0.03346882,0.0002145295,0.4238745,0.440386,0.7104001,0.3466821,0.5062812,0.3654238,0.2279898,0.5083511,⋯,0.6997401,0.9984513,0.1248656,0.1248656,0.09794063,0.1641737,0.5954396,0.6367953,,0.1560621
ENSG00000134697.8,,0.2788845,0.4536319,0.9819064,0.5200478,0.4750239,0.3360953,0.5299199,0.6342504,0.8320594,⋯,0.8053955,0.2960234,0.1677189,0.3353486,0.3336366,0.2590207,0.9427416,0.01384251,0.7925107,0.01221774
ENSG00000196517.7,0.7793364,0.4670715,0.6302241,0.6654889,0.3900305,0.1214142,0.8822645,0.04308558,0.8436767,0.8326814,⋯,0.2139413,0.2266117,0.0008630043,0.6407204,0.1370402,0.5591969,0.4817085,1,0.1370402,0.5158356


In [11]:
result2 <- unstack(stack(result))

In [12]:
rownames(result2) <- rownames(result)

In [13]:
write.csv(result2,"Neu.csv", row.names = TRUE)