In [1]:
library(doParallel)
library(ggplot2)
library(pscl)
library(lmtest)

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel

Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




In [3]:
gene.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_real_cpm_subcelltype_Mes_and_Pro.csv"
gene.df <- read.csv(gene.file, header=TRUE,row.names = 1)
snv.file <- "/data8t/mtx/scSNV/dataset_v2/GSE57872/data/processed_profile/GSE57872_real_snv_subcelltype_Mes_and_Pro.csv"
snv.df <- read.csv(snv.file, header=TRUE,row.names = 1)

In [4]:
dim(gene.df)

In [5]:
dim(snv.df)

In [8]:
calculate.pvalue.one.pair <- function(gene.vector,snv.vector){
  # Here I need to merge gene vector(a dataframe with only one gene row) and 
  # snv vector(a dataframe with only one snv row) into a dataframe
  df <- as.data.frame(t(rbind(gene.vector,snv.vector)))
  colnames(df) <- c('gene','snv')
  if(sum(df$gene==0)>0){
    #message('zinb')
    zinb.model <- try(zeroinfl(formula = gene ~ snv, data = df, dist = "negbin"),silent = TRUE)
    m0 <- try(zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin"),silent = TRUE)
    if (class(zinb.model)=='try-error' | class(m0)=='try-error'){
      pvalue <- NA
    }else{
      pvalue <- try(waldtest(m0,zinb.model)[2,'Pr(>Chisq)'],silent = TRUE)
      if ('try-error' %in% class(pvalue)){pvalue <- NA}
    }
    return(pvalue)
  }else{
    #message('poisson')
    poisson.model <- try(glm(formula = gene ~ snv, family="poisson", data=df),silent = TRUE)
    m0 <- try(glm(formula = gene ~ 1, family="poisson", data=df),silent = TRUE)
    message("Since all genes are non-zero, back to Poisson regression.")
    if ('try-error' %in% class(poisson.model)| 'try-error' %in% class(m0)){
      pvalue <- NA
    }else{
      pvalue <- try(waldtest(m0,poisson.model)[2,'Pr(>F)'],silent = TRUE)
      if ('try-error' %in% class(pvalue)){pvalue <- NA}
    }
    return(pvalue)
  }
}

In [9]:
calculate.pvalue.from.df <- function(gene.df,snv.df,thread = 16){
    # gene.df is gene * sample
    # snv.df is snv * sample
    registerDoParallel(thread)
    gene.count = dim(gene.df)[1]
    snv.count = dim(snv.df)[1]
    gene.name <- row.names(gene.df)
    snv.name <- row.names(snv.df)
    pvalue = list()
    j = 0
    message("Start calculating p value...\n")
    for(i in 1:gene.count){
        if(i%%30==0){
            message(paste0("calculate pvalue for gene: ", i, "\n"))
        }
        result = foreach(j=1:snv.count) %dopar% {calculate.pvalue.one.pair(gene.df[i,],snv.df[j,])}
        pvalue = rbind(pvalue,result)
    }
    pvalue.df <- as.data.frame(pvalue)
    #pvalue.df <- as.data.frame(sapply(pvalue.df, as.numeric))
    row.names(pvalue.df) <- gene.name
    colnames(pvalue.df) <- snv.name
    return(pvalue.df)
}

In [10]:
start.time <- Sys.time()
result <- calculate.pvalue.from.df(gene.df,snv.df,thread = 28)
end.time <- Sys.time()
time.taken <- end.time - start.time
saveRDS(result, "Mes_Pro.rds")

Start calculating p value...


calculate pvalue for gene: 30


calculate pvalue for gene: 60


calculate pvalue for gene: 90


calculate pvalue for gene: 120


calculate pvalue for gene: 150


calculate pvalue for gene: 180


calculate pvalue for gene: 210


calculate pvalue for gene: 240


calculate pvalue for gene: 270


calculate pvalue for gene: 300


calculate pvalue for gene: 330


calculate pvalue for gene: 360


calculate pvalue for gene: 390


calculate pvalue for gene: 420


calculate pvalue for gene: 450


calculate pvalue for gene: 480


calculate pvalue for gene: 510


calculate pvalue for gene: 540


calculate pvalue for gene: 570


calculate pvalue for gene: 600


calculate pvalue for gene: 630


calculate pvalue for gene: 660




In [11]:
result

Unnamed: 0_level_0,chr1__888639,chr1__1247494,chr1__13940864,chr1__13942731,chr1__20982631,chr1__25168124,chr1__36690047,chr1__36752433,chr1__52290984,chr1__93308798,⋯,chr6__25343823,chr6__25344086,chr11__85698207,chr7__55250026,chr20__330408,chr7__55312340,chr7__32957661,chr3__187927165,chr1__66628829,chr17__41381981
Unnamed: 0_level_1,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,⋯,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>,<named list>
ENSG00000074800.9,2.330528e-49,3.313805e-47,1.403154e-37,0.001106996,2.009732e-112,6.185035e-84,5.343557e-52,3.396393e-55,4.927158e-91,1.849933e-20,⋯,2.473089e-37,3.509537e-99,1.750917e-52,4.340858e-118,9.801478e-120,1.224076e-68,1.354554e-33,2.567887e-80,4.411057e-44,1.553164e-34
ENSG00000162493.12,0.4675077,0.7775024,4.792192e-13,8.088276e-05,0.7326287,0.2947917,0.895111,0.867552,0.1785569,0.5550263,⋯,0.3495338,0.5342428,0.469253,0.1529435,0.791475,0.2551306,0.8147411,0.7250273,0.3750047,0.7901924
ENSG00000127472.6,0.622147,0.914172,0.08980167,0.2037401,0.9933402,0.6806859,0.6760352,0.622679,0.752433,0.01885181,⋯,0.4105535,0.6029395,0.3671395,0.1258018,0.3868052,0.4720525,0.01405507,0.09792487,0.9615857,0.8583903
ENSG00000070831.11,0.5962978,0.365316,0.370284,0.287853,0.8493712,0.09280314,0.3175592,0.2916475,0.965887,0.946087,⋯,0.9242434,0.2924711,0.2418567,0.8066599,0.09001334,0.5162583,0.8906463,0.4141917,0.8253836,0.8391089
ENSG00000117632.16,0.3821005,0.7279257,7.333014e-05,0.02571059,0.4087877,7.236687e-07,0.002743321,0.718792,0.06541847,0.8083274,⋯,0.9216786,0.718164,9.96008e-05,0.9432067,0.8206228,0.3663664,0.4846742,0.05857925,0.7478453,0.7153719
ENSG00000159023.14,0.2154961,0.9705083,0.7450976,0.8930686,0.09288116,0.8487917,0.3644637,0.06454189,0.6249408,0.1923528,⋯,0.2974615,0.8188207,0.3393943,0.1132451,0.1863971,0.09339115,0.7314238,0.563068,0.02545702,0.1900801
ENSG00000175130.6,0.7632047,0.2260263,0.02558022,0.007569902,0.2356684,6.007541e-05,0.9464921,0.9576764,0.1090829,0.1337562,⋯,0.1595299,0.9480775,0.02478989,0.8992864,0.0005971514,0.2461033,0.6597323,0.1836073,0.7578701,0.7791791
ENSG00000134686.12,0.6941045,0.2567934,0.7028194,0.8463327,0.3339687,0.1552604,0.694569,0.09443257,0.3094709,0.5151011,⋯,0.02763994,0.2183899,0.704453,0.3265786,0.4601265,0.308226,0.1930572,0.8480101,0.9823218,0.02797778
ENSG00000134697.8,0.8786088,0.9740072,0.2640335,0.06919354,0.8092202,0.3398551,0.6792971,0.5368306,0.5987183,0.9949019,⋯,0.8328535,0.970627,0.2615747,0.5144353,0.5929303,0.6411978,0.7604287,0.3069733,0.6513808,0.3991912
ENSG00000196517.7,0.4013879,0.07029972,0.6622683,0.3785582,0.3606027,0.301171,0.9217439,0.3090704,0.3532472,0.5195826,⋯,0.5960473,0.447434,0.0839011,0.6939806,,0.9406767,0.5650621,0.007271921,0.8786235,0.9998043


In [12]:
result2 <- unstack(stack(result))

In [13]:
rownames(result2) <- rownames(result)

In [14]:
write.csv(result2,"Mes_Pro.csv", row.names = TRUE)