In [1]:
library(pscl)
library(lmtest)
library(ggplot2)

Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




# Simulation Function

In [24]:
SNV.simulation <- function(sample.size, zero.rate, gene.mu, is.snv=TRUE){
    if(is.snv){
        # Gene
        zero.size <- ceiling(sample.size * zero.rate)
        nonzero.size <- sample.size - zero.size
        gene.zero <- rep(0, zero.size)
        gene.nonzero <- rpois(nonzero.size, gene.mu)
    
        #SNV
        snv.zero <- rep(0,zero.size)
        error.rate <- 1/gene.nonzero
        error <- (runif(nonzero.size) < error.rate)
        snv.nonzero <- as.integer(!error)
        snv <- c(snv.zero, snv.nonzero)
    }
    else{
        snv <- rep(0,sample.size)
    }   
    return(sample(snv))
}
gene.simulation <- function(sample.size,zero.rate, gene.mu){
    zero.size <- ceiling(sample.size * zero.rate)
    nonzero.size <- sample.size - zero.size
    gene.zero <- rep(0, zero.size)
    gene.nonzero <- rpois(nonzero.size, gene.mu)
    gene <- c(gene.zero,gene.nonzero)
    return(sample(gene))
}
related.pair.simulation <- function(sample.issnv.size, sample.notsnv.size, zero.snv.rate, zero.gene.rate, 
                                    snv.gene.mu,gene.with.snv.mu,gene.without.snv.mu){
    issnv <- SNV.simulation(sample.issnv.size,zero.snv.rate,snv.gene.mu,is.snv=TRUE)
    notsnv <- SNV.simulation(sample.notsnv.size,zero.snv.rate,snv.gene.mu,is.snv=FALSE)
    snv <- c(issnv,notsnv)
    
    gene.with.snv <- gene.simulation(sample.issnv.size,zero.gene.rate, gene.with.snv.mu)
    gene.without.snv <- gene.simulation(sample.notsnv.size, zero.gene.rate, gene.without.snv.mu)
    gene <- c(gene.with.snv,gene.without.snv)
    
    label <- c(rep(1,sample.issnv.size), rep(0,sample.notsnv.size))
    
    simulation.data <- data.frame(gene,snv, label)
    return(simulation.data)
}

In [39]:
#simulation.data <- related.pair.simulation(10,10,0.3,0.3,10,10,20)
#simulation.data

# Different Sample Size

In [45]:
set.seed(123)
sample.size.container <- c(10,30,50,100,200,500)
zero.rate.container <- seq(0.1,0.6,by=0.1)
for (sample.size in sample.size.container){
    for (zero.rate in zero.rate.container){
        print(sample.size)
        print(zero.rate)
        simulation.data <- related.pair.simulation(sample.size,sample.size,zero.rate,0.3,10,10,20)
        zinb.model <- zeroinfl(formula = gene ~ snv, data = simulation.data, dist = "negbin")
        m0 <- zeroinfl(formula = gene ~ 1|snv , data = simulation.data, dist = "negbin")
        pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
        print(pvalue)
        print('---------')
    }
    
}

[1] 10
[1] 0.1
[1] 7.724292e-05
[1] "---------"
[1] 10
[1] 0.2
[1] 5.809631e-06
[1] "---------"
[1] 10
[1] 0.3
[1] 0.1674427
[1] "---------"
[1] 10
[1] 0.4
[1] 0.02054369
[1] "---------"
[1] 10
[1] 0.5
[1] 0.07384288
[1] "---------"
[1] 10
[1] 0.6
[1] 0.5056763
[1] "---------"
[1] 30
[1] 0.1
[1] 8.374164e-08
[1] "---------"
[1] 30
[1] 0.2
[1] 0.0002238546
[1] "---------"
[1] 30
[1] 0.3
[1] 3.247582e-05
[1] "---------"
[1] 30
[1] 0.4
[1] 0.0001423782
[1] "---------"
[1] 30
[1] 0.5
[1] 0.004756767
[1] "---------"
[1] 30
[1] 0.6
[1] 2.640308e-05
[1] "---------"
[1] 50
[1] 0.1
[1] 5.474898e-10
[1] "---------"
[1] 50
[1] 0.2
[1] 8.939413e-09
[1] "---------"
[1] 50
[1] 0.3
[1] 1.254503e-06
[1] "---------"
[1] 50
[1] 0.4
[1] 2.191676e-05
[1] "---------"
[1] 50
[1] 0.5
[1] 2.680762e-05
[1] "---------"
[1] 50
[1] 0.6
[1] 4.761372e-05
[1] "---------"
[1] 100
[1] 0.1
[1] 1.529755e-14
[1] "---------"
[1] 100
[1] 0.2
[1] 4.225627e-16
[1] "---------"
[1] 100
[1] 0.3
[1] 1.193818e-24
[1] "---------"


In [51]:
sample.size  <- 50
zero.rate <- 0.3
times <- 50
for (i in seq(1:times)){
    simulation.data <- related.pair.simulation(sample.size,sample.size,zero.rate,0.3,10,10,15)
    zinb.model <- zeroinfl(formula = gene ~ snv, data = simulation.data, dist = "negbin")
    m0 <- zeroinfl(formula = gene ~ 1|snv , data = simulation.data, dist = "negbin")
    pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
    print(pvalue)
}

[1] 0.0005120984
[1] 0.01421623
[1] 0.0003719247
[1] 0.3509467
[1] 4.862767e-05
[1] 0.002571117
[1] 0.009422045


“产生了NaNs”


[1] 0.0003622411
[1] 0.002057379
[1] 5.140049e-08
[1] 0.01385002
[1] 0.1045205
[1] 1.713958e-05
[1] 6.649585e-05
[1] 0.0008247739
[1] 0.0005991903
[1] 1.865264e-06
[1] 0.00305771
[1] 5.602686e-05
[1] 7.338911e-07
[1] 0.0006193912
[1] 0.007876453
[1] 2.53636e-05
[1] 0.0009291236
[1] 0.0001249087
[1] 0.003589458
[1] 3.536104e-05
[1] 0.0001447342
[1] 0.0009178621
[1] 4.276584e-05
[1] 2.616106e-10
[1] 0.02433917
[1] 0.0002150042
[1] 1.006195e-06
[1] 0.001544038
[1] 0.05492859
[1] 6.20588e-05
[1] 0.0002704517
[1] 6.212867e-05
[1] 6.214323e-05
[1] 0.02008365
[1] 0.00128953
[1] 0.02192658
[1] 5.189968e-06
[1] 4.236051e-09
[1] 0.04093332
[1] 0.001349519
[1] 4.062071e-05
[1] 9.15632e-06
[1] 9.674721e-08


In [42]:
set.seed(123)
sample.size.container <- c(10,30,50,100,200,500,1000)
zero.rate.container <- seq(0.1,0.5,by=0.1)
gene.mu.1 <- 10
gene.mu.2 <- 100
for (sample.size in sample.size.container){
    for (zero.rate in zero.rate.container){
        print(sample.size)
        print(zero.rate)
        snv.yes <- SNV.simulated(sample.size, zero.rate, gene.mu.1, is.snv=TRUE)
        snv.no <- SNV.simulated(sample.size, zero.rate, gene.mu.1, is.snv=FALSE)
        gene.yes <- gene.simulated(sample.size, zero.rate, gene.mu.1)
        gene.no <- gene.simulated(sample.size, zero.rate, gene.mu.2)
        snv <- c(snv.yes,snv.no)
        gene <- c(gene.yes,gene.no)
        df <- data.frame(gene,snv)
        zinb.model <- zeroinfl(formula = gene ~ snv, data = df, dist = "negbin")
        m0 <- zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin")
        pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
        print(pvalue)
        print('---------')
    }
    
}

[1] 10
[1] 0.1
[1] 3.038713e-95
[1] "---------"
[1] 10
[1] 0.2
[1] 4.917919e-10
[1] "---------"
[1] 10
[1] 0.3
[1] 9.857762e-05
[1] "---------"
[1] 10
[1] 0.4
[1] 0.0001577951
[1] "---------"
[1] 10
[1] 0.5
[1] 0.02385838
[1] "---------"
[1] 30
[1] 0.1
[1] 2.595645e-33
[1] "---------"
[1] 30
[1] 0.2
[1] 6.276839e-19
[1] "---------"
[1] 30
[1] 0.3
[1] 8.07993e-17
[1] "---------"
[1] 30
[1] 0.4
[1] 4.802287e-11
[1] "---------"
[1] 30
[1] 0.5
[1] 5.60091e-06
[1] "---------"
[1] 50
[1] 0.1
[1] 5.458844e-76
[1] "---------"
[1] 50
[1] 0.2
[1] 1.174596e-34
[1] "---------"
[1] 50
[1] 0.3
[1] 1.986263e-28
[1] "---------"
[1] 50
[1] 0.4
[1] 9.306489e-13
[1] "---------"
[1] 50
[1] 0.5
[1] 5.124101e-08
[1] "---------"
[1] 100
[1] 0.1
[1] 6.826869e-125
[1] "---------"
[1] 100
[1] 0.2
[1] 6.428092e-65
[1] "---------"
[1] 100
[1] 0.3
[1] 2.79126e-40
[1] "---------"
[1] 100
[1] 0.4
[1] 1.322515e-34
[1] "---------"
[1] 100
[1] 0.5
[1] 2.74643e-28
[1] "---------"
[1] 200
[1] 0.1
[1] 2.275628e-199
[1] "-

In [22]:
set.seed(123)
sample.size.container <- c(10,30,50,100,200,500,1000)
zero.rate.container <- seq(0.1,0.5,by=0.1)
gene.mu.1 <- 10
gene.mu.2 <- 100
for (sample.size in sample.size.container){
    for (zero.rate in zero.rate.container){
        print(sample.size)
        print(zero.rate)
        snv.yes <- SNV.simulated(sample.size, zero.rate, gene.mu.1, is.snv=TRUE)
        snv.no <- SNV.simulated(sample.size, zero.rate, gene.mu.1, is.snv=FALSE)
        gene.yes <- gene.simulated(sample.size, zero.rate, gene.mu.1)
        gene.no <- gene.simulated(sample.size, zero.rate, gene.mu.2)
        snv <- c(snv.yes,snv.no)
        gene <- c(gene.yes,gene.no)
        df <- data.frame(gene,snv)
        zinb.model <- zeroinfl(formula = gene ~ snv, data = df, dist = "negbin")
        m0 <- zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin")
        pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
        print(pvalue)
        print('---------')
    }
    
}

[1] 10
[1] 0.1
[1] 3.038713e-95
[1] "---------"
[1] 10
[1] 0.2
[1] 4.917919e-10
[1] "---------"
[1] 10
[1] 0.3
[1] 9.857762e-05
[1] "---------"
[1] 10
[1] 0.4
[1] 0.0001577951
[1] "---------"
[1] 10
[1] 0.5
[1] 0.02385838
[1] "---------"
[1] 30
[1] 0.1
[1] 2.595645e-33
[1] "---------"
[1] 30
[1] 0.2
[1] 6.276839e-19
[1] "---------"
[1] 30
[1] 0.3
[1] 8.07993e-17
[1] "---------"
[1] 30
[1] 0.4
[1] 4.802287e-11
[1] "---------"
[1] 30
[1] 0.5
[1] 5.60091e-06
[1] "---------"
[1] 50
[1] 0.1
[1] 5.458844e-76
[1] "---------"
[1] 50
[1] 0.2
[1] 1.174596e-34
[1] "---------"
[1] 50
[1] 0.3
[1] 1.986263e-28
[1] "---------"
[1] 50
[1] 0.4
[1] 9.306489e-13
[1] "---------"
[1] 50
[1] 0.5
[1] 5.124101e-08
[1] "---------"
[1] 100
[1] 0.1
[1] 6.826869e-125
[1] "---------"
[1] 100
[1] 0.2
[1] 6.428092e-65
[1] "---------"
[1] 100
[1] 0.3
[1] 2.79126e-40
[1] "---------"
[1] 100
[1] 0.4
[1] 1.322515e-34
[1] "---------"
[1] 100
[1] 0.5
[1] 2.74643e-28
[1] "---------"
[1] 200
[1] 0.1
[1] 2.275628e-199
[1] "-

In [20]:
gene.no

In [8]:
zero.rate.container