In [1]:
library(pscl)
library(lmtest)
library(ggplot2)

Classes and Methods for R developed in the
Political Science Computational Laboratory
Department of Political Science
Stanford University
Simon Jackman
hurdle and zeroinfl functions by Achim Zeileis

Loading required package: zoo


Attaching package: ‘zoo’


The following objects are masked from ‘package:base’:

    as.Date, as.Date.numeric




# Simulation Function

In [92]:
SNV.simulation <- function(sample.size, zero.rate, gene.mu, is.snv=TRUE){
    if(is.snv){
        # Gene
        zero.size <- ceiling(sample.size * zero.rate)
        nonzero.size <- sample.size - zero.size
        gene.zero <- rep(0, zero.size)
        gene.nonzero <- rpois(nonzero.size, gene.mu)
    
        #SNV
        snv.zero <- rep(0,zero.size)
        error.rate <- 1/gene.nonzero
        error <- (runif(nonzero.size) < error.rate)
        snv.nonzero <- as.integer(!error)
        snv <- c(snv.zero, snv.nonzero)
    }
    else{
        snv <- rep(0,sample.size)
    }   
    return(sample(snv))
}


gene.simulation <- function(sample.size,zero.rate, gene.mu){
    zero.size <- ceiling(sample.size * zero.rate)
    nonzero.size <- sample.size - zero.size
    gene.zero <- rep(0, zero.size)
    gene.nonzero <- rpois(nonzero.size, gene.mu)
    gene <- c(gene.zero,gene.nonzero)
    return(sample(gene))
}


related.pair.simulation <- function(sample.issnv.size, sample.notsnv.size, zero.snv.rate, zero.gene.rate, 
                                    snv.gene.mu,gene.with.snv.mu,gene.without.snv.mu){
    issnv <- SNV.simulation(sample.issnv.size,zero.snv.rate,snv.gene.mu,is.snv=TRUE)
    notsnv <- SNV.simulation(sample.notsnv.size,zero.snv.rate,snv.gene.mu,is.snv=FALSE)
    snv <- c(issnv,notsnv)
    
    gene.with.snv <- gene.simulation(sample.issnv.size,zero.gene.rate, gene.with.snv.mu)
    gene.without.snv <- gene.simulation(sample.notsnv.size, zero.gene.rate, gene.without.snv.mu)
    gene <- c(gene.with.snv,gene.without.snv)
    
    label <- c(rep(1,sample.issnv.size), rep(0,sample.notsnv.size))
    
    simulation.data <- data.frame(gene,snv, label)
    return(simulation.data)
}

calculate.pvalue <- function(df){
    # here df is a dataframe with 2 cols: gene, snv
    zinb.model <- try(zeroinfl(formula = gene ~ snv, data = df, dist = "negbin"),silent = TRUE)
    m0 <- try(zeroinfl(formula = gene ~ 1|snv , data = df, dist = "negbin"),silent = TRUE)
    if(class(zinb.model)=='try-error' | class(m0)=='try-error'){
        pvalue <- NA
    }
    else{
        pvalue <- waldtest(m0,zinb.model)[2,'Pr(>Chisq)']
    }
    return(pvalue)
}

In [39]:
#simulation.data <- related.pair.simulation(10,10,0.3,0.3,10,10,20)
#simulation.data

# Different Sample Size

In [100]:
set.seed(123)
#sample.size.container <- c(10,30,50,100,200,500,1000)
sample.size.container <- c(10)
zero.rate.container <- c(0.5)
times <- 5
#zero.rate.container <- seq(0.1,0.7,by=0.1)
gene.mu.1 <- 10
gene.mu.2 <- 100
for (sample.size in sample.size.container){
    for (zero.rate in zero.rate.container){
        for (t in seq(1:times)){
            print(sample.size)
            print(zero.rate)
            snv.yes <- SNV.simulated(sample.size, zero.rate, gene.mu.1, is.snv=TRUE)
            snv.no <- SNV.simulated(sample.size, zero.rate, gene.mu.1, is.snv=FALSE)
            gene.yes <- gene.simulated(sample.size, zero.rate, gene.mu.1)
            gene.no <- gene.simulated(sample.size, zero.rate, gene.mu.2)
            snv <- c(snv.yes,snv.no)
            gene <- c(gene.yes,gene.no)
            df <- data.frame(gene,snv)
            pvalue <- calculate.pvalue(df)
            print(pvalue)
            print('---------')
        }
    }
    
}

[1] 10
[1] 0.5
[1] 8.711926e-05
[1] "---------"
[1] 10
[1] 0.5
[1] 0.003377671
[1] "---------"
[1] 10
[1] 0.5
[1] 0.0001856629
[1] "---------"
[1] 10
[1] 0.5
[1] 0.006284519
[1] "---------"
[1] 10
[1] 0.5
[1] 0.01641659
[1] "---------"
