## Tables
* Assume that [set4delta.csv] file is located in the same directory.
* "datacall.r" file preprocess the [set4delta.csv] dataset. (source("datacall.r") would take few minutes)
* functions for replicating the results in simulation studies are coded in "simulation.r".

In [1]:
myfile<-read.csv("set4delta.csv")
lfc = myfile$realdata

In [36]:
source("datacall.r")
source("simulation_functions.r")

#### Simulation for **Table 1** 
* with sample size(n) 8000
* differentiating the correlations of primary p-values and auxiliary p-values by [0.0, 0.2, 0.4, 0.6, 0.8]
* differentiating the distance between null($N(0,1)$) and alternative($N(mm, 1)$) distributions.
* This will take few hours.

In [3]:
n = 8000
taus = c(0, 0.2, 0.4, 0.6, 0.8)
mms = c(2.0, 2.5, 3.0, 3.5, 4.0)

In [None]:
n = 8000

for(tau in c(0, 0.2,0.4, 0.6, 0.8)){
  print(paste(c("now the tau is", tau)))
  
  simres = array(0.0, c(2, 27, 1000) )
  if(tau==0){
    cop = BiCop(0)
  }else{
    cop = BiCop(3, tau = tau)
  }
  
  for(i in 1:1000){
    simdat = simgen(i, n, cop)
    res = test_all(simdat)
    simres[,,i] = res
    if(i%%10==0) print(res[1,])
  }
  assign(paste0("tau_",tau), simres)
  #saveRDS(simres, paste0("cop23_tau",tau,"_mm2_sd1.rds"))
}

for(mm in c(2.0, 2.5, 3.0, 3.5, 4.0)){
  print(paste(c("now the mu is", mm)))
  
  simres = array(0.0, c(2, 21, 1000))
  if(tau==0){
    cop = BiCop(0)
  }else{
    cop = BiCop(3, tau = 0.4)
  }
  
  for(i in 1:1000){
    simdat = simgen(i, n, cop, mm=mm)
    res = test_all(simdat)
    simres[,,i] = res
    if(i%%100==0) print(res[1,])
  }
  assign(paste0("mm_",mm), simres)
  #saveRDS(simres, paste0("cop23_tau",tau,"_mm2_sd1.rds"))
}

In [None]:
fdr_tbl = array(NA, dim = c(7,4,5))
power_tbl = fdr_tbl

i = 1

for(tau in c(0,0.2,0.4,0.6,0.8)){
  mat = get(paste0("tau_",tau))
  fdr_tbl[,,i] = fdr_and_power(mat)$fdr
  power_tbl[,,i] = fdr_and_power(mat)$power
  i = i+1
}

saveRDS(fdr_tbl, "fdr_diff_tau.rds")
saveRDS(power_tbl, "power_diff_tau.rds")

fdr_tbl2 = array(NA, dim = c(7,4,5))
power_tbl2 = fdr_tbl

i = 1

for(mu in c(2, 2.5, 3, 3.5, 4)){
  mat = get(paste0("mm_",mu))
  fdr_tbl2[,,i] = fdr_and_power(mat)$fdr
  power_tbl2[,,i] = fdr_and_power(mat)$power
  i = i+1
}

saveRDS(fdr_tbl2, "fdr_diff_mu.rds")
saveRDS(power_tbl2, "power_diff_mu.rds")

In [None]:
fdr_tbl

In [None]:
power_tbl

In [None]:
fdr_tbl2

In [None]:
power_tbl2

#### Simulation for **Table 2** 
* with sample size(n) 8000
* the correlations of primary p-values and auxiliary p-values is set to be 0.4
* the distance between null($N(0,1)$) and alternative($N(3, 1)$) distributions.
* compared with Misspecified copula : BB7, BB6 and Joe copulas
* This will take few hours.

In [None]:
miscop = c(39, 38,36) #BB7, BB6, Joe
for(c in miscop){
  print(paste(c("now the cop is", c)))
  cop = BiCop(3, tau = 0.4)
  simres = array(0.0, c(2, 21, 1000) )
  for(i in 1:1000){
    simdat = simgen(i, n, cop)
    res = test_all(simdat, copnum = c)
    simres[,,i] = res
    if(i%%100==0) print(res[1,1:21])
  }
  assign(paste0("misfit_",c), simres)
}

In [None]:
miscop = c(39, 38,36) #BB7, BB6, Joe
for(c in miscop){
  print(paste(c("now the cop is", c)))
  testcop = BiCop(3, tau = 0.4)
  simres = array(0.0, c(2, 21, 1000))
  for(i in 1:1000){
    set.seed(i)
    rr = BiCopSim(n, obj = testcop)
    cop = BiCopEst(rr[,1], rr[,2], c)
    simdat = simgen(i, n, cop)
    res = test_all(simdat)
    simres[,,i] = res
    if(i%%100==0) print(res[1,1:21])
  }
  assign(paste0("misgen_",c), simres)
}

In [None]:
fdr_tbl3 = array(NA, dim = c(7,4,5))
power_tbl3 = fdr_tbl

i = 1

for(c in miscop){
  mat = get(paste0("misfit_",c))
  fdr_tbl3[,,i] = fdr_and_power(mat)$fdr
  power_tbl3[,,i] = fdr_and_power(mat)$power
  i = i+1
}

saveRDS(fdr_tbl3, "fdr_misfit.rds")
saveRDS(power_tbl3, "power_misfit.rds")





fdr_tbl4 = array(NA, dim = c(7,4,5))
power_tbl4 = fdr_tbl

i = 1

for(c in miscop){
  mat = get(paste0("misgen_",c))
  fdr_tbl4[,,i] = fdr_and_power(mat)$fdr
  power_tbl4[,,i] = fdr_and_power(mat)$power
  i = i+1
}

saveRDS(fdr_tbl4, "fdr_misgen.rds")
saveRDS(power_tbl4, "power_misgen.rds")

In [None]:
fdr_tbl3

In [None]:
power_tbl3

In [None]:
fdr_tbl3

In [None]:
power_tbl4

#### **Table 3** The Log-likelihood, AIC, BIC of Gaussian, Frank, Clayton, Gumbel copulas with set4∆ dataset

In [3]:
tau = wdm(pval1, pval2, "kendall"); tau
cop = BiCop(23, tau = tau)

In [4]:
pval11 = pval1[pval1 + pval2 >= 1]
pval22 = pval2[pval1 + pval2 >= 1]
pval111 = c(pval11, 1-pval22)
pval222 = c(pval22, 1-pval11)

In [5]:
aa = BiCopEstList(pval11, pval22, c(1,5,23,24,26), rotations = F)$summary
aa$family = c("Gaussian", "Frank", "Clayton","Gumbel", "Joe"); aa

family,logLik,AIC,BIC
<chr>,<dbl>,<dbl>,<dbl>
Gaussian,306.45,-610.91,-604.8
Frank,332.84,-663.68,-657.57
Clayton,393.08,-784.16,-778.05
Gumbel,201.79,-401.57,-395.46
Joe,96.29,-190.57,-184.46


#### **Table4** Comparison of total and HXT family gene rejections using one-stage (locfdr, Storey), covarate-assisted (IHW, Boca and Leek, AdaFDR) and two-stage (types H and S) FDR methods at α = 0.05 and α = 0.10.

In [7]:
## locfdr
rej_locfdr = (p0hat*f0/f < 0.05)
rej_locfdr2 = (p0hat*f0/f < 0.1)


## Storey
p0 = pi_01(pval2)
rej_Storey = (p0*n*pval2/rank(pval2) <0.05)
rej_Storey2 = (p0*n*pval2/rank(pval2) <0.1)

## Two-stage (H)
gamma_seq = seq(0.5,1,0.001)
tau = wdm(pval1, pval2, "kendall")
cop = BiCop(23, tau = tau)

nrej = sapply(gamma_seq, function(x){
  pv = BiCopCDF(rep(x, n), pval2, obj = cop)
  pv = ifelse(pval1>x, pval1, pv)
  p0 = pi_01(pv)
  nr =  sum(p0*n*pv/rank(pv) <0.10)
  return(nr)
})


gamma = gamma_seq[which.max(nrej)]; gamma

pv = BiCopCDF(rep(gamma, n), pval2, obj = cop)
pval_H = ifelse(pval1>gamma, pval1, pv)
p0 = pi_01(pval_H)
rej_H = (p0*n*pval_H/rank(pval_H) <0.05)
rej_H2 = (p0*n*pval_H/rank(pval_H) <0.10)

## Two-stage (S)
pval_S = BiCopHfunc1(pval1, pval2, obj = cop)
p0hat = pi_01(pval_S)
rej_S2 = (p0hat*n*pval_S/rank(pval_S) <0.10)
rej_S = (p0hat*n*pval_S/rank(pval_S) <0.05)

## IHW
res_ihw <- ihw(pval2 ~ pval1, alpha = 0.05)
pval_ihw = adj_pvalues(res_ihw)
  
rej_ihw = pval_ihw < 0.05
rej_ihw2 = pval_ihw < 0.10

## BL
qvalues <- lm_qvalue(pval2, X=pval1)
newp = qvalues$qvalues
test = p.adjust(newp,"BH")
  
rej_BL = test < 0.05
rej_BL2 = test < 0.10


## adaFDR
rej_ada <- adafdr_test(pval2, pval1, alpha = 0.05)$decision
rej_ada2 <- adafdr_test(pval2, pval1, alpha = 0.10)$decision


## FDRreg
rejs <- FDRreg(x1, lfc)
rej_FDRreg <- rejs$d1
rej_FDRreg2 <- rejs$d2

## SIM

rejs <- SIM(p1,p2)
rej_SIM <- rejs$d1
rej_SIM2 <- rejs$d2

“no observations informative at iteration 1”
“glm.fit: algorithm did not converge”


ERROR: Error in glm.fit(x = structure(numeric(0), dim = c(0L, 2L), dimnames = list(: object 'fit' not found


In [37]:
## FDRreg
rejs <- FDRreg(x1, lfc)
rej_FDRreg <- rejs$d1
rej_FDRreg2 <- rejs$d2

In [39]:
rej1 = data.frame(locfdr = rej_locfdr, storey = rej_Storey, ihw = rej_ihw, BL = rej_BL, adaFDR = rej_ada, FDRreg = rej_FDRreg, SIM = rej_SIM,typeH = rej_H, typeS = rej_S)
rej2 = data.frame(locfdr = rej_locfdr2, storey = rej_Storey2, ihw = rej_ihw2, BL = rej_BL2, adaFDR = rej_ada2,FDRreg = rej_FDRreg2, SIM = rej_SIM2, typeH = rej_H2, typeS = rej_S2)
apply(rej1, 2, sum) # # of rejected genes with alpha = 0.05
apply(rej2, 2, sum) # # of rejected genes with alpha = 0.10
apply(rej1, 2, function(x) sum(x[hxtloc])) # # of rejected HXTfamily with alpha = 0.05
apply(rej2, 2, function(x) sum(x[hxtloc])) # # of rejected HXTfamily with alpha = 0.10

#### Table S1: The number of selected copula based on LogLik, AIC and BIC when generating random variables using the Clayton copula with mean and standard deviation.

In [8]:
fam = c(1,5,23,24,26)
names(fam) = c("Gaussian", "Frank", "Clayton","Gumbel","Joe")
cop = BiCop(23, tau = -0.4)

In [9]:
lik = matrix(0,100,length(fam))
aic = matrix(0,100,length(fam))
bic = matrix(0,100,length(fam))

for(i in 1:100){
    set.seed(i)
    r = BiCopSim(8000, obj = cop)
  for(f in 1:length(fam)){
    estcop = BiCopEst(r[,1], r[,2], fam[f])
    lik[i,f] = estcop$logLik
    aic[i,f] = estcop$AIC
    bic[i,f] = estcop$BIC 
  }
}

In [10]:
t(round(apply(lik, 2, function(x) c(mean(x), sd(x))),3))
t(round(apply(aic, 2, function(x) c(mean(x), sd(x))),3))
t(round(apply(bic, 2, function(x) c(mean(x), sd(x))),3))

0,1
1615.157,57.301
1524.409,53.864
2194.153,71.345
1090.186,50.598
501.012,35.078


0,1
-3228.315,114.602
-3046.818,107.729
-4386.307,142.69
-2178.372,101.197
-1000.024,70.156


0,1
-3221.328,114.602
-3039.83,107.729
-4379.32,142.69
-2171.385,101.197
-993.037,70.156


In [11]:
table(apply(lik, 1, which.max))
table(apply(aic, 1, which.min))
table(apply(bic, 1, which.min))


  3 
100 


  3 
100 


  3 
100 

#### **Table S2** The Log-likelihood, AIC, BIC of Gaussian, Frank, Clayton, Gumbel, BB6, BB7 and Joe copulas with set4∆ dataset

In [12]:
aa = BiCopEstList(pval1_deseq, pval2, family = c(1,5,23,24,26), rotations = F)$summary
aa$family = c("Gaussian", "Frank", "Clayton","Gumbel", "Joe")
aa

family,logLik,AIC,BIC
<chr>,<dbl>,<dbl>,<dbl>
Gaussian,1578.79,-3155.57,-3148.56
Frank,1486.99,-2971.97,-2964.96
Clayton,836.13,-1670.25,-1663.24
Gumbel,1199.29,-2396.58,-2389.56
Joe,486.19,-970.38,-963.37
