0.44.6.2

weizhouUMICH · Aug 2, 2021 · 648850f · 648850f
1 parent 2b63a20
commit 648850f
Show file tree

Hide file tree

Showing 20 changed files with 1,009,064 additions and 14 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
diff --git a/R/SAIGE_extractNeff.R b/R/SAIGE_extractNeff.R
diff --git a/R/SAIGE_fitGLMM_fast.R b/R/SAIGE_fitGLMM_fast.R
@@ -1072,8 +1072,9 @@ fitNULLGLMM = function(plinkFile = "",
             save(modglmm, file = modelOut)
             tau = modglmm$theta
             varAll = tau[2] + (pi^2)/3
-            tauVec_ss = c(((pi^2)/3)/varAll, (tau[2])/varAll)
-            wVec_ss = rep(1, length(modglmm$y))
+            #tauVec_ss = c(((pi^2)/3)/varAll, (tau[2])/varAll)
+	    tauVec_ss = c(0,1)
+	    wVec_ss = rep(1, length(modglmm$y))
             bVec_ss = rep(1, length(modglmm$y))
             Rinv_1 = getPCG1ofSigmaAndVector(wVec_ss, tauVec_ss, 
                 bVec_ss, maxiterPCG, tolPCG)
@@ -1100,7 +1101,8 @@ fitNULLGLMM = function(plinkFile = "",
             }
             tau = modglmm$theta
             varAll = tau[2] + (pi^2)/3
-            tauVec_ss = c(((pi^2)/3)/varAll, (tau[2])/varAll)
+            #tauVec_ss = c(((pi^2)/3)/varAll, (tau[2])/varAll)
+	    tauVec_ss = c(0, 1)
             wVec_ss = rep(1, length(modglmm$y))
             bVec_ss = rep(1, length(modglmm$y))
             t_getPCG1ofSigmaAndVector = proc.time()
@@ -1943,6 +1945,8 @@ pcg<-function (A, b, M=NULL, maxiter = 1e+05, tol = 1e-06){
     #print(dA)
     Minv = 1/dA
   } else Minv = solve(M)
+  print("R")
+  print(Minv)
   x = rep(0, length(b))
   r = b 
   if(is.null(M)){

diff --git a/README.md b/README.md
@@ -12,7 +12,9 @@ Table of Contents
 
 # Introduction
 
-## Current version is 0.44.6.1 (Updated on July 16, 2021) - SAIGE-GENE+: for group tests, collpasing ultra-rare variants with MAC <= 10. Set --method_to_CollapseUltraRare="absence_or_presence" as default to collpase ultra-rare varaints with MAC <= 10. SAIGE-GENE+ has well controlled type I error rates when the maximum MAF cutoff (maxMAFforGroupTest) is lower than 1%, e.g. 0.01% or 0.1%. Tests with multiple MAF cutoffs and variant annotations can be combined using the Cauchy combination (function CCT)
+##Current version is 0.44.6.2 (Updated on August 2, 2021) - 0.44.6.2 add extdata/extractNglmm.R to extract the effective sample size without running Step 1. extdata/cmd_extractNeff.sh has the pipeline. The effective sample size (Nglmm) is differently calculated than the previous versions.
+
+## Previous version is 0.44.6.1 (Updated on July 16, 2021) - SAIGE-GENE+: for group tests, collpasing ultra-rare variants with MAC <= 10. Set --method_to_CollapseUltraRare="absence_or_presence" as default to collpase ultra-rare varaints with MAC <= 10. SAIGE-GENE+ has well controlled type I error rates when the maximum MAF cutoff (maxMAFforGroupTest) is lower than 1%, e.g. 0.01% or 0.1%. Tests with multiple MAF cutoffs and variant annotations can be combined using the Cauchy combination (function CCT)
 
 ## Please re-install 0.44.2 if you installed this verion on March 31. 
 
@@ -124,7 +126,7 @@ Thanks to Juha Karjalainen for sharing the Dockerfile.
 The docker image can be pulled
 
 ```
-docker pull wzhou88/saige:0.44.6
+docker pull wzhou88/saige:0.44.6.2
 ```
 
 Functions can be called
@@ -149,6 +151,13 @@ Example data and script can be found in ./extdata. Run
 to run single-variant and gene-based association tests
 
 
+## extract effectize sample size v0.44.6.2)
+```
+  SAIGE_extractNeff.R --help
+  bash cmd_extractNeff.sh
+```    
+
+
 # Notes before running jobs
 
 ### FAQ can be found  [here](https://github.com/weizhouUMICH/SAIGE/wiki/Genetic-association-tests-using-SAIGE#Frequently-asked-questions)
@@ -183,6 +192,8 @@ https://www.leelabsg.org/resources
 
 
 # Log for fixing bugs
+* 0.44.6.2 (August-2-2021). add extdata/extractNglmm.R to extract the effective sample size without running Step 1. extdata/cmd_extractNeff.sh has the pipeline. The effective sample size (Nglmm) is differently calculated than the previous versions. 
+
 * 0.44.6.1 (July-16-2021). add the function CCT to perform Cauchy combination to combine multipel tests
 
 * 0.44.6 (July-13-2021). Set --method_to_CollapseUltraRare="absence_or_presence" as default to collpase ultra-rare varaints with MAC <= 10. We call this version SAIGE-GENE+. SAIGE-GENE+ has well controlled type I error rates when the maximum MAF cutoff (maxMAFforGroupTest) is lower than 1%, e.g. 0.01% or 0.1%.  

diff --git a/extdata/.nfs0000014b94a26e0800000005 b/extdata/.nfs0000014b94a26e0800000005
diff --git a/extdata/Nglmm.log b/extdata/Nglmm.log
@@ -0,0 +1,220 @@
+Loading required package: optparse
+R version 4.1.0 (2021-05-18)
+Platform: x86_64-pc-linux-gnu (64-bit)
+Running under: Ubuntu 18.04.5 LTS
+
+Matrix products: default
+BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
+LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
+
+locale:
+ [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+ [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
+ [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
+ [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
+ [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
+
+attached base packages:
+[1] stats     graphics  grDevices utils     datasets  methods   base     
+
+other attached packages:
+[1] optparse_1.6.6 SAIGE_0.44.6.1
+
+loaded via a namespace (and not attached):
+[1] compiler_4.1.0     Matrix_1.3-2       Rcpp_1.0.6         getopt_1.20.3     
+[5] grid_4.1.0         RcppParallel_5.0.2 lattice_0.20-44   
+$plinkFile
+[1] "./input/nfam_100_nindep_0_step1_includeMoreRareVariants_poly"
+
+$phenoFile
+[1] "./input/pheno_1000samples.txt_withdosages_withBothTraitTypes.txt"
+
+$phenoCol
+[1] "y_binary"
+
+$traitType
+[1] "binary"
+
+$invNormalize
+[1] FALSE
+
+$covarColList
+[1] "x1,x2"
+
+$sampleIDColinphenoFile
+[1] "IID"
+
+$tol
+[1] 0.02
+
+$maxiter
+[1] 20
+
+$tolPCG
+[1] 1e-05
+
+$maxiterPCG
+[1] 500
+
+$nThreads
+[1] 4
+
+$memoryChunk
+[1] 2
+
+$tauInit
+[1] "0,0"
+
+$outputPrefix
+[1] "~/"
+
+$IsSparseKin
+[1] FALSE
+
+$sparseGRMFile
+[1] "./output/sparseGRM_relatednessCutoff_0.05_2000_randomMarkersUsed.sparseGRM.mtx"
+
+$sparseGRMSampleIDFile
+[1] "./output/sparseGRM_relatednessCutoff_0.05_2000_randomMarkersUsed.sparseGRM.mtx.sampleIDs.txt"
+
+$numRandomMarkerforSparseKin
+[1] 2000
+
+$relatednessCutoff
+[1] 0.125
+
+$isCovariateTransform
+[1] TRUE
+
+$useSparseSigmaConditionerforPCG
+[1] FALSE
+
+$useSparseSigmaforInitTau
+[1] FALSE
+
+$useSparseGRMtoFitNULL
+[1] TRUE
+
+$minMAFforGRM
+[1] 0.01
+
+$minCovariateCount
+[1] -1
+
+$FemaleOnly
+[1] FALSE
+
+$MaleOnly
+[1] FALSE
+
+$sexCol
+[1] ""
+
+$FemaleCode
+[1] "1"
+
+$MaleCode
+[1] "0"
+
+$noEstFixedEff
+[1] FALSE
+
+$help
+[1] FALSE
+
+tauInit  is  0 0 
+Markers in the Plink file with MAF >=  0.01  will be used to construct GRM
+sparse GRM will be used to fit the NULL model
+4  threads are set to be used  
+[1] "a sparse GRM will be used to fit the null model and the variance ratio estimation will be skipped, so plink files are not required"
+1000  samples are in the sparse GRM
+formula is  y_binary~x1+x2 
+1000  samples have non-missing phenotypes
+1000  samples will be used for analysis
+qr transformation has been performed on covariates
+colnames(data.new) is  Y minus1 x1 x2 
+out.transform$Param.transform$qrr:  3 3 
+extract sparse GRM
+[1] 7192
+set elements in the sparse GRN <=  0.125  to zero
+[1] 6484
+1000  samples have been used to fit the glmm null model
+[1] "print m4"
+[1] 1000 1000
+2 locationMat.n_rows 
+6484 locationMat.n_cols 
+6484 valueVec.n_elem 
+0.975952
+0
+0
+0.490757
+2
+0
+0.48145
+3
+0
+0.240555
+6
+0
+0.248245
+7
+0
+0.237988
+8
+0
+0.247572
+9
+0
+1.00071
+1
+1
+0.497641
+2
+1
+0.505352
+3
+1
+y_binary  is a binary trait
+nbyte: 250
+nbyte: 250
+reserve: 32474736
+
+M: 128868, N: 1000
+size of genoVecofPointers: 1
+setgeno mark1
+setgeno mark2
+111305 markers with MAF >= 0.01 are used for GRM.
+setgeno mark5
+setgeno mark6
+time: 1573.7
+glm:
+
+Call:
+glm(formula = formula.new, family = binomial, data = data.new)
+
+Deviance Residuals: 
+    Min       1Q   Median       3Q      Max  
+-1.1519  -0.4858  -0.3551  -0.2545   2.8436  
+
+Coefficients:
+       Estimate Std. Error z value Pr(>|z|)    
+minus1   2.5205     0.1338  18.836  < 2e-16 ***
+x1      -0.7667     0.1158  -6.621 3.57e-11 ***
+x2      -0.4558     0.1183  -3.852 0.000117 ***
+---
+Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
+
+(Dispersion parameter for binomial family taken to be 1)
+
+    Null deviance: 1386.29  on 1000  degrees of freedom
+Residual deviance:  577.99  on  997  degrees of freedom
+AIC: 583.99
+
+Number of Fisher Scoring iterations: 5
+
+[1] 1000
+use sparse kinship to fit the model 
+t1_Rinv_1 is  398.9558 
+Nglmm  141.0644 
+closed the plinkFile!
diff --git a/extdata/cmd_extractNeff.sh b/extdata/cmd_extractNeff.sh
@@ -0,0 +1,28 @@
+##This script will create two files, which will be specified in the extractNeff.R with --sparseGRMFile and --sparseGRMSampleIDFile
+
+#--sparseGRMFile=sparseGRM_relatednessCutoff_0.05_2000_randomMarkersUsed.sparseGRM.mtx
+#--sparseGRMSampleIDFile=sparseGRM_relatednessCutoff_0.05_2000_randomMarkersUsed.sparseGRM.mtx.sampleIDs.txt
+
+
+Rscript createSparseGRM.R       \
+        --plinkFile=./input/nfam_100_nindep_0_step1_includeMoreRareVariants_poly \
+        --nThreads=4  \
+        --outputPrefix=./output/sparseGRM       \
+        --numRandomMarkerforSparseKin=2000      \
+        --relatednessCutoff=0.05
+
+#Next extract Nglmm using extractNeff.R
+#This script does not generate any output file. The screen outputcan be redirected to the log file. Here I use Nglmm.log. At the end of the Nglmm.log, the value of Nglmm can be found.   
+
+Rscript extractNeff.R	\
+	--useSparseGRMtoFitNULL=TRUE	\
+        --plinkFile=./input/nfam_100_nindep_0_step1_includeMoreRareVariants_poly \
+        --phenoFile=./input/pheno_1000samples.txt_withdosages_withBothTraitTypes.txt \
+        --phenoCol=y_binary \
+        --covarColList=x1,x2 \
+        --sampleIDColinphenoFile=IID \
+        --traitType=binary        \
+        --nThreads=4    \
+        --minMAFforGRM=0.01	\
+	--sparseGRMFile=./output/sparseGRM_relatednessCutoff_0.05_2000_randomMarkersUsed.sparseGRM.mtx	\
+	--sparseGRMSampleIDFile=./output/sparseGRM_relatednessCutoff_0.05_2000_randomMarkersUsed.sparseGRM.mtx.sampleIDs.txt &> Nglmm.log