diff --git a/DESCRIPTION b/DESCRIPTION index 5262b2d..b79dceb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: PCMFit Type: Package Title: Maximum likelihood fit and selection of phylogenetic comparative models -Version: 0.6.0 +Version: 1.0.0 Authors@R: c(person("Venelin", "Mitov", email = "vmitov@gmail.com", role = c("aut", "cre", "cph"))) Maintainer: Venelin Mitov @@ -26,8 +26,7 @@ Description: The goal of PCMFit is to provide a generic tool for inference and perform model selection of the best MGPM for a given tree and data according to an information loss function such as the Akaike information criterion (AIC). The package has been thoroughly tested and applied to real data in the - related research article entitled "Automatic Generation of Evolutionary - Hypotheses using Mixed Gaussian Phylogenetic Models" (currently in review). + related research article Mitov et al. 2019 . Currently, the package is available from . The web-page provides access to documentation and related resources. diff --git a/R/PCMFitMixed.R b/R/PCMFitMixed.R index 7b2eb2b..d750288 100644 --- a/R/PCMFitMixed.R +++ b/R/PCMFitMixed.R @@ -1,14 +1,19 @@ -#' Maximum likelihood based search for an optimal mixed Gaussian phylogenetic -#' model, given a tree, trait measurements at its tips and a score function. +#' Optimal information score search for a mixed Gaussian phylogenetic +#' model, given a tree, trait measurements at its tips and a score function. #' #' @inheritParams PCMFit #' -#' @description A mixed Gaussian phylogenetic model (MGPM) represents -#' Fit regime-assignments to (sub-)trees in a tree with different assigned model -#' types to each regime.This function performs multiple model fits of mixed regime models -#' (MixedGaussian) mapping different model-types (e.g. BM and OU) to different -#' regimes (colors) in a tree and testing different regime assignments to the -#' branches in the tree. +#' @description A mixed Gaussian phylogenetic model (MGPM) represents a Gaussian +#' phylogenetic model with shifts in the underlying parameters and, optionally, +#' type of Gaussian stochastic process (e.g. shifts from a BM to an OU model of +#' evolution). Formally, an MGPM consists of the following components: +#' \itemize{ +#' \item A shift-point configuration: this is a subset of the nodes in the tree +#' including at least the root node +#' \item b +#' } +#' +#' #' @importFrom foreach foreach when %do% %dopar% %:% #' @importFrom data.table data.table rbindlist is.data.table setkey := #' @importFrom PCMBase PCMTree PCMTreeSetLabels PCMTreeSetPartition PCMTreeEvalNestedEDxOnTree PCMTreeNumTips PCMTreeListCladePartitions PCMTreeListAllPartitions PCMTreeToString MixedGaussian PCMOptions PCMTreeTableAncestors PCMTreeSplitAtNode PCMGetVecParamsRegimesAndModels MGPMDefaultModelTypes PCMGenerateModelTypes is.Transformable diff --git a/README.Rmd b/README.Rmd index 6fe280f..1dd2c45 100644 --- a/README.Rmd +++ b/README.Rmd @@ -43,8 +43,7 @@ The goal of PCMFit is to provide a generic tool for inference and perform model selection of the best MGPM for a given tree and data according to an information loss function such as the Akaike information criterion (AIC). The package has been thoroughly tested and applied to real data in the - related research article entitled "Automatic Generation of Evolutionary - Hypotheses using Mixed Gaussian Phylogenetic Models" (currently in review). + related research article [@Mitov:2019agh] . Currently, the package is available from . The web-page provides access to documentation and related resources. @@ -68,6 +67,9 @@ module load interconnect/ethernet module load new gcc/6.3.0 open_mpi/1.6.5 r/3.4.0 intel/2017.5 ``` +```{sh, eval = FALSE} +xcode-select --install +``` Upon validating the availability of a C++ compiler, PCMBaseCpp can be installed using the commands: ```{r, eval=FALSE} # These two packages are available on CRAN @@ -182,55 +184,16 @@ PCMFit builds on top of a stack of three tools enabling fast likelihood calculat # Citing PCMFit -To acknowledge the PCMFit package in a publication or other presentation, please cite one of the following: +To acknowledge the PCMFit package in a publication or other presentation, please cite: -* Mitov, V. (2018). Phylogenetic Comparative Methods in the Era of Big Data, Doctoral Thesis No. [25428](https://doi.org/10.3929/ethz-b-000315296) ETH Zurich. -``` -@phdthesis{Mitov:2018hh, -author = {Mitov, Venelin}, -title = {{Phylogenetic Comparative Methods in the Era of Big Data}}, -school = {ETH Zurich}, -year = {2018}, -address = {Zurich}, -url = {https://doi.org/10.3929/ethz-b-000315296}, -} -``` -Mitov, V., Bartoszek, K., & Stadler, T. (2018). Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models. Article in Review, preprint available in Chapter 7, Doctoral Thesis No. [25428](https://doi.org/10.3929/ethz-b-000315296) ETH Zurich. +Mitov, V., Bartoszek, K., & Stadler, T. (2019). Automatic generation of evolutionary hypotheses using mixed Gaussian phylogenetic models. Proceedings of the National Academy of Sciences of the United States of America, http://doi.org/10.1073/pnas.1813823116. ``` @article{Mitov:2019agh, -author = {Mitov, Venelin and Bartoszek, Krzysztof and Stadler, Tanja}, -title = {{Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models}}, -journal = {article in review, preprint in Ch. 7 of Thesis No. 25428, ETH Zurich}, -year = {2019}, -pages = {}, -month = mai, -url = {https://doi.org/10.3929/ethz-b-000315296} -} -``` -* Mitov, V., Bartoszek, K., Asimomitis, G., & Stadler, T. (2018, September 24). Fast likelihood evaluation for multivariate phylogenetic comparative methods: the PCMBase R package. arXiv.org. https://arxiv.org/abs/1809.09014. -``` -@article{Mitov:2018fl, -author = {Mitov, Venelin and Bartoszek, Krzysztof and Asimomitis, Georgios and Stadler, Tanja}, -title = {{Fast likelihood evaluation for multivariate phylogenetic comparative methods: the PCMBase R package}}, -journal = {arXiv.org}, -year = {2018}, -eprint = {1809.09014}, -eprinttype = {arxiv}, -eprintclass = {q-bio.PE}, -pages = {arXiv:1809.09014}, -month = sep, -annote = {34 pages, 6 figures} -} -``` -* Mitov, V., & Stadler, T. (2018). Parallel Likelihood Calculation for Phylogenetic Comparative Models: the SPLITT C++ Library. Methods in Ecology and Evolution, 2041--210X.13136. http://doi.org/10.1101/235739 -``` -@article{Mitov:2018dqa, -author = {Mitov, Venelin and Stadler, Tanja}, -title = {{Parallel likelihood calculation for phylogenetic comparative models: The SPLITT C++ library}}, -journal = {Methods in Ecology and Evolution}, -year = {2018}, -pages = {2041--210X.13136}, -month = dec +title = {Automatic generation of evolutionary hypotheses using mixed Gaussian phylogenetic models}, + author = {Venelin Mitov and Krzysztof Bartoszek and Tanja Stadler}, + journal = {Proceedings of the National Academy of Sciences of the United States of America}, + year = {2019}, + url = {https://www.pnas.org/lookup/doi/10.1073/pnas.1813823116}, } ``` diff --git a/README.md b/README.md index c8a9bba..68f3844 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,97 @@ -[![Travis build status](https://travis-ci.org/venelin/PCMFit.svg?branch=master)](https://travis-ci.org/venelin/PCMFit) [![Coverage status](https://codecov.io/gh/venelin/PCMFit/branch/master/graph/badge.svg)](https://codecov.io/github/venelin/PCMFit?branch=master) [![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/PCMFit?color=blue)](https://cran.r-project.org/package=PCMFit) [![Downloads](http://cranlogs.r-pkg.org/badges/PCMFit?color=blue)](https://cran.r-project.org/package=PCMFit) + +[![Travis build +status](https://travis-ci.org/venelin/PCMFit.svg?branch=master)](https://travis-ci.org/venelin/PCMFit) +[![Coverage +status](https://codecov.io/gh/venelin/PCMFit/branch/master/graph/badge.svg)](https://codecov.io/github/venelin/PCMFit?branch=master) +[![CRAN\_Status\_Badge](http://www.r-pkg.org/badges/version/PCMFit?color=blue)](https://cran.r-project.org/package=PCMFit) +[![Downloads](http://cranlogs.r-pkg.org/badges/PCMFit?color=blue)](https://cran.r-project.org/package=PCMFit)

+ PCMFit: Statistical inference of phylogenetic comparative models +

-The goal of PCMFit is to provide a generic tool for inference and selection of phylogenetic comparative models (PCMs). Currently, the package implements Gaussian and mixed Gaussian phylogenetic models (MGPM) over all tree types (including non-ultrametric and polytomic trees). The package supports non-existing traits or missing measurements for some of the traits on some of the species. The package supports specifying measurement error associated with each tip of the tree or inferring a measurement error parameter for a group of tips. The Gaussian phylogenetic models include various parametrizations of Brownian motion (BM) and Ornstein-Uhlenbeck (OU) multivariate branching processes. The mixed Gaussian models represent models with shifts in the model parameters as well as the type of model at points of the tree. Each shift-point is described as a pair of a shift-node and associated type of model (e.g. OU or BM) driving the trait evolution from the beginning of the branch leading to the shift-node toward the shift-node and its descendants until reaching a tip or another shift-point. The function PCMFit is used to fit a given PCM or a MGPM for a given tree with specified shift-points. The function PCMFitMixed is used to fit an ensemble of possible MGPMs over a tree for which the shift-points are unknown. This function can perform model selection of the best MGPM for a given tree and data according to an information loss function such as the Akaike information criterion (AIC). The package has been thoroughly tested and applied to real data in the related research article entitled "Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models" (currently in review). Currently, the package is available from . The web-page provides access to documentation and related resources. + +The goal of PCMFit is to provide a generic tool for inference and +selection of phylogenetic comparative models (PCMs). Currently, the +package implements Gaussian and mixed Gaussian phylogenetic models +(MGPM) over all tree types (including non-ultrametric and polytomic +trees). The package supports non-existing traits or missing measurements +for some of the traits on some of the species. The package supports +specifying measurement error associated with each tip of the tree or +inferring a measurement error parameter for a group of tips. The +Gaussian phylogenetic models include various parametrizations of +Brownian motion (BM) and Ornstein-Uhlenbeck (OU) multivariate branching +processes. The mixed Gaussian models represent models with shifts in the +model parameters as well as the type of model at points of the tree. +Each shift-point is described as a pair of a shift-node and associated +type of model (e.g. OU or BM) driving the trait evolution from the +beginning of the branch leading to the shift-node toward the shift-node +and its descendants until reaching a tip or another shift-point. The +function PCMFit is used to fit a given PCM or a MGPM for a given tree +with specified shift-points. The function PCMFitMixed is used to fit an +ensemble of possible MGPMs over a tree for which the shift-points are +unknown. This function can perform model selection of the best MGPM for +a given tree and data according to an information loss function such as +the Akaike information criterion (AIC). The package has been thoroughly +tested and applied to real data in the related research article (Mitov, +Bartoszek, and Stadler 2019) . Currently, +the package is available from . The +web-page provides access to +documentation and related +resources. -Prerequisites -============= -Before installing PCMFit, it is necessary to ensure that several R-packages are installed or can be installed from CRAN. These are listed below: +# Prerequisites -- [PCMBase](https://venelin.github.io/PCMBase). The PCMBase package is available on CRAN and should be installed automatically during the installation of PCMFit. If this does not happen, try the command: +Before installing PCMFit, it is necessary to ensure that several +R-packages are installed or can be installed from CRAN. These are listed +below: + + - [PCMBase](https://venelin.github.io/PCMBase). The PCMBase package is + available on CRAN and should be installed automatically during the + installation of PCMFit. If this does not happen, try the command: + + ``` r install.packages("PCMBase") ``` -- \[[PCMBaseCpp](https://github.com/venelin/PCMBaseCpp)\]. This package contains Rcpp modules for likelihood calculation of the model types implemented in PCMBase. PCMBaseCpp can be used as a companion and not a substitute of PCMBase. The sole purpose of PCMBaseCpp is to accelerate the likelihood calculation by implementing the most computationally intensive algorithm in C++, which has shown a dramatic speed-up (in the order of 100 times) (Mitov et al. 2018). Hence, installing this package is optional but highly recommended, in particular, if the goal is to infer models with shifts and/or to infer models on trees bigger than 100 tips. Installing PCMBaseCpp requires a C++ compiler to be installed on the system. This installation has been tested on two systems: - - - On Mac OS X, it has been tested using the default (clang) and the Intel (icpc) compiler. - - On Linux (Euler ETH cluster), it has been tested with the following modules loaded (commands in the file `.bashrc`): + - \[[PCMBaseCpp](https://github.com/venelin/PCMBaseCpp)\]. This + package contains Rcpp modules for likelihood calculation of the + model types implemented in PCMBase. PCMBaseCpp can be used as a + companion and not a substitute of PCMBase. The sole purpose of + PCMBaseCpp is to accelerate the likelihood calculation by + implementing the most computationally intensive algorithm in C++, + which has shown a dramatic speed-up (in the order of 100 times) + (Mitov et al. 2018). Hence, installing this package is optional but + highly recommended, in particular, if the goal is to infer models + with shifts and/or to infer models on trees bigger than 100 tips. + Installing PCMBaseCpp requires a C++ compiler to be installed on the + system. This installation has been tested on two systems: + + - On Mac OS X, it has been tested using the default (clang) and + the Intel (icpc) compiler. + - On Linux (Euler ETH cluster), it has been tested with the + following modules loaded (commands in the file `.bashrc`): + + ``` sh module load interconnect/ethernet module load new gcc/6.3.0 open_mpi/1.6.5 r/3.4.0 intel/2017.5 ``` -Upon validating the availability of a C++ compiler, PCMBaseCpp can be installed using the commands: +``` sh +xcode-select --install +``` + +Upon validating the availability of a C++ compiler, PCMBaseCpp can be +installed using the commands: ``` r # These two packages are available on CRAN @@ -39,9 +101,28 @@ install.packages("RcppArmadillo") devtools::install_github("venelin/PCMBaseCpp") ``` -- other third party dependencies include the packages [`data.table`](https://CRAN.R-project.org/package=data.table), [`foreach`](https://CRAN.R-project.org/package=foreach), [`iterators`](https://CRAN.R-project.org/package=iterators), [`ape`](https://CRAN.R-project.org/package=ape) and [`digest`](https://CRAN.R-project.org/package=digest). These packages should be installed automatically from CRAN when installing PCMFit. If this does not happen, consult the packages' web-pages (links above). - -- An optional but highly recommended dependency. The function `PCMTreePlot` in the package PCMBase uses the R-package ggtree, which is not on CRAN. It is highly recommended to install this package in order to be able to visualize trees with colored parts corresponding to defferent evolutionary regimes. If `ggtree` is not installed, the figures in the vignettes and coding examples cannot be generated. At the time of writing this documentation, `ggtree` can be installed from bioconductor through the following code (if that does not work, check the [ggtree home page](https://guangchuangyu.github.io/software/ggtree/)): + - other third party dependencies include the packages + [`data.table`](https://CRAN.R-project.org/package=data.table), + [`foreach`](https://CRAN.R-project.org/package=foreach), + [`iterators`](https://CRAN.R-project.org/package=iterators), + [`ape`](https://CRAN.R-project.org/package=ape) and + [`digest`](https://CRAN.R-project.org/package=digest). These + packages should be installed automatically from CRAN when installing + PCMFit. If this does not happen, consult the packages’ web-pages + (links above). + + - An optional but highly recommended dependency. The function + `PCMTreePlot` in the package PCMBase uses the R-package ggtree, + which is not on CRAN. It is highly recommended to install this + package in order to be able to visualize trees with colored parts + corresponding to defferent evolutionary regimes. If `ggtree` is not + installed, the figures in the vignettes and coding examples cannot + be generated. At the time of writing this documentation, `ggtree` + can be installed from bioconductor through the following code (if + that does not work, check the [ggtree home + page](https://guangchuangyu.github.io/software/ggtree/)): + + ``` r if (!requireNamespace("BiocManager", quietly = TRUE)) @@ -49,11 +130,9 @@ if (!requireNamespace("BiocManager", quietly = TRUE)) BiocManager::install("ggtree", version = "3.8") ``` -Installing PCMFit -================= +# Installing PCMFit -From Github ------------ +## From Github Currently the package can be installed from github using the command: @@ -61,20 +140,40 @@ Currently the package can be installed from github using the command: devtools::install_github("venelin/PCMFit") ``` -From CRAN ---------- - -Publishing PCMFit on CRAN is planned after release of the first stable and documented version. - -Parallel execution -================== - -Currently PCMFit implements parallel execution for the inference of mixed Gaussian phylogenetic models with unknown shifts. This is optional but highly recommended. To enable parallel execution, it is necessary to run PCMFit on a computer equipped with a multiple core processor or on multiple node computing cluster. In its current implementation, PCMFit uses the function `%dopar%` from the R-package [`foreach`](https://CRAN.R-project.org/package=foreach) to parallelize the execution of (nested) `foreach` loops. This parallelization has been tested using two parallel backends for the `%dopar%` function: - -- Using the R packages [`doMPI`](https://CRAN.R-project.org/package=doMPI) and [`Rmpi`](https://CRAN.R-project.org/package=Rmpi) on a multiple node cluster with `open_mpi/1.6.5` installed. In particular, MGPM inference has been run using up to 250 cores on the [ETH scientific computing cluster Euler](https://scicomp.ethz.ch/wiki/Euler). -- Using the R package [`doParallel`](https://CRAN.R-project.org/package=doParallel) on a MacBook Pro (Retina, 15-inch, Late 2013), 2.3 GHz Intel Core i7 processor (4 physical cores, 8 logical cores), running macOS Sierra 10.12.6. - -To install the above packages, follow the most recent instructions in their documentation (links to the packages web-pages provided above). Once you have installed the parallel backend of choice, you can paste/edit the following code snippet in the beginning of the R-script for running PCMFit model inference: +## From CRAN + +Publishing PCMFit on CRAN is planned after release of the first stable +and documented version. + +# Parallel execution + +Currently PCMFit implements parallel execution for the inference of +mixed Gaussian phylogenetic models with unknown shifts. This is optional +but highly recommended. To enable parallel execution, it is necessary to +run PCMFit on a computer equipped with a multiple core processor or on +multiple node computing cluster. In its current implementation, PCMFit +uses the function `%dopar%` from the R-package +[`foreach`](https://CRAN.R-project.org/package=foreach) to parallelize +the execution of (nested) `foreach` loops. This parallelization has been +tested using two parallel backends for the `%dopar%` function: + + - Using the R packages + [`doMPI`](https://CRAN.R-project.org/package=doMPI) and + [`Rmpi`](https://CRAN.R-project.org/package=Rmpi) on a multiple node + cluster with `open_mpi/1.6.5` installed. In particular, MGPM + inference has been run using up to 250 cores on the [ETH scientific + computing cluster Euler](https://scicomp.ethz.ch/wiki/Euler). + - Using the R package + [`doParallel`](https://CRAN.R-project.org/package=doParallel) on a + MacBook Pro (Retina, 15-inch, Late 2013), 2.3 GHz Intel Core i7 + processor (4 physical cores, 8 logical cores), running macOS Sierra + 10.12.6. + +To install the above packages, follow the most recent instructions in +their documentation (links to the packages web-pages provided above). +Once you have installed the parallel backend of choice, you can +paste/edit the following code snippet in the beginning of the R-script +for running PCMFit model inference: ``` r library(PCMBase) @@ -113,143 +212,287 @@ if(!exists("cluster") || is.null(cluster)) { } ``` -Finally, to tell PCMFit that it should run the inference in parallel, specify the argument `doParallel=TRUE` in calls to the function `PCMFitMixed`. A full example for this is provided in the user guide [Inferring an MGPM with Unknown Shifts](https://venelin.github.io/PCMFit/articles/pcmfitmixed.html). +Finally, to tell PCMFit that it should run the inference in parallel, +specify the argument `doParallel=TRUE` in calls to the function +`PCMFitMixed`. A full example for this is provided in the user guide +[Inferring an MGPM with Unknown +Shifts](https://venelin.github.io/PCMFit/articles/pcmfitmixed.html). + +# Resources + +A brief historical background and theoretical overview of PCMs can be +found in Chapter 1 of (Mitov 2018a). A more general introduction can be +found in (Harmon 2018). The research article “Automatic Generation of +Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models” +provides a general introduction to MGPMs and reports a real data example +and a simulation based comparison of MGPMs versus other implementations +of phylogenetic comparative models with shifts. The article is currently +undergoing peer review for a publication. + +The user guides and technical reference for the library are available on +the [PCMFit web-page](https://venelin.github.io/PCMFit/): + +**Note:** *The writing of the user gudes and [help +articles](https://venelin.github.io/PCMFit/reference/index.html) for +this package is in progress. Please, contact the author for assistance, +in case you need to use PCMFit right away and need help with the coding +examples. Thanks for your understanding.* + + - The [Getting + started](https://venelin.github.io/PCMFit/articles/pcmfit.html) + guide introduces mixed Gaussian phylogenetic models (MGPMs) and + provides an example how to use the function `PCMFit()` to infer such + models on a given tree and trait data. + - The [Inferring an MGPM with Unknown + Shifts](https://venelin.github.io/PCMFit/articles/pcmfitmixed.html) + guide shows how to use the function `PCMFitMixed()` to select the + best MGPM for a given tree and trait data, based on an information + loss function such as the Akaike information criterion (AIC) *(in + preparation)*. + - The [Performing Parametric Bootstrap of an + MGPM](https://venelin.github.io/PCMFit/articles/parambootstrap.html) + guide shows how to simulate and perform MGPM inference on parametric + bootstrap datasets in order to assess the uncertainty of a given + MGPM *(in preparation)*. + +The PCMFit source code is located in the [PCMFit github +repository](https://github.com/venelin/PCMFit). + +Feature requests, bugs, etc can be reported in the [PCMFit issues +list](https://github.com/venelin/PCMFit/issues). + +# Related tools + +PCMFit builds on top of a stack of three tools enabling fast likelihood +calculation and simulation of MGPMs: + + - The R-package [PCMBase](https://venelin.github.io/PCMBase/) + implements the specification, likelihood calculation and simulation + of MGPMs (Mitov et al. 2018). + - The auxiliary package + [PCMBaseCpp](https://github.com/venelin/PCMBaseCpp) provides a fast + C++ implementation of the likelihood calculation as described in + (Mitov et al. 2018). + - PCMBaseCpp relies on the C++ library + [SPLITT](https://github.com/venelin/SPLITT) implementing fast + traversal of phylogenetic trees (Mitov and Stadler 2018). + +# Citing PCMFit + +To acknowledge the PCMFit package in a publication or other +presentation, please cite: + +Mitov, V., Bartoszek, K., & Stadler, T. (2019). Automatic generation of +evolutionary hypotheses using mixed Gaussian phylogenetic models. +Proceedings of the National Academy of Sciences of the United States of +America, . + + @article{Mitov:2019agh, + title = {Automatic generation of evolutionary hypotheses using mixed Gaussian phylogenetic models}, + author = {Venelin Mitov and Krzysztof Bartoszek and Tanja Stadler}, + journal = {Proceedings of the National Academy of Sciences of the United States of America}, + year = {2019}, + url = {https://www.pnas.org/lookup/doi/10.1073/pnas.1813823116}, + } -Resources -========= +# Used software packages -A brief historical background and theoretical overview of PCMs can be found in Chapter 1 of (Mitov 2018a). A more general introduction can be found in (Harmon 2018). The research article "Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models" provides a general introduction to MGPMs and reports a real data example and a simulation based comparison of MGPMs versus other implementations of phylogenetic comparative models with shifts. The article is currently undergoing peer review for a publication. +Although, I have been consistent in my effort to update the following +list with any new package I have used in developing and testing PCMFit, +chances are that I have omitted some of these tools. I apologise to +their authors. -The user guides and technical reference for the library are available on the [PCMFit web-page](https://venelin.github.io/PCMFit/): +The PCMFit R-package uses the following 3rd party R-packages: -**Note:** *The writing of the user gudes and [help articles](https://venelin.github.io/PCMFit/reference/index.html) for this package is in progress. Please, contact the author for assistance, in case you need to use PCMFit right away and need help with the coding examples. Thanks for your understanding.* + - For tree processing in R: ape v5.3 (Paradis et al. 2019), data.table + v1.12.2 (Dowle and Srinivasan 2019), PCMBase v1.2.10 (Mitov 2019b); + - For specification and manipulation of models in R: PCMBase v1.2.10 + (Mitov 2019b), PCMBaseCpp v0.1.4 (Mitov 2019a), SPLITT v1.2.1 (Mitov + 2018b); + - For data processing in R: data.table v1.12.2 (Dowle and Srinivasan + 2019); + - For parallel execution: iterators v1.0.10 (Analytics and Weston + 2018), foreach v1.4.4 (Revolution Analytics and Weston, n.d.), + doParallel v1.0.14 (Corporation and Weston 2018); + - For algebraic computation: expm v0.999.4 (Goulet et al. 2019), + mvtnorm v1.0.11 (Genz et al. 2019); + - For plotting: ggtree v1.15.3 (Yu and Lam 2019), ggplot2 v3.2.0 + (Wickham et al. 2018), cowplot v1.0.0 (Wilke 2019); + - For unit-testing: testthat v2.1.1 (Wickham 2018); + - For documentation and web-site generation: roxygen2 v6.1.1 (Wickham, + Danenberg, and Eugster 2018), pkgdown v1.3.0 (Wickham and + Hesselberth 2018); -- The [Getting started](https://venelin.github.io/PCMFit/articles/pcmfit.html) guide introduces mixed Gaussian phylogenetic models (MGPMs) and provides an example how to use the function `PCMFit()` to infer such models on a given tree and trait data. -- The [Inferring an MGPM with Unknown Shifts](https://venelin.github.io/PCMFit/articles/pcmfitmixed.html) guide shows how to use the function `PCMFitMixed()` to select the best MGPM for a given tree and trait data, based on an information loss function such as the Akaike information criterion (AIC) *(in preparation)*. -- The [Performing Parametric Bootstrap of an MGPM](https://venelin.github.io/PCMFit/articles/parambootstrap.html) guide shows how to simulate and perform MGPM inference on parametric bootstrap datasets in order to assess the uncertainty of a given MGPM *(in preparation)*. +# References -The PCMFit source code is located in the [PCMFit github repository](https://github.com/venelin/PCMFit). +
-Feature requests, bugs, etc can be reported in the [PCMFit issues list](https://github.com/venelin/PCMFit/issues). +
-Related tools -============= +Analytics, Revolution, and Steve Weston. 2018. *Iterators: Provides +Iterator Construct for R*. +. -PCMFit builds on top of a stack of three tools enabling fast likelihood calculation and simulation of MGPMs: +
-- The R-package [PCMBase](https://venelin.github.io/PCMBase/) implements the specification, likelihood calculation and simulation of MGPMs (Mitov et al. 2018). -- The auxiliary package [PCMBaseCpp](https://github.com/venelin/PCMBaseCpp) provides a fast C++ implementation of the likelihood calculation as described in (Mitov et al. 2018). -- PCMBaseCpp relies on the C++ library [SPLITT](https://github.com/venelin/SPLITT) implementing fast traversal of phylogenetic trees (Mitov and Stadler 2018). +
-Citing PCMFit -============= +Corporation, Microsoft, and Steve Weston. 2018. *DoParallel: Foreach +Parallel Adaptor for the ’Parallel’ Package*. +. -To acknowledge the PCMFit package in a publication or other presentation, please cite one of the following: +
-- Mitov, V. (2018). Phylogenetic Comparative Methods in the Era of Big Data, Doctoral Thesis No. [25428](https://doi.org/10.3929/ethz-b-000315296) ETH Zurich. +
- @phdthesis{Mitov:2018hh, - author = {Mitov, Venelin}, - title = {{Phylogenetic Comparative Methods in the Era of Big Data}}, - school = {ETH Zurich}, - year = {2018}, - address = {Zurich}, - url = {https://doi.org/10.3929/ethz-b-000315296}, - } +Dowle, Matt, and Arun Srinivasan. 2019. *Data.table: Extension of +‘Data.frame‘*. . - Mitov, V., Bartoszek, K., & Stadler, T. (2018). Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models. Article in Review, preprint available in Chapter 7, Doctoral Thesis No. [25428](https://doi.org/10.3929/ethz-b-000315296) ETH Zurich. +
- @article{Mitov:2019agh, - author = {Mitov, Venelin and Bartoszek, Krzysztof and Stadler, Tanja}, - title = {{Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models}}, - journal = {article in review, preprint in Ch. 7 of Thesis No. 25428, ETH Zurich}, - year = {2019}, - pages = {}, - month = mai, - url = {https://doi.org/10.3929/ethz-b-000315296} - } - -- Mitov, V., Bartoszek, K., Asimomitis, G., & Stadler, T. (2018, September 24). Fast likelihood evaluation for multivariate phylogenetic comparative methods: the PCMBase R package. arXiv.org. . - - @article{Mitov:2018fl, - author = {Mitov, Venelin and Bartoszek, Krzysztof and Asimomitis, Georgios and Stadler, Tanja}, - title = {{Fast likelihood evaluation for multivariate phylogenetic comparative methods: the PCMBase R package}}, - journal = {arXiv.org}, - year = {2018}, - eprint = {1809.09014}, - eprinttype = {arxiv}, - eprintclass = {q-bio.PE}, - pages = {arXiv:1809.09014}, - month = sep, - annote = {34 pages, 6 figures} - } - -- Mitov, V., & Stadler, T. (2018). Parallel Likelihood Calculation for Phylogenetic Comparative Models: the SPLITT C++ Library. Methods in Ecology and Evolution, 2041--210X.13136. - - @article{Mitov:2018dqa, - author = {Mitov, Venelin and Stadler, Tanja}, - title = {{Parallel likelihood calculation for phylogenetic comparative models: The SPLITT C++ library}}, - journal = {Methods in Ecology and Evolution}, - year = {2018}, - pages = {2041--210X.13136}, - month = dec - } - -Used software packages -====================== - -Although, I have been consistent in my effort to update the following list with any new package I have used in developing and testing PCMFit, chances are that I have omitted some of these tools. I apologise to their authors. +
-The PCMFit R-package uses the following 3rd party R-packages: +Genz, Alan, Frank Bretz, Tetsuhisa Miwa, Xuefei Mi, and Torsten Hothorn. +2019. *Mvtnorm: Multivariate Normal and T Distributions*. +. + +
+ +
+ +Goulet, Vincent, Christophe Dutang, Martin Maechler, David Firth, Marina +Shapira, and Michael Stadelmann. 2019. *Expm: Matrix Exponential, Log, +’Etc’*. . + +
+ +
+ +Harmon, Luke J. 2018. *Phylogenetic Comparative Methods*. Learning from +Trees. . + +
+ +
+ +Mitov, Venelin. 2018a. “Phylogenetic Comparative Methods in the Era of +Big Data.” PhD thesis, Zurich: ETH Zurich. +. + +
+ +
+ +———. 2018b. *SPLITT: A Generic Library for Serial and Parallel Lineage +Traversal of Trees*. + +
+ +
+ +———. 2019a. *PCMBaseCpp: A C++ Implementation of Parallel Likelihood +Calculation for Phylogenetic Comparative Models*. + +
+ +
+ +———. 2019b. *PCMBase: Simulation and Likelihood Calculation of +Phylogenetic Comparative Models*. +. + +
+ +
+ +Mitov, Venelin, Krzysztof Bartoszek, Georgios Asimomitis, and Tanja +Stadler. 2018. “Fast likelihood evaluation for multivariate phylogenetic +comparative methods: the PCMBase R package.” *arXiv.org*, September, +arXiv:1809.09014. . + +
+ +
+ +Mitov, Venelin, Krzysztof Bartoszek, and Tanja Stadler. 2019. “Automatic +Generation of Evolutionary Hypotheses Using Mixed Gaussian Phylogenetic +Models.” *Proceedings of the National Academy of Sciences of the United +States of America*. +. + +
+ +
+ +Mitov, Venelin, and Tanja Stadler. 2018. “Parallel likelihood +calculation for phylogenetic comparative models: The SPLITT C++ +library.” *Methods in Ecology and Evolution*, December, +2041–210X.13136. + +
+ +
+ +Paradis, Emmanuel, Simon Blomberg, Ben Bolker, Joseph Brown, Julien +Claude, Hoa Sien Cuong, Richard Desper, et al. 2019. *Ape: Analyses of +Phylogenetics and Evolution*. . + +
-- For tree processing in R: ape v5.3 (Paradis et al. 2019), data.table v1.12.0 (Dowle and Srinivasan 2019), PCMBase v1.2.10 (Mitov 2019a); -- For specification and manipulation of models in R: PCMBase v1.2.10 (Mitov 2019a), PCMBaseCpp v0.1.4 (Mitov 2019b), SPLITT v1.2.1 (Mitov 2018b); -- For data processing in R: data.table v1.12.0 (Dowle and Srinivasan 2019); -- For parallel execution: iterators v1.0.10 (Analytics and Weston 2018), foreach v1.4.4 (Revolution Analytics and Weston, n.d.), doParallel v1.0.14 (Corporation and Weston 2018); -- For algebraic computation: expm v0.999.4 (Goulet et al. 2019), mvtnorm v1.0.10 (Genz et al. 2019); -- For plotting: ggtree v1.14.6 (Yu and Lam 2019), ggplot2 v3.1.0 (Wickham et al. 2018), cowplot v0.9.4 (Wilke 2019); -- For unit-testing: testthat v2.0.1 (Wickham 2018); -- For documentation and web-site generation: roxygen2 v6.1.1 (Wickham, Danenberg, and Eugster 2018), pkgdown v1.3.0 (Wickham and Hesselberth 2018); +
-References -========== +Revolution Analytics, and Steve Weston. n.d. *Foreach: Provides Foreach +Looping Construct for R*. -Analytics, Revolution, and Steve Weston. 2018. *Iterators: Provides Iterator Construct for R*. . +
-Corporation, Microsoft, and Steve Weston. 2018. *DoParallel: Foreach Parallel Adaptor for the ’Parallel’ Package*. . +
-Dowle, Matt, and Arun Srinivasan. 2019. *Data.table: Extension of ‘Data.frame‘*. . +Wickham, Hadley. 2018. *Testthat: Unit Testing for R*. +. -Genz, Alan, Frank Bretz, Tetsuhisa Miwa, Xuefei Mi, and Torsten Hothorn. 2019. *Mvtnorm: Multivariate Normal and T Distributions*. . +
-Goulet, Vincent, Christophe Dutang, Martin Maechler, David Firth, Marina Shapira, and Michael Stadelmann. 2019. *Expm: Matrix Exponential, Log, ’Etc’*. . +
-Harmon, Luke J. 2018. *Phylogenetic Comparative Methods*. Learning from Trees. . +Wickham, Hadley, Winston Chang, Lionel Henry, Thomas Lin Pedersen, +Kohske Takahashi, Claus Wilke, and Kara Woo. 2018. *Ggplot2: Create +Elegant Data Visualisations Using the Grammar of Graphics*. +. -Mitov, Venelin. 2018a. “Phylogenetic Comparative Methods in the Era of Big Data.” PhD thesis, Zurich: ETH Zurich. . +
-———. 2018b. *SPLITT: A Generic Library for Serial and Parallel Lineage Traversal of Trees*. +
-———. 2019a. *PCMBase: Simulation and Likelihood Calculation of Phylogenetic Comparative Models*. . +Wickham, Hadley, Peter Danenberg, and Manuel Eugster. 2018. *Roxygen2: +In-Line Documentation for R*. +. -———. 2019b. *PCMBaseCpp: A C++ Implementation of Parallel Likelihood Calculation for Phylogenetic Comparative Models*. +
-Mitov, Venelin, and Tanja Stadler. 2018. “Parallel likelihood calculation for phylogenetic comparative models: The SPLITT C++ library.” *Methods in Ecology and Evolution*, December, 2041–210X.13136. +
-Mitov, Venelin, Krzysztof Bartoszek, Georgios Asimomitis, and Tanja Stadler. 2018. “Fast likelihood evaluation for multivariate phylogenetic comparative methods: the PCMBase R package.” *arXiv.org*, September, arXiv:1809.09014. . +Wickham, Hadley, and Jay Hesselberth. 2018. *Pkgdown: Make Static Html +Documentation for a Package*. +. -Paradis, Emmanuel, Simon Blomberg, Ben Bolker, Joseph Brown, Julien Claude, Hoa Sien Cuong, Richard Desper, et al. 2019. *Ape: Analyses of Phylogenetics and Evolution*. . +
-Revolution Analytics, and Steve Weston. n.d. *Foreach: Provides Foreach Looping Construct for R*. +
-Wickham, Hadley. 2018. *Testthat: Unit Testing for R*. . +Wilke, Claus O. 2019. *Cowplot: Streamlined Plot Theme and Plot +Annotations for ’Ggplot2’*. +. -Wickham, Hadley, and Jay Hesselberth. 2018. *Pkgdown: Make Static Html Documentation for a Package*. . +
-Wickham, Hadley, Winston Chang, Lionel Henry, Thomas Lin Pedersen, Kohske Takahashi, Claus Wilke, and Kara Woo. 2018. *Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics*. . +
-Wickham, Hadley, Peter Danenberg, and Manuel Eugster. 2018. *Roxygen2: In-Line Documentation for R*. . +Yu, Guangchuang, and Tommy Tsan-Yuk Lam. 2019. *Ggtree: An R Package for +Visualization and Annotation of Phylogenetic Trees with Their Covariates +and Other Associated Data*. +. -Wilke, Claus O. 2019. *Cowplot: Streamlined Plot Theme and Plot Annotations for ’Ggplot2’*. . +
-Yu, Guangchuang, and Tommy Tsan-Yuk Lam. 2019. *Ggtree: An R Package for Visualization and Annotation of Phylogenetic Trees with Their Covariates and Other Associated Data*. . +
diff --git a/docs/articles/index.html b/docs/articles/index.html index 6d2a4ef..4c55607 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -70,7 +70,7 @@ PCMFit - 0.6.0 + 1.0.0 diff --git a/docs/articles/parambootstrap.html b/docs/articles/parambootstrap.html index e6d9898..1e91cbf 100644 --- a/docs/articles/parambootstrap.html +++ b/docs/articles/parambootstrap.html @@ -36,7 +36,7 @@ PCMFit - 0.6.0 + 1.0.0 @@ -88,7 +88,7 @@

Parametric Bootstrap of an MGPM

Venelin Mitov

-

2019-04-02

+

2019-07-18

Source: vignettes/parambootstrap.Rmd @@ -101,53 +101,53 @@

2019-04-02

Simulating parametric bootstrap datasets

-
library(data.table)
-library(PCMBase)
-library(PCMFit)
-
-# make results reproducible
-set.seed(12, kind = "Mersenne-Twister", normal.kind = "Inversion")
-
-numBootstraps <- 10
-
-bestModel <- 
-  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel
-
-metaI <- PCMInfo(
-  X = NULL, tree = attr(bestModel, "tree"), model = bestModel)
-
-# Simulate data with the best fit model from the mammal data with SEs
-valuesBootstrapMGPM_A_F_BC2_RR <- data.table(
-  IdGlob = seq_len(numBootstraps),
-  X = lapply(seq_len(numBootstraps), function(id) {
-    PCMSim(
-      tree = attr(bestModel, "tree"), 
-      model = bestModel, 
-      X0 = bestModel$X0,
-      metaI = metaI)
-  })
-)
-
-options(PCMBase.Value.NA = -1e20)
-options(PCMBase.Lmr.mode = 11)
-options(PCMBase.Threshold.EV = 1e-7)
-
-valuesBootstrapMGPM_A_F_BC2_RR[
-  , c("df", "logLik", "score") := {
-    resLists <- lapply(.I, function(i) {
-      attr(bestModel, "X") <- X[[i]]
-      ll <- logLik(bestModel)
-      aic <- AIC(bestModel)
-      df <- attr(ll, "df")
-      c(df, ll, aic)
-    })
-    list(df = sapply(resLists, function(.) .[1]),
-         logLik = sapply(resLists, function(.) .[2]),
-         score = sapply(resLists, function(.) .[3]))
-  }]
-
-PCMFitDemoObjects$valuesBootstrapMGPM_A_F_BC2_RR <- valuesBootstrapMGPM_A_F_BC2_RR
-usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)
+
library(data.table)
+library(PCMBase)
+library(PCMFit)
+
+# make results reproducible
+set.seed(12, kind = "Mersenne-Twister", normal.kind = "Inversion")
+
+numBootstraps <- 10
+
+bestModel <- 
+  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel
+
+metaI <- PCMInfo(
+  X = NULL, tree = attr(bestModel, "tree"), model = bestModel)
+
+# Simulate data with the best fit model from the mammal data with SEs
+valuesBootstrapMGPM_A_F_BC2_RR <- data.table(
+  IdGlob = seq_len(numBootstraps),
+  X = lapply(seq_len(numBootstraps), function(id) {
+    PCMSim(
+      tree = attr(bestModel, "tree"), 
+      model = bestModel, 
+      X0 = bestModel$X0,
+      metaI = metaI)
+  })
+)
+
+options(PCMBase.Value.NA = -1e20)
+options(PCMBase.Lmr.mode = 11)
+options(PCMBase.Threshold.EV = 1e-7)
+
+valuesBootstrapMGPM_A_F_BC2_RR[
+  , c("df", "logLik", "score") := {
+    resLists <- lapply(.I, function(i) {
+      attr(bestModel, "X") <- X[[i]]
+      ll <- logLik(bestModel)
+      aic <- AIC(bestModel)
+      df <- attr(ll, "df")
+      c(df, ll, aic)
+    })
+    list(df = sapply(resLists, function(.) .[1]),
+         logLik = sapply(resLists, function(.) .[2]),
+         score = sapply(resLists, function(.) .[3]))
+  }]
+
+PCMFitDemoObjects$valuesBootstrapMGPM_A_F_BC2_RR <- valuesBootstrapMGPM_A_F_BC2_RR
+usethis::use_data(PCMFitDemoObjects, overwrite = TRUE)

@@ -155,96 +155,96 @@

Writing a parametric bootstrap inference R-script

-
# File: FitBootstrap_MGPM_A_F_BC2_RR.R
-# Usage: R --vanilla --slave -f ../../FitBootstrap_MGPM_A_F_BC2_RR.R --args 1
-library(PCMBase)
-library(PCMBaseCpp)
-library(PCMFit)
-library(data.table)
-
-# extract dataset identifier and possibly other parameters from the command line:
-args <- commandArgs(trailingOnly = TRUE)
-if(length(args) > 0) {
-  data_id <- as.integer(args[1])
-} else {
-  data_id <- 1L
-}
-
-# A character string used in filenames for a model inference on a given data:
-prefixFiles = paste0("MGPM_A_F_BC2_RR_BSID_", data_id)
-
-# creating the cluster for this PCMFit run:
-if(!exists("cluster") || is.null(cluster)) {
-  if(require(doMPI)) {
-    # using MPI cluster as distributed node cluster (possibly running on a 
-    # cluster of multiple nodes)
-    # Get the number of cores. Assume this is run in a batch job.
-    p = strtoi(Sys.getenv('LSB_DJOB_NUMPROC'))
-    cluster <- startMPIcluster(count = p-1, verbose = TRUE)
-    doMPI::registerDoMPI(cluster)
-  } else {
-    # possibly running on personal computer without mpi installation
-    cluster <- parallel::makeCluster(
-      parallel::detectCores(logical = TRUE),
-      outfile = paste0("log_", prefixFiles, ".txt"))
-    doParallel::registerDoParallel(cluster)
-  }
-}
-
-# This function is going to be executed on each worker node.
-generatePCMModelsFunction <- function() {
-  # make results reproducible
-  set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")
-
-  PCMGenerateModelTypes()
-  fileName <- '../../DefineParameterLimits.R'
-  codeDefineLimits <- readChar(fileName, file.info(fileName)$size)
-  eval(parse(text = codeDefineLimits), .GlobalEnv)
-}
-
-bestModel <- 
-  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel
-
-tree <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
-X <- PCMFitDemoObjects$valuesBootstrapMGPM_A_F_BC2_RR$X[[data_id]][
-  , seq_len(PCMTreeNumTips(tree))]
-
-currentResultFile <- paste0("CurrentResults_fits_", prefixFiles, ".RData")
-if(file.exists(currentResultFile)) {
-  load(currentResultFile)
-  tableFitsPrev <- listResults$tableFits
-} else {
-  tableFitsPrev <- NULL
-}
-
-fitMGPM_A_F_BC2_RR <- PCMFitMixed(
-    X = X, tree = tree, metaIFun = PCMInfoCpp,
-    generatePCMModelsFun = generatePCMModelsFunction, 
-    maxNumRoundRobins = 2, maxNumPartitionsInRoundRobins = 2,
-    tableFitsPrev = tableFitsPrev,
-    prefixFiles = prefixFiles,
-    doParallel = TRUE)
-
-save(fitMGPM_A_F_BC2_RR, file = paste0("Result_", prefixFiles, ".RData"))
+
# File: FitBootstrap_MGPM_A_F_BC2_RR.R
+# Usage: R --vanilla --slave -f ../../FitBootstrap_MGPM_A_F_BC2_RR.R --args 1
+library(PCMBase)
+library(PCMBaseCpp)
+library(PCMFit)
+library(data.table)
+
+# extract dataset identifier and possibly other parameters from the command line:
+args <- commandArgs(trailingOnly = TRUE)
+if(length(args) > 0) {
+  data_id <- as.integer(args[1])
+} else {
+  data_id <- 1L
+}
+
+# A character string used in filenames for a model inference on a given data:
+prefixFiles = paste0("MGPM_A_F_BC2_RR_BSID_", data_id)
+
+# creating the cluster for this PCMFit run:
+if(!exists("cluster") || is.null(cluster)) {
+  if(require(doMPI)) {
+    # using MPI cluster as distributed node cluster (possibly running on a 
+    # cluster of multiple nodes)
+    # Get the number of cores. Assume this is run in a batch job.
+    p = strtoi(Sys.getenv('LSB_DJOB_NUMPROC'))
+    cluster <- startMPIcluster(count = p-1, verbose = TRUE)
+    doMPI::registerDoMPI(cluster)
+  } else {
+    # possibly running on personal computer without mpi installation
+    cluster <- parallel::makeCluster(
+      parallel::detectCores(logical = TRUE),
+      outfile = paste0("log_", prefixFiles, ".txt"))
+    doParallel::registerDoParallel(cluster)
+  }
+}
+
+# This function is going to be executed on each worker node.
+generatePCMModelsFunction <- function() {
+  # make results reproducible
+  set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")
+
+  PCMGenerateModelTypes()
+  fileName <- '../../DefineParameterLimits.R'
+  codeDefineLimits <- readChar(fileName, file.info(fileName)$size)
+  eval(parse(text = codeDefineLimits), .GlobalEnv)
+}
+
+bestModel <- 
+  RetrieveBestFitScore(PCMFitDemoObjects$fitMGPM_A_F_BC2_RR)$inferredModel
+
+tree <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
+X <- PCMFitDemoObjects$valuesBootstrapMGPM_A_F_BC2_RR$X[[data_id]][
+  , seq_len(PCMTreeNumTips(tree))]
+
+currentResultFile <- paste0("CurrentResults_fits_", prefixFiles, ".RData")
+if(file.exists(currentResultFile)) {
+  load(currentResultFile)
+  tableFitsPrev <- listResults$tableFits
+} else {
+  tableFitsPrev <- NULL
+}
+
+fitMGPM_A_F_BC2_RR <- PCMFitMixed(
+    X = X, tree = tree, metaIFun = PCMInfoCpp,
+    generatePCMModelsFun = generatePCMModelsFunction, 
+    maxNumRoundRobins = 2, maxNumPartitionsInRoundRobins = 2,
+    tableFitsPrev = tableFitsPrev,
+    prefixFiles = prefixFiles,
+    doParallel = TRUE)
+
+save(fitMGPM_A_F_BC2_RR, file = paste0("Result_", prefixFiles, ".RData"))

Running the parametric bootstrap inference scripter on a cluster

-
# File: FitBootstrap_MGPM_A_F_BC2_RR_bsub.sh
-# Usage: sh FitBootstrap_MGPM_A_F_BC2_RR_bsub.sh
-for id in `seq 1 1 10`
-do
-mkdir -p ResultsBootstrap/MGPM_A_F_BC2_RR_BSID_$id
-cd ResultsBootstrap/MGPM_A_F_BC2_RR_BSID_$id
-if [ -f "Result_MGPM_A_F_BC2_RR_BSID_"$id".RData" ]
-then
-rm MPI*.log
-rm CurrentResults*.RData
-else
-bsub -M 10000 -n 8 -W 3:59 -R ib sh R --vanilla --slave -f ../../FitBootstrap_MGPM_A_F_BC2_RR.R --args $id
-fi
-cd ../..
-done
+

diff --git a/docs/articles/pcmfit.html b/docs/articles/pcmfit.html index 36e7169..c2ed576 100644 --- a/docs/articles/pcmfit.html +++ b/docs/articles/pcmfit.html @@ -36,7 +36,7 @@ PCMFit - 0.6.0 + 1.0.0
@@ -88,7 +88,7 @@

Getting started with the PCMFit R-package

Venelin Mitov

-

2019-04-02

+

2019-07-18

Source: vignettes/pcmfit.Rmd @@ -107,15 +107,15 @@

2019-04-02

  • Evaluating the inferred model.
  • Before we start, let’s load the packages needed for this tutorial:

    -
    library(PCMBase)
    -library(PCMFit)
    -library(data.table)
    -library(ggplot2)
    -library(ggtree)
    -library(cowplot)
    -
    -# make results reproducible
    -set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")
    +
    library(PCMBase)
    +library(PCMFit)
    +library(data.table)
    +library(ggplot2)
    +library(ggtree)
    +library(cowplot)
    +
    +# make results reproducible
    +set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")

    In the next sections, we go through each of the above steps:

    @@ -124,42 +124,42 @@

    The tree

    -
    tree <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
    -tree
    -#> 
    -#> Phylogenetic tree with 80 tips and 79 internal nodes.
    -#> 
    -#> Tip labels:
    -#>  1, 2, 3, 4, 5, 6, ...
    -#> Node labels:
    -#>  81, 82, 83, 84, 85, 86, ...
    -#> 
    -#> Rooted; includes branch lengths.
    +

    The data

    -
    X <- PCMFitDemoObjects$dtSimulated$X[[1]][, seq_len(PCMTreeNumTips(tree))]
    -dim(X)
    -#> [1]  2 80
    +

    We use the PCMBase::PCMTreePlot() and PCMBase::PCMPlotTraitData2D() functions to generate plots of the tree and the data (note that PCMBase::PCMTreePlot() requires the ggtree R-package which is currently not on CRAN - see the ggtree home page for instructions how to install this package):

    -
    plTree <- PCMTreePlot(tree, layout="fan") +
    -  geom_tiplab2(size = 2) + 
    -  geom_nodelab(size = 2, color = "black") + 
    -  geom_treescale(width = max(PCMTreeNodeTimes(tree)), x = 0, linesize = .25, fontsize = 2, offset = 79)
    -
    -plX <- PCMPlotTraitData2D(
    -  X[, seq_len(PCMTreeNumTips(tree))], 
    -  tree, 
    -  scaleSizeWithTime = FALSE) +
    -  geom_text(
    -    aes(x = x, y = y, label = id, color = regime), 
    -    size=2, 
    -    position = position_jitter(.4, .4)) +
    -  theme_bw() +
    -  theme(legend.position = "bottom")
    -
    -cowplot::plot_grid(plTree, plX, labels = LETTERS[1:2])
    +
    plTree <- PCMTreePlot(tree, layout="fan") +
    +  geom_tiplab2(size = 2) + 
    +  geom_nodelab(size = 2, color = "black") + 
    +  geom_treescale(width = max(PCMTreeNodeTimes(tree)), x = 0, linesize = .25, fontsize = 2, offset = 79)
    +
    +plX <- PCMPlotTraitData2D(
    +  X[, seq_len(PCMTreeNumTips(tree))], 
    +  tree, 
    +  scaleSizeWithTime = FALSE) +
    +  geom_text(
    +    aes(x = x, y = y, label = id, color = regime), 
    +    size=2, 
    +    position = position_jitter(.4, .4)) +
    +  theme_bw() +
    +  theme(legend.position = "bottom")
    +
    +cowplot::plot_grid(plTree, plX, labels = LETTERS[1:2])
    **Example phylogenetic comparative data.** **A**: a tree of 80 tips. **B**:  bivariate trait values at the tips of the tree.

    Example phylogenetic comparative data. A: a tree of 80 tips. B: bivariate trait values at the tips of the tree. @@ -171,21 +171,21 @@

    Step 2. Gaussian model types

    In this tutorial, we use the six model types \(BM_{A}\), \(BM_{B}\), \(OU_{C}\), \(OU_{D}\), \(OU_{E}\), \(OU_{F}\) from the \(\mathcal{G}_{LInv}\)-family as defined in (Mitov, Bartoszek, and Stadler 2019). These model types are also described in the PCMBase Getting started guide:

    -
    # scroll to the right in the following listing to see the aliases for the six 
    -# default model types:
    -PCMDefaultModelTypes()
    -#>                                                                                                                                                                                A 
    -#>                                                                                                      "BM__Global_X0__Diagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x" 
    -#>                                                                                                                                                                                B 
    -#>                                                                                   "BM__Global_X0__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x" 
    -#>                                                                                                                                                                                C 
    -#>                                       "OU__Global_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__Diagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x" 
    -#>                                                                                                                                                                                D 
    -#>                    "OU__Global_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x" 
    -#>                                                                                                                                                                                E 
    -#> "OU__Global_X0__Schur_UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x" 
    -#>                                                                                                                                                                                F 
    -#>                             "OU__Global_X0__Schur_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x"
    +

    Our purpose for this tutorial will be to select an optimal model for the data. We will compare “global” models in which one model of one of the above model types is fit to the entire tree and data, versus a mixed Gaussian phylogenetic model where different models of the above model types are assigned to different parts of the tree.

    @@ -197,57 +197,57 @@

    A global \(BM_{B}\) model

    -
    modelBM <- PCM(
    -  PCMDefaultModelTypes()["B"], modelTypes = PCMDefaultModelTypes(), k = 2)
    -
    -modelBM
    -#> Brownian motion model
    -#> S3 class: BM__Global_X0__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, BM, GaussianPCM, PCM; k=2; p=5; regimes: 1. Parameters/sub-models:
    -#> X0 (VectorParameter, _Global, numeric; trait values at the root):
    -#> [1] 0 0
    -#> Sigma_x (MatrixParameter, _UpperTriangularWithDiagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,]    0    0
    -#> [2,]    0    0
    -#> 
    -#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    -#> 
    -#> 
    +

    A global \(OU_{F}\) model

    -
    modelOU <- PCM(
    -  PCMDefaultModelTypes()["F"], modelTypes = PCMDefaultModelTypes(), k = 2)
    -
    -modelOU
    -#> Ornstein-Uhlenbeck model
    -#> S3 class: OU__Global_X0__Schur_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable; k=2; p=11; regimes: 1. Parameters/sub-models:
    -#> X0 (VectorParameter, _Global, numeric; trait values at the root):
    -#> [1] 0 0
    -#> H (MatrixParameter, _Schur, _WithNonNegativeDiagonal, _Transformable; adaptation rate matrix):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,]    0    0
    -#> [2,]    0    0
    -#> 
    -#> Theta (VectorParameter, matrix; long-term optimum):
    -#>      1
    -#> [1,] 0
    -#> [2,] 0
    -#> Sigma_x (MatrixParameter, _UpperTriangularWithDiagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,]    0    0
    -#> [2,]    0    0
    -#> 
    -#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    -#> 
    -#> 
    +

    @@ -256,140 +256,140 @@

    A MGPM with the true shift-point configuration and model type mapping

    -
    modelTrueTypeMapping <- MixedGaussian(
    -  k = 2,
    -  modelTypes = MGPMDefaultModelTypes(),
    -  mapping = c(4, 3, 2),
    -  X0 = structure(
    -    0, class = c("VectorParameter",
    -                 "_Global"), description = "trait values at the root"), 
    -  Sigmae_x = structure(
    -    0, class = c("MatrixParameter", "_Omitted",
    -                 "_Global"),
    -    description =
    -      "Upper triangular Choleski factor of the non-phylogenetic variance-covariance"))
    -
    -treeWithTrueShifts <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
    -PCMTreeSetPartRegimes(
    -  treeWithTrueShifts, 
    -  part.regime = c(`81` = 1, `105` = 2, `125` = 3), 
    -  setPartition = TRUE)
    +

    A MGPM with the true shift-point configuration, model type mapping and parameter values

    -
    modelTrue <- PCMFitDemoObjects$dtSimulated$model[[1]]
    -modelTrue
    -#> Mixed Gaussian model
    -#> S3 class: MixedGaussian_432, MixedGaussian, GaussianPCM, PCM, _Transformable; k=2; p=18; regimes: 1, 2, 3. Parameters/sub-models:
    -#> X0 (VectorParameter, _Global, numeric; trait values at the root):
    -#> [1]  1 -1
    -#> 1 (OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable):
    -#> Ornstein-Uhlenbeck model
    -#> S3 class: OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable; k=2; p=7; regimes: 1. Parameters/sub-models:
    -#> X0 (VectorParameter, _Omitted; trait values at the root):
    -#> 
    -#> H (MatrixParameter, _Schur, _Diagonal, _WithNonNegativeDiagonal, _Transformable; adaptation rate matrix):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,] 1.39 0.00
    -#> [2,] 0.00 0.59
    -#> 
    -#> Theta (VectorParameter, matrix; long-term optimum):
    -#>         1
    -#> [1,] 5.94
    -#> [2,] 2.58
    -#> Sigma_x (MatrixParameter, _UpperTriangularWithDiagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,] 0.29 0.02
    -#> [2,] 0.00 0.36
    -#> 
    -#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    -#> 
    -#>  
    -#> 2 (OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__Diagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable):
    -#> Ornstein-Uhlenbeck model
    -#> S3 class: OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__Diagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable; k=2; p=6; regimes: 1. Parameters/sub-models:
    -#> X0 (VectorParameter, _Omitted; trait values at the root):
    -#> 
    -#> H (MatrixParameter, _Schur, _Diagonal, _WithNonNegativeDiagonal, _Transformable; adaptation rate matrix):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,] 0.37 0.00
    -#> [2,] 0.00 1.06
    -#> 
    -#> Theta (VectorParameter, matrix; long-term optimum):
    -#>         1
    -#> [1,] 5.41
    -#> [2,] 2.63
    -#> Sigma_x (MatrixParameter, _Diagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,] 0.36  0.0
    -#> [2,] 0.00  0.4
    -#> 
    -#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    -#> 
    -#>  
    -#> 3 (BM__Omitted_X0__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, BM, GaussianPCM, PCM):
    -#> Brownian motion model
    -#> S3 class: BM__Omitted_X0__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, BM, GaussianPCM, PCM; k=2; p=3; regimes: 1. Parameters/sub-models:
    -#> X0 (VectorParameter, _Omitted; trait values at the root):
    -#> 
    -#> Sigma_x (MatrixParameter, _UpperTriangularWithDiagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    -#> , , 1
    -#> 
    -#>      [,1] [,2]
    -#> [1,] 0.17 0.14
    -#> [2,] 0.00 0.50
    -#> 
    -#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    -#> 
    -#>  
    -#> Sigmae_x (MatrixParameter, _Omitted; upper triangular Choleski factor of the non-phylogenetic variance-covariance):
    -#> 
    -#> 
    -
    -# We specify the tree and trait values for the true model in order to easily 
    -# calculate parameter count likelihood and AIC for it:
    -attr(modelTrue, "tree") <- treeWithTrueShifts
    -attr(modelTrue, "X") <- X
    -attr(modelTrue, "SE") <- X * 0.0
    +
    modelTrue <- PCMFitDemoObjects$dtSimulated$model[[1]]
    +modelTrue
    +#> Mixed Gaussian model
    +#> S3 class: MixedGaussian_432, MixedGaussian, GaussianPCM, PCM, _Transformable; k=2; p=18; regimes: 1, 2, 3. Parameters/sub-models:
    +#> X0 (VectorParameter, _Global, numeric; trait values at the root):
    +#> [1]  1 -1
    +#> 1 (OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable):
    +#> Ornstein-Uhlenbeck model
    +#> S3 class: OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable; k=2; p=7; regimes: 1. Parameters/sub-models:
    +#> X0 (VectorParameter, _Omitted; trait values at the root):
    +#> 
    +#> H (MatrixParameter, _Schur, _Diagonal, _WithNonNegativeDiagonal, _Transformable; adaptation rate matrix):
    +#> , , 1
    +#> 
    +#>      [,1] [,2]
    +#> [1,] 1.39 0.00
    +#> [2,] 0.00 0.59
    +#> 
    +#> Theta (VectorParameter, matrix; long-term optimum):
    +#>         1
    +#> [1,] 5.94
    +#> [2,] 2.58
    +#> Sigma_x (MatrixParameter, _UpperTriangularWithDiagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    +#> , , 1
    +#> 
    +#>      [,1] [,2]
    +#> [1,] 0.29 0.02
    +#> [2,] 0.00 0.36
    +#> 
    +#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    +#> 
    +#>  
    +#> 2 (OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__Diagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable):
    +#> Ornstein-Uhlenbeck model
    +#> S3 class: OU__Omitted_X0__Schur_Diagonal_WithNonNegativeDiagonal_Transformable_H__Theta__Diagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, OU, GaussianPCM, PCM, _Transformable; k=2; p=6; regimes: 1. Parameters/sub-models:
    +#> X0 (VectorParameter, _Omitted; trait values at the root):
    +#> 
    +#> H (MatrixParameter, _Schur, _Diagonal, _WithNonNegativeDiagonal, _Transformable; adaptation rate matrix):
    +#> , , 1
    +#> 
    +#>      [,1] [,2]
    +#> [1,] 0.37 0.00
    +#> [2,] 0.00 1.06
    +#> 
    +#> Theta (VectorParameter, matrix; long-term optimum):
    +#>         1
    +#> [1,] 5.41
    +#> [2,] 2.63
    +#> Sigma_x (MatrixParameter, _Diagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    +#> , , 1
    +#> 
    +#>      [,1] [,2]
    +#> [1,] 0.36  0.0
    +#> [2,] 0.00  0.4
    +#> 
    +#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    +#> 
    +#>  
    +#> 3 (BM__Omitted_X0__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, BM, GaussianPCM, PCM):
    +#> Brownian motion model
    +#> S3 class: BM__Omitted_X0__UpperTriangularWithDiagonal_WithNonNegativeDiagonal_Sigma_x__Omitted_Sigmae_x, BM, GaussianPCM, PCM; k=2; p=3; regimes: 1. Parameters/sub-models:
    +#> X0 (VectorParameter, _Omitted; trait values at the root):
    +#> 
    +#> Sigma_x (MatrixParameter, _UpperTriangularWithDiagonal, _WithNonNegativeDiagonal; Choleski factor of the unit-time variance rate):
    +#> , , 1
    +#> 
    +#>      [,1] [,2]
    +#> [1,] 0.17 0.14
    +#> [2,] 0.00 0.50
    +#> 
    +#> Sigmae_x (MatrixParameter, _Omitted; Choleski factor of the non-heritable variance or the variance of the measurement error):
    +#> 
    +#>  
    +#> Sigmae_x (MatrixParameter, _Omitted; upper triangular Choleski factor of the non-phylogenetic variance-covariance):
    +#> 
    +#> 
    +
    +# We specify the tree and trait values for the true model in order to easily 
    +# calculate parameter count likelihood and AIC for it:
    +attr(modelTrue, "tree") <- treeWithTrueShifts
    +attr(modelTrue, "X") <- X
    +attr(modelTrue, "SE") <- X * 0.0

    A view of the data according to the true shift-point configuration

    -
    
    -plTree <- PCMTreePlot(treeWithTrueShifts, layout="fan") %<+% 
    -  data.table(
    -    node = c(12, 77, 45), 
    -    part.model = c(" 1.D ", " 2.C ", " 3.B "),
    -    offset = 5) + 
    -  geom_tiplab2(size = 2) + 
    -  geom_tiplab2(aes(label = part.model), offset = 16) + 
    -  geom_nodelab(size = 2, color = "black") + 
    -  geom_treescale(
    -    width = max(PCMTreeNodeTimes(treeWithTrueShifts)), x = 0, 
    -    linesize = .25, fontsize = 2, offset = 79)
    -
    -plX <- PCMPlotTraitData2D(
    -  X[, seq_len(PCMTreeNumTips(treeWithTrueShifts))], 
    -  treeWithTrueShifts, 
    -  scaleSizeWithTime = FALSE) +
    -  geom_text(
    -    aes(x = x, y = y, label = id, color = regime), 
    -    size=2, 
    -    position = position_jitter(.4, .4)) +
    -  theme_bw() +
    -  theme(legend.position = "bottom")
    -
    -cowplot::plot_grid(plTree, plX, labels = LETTERS[1:2])
    +
    
    +plTree <- PCMTreePlot(treeWithTrueShifts, layout="fan") %<+% 
    +  data.table(
    +    node = c(12, 77, 45), 
    +    part.model = c(" 1.D ", " 2.C ", " 3.B "),
    +    offset = 5) + 
    +  geom_tiplab2(size = 2) + 
    +  geom_tiplab2(aes(label = part.model), offset = 16) + 
    +  geom_nodelab(size = 2, color = "black") + 
    +  geom_treescale(
    +    width = max(PCMTreeNodeTimes(treeWithTrueShifts)), x = 0, 
    +    linesize = .25, fontsize = 2, offset = 79)
    +
    +plX <- PCMPlotTraitData2D(
    +  X[, seq_len(PCMTreeNumTips(treeWithTrueShifts))], 
    +  treeWithTrueShifts, 
    +  scaleSizeWithTime = FALSE) +
    +  geom_text(
    +    aes(x = x, y = y, label = id, color = regime), 
    +    size=2, 
    +    position = position_jitter(.4, .4)) +
    +  theme_bw() +
    +  theme(legend.position = "bottom")
    +
    +cowplot::plot_grid(plTree, plX, labels = LETTERS[1:2])
    **Example phylogenetic comparative data.** **A**: a tree of 80 tips partitioned in three evolutionary regimes. Each evolutionary regime is denoted by #.T, where # is the regime identifier and T is the evolutionary model type associated with this regime (a model type among A, ..., F). **B**:  bivariate trait values at the tips of the tree.

    Example phylogenetic comparative data. A: a tree of 80 tips partitioned in three evolutionary regimes. Each evolutionary regime is denoted by #.T, where # is the regime identifier and T is the evolutionary model type associated with this regime (a model type among A, …, F). B: bivariate trait values at the tips of the tree. @@ -404,155 +404,155 @@

    Specifying parameter limits

    An optional but important preliminary step is to explicitly specify the limits for the model parameters. This is needed, because the default settings might not be appropriate for the data in question. Here is an example how the model parameter limits can be set. Note that we specify these limits for the base “BM” and “OU” model types, but they are inherited by their subtypes A, …, F, as specified in the comments.

    -
    ## File: DefineParameterLimits.R
    -# lower limits for models A and B
    -PCMParamLowerLimit.BM <- function(o, k, R, ...) {
    -  o <- NextMethod()
    -  k <- attr(o, "k", exact = TRUE)
    -  R <- length(attr(o, "regimes", exact = TRUE))
    -
    -  if(is.Global(o$Sigma_x)) {
    -    if(!is.Diagonal(o$Sigma_x)) {
    -      o$Sigma_x[1, 2] <- -.0
    -    }
    -  } else {
    -    if(!is.Diagonal(o$Sigma_x)) {
    -      for(r in seq_len(R)) {
    -        o$Sigma_x[1, 2, r] <- -.0
    -      }
    -    }
    -  }
    -  o
    -}
    -
    -# upper limits for models A and B
    -PCMParamUpperLimit.BM <- function(o, k, R, ...) {
    -  o <- NextMethod()
    -  k <- attr(o, "k", exact = TRUE)
    -  R <- length(attr(o, "regimes", exact = TRUE))
    -
    -  if(is.Global(o$Sigma_x)) {
    -    o$Sigma_x[1, 1] <- o$Sigma_x[2, 2] <- 1.0
    -    if(!is.Diagonal(o$Sigma_x)) {
    -      o$Sigma_x[1, 2] <- 1.0
    -    }
    -  } else {
    -    for(r in seq_len(R)) {
    -      o$Sigma_x[1, 1, r] <- o$Sigma_x[2, 2, r] <- 1.0
    -      if(!is.Diagonal(o$Sigma_x)) {
    -        o$Sigma_x[1, 2, r] <- 1.0
    -      }
    -    }
    -  }
    -  o
    -}
    -
    -# lower limits for models C, ..., F.
    -PCMParamLowerLimit.OU <- function(o, k, R, ...) {
    -  o <- NextMethod()
    -  k <- attr(o, "k", exact = TRUE)
    -  R <- length(attr(o, "regimes", exact = TRUE))
    -
    -  if(is.Global(o$Theta)) {
    -    o$Theta[1] <- 0.0
    -    o$Theta[2] <- -1.2
    -  } else {
    -    for(r in seq_len(R)) {
    -      o$Theta[1, r] <- 0.0
    -      o$Theta[2, r] <- -1.2
    -    }
    -  }
    -  if(is.Global(o$Sigma_x)) {
    -    if(!is.Diagonal(o$Sigma_x)) {
    -      o$Sigma_x[1, 2] <- -.0
    -    }
    -  } else {
    -    if(!is.Diagonal(o$Sigma_x)) {
    -      for(r in seq_len(R)) {
    -        o$Sigma_x[1, 2, r] <- -.0
    -      }
    -    }
    -  }
    -  o
    -}
    -
    -# upper limits for models C, ..., F.
    -PCMParamUpperLimit.OU <- function(o, k, R, ...) {
    -  o <- NextMethod()
    -  k <- attr(o, "k", exact = TRUE)
    -  R <- length(attr(o, "regimes", exact = TRUE))
    -
    -  if(is.Global(o$Theta)) {
    -    o$Theta[1] <- 7.8
    -    o$Theta[2] <- 4.2
    -  } else {
    -    for(r in seq_len(R)) {
    -      o$Theta[1, r] <- 7.8
    -      o$Theta[2, r] <- 4.2
    -    }
    -  }
    -  if(is.Global(o$Sigma_x)) {
    -    o$Sigma_x[1, 1] <- o$Sigma_x[2, 2] <- 1.0
    -    if(!is.Diagonal(o$Sigma_x)) {
    -      o$Sigma_x[1, 2] <- 1.0
    -    }
    -  } else {
    -    for(r in seq_len(R)) {
    -      o$Sigma_x[1, 1, r] <- o$Sigma_x[2, 2, r] <- 1.0
    -      if(!is.Diagonal(o$Sigma_x)) {
    -        o$Sigma_x[1, 2, r] <- 1.0
    -      }
    -    }
    -  }
    -  o
    -}
    +
    ## File: DefineParameterLimits.R
    +# lower limits for models A and B
    +PCMParamLowerLimit.BM <- function(o, k, R, ...) {
    +  o <- NextMethod()
    +  k <- attr(o, "k", exact = TRUE)
    +  R <- length(attr(o, "regimes", exact = TRUE))
    +
    +  if(is.Global(o$Sigma_x)) {
    +    if(!is.Diagonal(o$Sigma_x)) {
    +      o$Sigma_x[1, 2] <- -.0
    +    }
    +  } else {
    +    if(!is.Diagonal(o$Sigma_x)) {
    +      for(r in seq_len(R)) {
    +        o$Sigma_x[1, 2, r] <- -.0
    +      }
    +    }
    +  }
    +  o
    +}
    +
    +# upper limits for models A and B
    +PCMParamUpperLimit.BM <- function(o, k, R, ...) {
    +  o <- NextMethod()
    +  k <- attr(o, "k", exact = TRUE)
    +  R <- length(attr(o, "regimes", exact = TRUE))
    +
    +  if(is.Global(o$Sigma_x)) {
    +    o$Sigma_x[1, 1] <- o$Sigma_x[2, 2] <- 1.0
    +    if(!is.Diagonal(o$Sigma_x)) {
    +      o$Sigma_x[1, 2] <- 1.0
    +    }
    +  } else {
    +    for(r in seq_len(R)) {
    +      o$Sigma_x[1, 1, r] <- o$Sigma_x[2, 2, r] <- 1.0
    +      if(!is.Diagonal(o$Sigma_x)) {
    +        o$Sigma_x[1, 2, r] <- 1.0
    +      }
    +    }
    +  }
    +  o
    +}
    +
    +# lower limits for models C, ..., F.
    +PCMParamLowerLimit.OU <- function(o, k, R, ...) {
    +  o <- NextMethod()
    +  k <- attr(o, "k", exact = TRUE)
    +  R <- length(attr(o, "regimes", exact = TRUE))
    +
    +  if(is.Global(o$Theta)) {
    +    o$Theta[1] <- 0.0
    +    o$Theta[2] <- -1.2
    +  } else {
    +    for(r in seq_len(R)) {
    +      o$Theta[1, r] <- 0.0
    +      o$Theta[2, r] <- -1.2
    +    }
    +  }
    +  if(is.Global(o$Sigma_x)) {
    +    if(!is.Diagonal(o$Sigma_x)) {
    +      o$Sigma_x[1, 2] <- -.0
    +    }
    +  } else {
    +    if(!is.Diagonal(o$Sigma_x)) {
    +      for(r in seq_len(R)) {
    +        o$Sigma_x[1, 2, r] <- -.0
    +      }
    +    }
    +  }
    +  o
    +}
    +
    +# upper limits for models C, ..., F.
    +PCMParamUpperLimit.OU <- function(o, k, R, ...) {
    +  o <- NextMethod()
    +  k <- attr(o, "k", exact = TRUE)
    +  R <- length(attr(o, "regimes", exact = TRUE))
    +
    +  if(is.Global(o$Theta)) {
    +    o$Theta[1] <- 7.8
    +    o$Theta[2] <- 4.2
    +  } else {
    +    for(r in seq_len(R)) {
    +      o$Theta[1, r] <- 7.8
    +      o$Theta[2, r] <- 4.2
    +    }
    +  }
    +  if(is.Global(o$Sigma_x)) {
    +    o$Sigma_x[1, 1] <- o$Sigma_x[2, 2] <- 1.0
    +    if(!is.Diagonal(o$Sigma_x)) {
    +      o$Sigma_x[1, 2] <- 1.0
    +    }
    +  } else {
    +    for(r in seq_len(R)) {
    +      o$Sigma_x[1, 1, r] <- o$Sigma_x[2, 2, r] <- 1.0
    +      if(!is.Diagonal(o$Sigma_x)) {
    +        o$Sigma_x[1, 2, r] <- 1.0
    +      }
    +    }
    +  }
    +  o
    +}

    Fitting the global models

    -
    fitBM <- PCMFit(X, tree, modelBM, metaI = PCMBaseCpp::PCMInfoCpp)
    -fitOU <- PCMFit(X, tree, modelOU, metaI = PCMBaseCpp::PCMInfoCpp)
    +
    fitBM <- PCMFit(X, tree, modelBM, metaI = PCMBaseCpp::PCMInfoCpp)
    +fitOU <- PCMFit(X, tree, modelOU, metaI = PCMBaseCpp::PCMInfoCpp)

    Fitting an MGPM model with known shift-point configuration and model type mapping but unknown model parameters

    -
    # This can take about 5 minutes to finish
    -fitMGPMTrueTypeMapping <- PCMFit(
    -  X, treeWithTrueShifts, modelTrueTypeMapping, metaI = PCMBaseCpp::PCMInfoCpp)
    +

    Fitting an MGPM model with known shift-point configuration and model type mapping starting from the true parameter values

    -
    fitMGPMTrueTypeMappingCheat <- PCMFit(
    -  X, treeWithTrueShifts, modelTrueTypeMapping, 
    -    matParInit = matrix(PCMParamGetShortVector(modelTrue), 1L),
    -    numRunifInitVecParams = 1000L,
    -    numGuessInitVecParams = 100L,
    -  metaI = PCMBaseCpp::PCMInfoCpp,
    -  matParInit = jitterModelParams(modelTrue))
    +

    Step 5. Evaluating the model fits

    -
    listModels <- list(
    -  RetrieveBestModel(fitBM), 
    -  RetrieveBestModel(fitOU),
    -  RetrieveBestModel(fitMGPMTrueTypeMapping), 
    -  RetrieveBestModel(fitMGPMTrueTypeMappingCheat), 
    -  modelTrue)
    -
    -dtSummary <- data.table(
    -  model = c(
    -    "Global BM", 
    -    "Global OU", 
    -    "True MGPM, unknown parameters", 
    -    "True MGPM, known true parameters", 
    -    "True MGPM, true parameters"),
    -  p = sapply(listModels, PCMParamCount),
    -  logLik = sapply(listModels, logLik), 
    -  AIC = sapply(listModels, AIC))
    -knitr::kable(dtSummary)
    + @@ -599,7 +599,7 @@

    References

    -

    Mitov, Venelin, Krzysztof Bartoszek, and Tanja Stadler. 2019. “Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models.” Article in Review, Preprint in Ch. 7 of Thesis No. 25428, ETH Zurich, mai. https://doi.org/10.3929/ethz-b-000315296.

    +

    Mitov, Venelin, Krzysztof Bartoszek, and Tanja Stadler. 2019. “Automatic Generation of Evolutionary Hypotheses Using Mixed Gaussian Phylogenetic Models.” Proceedings of the National Academy of Sciences of the United States of America. https://www.pnas.org/lookup/doi/10.1073/pnas.1813823116.

    diff --git a/docs/articles/pcmfit_files/figure-html/unnamed-chunk-3-1.png b/docs/articles/pcmfit_files/figure-html/unnamed-chunk-3-1.png index 3ecb6c3..9a8f359 100644 Binary files a/docs/articles/pcmfit_files/figure-html/unnamed-chunk-3-1.png and b/docs/articles/pcmfit_files/figure-html/unnamed-chunk-3-1.png differ diff --git a/docs/articles/pcmfit_files/figure-html/unnamed-chunk-9-1.png b/docs/articles/pcmfit_files/figure-html/unnamed-chunk-9-1.png index fe3a578..b5df31e 100644 Binary files a/docs/articles/pcmfit_files/figure-html/unnamed-chunk-9-1.png and b/docs/articles/pcmfit_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/articles/pcmfitmixed.html b/docs/articles/pcmfitmixed.html index d0980f5..0d16b79 100644 --- a/docs/articles/pcmfitmixed.html +++ b/docs/articles/pcmfitmixed.html @@ -36,7 +36,7 @@ PCMFit - 0.6.0 + 1.0.0 @@ -88,7 +88,7 @@

    Inferring an MGPM with Unknown Shifts

    Venelin Mitov

    -

    2019-04-02

    +

    2019-07-18

    Source: vignettes/pcmfitmixed.Rmd @@ -101,100 +101,100 @@

    2019-04-02

    Setting up the computing cluster

    -
    library(PCMBase)
    -library(PCMBaseCpp)
    -library(PCMFit)
    -library(data.table)
    -library(ggtree)
    -library(ggplot2)
    -# other needed packages, e.g. ape, data.table etc...
    -
    -# extract dataset identifier and possibly other parameters from the command line:
    -args <- commandArgs(trailingOnly = TRUE)
    -if(length(args) > 0) {
    -  data_id <- as.integer(args[1])
    -} else {
    -  data_id <- 1L
    -}
    -
    -# A character string used in filenames for a model inference on a given data:
    -prefixFiles = paste0("MGPM_A_F_BC2_RR_DATAID_", data_id)
    -
    -
    -# creating the cluster for this PCMFit run:
    -if(!exists("cluster") || is.null(cluster)) {
    -  if(require(doMPI)) {
    -    # using MPI cluster as distributed node cluster (possibly running on a 
    -    # cluster of multiple nodes)
    -    # Get the number of cores. Assume this is run in a batch job.
    -    p = strtoi(Sys.getenv('LSB_DJOB_NUMPROC'))
    -    cluster <- startMPIcluster(count = p-1, verbose = TRUE)
    -    doMPI::registerDoMPI(cluster)
    -  } else {
    -    # possibly running on personal computer without mpi installation
    -    cluster <- parallel::makeCluster(
    -      parallel::detectCores(logical = TRUE),
    -      outfile = paste0("log_", prefixFiles, ".txt"))
    -    doParallel::registerDoParallel(cluster)
    -  }
    -}
    -
    # This function is going to be executed on each worker node.
    -generatePCMModelsFunction <- function() {
    -  # make results reproducible
    -  set.seed(4, kind = "Mersenne-Twister", normal.kind = "Inversion")
    -
    -  PCMGenerateModelTypes()
    -  fileName <- 'DefineParameterLimits.R'
    -  codeDefineLimits <- readChar(fileName, file.info(fileName)$size)
    -  eval(parse(text = codeDefineLimits), .GlobalEnv)
    -}
    + +

    Running the MGPM Inference

    -
    tree <- PCMTree(PCMFitDemoObjects$dtSimulated$tree[[1]])
    -X <- PCMFitDemoObjects$dtSimulated$X[[1]][, seq_len(PCMTreeNumTips(tree))]
    -
    currentResultFile <- paste0("CurrentResults_fits_", prefixFiles, ".RData")
    -if(file.exists(currentResultFile)) {
    -  load(currentResultFile)
    -  tableFitsPrev <- listResults$tableFits
    -} else {
    -  tableFitsPrev <- NULL
    -}
    -
    -fitMGPM_A_F_BC2_RR <- PCMFitMixed(
    -    X = X, tree = tree, metaIFun = PCMInfoCpp,
    -    generatePCMModelsFun = generatePCMModelsFunction, 
    -    maxNumRoundRobins = 2, maxNumPartitionsInRoundRobins = 2,
    -    tableFitsPrev = tableFitsPrev,
    -    prefixFiles = prefixFiles,
    -    doParallel = TRUE)
    + +

    Retrieving and analyzing the best scoring model fit

    -
    bestFit <- RetrieveBestFitScore(fitMGPM_A_F_BC2_RR)
    -
    -listModels <- list(
    -  RetrieveBestModel(PCMFitDemoObjects$fitBM), 
    -  RetrieveBestModel(PCMFitDemoObjects$fitOU),
    -  RetrieveBestModel(PCMFitDemoObjects$fitMGPMTrueTypeMapping), 
    -  RetrieveBestModel(PCMFitDemoObjects$fitMGPMTrueTypeMappingCheat), 
    -  modelTrue,
    -  bestFit$inferredModel)
    -
    -dtSummary <- data.table(
    -  model = c(
    -    "Global BM", 
    -    "Global OU", 
    -    "True MGPM, unknown parameters", 
    -    "True MGPM, known true parameters", 
    -    "True MGPM, true parameters",
    -    "Inferred MGPM, unknown shift-points and parameters"),
    -  p = sapply(listModels, PCMParamCount),
    -  logLik = sapply(listModels, logLik), 
    -  AIC = sapply(listModels, AIC))
    -knitr::kable(dtSummary)
    +
    model
    @@ -241,13 +241,13 @@

    model
    -
    PCMMapModelTypesToRegimes(modelTrue, attr(modelTrue, "tree"))
    -#>  81 105 125 
    -#>   4   3   2
    -
    PCMMapModelTypesToRegimes(
    -  bestFit$inferredModel, attr(bestFit$inferredModel, "tree"))
    -#>  81  83 126 
    -#>   3   3   2
    +
    PCMMapModelTypesToRegimes(modelTrue, attr(modelTrue, "tree"))
    +#>  81 105 125 
    +#>   4   3   2
    +
    **Example phylogenetic comparative data.** **A**: a tree of 80 tips partitioned in three evolutionary regimes. Each evolutionary regime is denoted by #.T, where # is the regime identifier and T is the evolutionary model type associated with this regime (a model type among A, ..., F). **B**:  bivariate trait values at the tips of the tree. **C** and **D**: Analogical to **A** and **B** according to the MGPM found using the AIC score as a criterion.

    Example phylogenetic comparative data. A: a tree of 80 tips partitioned in three evolutionary regimes. Each evolutionary regime is denoted by #.T, where # is the regime identifier and T is the evolutionary model type associated with this regime (a model type among A, …, F). B: bivariate trait values at the tips of the tree. C and D: Analogical to A and B according to the MGPM found using the AIC score as a criterion. diff --git a/docs/articles/pcmfitmixed_files/figure-html/unnamed-chunk-10-1.png b/docs/articles/pcmfitmixed_files/figure-html/unnamed-chunk-10-1.png index de171c3..d025d7b 100644 Binary files a/docs/articles/pcmfitmixed_files/figure-html/unnamed-chunk-10-1.png and b/docs/articles/pcmfitmixed_files/figure-html/unnamed-chunk-10-1.png differ diff --git a/docs/authors.html b/docs/authors.html index b6fde4c..a56571d 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -6,7 +6,7 @@ -Authors • PCMFit +Citation and Authors • PCMFit @@ -30,7 +30,7 @@ - + @@ -57,7 +57,7 @@ -

    +
    @@ -142,7 +142,8 @@

    Fitting a PCM model to a given tree and data

    argsPCMParamUpperLimit = NULL, matParInit = NULL, numRunifInitVecParams = if (is.null(matParInit)) 1000L else 0L, numGuessInitVecParams = if (is.null(matParInit)) 100L else 0L, - numCallsOptim = 10L, control = NULL, verbose = FALSE) + numCallsOptim = 10L, control = NULL, doParallel = FALSE, + verbose = FALSE)

    Arguments

    @@ -169,18 +170,79 @@

    Arg

    +containing meta-data such as N, M and k. Alternatively, this can be a +function object that returns such a list, e.g. the functionPCMInfo +or the function PCMInfoCpp from the PCMBaseCpp package.

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + - +
    SE

    a k x N matrix specifying the standard error for each measurement in X. Alternatively, a k x k x N cube specifying an upper triangular k x k -Choleski factor of the variance covariance matrix for the measurement error +Cholesky factor of the variance covariance matrix for the measurement error for each node i=1, ..., N. Default: matrix(0.0, PCMNumTraits(model), PCMTreeNumTips(tree)).

    metaI

    a list returned from a call to PCMInfo(X, tree, model, SE), -containing meta-data such as N, M and k.

    positiveValueGuard

    a real number (not necessarily positive) used during +the fit as a threshold for highly positive but likely incorrect log-likelihood +values. This argument is set to Inf by default and should be used only +as a last escape. The recommended way to of preventing such numerical errors +is by setting more stringent values for the runtime options +PCMBase.Threshold.EV and PCMBase.Threshold.SV (see +PCMOptions).

    argsPCMParamLowerLimit, argsPCMParamUpperLimit

    named lists with +arguments passed to the functions PCMParamLowerLimit and +PCMParamUpperLimit, respectively. Default: NULL.

    matParInit

    a matrix of any number of rows and p columns where, p is +the number of variable numerical parameters in the model +(equal to PCMParamCount(model)). Each row of this matrix specifies a +suggested starting location for the optim L-BFGS-B run. Default: NULL, +meaning that the initial parameters are to be chosen at random and/or using +calls to GuessInitVecParams function.

    numRunifInitVecParams, numGuessInitVecParams

    integers specifying how +many parameter vectors should be drawn from a uniform distribution between +PCMParamLowerLimit(model) and PCMParamUpplerLimit(model), and +how many parameter vectors should be generated by jittering the resulting +vector from a call to GuessInitVecParams. Before starting the +optimization the model likelihood is evaluated at each of these vectors and +the top numCallsOptim vectors are chosen for starting locations for +optimization. The default settings are +numRunifInitVecParams = if( is.null(matParInit) ) 1000L else 0L and +numGuessInitVecParams = if( is.null(matParInit) ) 100L else 0L.

    numCallsOptim

    integer specifying the maximum number of calls to +optim function. Default: 10. Note that this parameter would be +overwritten by a smaller nrow(matParInit) (if matParInit is +specified) or by a smaller number of generated initial parameter vectors that +satisfy the parameter limits (see also the arguments +numRunifInitVecParams,numGuessInitVecParams and +argsPCMParamLowerLimit, argsPCMParamUpperLimit).

    control

    a list passed as control argument to optim. +Default: NULL.

    doParallel

    logical indicating if optim calls should be executed in +parallel using the foreach() %dopar% {} construct. Default: FALSE.

    verbose

    logical indicating if some debug-messages should printed.

    logical indicating if information messages should be printed +to the console while running. Default: FALSE.

    @@ -188,6 +250,10 @@

    Value

    an object of class PCMFit

    +

    See also

    + +

    PCMOptions

    +
    diff --git a/docs/reference/PCMFitMixed.html b/docs/reference/PCMFitMixed.html index c61f1a3..6d35d70 100644 --- a/docs/reference/PCMFitMixed.html +++ b/docs/reference/PCMFitMixed.html @@ -6,8 +6,8 @@ -Fit regime-assignments to (sub-)trees in a tree with different assigned model -types to each regime. — PCMFitMixed • PCMFit +Optimal information score search for a mixed Gaussian phylogenetic +model, given a tree, trait measurements at its tips and a score function. — PCMFitMixed • PCMFit @@ -31,13 +31,17 @@ - + - + @@ -78,7 +82,7 @@ PCMFit - 0.6.0 + 1.0.0
    @@ -130,30 +134,35 @@
    -

    This function performs multiple model fits of mixed regime models -(MixedGaussian) mapping different model-types (e.g. BM and OU) to different -regimes (colors) in a tree and testing different regime assignments to the -branches in the tree.

    +

    A mixed Gaussian phylogenetic model (MGPM) represents a Gaussian +phylogenetic model with shifts in the underlying parameters and, optionally, +type of Gaussian stochastic process (e.g. shifts from a BM to an OU model of +evolution). Formally, an MGPM consists of the following components:

      +
    • A shift-point configuration: this is a subset of the nodes in the tree +including at least the root node

    • +
    • b

    • +
    PCMFitMixed(X, tree, modelTypes = MGPMDefaultModelTypes(),
    +  subModels = c(B = "A", C = "A", D = "B", E = "D", F = "E"),
       argsMixedGaussian = Args_MixedGaussian_MGPMDefaultModelTypes(),
       SE = matrix(0, nrow(X), PCMTreeNumTips(tree)),
       generatePCMModelsFun = PCMGenerateModelTypes, metaIFun = PCMInfo,
       positiveValueGuard = Inf, scoreFun = AIC, fitMappingsPrev = NULL,
       tableFitsPrev = fitMappingsPrev$tableFits,
       modelTypesInTableFitsPrev = NULL, listPartitions = NULL,
    -  minCladeSizes = 20L, maxCladePartitionLevel = if
    -  (is.null(listPartitions)) 8L else 1L,
    +  minCladeSizes = 20L, skipNodes = character(),
    +  maxCladePartitionLevel = if (is.null(listPartitions)) 8L else 1L,
       maxNumNodesPerCladePartition = 1L,
       listAllowedModelTypesIndices = c("best-clade-2", "best-clade", "all"),
       argsConfigOptim1 = DefaultArgsConfigOptim(numCallsOptim = 10),
    @@ -164,7 +173,52 @@ 

    Fit regime-assignments to (sub-)trees in a tree with different assigned mode doParallel = FALSE, prefixFiles = "fits_", saveTempWorkerResults = TRUE, printFitVectorsToConsole = FALSE, verbose = TRUE, debug = FALSE)

    - + +

    Arguments

    + + + + + + + + + + + + + + + + + + + + + + + + + + +
    X

    a k x N numerical matrix with possible NA and NaN entries. Each +column of X contains the measured trait values for one species (tip in tree). +Missing values can be either not-available (NA) or not existing (NaN). +Thse two values have are treated differently when calculating +likelihoods: see PCMPresentCoordinates.

    tree

    a phylo object with N tips.

    SE

    a k x N matrix specifying the standard error for each measurement in +X. Alternatively, a k x k x N cube specifying an upper triangular k x k +Cholesky factor of the variance covariance matrix for the measurement error +for each node i=1, ..., N. +Default: matrix(0.0, PCMNumTraits(model), PCMTreeNumTips(tree)).

    positiveValueGuard

    a real number (not necessarily positive) used during +the fit as a threshold for highly positive but likely incorrect log-likelihood +values. This argument is set to Inf by default and should be used only +as a last escape. The recommended way to of preventing such numerical errors +is by setting more stringent values for the runtime options +PCMBase.Threshold.EV and PCMBase.Threshold.SV (see +PCMOptions).

    doParallel

    logical indicating if optim calls should be executed in +parallel using the foreach() %dopar% {} construct. Default: FALSE.

    verbose

    logical indicating if information messages should be printed +to the console while running. Default: FALSE.

    +

    Value

    an S3 object of class PCMFitModelMappings.

    @@ -174,6 +228,7 @@

    Value

    diff --git a/docs/reference/PCMIteratorMapping2.html b/docs/reference/PCMIteratorMapping2.html index 086601d..f278d0d 100644 --- a/docs/reference/PCMIteratorMapping2.html +++ b/docs/reference/PCMIteratorMapping2.html @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0
    diff --git a/docs/reference/PCMLoadMixedGaussianFromFitVector.html b/docs/reference/PCMLoadMixedGaussianFromFitVector.html index 6ed7140..96b50b2 100644 --- a/docs/reference/PCMLoadMixedGaussianFromFitVector.html +++ b/docs/reference/PCMLoadMixedGaussianFromFitVector.html @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0
    diff --git a/docs/reference/PCMNextMapping.html b/docs/reference/PCMNextMapping.html index 6503f9a..80db5ba 100644 --- a/docs/reference/PCMNextMapping.html +++ b/docs/reference/PCMNextMapping.html @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0
    diff --git a/docs/reference/PCMNextMapping2.html b/docs/reference/PCMNextMapping2.html index 01bbfcc..3ba494f 100644 --- a/docs/reference/PCMNextMapping2.html +++ b/docs/reference/PCMNextMapping2.html @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0
    diff --git a/docs/reference/RetrieveFittedModelsFromFitVectors.html b/docs/reference/RetrieveFittedModelsFromFitVectors.html index f4bf25a..682a0e3 100644 --- a/docs/reference/RetrieveFittedModelsFromFitVectors.html +++ b/docs/reference/RetrieveFittedModelsFromFitVectors.html @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0
    diff --git a/docs/reference/hello.html b/docs/reference/UpdateCladeFitsUsingSubModels.html similarity index 77% rename from docs/reference/hello.html rename to docs/reference/UpdateCladeFitsUsingSubModels.html index dffcc6e..6f9d71a 100644 --- a/docs/reference/hello.html +++ b/docs/reference/UpdateCladeFitsUsingSubModels.html @@ -6,7 +6,7 @@ -Hello, World! — hello • PCMFit +Update with parameters from submodels where the fit of a submodel is better — UpdateCladeFitsUsingSubModels • PCMFit @@ -30,9 +30,9 @@ - + - + @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0

    @@ -125,29 +125,27 @@
    -

    Prints 'Hello, world!'.

    +

    Update with parameters from submodels where the fit of a submodel is better

    -
    hello()
    +
    UpdateCladeFitsUsingSubModels(cladeFits, modelTypes, subModels,
    +  argsMixedGaussian, metaIFun = PCMInfo, scoreFun, X, tree, SE,
    +  verbose = FALSE)
    -

    Examples

    -
    hello()
    #> Error in hello(): could not find function "hello"
    diff --git a/docs/reference/index.html b/docs/reference/index.html index 960b68e..5f0c06c 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -70,7 +70,7 @@ PCMFit - 0.6.0 + 1.0.0 @@ -148,6 +148,12 @@

    GuessInitVecParams()

    + +

    A heuristic-based guess of the optimal parameters of a model

    + +

    LookupFit()

    @@ -170,8 +176,8 @@

    PCMFitMixed()

    -

    Fit regime-assignments to (sub-)trees in a tree with different assigned model -types to each regime.

    +

    Optimal information score search for a mixed Gaussian phylogenetic +model, given a tree, trait measurements at its tips and a score function.

    @@ -211,9 +217,9 @@

    hello()

    +

    UpdateCladeFitsUsingSubModels()

    -

    Hello, World!

    +

    Update with parameters from submodels where the fit of a submodel is better

    diff --git a/docs/reference/runOptim.html b/docs/reference/runOptim.html index 8ccb631..3ba6e7d 100644 --- a/docs/reference/runOptim.html +++ b/docs/reference/runOptim.html @@ -73,7 +73,7 @@ PCMFit - 0.6.0 + 1.0.0 @@ -136,17 +136,54 @@

    Calling optim a number of times

    -
    runOptim(lik, parLower, parUpper, matParInit, control = NULL,
    -  verbose = TRUE)
    +
    runOptim(X, tree, model, SE, metaI, positiveValueGuard, parLower, parUpper,
    +  matParInit, control = NULL, doParallel = FALSE, verbose = TRUE)

    Arguments

    - - + + + + + + + + + + + + + + + + + + + + + + @@ -161,6 +198,11 @@

    Arg

    + + + +
    lik

    a function of numeric vector argument. Possible signatures are -function(par) or function(par, input) . This argument is mandatory and -doesn't have a default value.

    X

    a k x N numerical matrix with possible NA and NaN entries. Each +column of X contains the measured trait values for one species (tip in tree). +Missing values can be either not-available (NA) or not existing (NaN). +Thse two values have are treated differently when calculating +likelihoods: see PCMPresentCoordinates.

    tree

    a phylo object with N tips.

    model

    an S3 object specifying both, the model type (class, e.g. "OU") as +well as the concrete model parameter values at which the likelihood is to be +calculated (see also Details).

    SE

    a k x N matrix specifying the standard error for each measurement in +X. Alternatively, a k x k x N cube specifying an upper triangular k x k +Cholesky factor of the variance covariance matrix for the measurement error +for each node i=1, ..., N. +Default: matrix(0.0, PCMNumTraits(model), PCMTreeNumTips(tree)).

    metaI

    a list returned from a call to PCMInfo(X, tree, model, SE), +containing meta-data such as N, M and k. Alternatively, this can be a +function object that returns such a list, e.g. the functionPCMInfo +or the function PCMInfoCpp from the PCMBaseCpp package.

    positiveValueGuard

    a real number (not necessarily positive) used during +the fit as a threshold for highly positive but likely incorrect log-likelihood +values. This argument is set to Inf by default and should be used only +as a last escape. The recommended way to of preventing such numerical errors +is by setting more stringent values for the runtime options +PCMBase.Threshold.EV and PCMBase.Threshold.SV (see +PCMOptions).

    parLower, parUppercontrol

    a list passed to optim().

    doParallel

    logical indicating if optim calls should be executed in +parallel using the foreach() %dopar% {} construct. Default: FALSE.

    verbose

    logical indicating whether informative messages should be diff --git a/docs/sitemap.xml b/docs/sitemap.xml index fa55144..924ac7b 100644 --- a/docs/sitemap.xml +++ b/docs/sitemap.xml @@ -6,6 +6,9 @@ http://venelin.github.io/PCMFit/reference/ComposeMixedGaussianFromFits.html + + http://venelin.github.io/PCMFit/reference/GuessInitVecParams.html + http://venelin.github.io/PCMFit/reference/LookupFit.html @@ -37,7 +40,7 @@ http://venelin.github.io/PCMFit/reference/RetrieveFittedModelsFromFitVectors.html - http://venelin.github.io/PCMFit/reference/hello.html + http://venelin.github.io/PCMFit/reference/UpdateCladeFitsUsingSubModels.html http://venelin.github.io/PCMFit/reference/runOptim.html diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 0000000..148e5c4 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,19 @@ +citHeader("To cite PCMFit in publications use:") + +citEntry( + entry = "Article", + title = "Automatic generation of evolutionary hypotheses using mixed Gaussian phylogenetic models", + author = "Venelin Mitov, Krzysztof Bartoszek, Tanja Stadler", + journal = "Proceedings of the National Academy of Sciences of the United States of America", + year = 2019, + volume = "", + number = "", + pages = "", + url = "https://www.pnas.org/lookup/doi/10.1073/pnas.1813823116", + textVersion = paste( +"Mitov, V., Bartoszek, K., & Stadler, T. (2019). Automatic generation of evolutionary", +"hypotheses using mixed Gaussian phylogenetic models. Proceedings of the National", +"Academy of Sciences of the United States of America,", +"http://doi.org/10.1073/pnas.1813823116" + ) +) diff --git a/man/PCMFitMixed.Rd b/man/PCMFitMixed.Rd index 21d521d..a282418 100644 --- a/man/PCMFitMixed.Rd +++ b/man/PCMFitMixed.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/PCMFitMixed.R \name{PCMFitMixed} \alias{PCMFitMixed} -\title{Maximum likelihood based search for an optimal mixed Gaussian phylogenetic -model, given a tree, trait measurements at its tips and a score function.} +\title{Optimal information score search for a mixed Gaussian phylogenetic +model, given a tree, trait measurements at its tips and a score function.} \usage{ PCMFitMixed(X, tree, modelTypes = MGPMDefaultModelTypes(), subModels = c(B = "A", C = "A", D = "B", E = "D", F = "E"), @@ -59,10 +59,13 @@ to the console while running. Default: FALSE.} an S3 object of class PCMFitModelMappings. } \description{ -A mixed Gaussian phylogenetic model (MGPM) represents -Fit regime-assignments to (sub-)trees in a tree with different assigned model -types to each regime.This function performs multiple model fits of mixed regime models -(MixedGaussian) mapping different model-types (e.g. BM and OU) to different -regimes (colors) in a tree and testing different regime assignments to the -branches in the tree. +A mixed Gaussian phylogenetic model (MGPM) represents a Gaussian +phylogenetic model with shifts in the underlying parameters and, optionally, +type of Gaussian stochastic process (e.g. shifts from a BM to an OU model of +evolution). Formally, an MGPM consists of the following components: +\itemize{ +\item A shift-point configuration: this is a subset of the nodes in the tree +including at least the root node +\item b +} } diff --git a/man/hello.Rd b/man/hello.Rd deleted file mode 100644 index 0fa7c4b..0000000 --- a/man/hello.Rd +++ /dev/null @@ -1,12 +0,0 @@ -\name{hello} -\alias{hello} -\title{Hello, World!} -\usage{ -hello() -} -\description{ -Prints 'Hello, world!'. -} -\examples{ -hello() -} diff --git a/vignettes/REFERENCES-R.bib b/vignettes/REFERENCES-R.bib index e20457b..d4ad324 100644 --- a/vignettes/REFERENCES-R.bib +++ b/vignettes/REFERENCES-R.bib @@ -9,14 +9,14 @@ @Manual{R-cowplot title = {cowplot: Streamlined Plot Theme and Plot Annotations for 'ggplot2'}, author = {Claus O. Wilke}, year = {2019}, - note = {R package version 0.9.4}, + note = {R package version 1.0.0}, url = {https://CRAN.R-project.org/package=cowplot}, } @Manual{R-data.table, title = {data.table: Extension of `data.frame`}, author = {Matt Dowle and Arun Srinivasan}, year = {2019}, - note = {R package version 1.12.0}, + note = {R package version 1.12.2}, url = {https://CRAN.R-project.org/package=data.table}, } @Manual{R-doParallel, @@ -42,17 +42,17 @@ @Manual{R-foreach } @Manual{R-ggplot2, title = {ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics}, - author = {Hadley Wickham and Winston Chang and Lionel Henry and Thomas Lin Pedersen and Kohske Takahashi and Claus Wilke and Kara Woo}, - year = {2018}, - note = {R package version 3.1.0}, + author = {Hadley Wickham and Winston Chang and Lionel Henry and Thomas Lin Pedersen and Kohske Takahashi and Claus Wilke and Kara Woo and Hiroaki Yutani}, + year = {2019}, + note = {R package version 3.2.0}, url = {https://CRAN.R-project.org/package=ggplot2}, } @Manual{R-ggtree, - title = {ggtree: an R package for visualization and annotation of phylogenetic -trees with their covariates and other associated data}, + title = {ggtree: an R package for visualization and annotation of phylogenetic trees with +their covariates and other associated data}, author = {Guangchuang Yu and Tommy Tsan-Yuk Lam}, - year = {2019}, - note = {R package version 1.14.6}, + year = {2014}, + note = {R package version 1.15.3}, url = {https://guangchuangyu.github.io/software/ggtree}, } @Manual{R-iterators, @@ -66,13 +66,14 @@ @Manual{R-mvtnorm title = {mvtnorm: Multivariate Normal and t Distributions}, author = {Alan Genz and Frank Bretz and Tetsuhisa Miwa and Xuefei Mi and Torsten Hothorn}, year = {2019}, - note = {R package version 1.0-10}, + note = {R package version 1.0-11}, url = {https://CRAN.R-project.org/package=mvtnorm}, } @Manual{R-PCMBase, title = {PCMBase: Simulation and Likelihood Calculation of Phylogenetic Comparative Models}, author = {Venelin Mitov}, - note = {https://venelin.github.io/PCMBase/, https://github.com/venelin/PCMBase}, + note = {R package version 1.2.10}, + url = {https://CRAN.R-project.org/package=PCMBase}, year = {2019}, } @Manual{R-PCMBaseCpp, @@ -106,7 +107,7 @@ @Manual{R-SPLITT @Manual{R-testthat, title = {testthat: Unit Testing for R}, author = {Hadley Wickham}, - year = {2018}, - note = {R package version 2.0.1}, + year = {2019}, + note = {R package version 2.1.1}, url = {https://CRAN.R-project.org/package=testthat}, } diff --git a/vignettes/REFERENCES.bib b/vignettes/REFERENCES.bib index b52ae34..14ee315 100644 --- a/vignettes/REFERENCES.bib +++ b/vignettes/REFERENCES.bib @@ -138,13 +138,11 @@ @article{Mitov:2018dqa } @article{Mitov:2019agh, -author = {Mitov, Venelin and Bartoszek, Krzysztof and Stadler, Tanja}, -title = {{Automatic Generation of Evolutionary Hypotheses using Mixed Gaussian Phylogenetic Models}}, -journal = {article in review, preprint in Ch. 7 of Thesis No. 25428, ETH Zurich}, -year = {2019}, -pages = {}, -month = mai, -url = {https://doi.org/10.3929/ethz-b-000315296} +title = {Automatic generation of evolutionary hypotheses using mixed Gaussian phylogenetic models}, + author = {Venelin Mitov and Krzysztof Bartoszek and Tanja Stadler}, + journal = {Proceedings of the National Academy of Sciences of the United States of America}, + year = {2019}, + url = {https://www.pnas.org/lookup/doi/10.1073/pnas.1813823116}, } @phdthesis{Mitov:2018hh,