Skip to content

Commit

Permalink
[release] Version 2.0.0
Browse files Browse the repository at this point in the history
- links to some intro to Bayes articles
- change from `alpha` to `credible_mass`.
- add BestResults.hdi()
- clarifications in docs
  • Loading branch information
treszkai committed Sep 19, 2019
2 parents b3dff61 + c3977bb commit 2c6ec83
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 64 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ The [documentation](https://best.readthedocs.io) describes the API in detail.

## Installation ##

Ensure your Python version is sufficiently up-to-date:
Ensure your Python version is sufficiently up-to-date (at least 3.5.4):

```bash
$ python --version
Expand All @@ -51,7 +51,7 @@ Python 3.5.6

Then install with Pip:
```bash
$ pip install https://github.com/treszkai/best/archive/master.tar.gz
$ pip install best
```

## Developer notes ##
Expand Down
52 changes: 32 additions & 20 deletions best/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@
import scipy.stats as st


def check_credible_mass(credible_mass):
if not 0 < credible_mass < 1:
raise ValueError('credible_mass parameter must be between 0 and 1, non-inclusive')


class BestModel(ABC):
"""Base class for BEST models"""

Expand All @@ -43,10 +48,10 @@ def model(self) -> pm.Model:
pass

@abstractmethod
def observed_data(self, group_id):
def observed_data(self, group_id: int):
"""Return the observed data as a NumPy array
(This property is accessible primarily for internal purposes.)
(This method is accessible primarily for internal purposes.)
"""
pass

Expand Down Expand Up @@ -123,7 +128,7 @@ def version(self):
def model(self):
return self._model

def observed_data(self, group_id):
def observed_data(self, group_id: int):
if group_id == 1:
return self.y
else:
Expand Down Expand Up @@ -200,7 +205,7 @@ def version(self):
def model(self):
return self._model

def observed_data(self, group_id):
def observed_data(self, group_id: int):
if group_id == 1:
return self.y1
elif group_id == 2:
Expand Down Expand Up @@ -241,43 +246,50 @@ def trace(self) -> MultiTrace:
"""
return self._trace

def observed_data(self, group_id):
def observed_data(self, group_id: int):
"""Return the observed data as a NumPy array
(This method is accessible primarily for internal purposes.)
"""
return self.model.observed_data(group_id)

def summary(self, alpha=0.05):
def summary(self, credible_mass: float = 0.95):
"""Return summary statistics of the results
Parameters
----------
alpha : float
credible_mass : float
The highest posterior density intervals in the summary will cover
(1–alpha) * 100% of the probability mass.
For example, alpha=0.05 results in a 95% credible interval.
Default: 0.05.
credible_mass * 100% of the probability mass.
For example, credible_mass=0.95 results in 95% credible intervals.
Default: 0.95.
"""
return pm.summary(self.trace, alpha=alpha)
check_credible_mass(credible_mass)
return pm.summary(self.trace, alpha=(1 - credible_mass))

def hpd(self, var_name: str, alpha: float = 0.05):
"""Calculate the highest posterior density (HPD) interval
def hdi(self, var_name: str, credible_mass: float = 0.95):
"""Calculate the highest posterior density interval (HDI)
This is a 1-alpha *credible interval* which contains the
most likely values of the parameter.
This function calculates a *credible interval* which contains the
``credible_mass`` most likely values of the parameter, given the data.
Also known as an HPD interval.
Parameters
----------
var_name : str
Name of variable.
alpha : float
The HPD will cover (1–alpha) * 100% of the probability mass.
For example, alpha=0.05 results in a 95% credible interval.
Default: 0.05.
credible_mass : float
The HDI will cover credible_mass * 100% of the probability mass.
Default: 0.95, i.e. a 95% HDI.
Returns
-------
(float, float)
The endpoints of the HPD
"""
return tuple(pm.hpd(self.trace[var_name], alpha=alpha))
check_credible_mass(credible_mass)

return tuple(pm.hpd(self.trace[var_name], alpha=(1 - credible_mass)))

def posterior_prob(self, var_name: str, low: float = -np.inf, high: float = np.inf):
r"""Calculate the posterior probability that a variable is in a given interval
Expand Down
21 changes: 12 additions & 9 deletions best/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,16 @@
import numpy as np
import matplotlib.lines as mpllines
from matplotlib.transforms import blended_transform_factory
from matplotlib.ticker import LogFormatter
import scipy.stats as st

from .model import BestResults, BestResultsOne, BestResultsTwo

# Only 99.5% of the samples are displayed, to prevent the long tails
DISPLAYED_ALPHA = 0.005
# Only this ratio of the samples are displayed,
# to prevent the long tails distorting the plot.
DISPLAYED_MASS = 0.995

DEFAULT_HDI_MASS = 0.95

PRETTY_BLUE = '#89d1ea'


Expand Down Expand Up @@ -86,7 +89,7 @@ def plot_posterior(best_results: BestResults,
>>> plt.show()
"""
samples = best_results.trace[var_name]
samples_min, samples_max = best_results.hpd(var_name, DISPLAYED_ALPHA)
samples_min, samples_max = best_results.hdi(var_name, DISPLAYED_MASS)
samples = samples[(samples_min <= samples) * (samples <= samples_max)]

trans = blended_transform_factory(ax.transData, ax.transAxes)
Expand Down Expand Up @@ -118,7 +121,7 @@ def plot_posterior(best_results: BestResults,
ax.axvline(ref_val, linestyle=':')

# plot HDI
hdi_min, hdi_max = best_results.hpd(var_name, alpha=0.05)
hdi_min, hdi_max = best_results.hdi(var_name, DEFAULT_HDI_MASS)

hdi_line, = ax.plot([hdi_min, hdi_max], [0, 0],
lw=5.0, color='k')
Expand Down Expand Up @@ -162,7 +165,7 @@ def plot_normality_posterior(best_results, ax, bins, title):
var_name = 'Normality'
samples = best_results.trace[var_name]
norm_bins = np.logspace(np.log10(best_results.model.nu_min),
np.log10(best_results.hpd(var_name, DISPLAYED_ALPHA)[-1]),
np.log10(best_results.hdi(var_name, DISPLAYED_MASS)[-1]),
num=bins + 1)
plot_posterior(best_results,
var_name,
Expand All @@ -173,7 +176,7 @@ def plot_normality_posterior(best_results, ax, bins, title):
ax.set_xlim(2.4, norm_bins[-1] * 1.05)
ax.semilogx()
# don't use scientific notation for tick labels
tick_fmt = LogFormatter()
tick_fmt = plt.LogFormatter()
ax.xaxis.set_major_formatter(tick_fmt)
ax.xaxis.set_minor_formatter(tick_fmt)

Expand Down Expand Up @@ -363,8 +366,8 @@ def plot_all_two(best_results: BestResultsTwo,
posterior_std1 = trace['Group 1 SD']
posterior_std2 = trace['Group 2 SD']

std1_min, std1_max = best_results.hpd('Group 1 SD', DISPLAYED_ALPHA)
std2_min, std2_max = best_results.hpd('Group 2 SD', DISPLAYED_ALPHA)
std1_min, std1_max = best_results.hdi('Group 1 SD', DISPLAYED_MASS)
std2_min, std2_max = best_results.hdi('Group 2 SD', DISPLAYED_MASS)
std_min = min(std1_min, std2_min)
std_max = max(std1_max, std2_max)
stds = np.concatenate((posterior_std1, posterior_std2))
Expand Down
3 changes: 2 additions & 1 deletion docs/explanations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ with an :ref:`additional parameter <sec-normality>` that controls the prevalence
After assigning a prior distribution to the parameters,
these distributions are updated in accordance with how well each parameter setting fits the data,
resulting in the so-called *posterior distribution*.
(The specifics of the prior are explained on the :ref:`Model history <sec-model-latest>` page.)
(The model used by this library is not exactly like the one described in (Kruschke, 2013);
the :ref:`Model history <sec-model-latest>` page gives an exact description.)

The resulting t-distributions can be plotted with the :func:`plot_data_and_prediction` function,
whose output is a figure like this:
Expand Down
45 changes: 36 additions & 9 deletions docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,16 @@ For example, to find the probability that the first group's mean is larger by at
>>> best_out.posterior_prob('Difference of means', low=0.5)
0.87425

The parameter estimation is described briefly in :ref:`the relevant section <brief-description>`,
The 95% highest posterior density interval (HDI) can be queried just as easily::

>>> best_out.hdi('Difference of means', 0.95)
(0.12..., 1.84...)

The parameter estimation is described briefly in :ref:`the relevant section <brief-description>` of this documentation,
or in detail in the following publication:

John K. Kruschke, 2013. Bayesian estimation supersedes the *t* test.
Journal of Experimental Psychology: General, 2013, v.142(2), pp.573-603.
Kruschke, J. K. (2013). Bayesian estimation supersedes the *t* test.
*Journal of Experimental Psychology: General* **142(2)**, pp.573-603.
(doi: `10.1037/a0029146 <https://dx.doi.org/10.1037/a0029146>`_)

The publication's `accompanying website <http://www.indiana.edu/~kruschke/BEST/>`_
Expand All @@ -44,6 +49,25 @@ while keeping the tools simple.
It is *not* intended as a comprehensive data analysis tool.
For that purpose, please refer to books like John K. Kruschke's
`Doing Bayesian Data Analysis <https://sites.google.com/site/doingbayesiandataanalysis/>`_.
Quote from chapter 1 of the book:

This book explains how to actually do Bayesian data analysis, by real people (like
you), for realistic data (like yours). The book starts at the basics, with elementary
notions of probability and programming. You do not need to already know statistics
and programming.

The following open access papers give an intuitive introduction to Bayesian data analysis:

- Kruschke, J. K. and Liddell, T. M. (2018).
The Bayesian New Statistics: Hypothesis testing, estimation, meta-analysis, and power analysis from a Bayesian perspective.
*Psychonomic Bulletin & Review* **25**, pp. 178-206.
`R code <https://osf.io/j6364/files/>`_;
see also the `Shiny App <http://www.indiana.edu/~kruschke/bayesAndFreqApp.html>`_.
(doi: `10.3758/s13423-016-1221-4 <https://doi.org/10.3758/s13423-016-1221-4>`_)
- Kruschke, J. K. and Liddell, T. M. (2018).
Bayesian data analysis for newcomers.
*Psychonomic Bulletin & Review* **25**, pp. 155-177.
(doi: `10.3758/s13423-017-1272-1 <https://doi.org/10.3758/s13423-017-1272-1>`_)

Installation
------------
Expand All @@ -53,7 +77,7 @@ and can be installed with *pip*:

.. code-block:: bash
pip install https://github.com/treszkai/best/archive/master.zip
pip install best
This command installs the following dependencies:

Expand All @@ -64,16 +88,19 @@ This command installs the following dependencies:
Get in touch
------------

If you have trouble installing or using `best`, or understanding the results, or you found an error,
If you find the documentation lacking or you have found an error,
please `open an issue <https://github.com/treszkai/best/issues>`_ at the project's GitHub page,
or open a `pull request <https://github.com/treszkai/best/pulls>`_ if you have a proposed solution.

Your feedback and contribution are welcome!

Further documentation
---------------------

.. toctree::
:maxdepth: 2
:maxdepth: 2

api
explanations
model_history
api
explanations
model_history
version_history
15 changes: 8 additions & 7 deletions docs/model_history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Where :math:`\hat\mu` and :math:`\hat\sigma` are the sample mean and
sample standard deviation of all the data from the two groups.
The effect size is calculated as :math:`(\mu_1 - \mu_2) \big/ \sqrt{(\sigma_1^2 + \sigma_2^2) \,/\, 2}`.

.. _sec-model-v2:
.. _sec-model-latest:

v2
Expand All @@ -47,28 +48,28 @@ and the formula for effect size also uses :math:`\mathrm{sd}_i` instead of :math
Setting a bound of 2.5 prevents strong outliers and extremely large standard deviations.

Another change concerns the sampling of :math:`\sigma_i`.
In the original model :math:`\sigma_i` was uniformly distributed between
:math:`\hat\sigma\, / \,1000` and :math:`1000\,\hat\sigma`,
In the original model :math:`\sigma_i \,/\, \hat\sigma` was uniformly distributed between
:math:`1 \, / \,1000` and :math:`1000`,
meaning the *prior* probability of :math:`\sigma > \hat\sigma` was 1000 times that of :math:`\sigma < \hat\sigma`,
which caused an overestimation of :math:`\sigma` with low sample sizes (around :math:`N = 5`).
To make these probabilities equal, now :math:`\log(\sigma_i)` is distributed uniformly between
:math:`\log(\hat\sigma\, / \,1000)` and :math:`\log(1000\, \hat\sigma)`.
To make these probabilities equal, now :math:`\log(\sigma_i \,/\, \hat\sigma)` is distributed uniformly between
:math:`\log(1\, / \,1000)` and :math:`\log(1000)`.
At :math:`N=25` this change in the prior does not cause a perceptible change in the posterior.

*Summary of changes*:
- Lower bound of :math:`\nu` is 2.5.
- SD is calculated as :math:`\sigma \sqrt{ \nu / (\nu - 2)}`.
- Effect size is calculated as :math:`(\mu_1 - \mu_2) \big/ \sqrt{(\mathrm{sd}_1^2 + \mathrm{sd}_2^2) \,/\, 2}`.
- :math:`\log(\sigma_i)` is uniformly distributed.
- :math:`\log(\sigma_i \,/\, \hat\sigma)` is uniformly distributed.

The model for two-group analysis is described by the following sampling statements:

.. math::
\mu_1 &\sim \text{Normal}(\hat\mu, 1000 \, \hat\sigma) \\
\mu_2 &\sim \text{Normal}(\hat\mu, 1000 \, \hat\sigma) \\
\log(\sigma_1) &\sim \text{Uniform}(\log(\hat\sigma \, / \, 1000), \log(1000\, \hat\sigma)) \\
\log(\sigma_2) &\sim \text{Uniform}(\log(\hat\sigma \, / \, 1000), \log(1000\, \hat\sigma)) \\
\log(\sigma_1 \,/\, \hat\sigma) &\sim \text{Uniform}(\log(1 \, / \, 1000), \log(1000)) \\
\log(\sigma_2 \,/\, \hat\sigma) &\sim \text{Uniform}(\log(1 \, / \, 1000), \log(1000)) \\
\nu &\sim \text{Exponential}(1\, / \, 27.5) + 2.5 \\
y_1 &\sim t_\nu(\mu_1, \sigma_1) \\
y_2 &\sim t_\nu(\mu_2, \sigma_2)
Expand Down
21 changes: 21 additions & 0 deletions docs/version_history.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
.. _ch-version-history:

.. module:: best
:noindex:

Version History
===============

1.0
---
Initial release by `Andrew Straw <https://github.com/strawlab/best>`_.
This version supported Python 2.7, Matplotlib 2.2, and PyMC.

2.0.0
-----
Changes from version 1.0:

- Upgrade to Python 3.5, PyMC3, and Matplotlib 3.0.0.
- :ref:`Model version 2 <sec-model-v2>`.
- Introduction of object-oriented API.
- Changes in plot appearance.
17 changes: 9 additions & 8 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
from setuptools import setup


with open("README.md", "r") as fh:
with open('README.md', 'r') as fh:
long_description = fh.read()

setup(name='best',
description='Bayesian estimation supersedes the t-test',
author='Andrew Straw and Laszlo Treszkai',
author_email='strawman@astraw.com',
author_email='laszlo.treszkai@gmail.com',
long_description=long_description,
long_description_content_type="text/markdown",
url='https://github.com/strawlab/best',
long_description_content_type='text/markdown',
url='https://github.com/treszkai/best',
version='2.0.0',
packages=['best'],
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
],
license='MIT',
)
python_requires='>=3.5',
)

0 comments on commit 2c6ec83

Please sign in to comment.