Skip to content

Commit

Permalink
Merge pull request #134 from gbordyugov/OCTO-1616-no-experimentdata
Browse files Browse the repository at this point in the history
Octo 1616 no experimentdata
  • Loading branch information
gbordyugov committed Jul 26, 2017
2 parents dfbf3ed + 1f6f080 commit c6e39e5
Show file tree
Hide file tree
Showing 16 changed files with 414 additions and 3,039 deletions.
2 changes: 1 addition & 1 deletion expan/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import logging.config

from expan.cli import *
# from expan.cli import *
from expan.core import *
from expan.core.version import __version__
from expan.data import *
Expand Down
2 changes: 1 addition & 1 deletion expan/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pandas as pd

from expan.core.experiment import Experiment
from expan.core.experiment import Experiment
from expan.core.experimentdata import ExperimentData


Expand Down
3 changes: 2 additions & 1 deletion expan/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@

from __future__ import absolute_import

__all__ = ["binning", "experiment", "experimentdata", "results", "statistics", "util", "version"]
# __all__ = ["binning", "experiment", "experimentdata", "results", "statistics", "util", "version"]
__all__ = ["binning", "experiment", "statistics", "util", "version"]

from expan.core.version import __version__, version

Expand Down
129 changes: 88 additions & 41 deletions expan/core/early_stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,16 @@ def obrien_fleming(information_fraction, alpha=0.05):
return (1 - norm.cdf(norm.ppf(1 - alpha / 2) / np.sqrt(information_fraction))) * 2


def make_group_sequential(spending_function='obrien_fleming', estimated_sample_size=None, alpha=0.05, cap=8):
def f(x, y):
return group_sequential(x, y, spending_function, estimated_sample_size,
alpha, cap)
return f

def group_sequential(x,
y,
spending_function='obrien_fleming',
information_fraction=1,
estimated_sample_size=None,
alpha=0.05,
cap=8):
"""
Expand All @@ -46,20 +52,13 @@ def group_sequential(x,
y (array_like): sample of a control group
spending_function: name of the alpha spending function, currently
supports: 'obrien_fleming'
information_fraction: share of the information amount at the point
of evaluation, e.g. the share of the maximum sample size
estimated_sample_size: sample size to be achieved towards
the end of experiment
alpha: type-I error rate
cap: upper bound of the adapted z-score
Returns:
tuple:
- stop label
- effect size (delta)
- confidence interval of delta
- sample size of x
- sample size of y
- absolute mean of x
- absolute mean of y
EarlyStoppingStatistics object
"""
# Checking if data was provided
if x is None or y is None:
Expand All @@ -75,6 +74,14 @@ def group_sequential(x,
# else:
# fraction = information_fraction

n_x = statx.sample_size(_x)
n_y = statx.sample_size(_y)

if not estimated_sample_size:
information_fraction = 1.0
else:
information_fraction = max(1.0, min(n_x, n_y) / estimated_sample_size)

# alpha spending function
if spending_function in ('obrien_fleming'):
func = eval(spending_function)
Expand All @@ -92,19 +99,24 @@ def group_sequential(x,
mu_y = np.nanmean(_y)
sigma_x = np.nanstd(_x)
sigma_y = np.nanstd(_y)
n_x = statx.sample_size(_x)
n_y = statx.sample_size(_y)
z = (mu_x - mu_y) / np.sqrt(sigma_x ** 2 / n_x + sigma_y ** 2 / n_y)

if z > bound or z < -bound:
stop = 1
stop = True
else:
stop = 0
stop = False

interval = statx.normal_difference(mu_x, sigma_x, n_x, mu_y, sigma_y, n_y,
[alpha_new * 100 / 2, 100 - alpha_new * 100 / 2])

return stop, mu_x - mu_y, interval, n_x, n_y, mu_x, mu_y
# return stop, mu_x - mu_y, interval, n_x, n_y, mu_x, mu_y
return {'stop' : bool(stop),
'delta' : float(mu_x - mu_y),
'interval' : interval,
'n_x' : int(n_x),
'n_y' : int(n_y),
'mu_x' : float(mu_x),
'mu_y' : float(mu_y)}


def HDI_from_MCMC(posterior_samples, credible_mass=0.95):
Expand Down Expand Up @@ -159,6 +171,8 @@ def get_or_compile_stan_model(model_file, distribution):
pickle.dump(sm, f)
return sm

cache_sampling_results = False
sampling_results = {} # memoized sampling results

def _bayes_sampling(x, y, distribution='normal', num_iters=25000):
"""
Expand Down Expand Up @@ -189,6 +203,11 @@ def _bayes_sampling(x, y, distribution='normal', num_iters=25000):
_x = drop_nan(_x)
_y = drop_nan(_y)

key = (str(_x), str(_y), num_iters)

if cache_sampling_results and key in sampling_results:
return sampling_results[key]

mu_x = np.nanmean(_x)
mu_y = np.nanmean(_y)
n_x = statx.sample_size(_x)
Expand All @@ -215,9 +234,18 @@ def _bayes_sampling(x, y, distribution='normal', num_iters=25000):
control={'stepsize': 0.01, 'adapt_delta': 0.99})
traces = fit.extract()

if cache_sampling_results:
sampling_results[key] = (traces, n_x, n_y, mu_x, mu_y)

return traces, n_x, n_y, mu_x, mu_y


def make_bayes_factor(distribution='normal', num_iters=25000):
def f(x, y):
return bayes_factor(x, y, distribution, num_iters)
return f


def bayes_factor(x, y, distribution='normal', num_iters=25000):
"""
Args:
Expand All @@ -228,26 +256,38 @@ def bayes_factor(x, y, distribution='normal', num_iters=25000):
num_iters: number of iterations of bayes sampling
Returns:
tuple:
- stop label
- effect size (delta)
- credible interval of delta
- sample size of x
- sample size of y
- absolute mean of x
- absolute mean of y
dictionary with statistics
"""
traces, n_x, n_y, mu_x, mu_y = _bayes_sampling(x, y, distribution=distribution, num_iters=num_iters)
kde = gaussian_kde(traces['delta'])

prior = cauchy.pdf(0, loc=0, scale=1)
# BF_01
bf = kde.evaluate(0)[0] / prior
stop = int(bf > 3 or bf < 1 / 3.)
interval = HDI_from_MCMC(traces['delta'])

return stop, mu_x - mu_y, {'lower': interval[0], 'upper': interval[1]}, n_x, n_y, mu_x, mu_y

# stop = int(bf > 3 or bf < 1 / 3.)
stop = bf > 3 or bf < 1 / 3.

credibleMass = 0.95 # another magic number
leftOut = 1.0 - credibleMass
p1 = round(leftOut/2.0, 5)
p2 = round(1.0 - leftOut/2.0, 5)
interval = HDI_from_MCMC(traces['delta'], credibleMass)

# return stop, mu_x - mu_y, {'lower': interval[0], 'upper': interval[1]}, n_x, n_y, mu_x, mu_y
return {'stop' : bool(stop),
'delta' : float(mu_x - mu_y),
'interval' : {p1*100: interval[0], p2*100: interval[1]},
'n_x' : int(n_x),
'n_y' : int(n_y),
'mu_x' : float(mu_x),
'mu_y' : float(mu_y),
'num_iters': num_iters}


def make_bayes_precision(distribution='normal', posterior_width=0.08, num_iters=25000):
def f(x, y):
return bayes_precision(x, y, distribution, posterior_width, num_iters)
return f

def bayes_precision(x, y, distribution='normal', posterior_width=0.08, num_iters=25000):
"""
Expand All @@ -261,17 +301,24 @@ def bayes_precision(x, y, distribution='normal', posterior_width=0.08, num_iters
num_iters: number of iterations of bayes sampling
Returns:
tuple:
- stop label
- effect size (delta)
- credible interval of delta
- sample size of x
- sample size of y
- absolute mean of x
- absolute mean of y
dictionary with statistics
"""
traces, n_x, n_y, mu_x, mu_y = _bayes_sampling(x, y, distribution=distribution, num_iters=num_iters)
interval = HDI_from_MCMC(traces['delta'])
stop = int(interval[1] - interval[0] < posterior_width)

return stop, mu_x - mu_y, {'lower': interval[0], 'upper': interval[1]}, n_x, n_y, mu_x, mu_y
credibleMass = 0.95 # another magic number
leftOut = 1.0 - credibleMass
p1 = round(leftOut/2.0, 5)
p2 = round(1.0 - leftOut/2.0, 5)
interval = HDI_from_MCMC(traces['delta'], credibleMass)

# stop = int(interval[1] - interval[0] < posterior_width)
stop = interval[1] - interval[0] < posterior_width

# return stop, mu_x - mu_y, {'lower': interval[0], 'upper': interval[1]}, n_x, n_y, mu_x, mu_y
return {'stop' : bool(stop),
'delta' : float(mu_x - mu_y),
'interval' : {p1*100: interval[0], p2*100: interval[1]},
'n_x' : int(n_x),
'n_y' : int(n_y),
'mu_x' : float(mu_x),
'mu_y' : float(mu_y),
'num_iters': num_iters}

0 comments on commit c6e39e5

Please sign in to comment.