In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import norm
from tqdm.notebook import tqdm

In [1]:
#work around all those python directory crazyness in order to access the ab_testing module
import os
import sys
import inspect

currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)

from ab_testing.stratification.stratification import stratified_ttest, get_stratified_statistics

# Effect of relaxed stopping rules on fixed horizon tests

In this notebook we investigate the following question:

When performing a fixed horizon null hypothesis test with relaxed stopping rules (e.g. peeking nd early stopping) what is the effect on the various error rates like false positives, false negatives, sign errors etc.

In [3]:
from decision_rules import (
    FixedHorizonDecisionRule,
    FixedHorizonDecisionRuleWithPeek,
    FixedHorizonDecisionRuleExtend
)
from simulate import simulate_ab_test_data_binom, ABTestData

# simulations

In [4]:
from simulate_decisions import simulate_decision_progression

In [5]:

def simulate_n_decisions(
    decision_rule,
    decision_rule_kwargs,
    n_ab_tests,
    n_samples_per_increment,
    n_increments,
    mean_a,
    mean_b,
    plot_title=None
):
    results = []
    
    for i in tqdm(range(n_ab_tests)):
        results.append(
                simulate_decision_progression(
                     simulate_ab_test_data_binom(
                        n_samples_per_increment=n_samples_per_increment,
                        n_increments=n_increments,
                        mean_a=mean_a,
                        mean_b=mean_b
                    ),
                    decision_rule, 
                    **decision_rule_kwargs
                )
        )

    return results

In [6]:
import pandas as pd
ab_test_increments = 42
expected_runtime_decision = 21
alpha=.1
kwargs_no_effect = {
    "n_ab_tests": 1000,
    "n_samples_per_increment": 1000,
    "n_increments": ab_test_increments,
    "mean_a": .05,
    "mean_b": .05,
}

pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRule,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha},
    plot_title=None,
    **kwargs_no_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    907
STOP REJECT NULL      93
Name: DECISION, dtype: int64

In [7]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleExtend,
    decision_rule_kwargs={
        "expected_runtime": expected_runtime_decision,
        "alpha": alpha,
        "extension_interval": 2,
        "extension_periods": 7,
    },
    plot_title=None,
    **kwargs_no_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    811
STOP REJECT NULL     189
Name: DECISION, dtype: int64

In [8]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleWithPeek,
    decision_rule_kwargs={
        "expected_runtime": expected_runtime_decision,
        "alpha": alpha,
    },
    plot_title=None,
    **kwargs_no_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    587
STOP REJECT NULL     413
Name: DECISION, dtype: int64

In [9]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleWithPeek,
    decision_rule_kwargs={
        "expected_runtime": expected_runtime_decision,
        "alpha": alpha,
        "peeking_interval": 7
    },
    plot_title=None,
    **kwargs_no_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    807
STOP REJECT NULL     193
Name: DECISION, dtype: int64

In [10]:
kwargs_small_positive_effect = {
    "n_ab_tests": 1000,
    "n_samples_per_increment": 1000,
    "n_increments": ab_test_increments,
    "mean_a": .05,
    "mean_b": .0505,
}

pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRule,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha},
    plot_title=None,
    **kwargs_small_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    888
STOP REJECT NULL     112
Name: DECISION, dtype: int64

In [11]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleExtend,
    decision_rule_kwargs={
        "expected_runtime": expected_runtime_decision,
        "alpha": alpha,
        "extension_interval": 2,
        "extension_periods": 7,
    },
    plot_title=None,
    **kwargs_small_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    812
STOP REJECT NULL     188
Name: DECISION, dtype: int64

In [12]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleWithPeek,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha},
    plot_title=None,
    **kwargs_small_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    566
STOP REJECT NULL     434
Name: DECISION, dtype: int64

In [13]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleWithPeek,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha, "peeking_interval": 7},
    plot_title=None,
    **kwargs_small_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP INCONCLUSIVE    778
STOP REJECT NULL     222
Name: DECISION, dtype: int64

In [14]:
kwargs_big_positive_effect = {
    "n_ab_tests": 1000,
    "n_samples_per_increment": 1000,
    "n_increments": ab_test_increments,
    "mean_a": .05,
    "mean_b": .055,
}

pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRule,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha},
    plot_title=None,
    **kwargs_big_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP REJECT NULL     719
STOP INCONCLUSIVE    281
Name: DECISION, dtype: int64

In [15]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleExtend,
    decision_rule_kwargs={
        "expected_runtime": expected_runtime_decision,
        "alpha": alpha,
        "extension_interval": 2,
        "extension_periods": 7,
    },
    plot_title=None,
    **kwargs_big_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP REJECT NULL     923
STOP INCONCLUSIVE     77
Name: DECISION, dtype: int64

In [16]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleWithPeek,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha},
    plot_title=None,
    **kwargs_big_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP REJECT NULL     860
STOP INCONCLUSIVE    140
Name: DECISION, dtype: int64

In [17]:
pd.DataFrame(simulate_n_decisions(
    decision_rule=FixedHorizonDecisionRuleWithPeek,
    decision_rule_kwargs={"expected_runtime": expected_runtime_decision, "alpha": alpha, "peeking_interval": 7},
    plot_title=None,
    **kwargs_big_positive_effect
))["DECISION"].value_counts()

  0%|          | 0/1000 [00:00<?, ?it/s]

STOP REJECT NULL     802
STOP INCONCLUSIVE    198
Name: DECISION, dtype: int64