In [2]:
import numpy as np
from scipy.stats import uniform, poisson, geom, ttest_ind

uni = 'yc4153'

## Problem

An app with microtransactions offers a free but limited tier. The vast majority of customers do not spend any money on the app, a small group spends a small amount, and another, even smaller group consists of very heavy spenders. Let's suppose all spending is integer valued.

We can model the pmf of this as follows:

$$p(x) = p_1 \cdot 1_{\{x=0\}} + p_2 \cdot g_2(x) + p_3 \cdot g_3(x)$$

where $p_1 + p_2 + p_3 = 1$, $g_2$ is the pmf of a poisson distribution with parameter $\lambda$, and $g_3$ is the pmf of $5+Y$ where $Y\sim geo(q)$.

Currently, the parameters here are:

$$p_1 = 0.98, p_2=0.019, p_3 = .001, \lambda=3, q=0.1$$

We are never able to observed what group a person is in, just their revenue.

Product management has proposed a change in the conversion funnel that they believe will increase conversions, particularly amongst the second group. They want to perform an A/B test of this with $n$ customers. As the distribution of revenues is quite skewed, they wish to explicitly compute the power of this test in the event that the test group has the same values of $\lambda$ and $q$, but $p_1 = 0.975, p_2=0.024, p_3=.001$ (i.e. the probability of getting a significant result if this particular alternative is true). Assume they are doing a one-sided two-sample t-test with level $\alpha=0.01$. 

Write the following functions.

- generate_counterfactuals(n, lam, q) which returns an n x 2 array, where the first column is a poisson random variable with parameter lam for each person, and the second column is a geometric random variable with parameter q for each person.

- generate_revenues(n, lam, q, p) which calls generate_counterfactuals, and then generates a revenue for each individual. 

- run_experiment(n, lam, q, p1, p2) which creates the two test groups of size n each, calculates their mean revenue, and performs a two sample t-test on them. It should return a 1 if we reject the null hypothesis that the groups have the same, and 0 if otherwise. Let p1 be the mixing parameters for the control group and p2 be the mixing parameters for experimental group. For the t-test use scipy.stats.ttest_ind, do not assume equal variance between the two groups.

- calc_power(n, m, lam, q, p1, p2) which runs the experiment m times to calculate the power. 

Note: In generate_counterfactuals, the second column should generate a geometric distribution + 5

In [8]:
def generate_counterfactuals(n, lam, q):
    poisson_samples = poisson.rvs(lam, size=n)
    geom_samples = 5 + geom.rvs(q, size=n)
    return np.column_stack((poisson_samples, geom_samples))


def generate_revenues(n, lam, q, p):
    counterfactuals = generate_counterfactuals(n, lam, q)
    revenues = np.zeros(n)
    for i in range(n):
        U = np.random.rand()
        if U < p[0]:
            revenues[i] = 0
        elif U < p[0] + p[1]:
            revenues[i] = counterfactuals[i,0]
        else:
            revenues[i] = counterfactuals[i,1]
    return revenues


def run_experiment(n, lam, q, p1, p2):
    control_revenues = generate_revenues(n, lam, q, p1)
    experimental_revenues = generate_revenues(n, lam, q, p2)
    
    _, p_value = ttest_ind(control_revenues, experimental_revenues, equal_var=False, alternative='less')
    
    if p_value < 0.01:
        return 1
    else:
        return 0
    

def calc_power(n, m, lam, q, p1, p2):
    rejected_count = 0
    for i in range(m):
        rejected = run_experiment(n, lam, q, p1, p2)
        rejected_count += rejected
    power = rejected_count / m
    return power


In [4]:
def generate_counterfactuals(n, lam, q):
    poisson_samples = poisson.rvs(lam, size=n)
    geom_samples = 5 + geom.rvs(q, size=n)
    return np.column_stack((poisson_samples, geom_samples))

def generate_revenues(n, lam, q, p):
    counterfactuals = generate_counterfactuals(n, lam, q)
    revenues = np.zeros(n)
    for i in range(n):
        sample = np.random.rand()
        if sample < p[0]:
            revenues[i] = 0
        elif sample < p[0] + p[1]:
            revenues[i] = counterfactuals[i, 0]
        else:
            revenues[i] = counterfactuals[i, 1]
    return revenues

def run_experiment(n, lam, q, p1, p2):
    control_revenues = generate_revenues(n, lam, q, p1)
    experimental_revenues = generate_revenues(n, lam, q, p2)
    _, p_value = ttest_ind(control_revenues, experimental_revenues, equal_var=False, alternative='less')
    return 1 if p_value < 0.01 else 0

def calc_power(n, m, lam, q, p1, p2):
    count = 0
    for _ in range(m):
        count += run_experiment(n, lam, q, p1, p2)
    return count / m

In [24]:
p1 = [0.98, 0.019, 0.001]
p2 = [0.975, 0.024, 0.001]
lam = 3
q = 0.1
calc_power(1000000, 100, lam, q, p1, p2)

1.0

In [15]:
run_experiment(2, lam, q, p1, p2)

0

In [7]:
counterfactuals = generate_counterfactuals(10, lam, q)
counterfactuals

array([[ 1,  7],
       [ 2,  9],
       [ 5, 10],
       [ 5, 17],
       [ 0,  6],
       [ 3,  7],
       [ 3, 26],
       [ 2, 14],
       [ 2, 37],
       [ 3, 10]], dtype=int64)

In [19]:
def generate_counterfactuals(n, lam, q):
    poisson_sample = poisson.rvs(mu=lam, size=n)
    geo_sample = 5 + geom.rvs(p=q, size=n)
    return np.column_stack((poisson_sample, geo_sample))


def generate_revenues(n, lam, q, p):
    counterfactuals = generate_counterfactuals(n,lam,q)
    revenues = np.zeros(n)
    for i in range(n):
        sample = np.random.rand()
        if sample < p[0]:
            revenues[i] = 0
        elif sample < p[0] + p[1]:
            revenues[i] = counterfactuals[i,0]
        else:
            revenues[i] = counterfactuals[i,1]
    return revenues

def run_experiment(n, lam, q, p1, p2):
    revenue_1 = generate_revenues(n,lam, q, p1)
    revenue_2 = generate_revenues(n,lam, q, p2)
    t_test, p_value = ttest_ind(revenue_1,revenue_2, equal_var = False)
    if p_value < 0.01:
        return 1
    else:
        return 0

def calc_power(n, m, lam, q, p1, p2):
    experiment = 0
    for i in range(m):
        experiment += run_experiment(n,lam,q,p1,p2)
    power = experiment/m
    return power