In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pymc3 as pm
import arviz as az

sns.set()


In [None]:
def logit_pure(x):
    return (np.exp(x)) / (1 + np.exp(x))

def logistic_pure(p):
    return np.log(p / (1 - p))

In [None]:
### simulated population that we in reality know nothing about, only thru sampling ###
#np.random.seed(4711)
np.random.seed(666)

POP_SIZE = 100000 # pop in each cohort #
MAX_CONV_RATE = 0.2


pop_df = pd.DataFrame({'country' : ['SWE','SWE','FIN','FIN','DEN','DEN','NOR','NOR'],
                      'pop' : [POP_SIZE] * 8,
                      'conversions' : np.random.randint(1,int(MAX_CONV_RATE * POP_SIZE),size=8),
                      'age_idx' : [0,1] * 4})

pop_df['conversion_rate'] = pop_df['conversions'] / pop_df['pop']
pop_df['country_idx'],_ = pd.factorize(pop_df['country'])
pop_df

In [None]:
### create a binary representation of conversions to enable sampling ###
def create_pop(df):
    pop_arr = np.zeros((POP_SIZE,len(df)))
    
    i = 0
    for r,c in df.iterrows():
        pop_arr[np.random.choice(np.arange(c['pop']),replace=False,size=c['conversions']),i] = 1
        i +=1
    return pop_arr.T

In [None]:
### binary conversion yes/no for each individual ###
### 1 == conversion, 0 == no conversion ###

sim_pop = pd.DataFrame(create_pop(pop_df)).astype(int)
sim_pop.columns.name = 'individual'
sim_pop

In [None]:
### sample sim_pop ###
MAX_SAMPLE_SIZE = int(0.01 * POP_SIZE) # maximally % of pop are sampled # 

samples = np.zeros((len(sim_pop),2))

i = 0
for r,c in sim_pop.iterrows():
    sample = np.random.choice(c,np.random.randint(0,MAX_SAMPLE_SIZE),replace=False)
    conversions = sample.sum()
    N = len(sample)
    samples[i,0] = conversions
    samples[i,1] = N
    i += 1
    
samples = pd.DataFrame(samples)
samples.columns = ['conversions','sample_size']
samples['conversion_rate'] = samples['conversions'] / samples['sample_size']
samples

In [None]:
import scipy.stats as sps

slope,intercept,_,_,_ = sps.linregress(samples['sample_size'], samples['conversions'])
print (slope,intercept)
print ()
print (samples.mean())
ax = samples.plot(x='sample_size',y='conversions',style='o',figsize=(18,12))

X = range(0,1000)

# y = kx (intercept = 0)
ax.plot(X,[X[i] * samples['conversion_rate'].mean() for i in range(len(X))],ls='dashed',color='k',
       label=r'no intercept, $\beta$ : {:.4f}'.format(samples['conversion_rate'].mean()))

# y = kx + m
ax.plot(X,[X[i] * slope + intercept for i in range(len(X))],color='orange',ls='dashed',
       label=r'intercept, $\beta$ : {:.4f}, $\alpha$ : {:.4f}'.format(slope,intercept))
ax.legend(loc='upper left')

In [None]:
print ('avg. rate pop: ',pop_df['conversion_rate'].mean())
print ('avg. rate samples : ',samples['conversion_rate'].mean())
print ('weighted avg. rate pop : ',np.average(pop_df['conversion_rate'],weights=pop_df['pop']))
print ('weighted avg. rate samples : ',np.average(samples['conversion_rate'],weights=samples['sample_size']))


In [None]:
cmp_rates = pd.concat([pop_df['conversion_rate'],samples['conversion_rate']],axis=1)
cmp_rates.columns = ['pop_rate','sample_rate']
cmp_rates['rel_diff'] = cmp_rates['sample_rate'] / cmp_rates['pop_rate']
cmp_rates

In [None]:
### prior predictive check ###

with pm.Model() as ppc:
    alpha = pm.Normal('alpha',mu=0,sd=1.5)
    prior_predictive = pm.sample_prior_predictive(samples=5000)
    plt.hist(prior_predictive['alpha'])

In [None]:
with pm.Model() as ppc2:
    alpha_bar = pm.Normal('alpha_bar',mu=0,sd=1.5)
    sigma = pm.Exponential('sigma',1)
    alpha = pm.Normal('alpha',mu=alpha_bar,sd=sigma)
    
    prior_predictive = pm.sample_prior_predictive(samples=1000)

fig,axes = plt.subplots(3,1,figsize=(18,12),sharex=True,sharey=True)
axes[0].hist(prior_predictive['alpha_bar'])
axes[1].hist(prior_predictive['sigma'])
axes[2].hist(prior_predictive['alpha'])

In [None]:
### no pooling, aggregated ###

with pm.Model() as no_pool_agg_mdl:
    sigma = pm.Exponential('sigma',1)
    alpha = pm.Normal('alpha',mu=0,sd=sigma)
    
    p_alpha = pm.Deterministic('p_alpha',pm.math.invlogit(alpha))
    
    obs = pm.Binomial('obs',n=samples['sample_size'].sum(),p=pm.math.invlogit(alpha),
                      observed=samples['conversions'].sum())
    
    trace_no_pool_agg_mdl = pm.sample(10000,tune=10000,target_accept=0.99,return_inferencedata=False)

In [None]:
with no_pool_agg_mdl:
    print (az.summary(trace_no_pool_agg_mdl,hdi_prob=0.89,round_to=5))

In [None]:
np.average(samples['conversion_rate'],weights=samples['sample_size'])

           mean       sd  hdi_5.5%  hdi_94.5%  mcse_mean  mcse_sd  \
alpha   -2.52340  0.05521  -2.61286   -2.43705    0.00058  0.00041   
p_alpha  0.07432  0.00380   0.06832    0.08039    0.00004  0.00003   

           ess_bulk     ess_tail    r_hat  
alpha    9209.91749  13797.16343  0.99994  
p_alpha  9209.91749  13797.16343  0.99994  

In [None]:
### no pooling ###

with pm.Model() as no_pool_mdl:
    
    sigma = pm.Exponential('sigma',1)
    alpha = pm.Normal('alpha',mu=0,sd=sigma)
    
    p_alpha = pm.Deterministic('p_alpha',logit_pure(alpha))
    
    obs = pm.Binomial('obs',n=samples['sample_size'],p=logit_pure(alpha),observed=samples['conversions'])
    
    trace_no_pool_mdl = pm.sample(10000,tune=10000,return_inferencedata=False,target_accept=0.99)
    

In [None]:
with no_pool_mdl:
    print (az.summary(trace_no_pool_mdl,hdi_prob=0.89,round_to=5))

In [None]:
np.average(samples['conversion_rate'],weights=samples['sample_size'])

In [None]:
### compare aggregated vs non-aggregated models ###
with no_pool_agg_mdl,pooling_age_mdl:
    az.plot_forest(data=[trace_no_pool_mdl,trace_no_pool_agg_mdl],
                   model_names=[no_pool_mdl,no_pool_agg_mdl],var_names=['p_alpha'],combined=True,
                  hdi_prob=0.89)

In [None]:
print ('avg. rate pop : ',pop_df['conversion_rate'].mean())
print ('avg. rate samples : ',samples['conversion_rate'].mean())
print ('weighted avg. rate pop : ',np.average(pop_df['conversion_rate'],weights=pop_df['pop']))
print ('weighted avg. rate samples : ',np.average(samples['conversion_rate'],weights=samples['sample_size']))

In [None]:
with no_pool_mdl:
    az.plot_posterior(trace_no_pool_mdl,hdi_prob=0.89)

In [None]:
combined_df = pd.concat([pop_df,samples],axis=1)
combined_df.columns = ['country','pop','pop_conversions','age_idx','pop_conversion_rate','country_idx',
                      'sample_conversions','sample_size','sample_conversion_rate']

combined_df


In [None]:
### pooling - full pooling on age only ###

with pm.Model() as pooling_age_mdl:
    alpha = pm.Normal('alpha',mu=0,sd=1.5,shape=2)
    p_alpha = pm.Deterministic('p_alpha',logit_pure(alpha))
    obs = pm.Binomial('obs',n=combined_df['sample_size'],p=logit_pure(alpha[combined_df['age_idx']]),
                      observed=combined_df['sample_conversions']) 
    
    trace_pooling_age_mdl = pm.sample(10000,tune=10000,return_inferencedata=False)

In [None]:
with pooling_age_mdl:
    print (az.summary(trace_pooling_age_mdl,hdi_prob=0.89))

In [None]:
### partial pooling - age only ###

with pm.Model() as partial_pooling_age_mdl:
    alpha_bar = pm.Normal('alpha_bar',mu=0,sd=1.5)
    
    sigma = pm.Exponential('sigma',1)
    
    alpha = pm.Normal('alpha',mu=alpha_bar,sd=sigma,shape=2)
    p_alpha = pm.Deterministic('p_alpha',logit_pure(alpha))
    obs = pm.Binomial('obs',n=combined_df['sample_size'],p=logit_pure(alpha[combined_df['age_idx']]),
                      observed=combined_df['sample_conversions']) 
    
    trace_partial_pooling_age_mdl = pm.sample(10000,tune=10000,
                                              target_accept=0.99,return_inferencedata=False)

In [None]:
with partial_pooling_age_mdl:
    print (az.summary(data=trace_partial_pooling_age_mdl,hdi_prob=0.89))

In [None]:
with partial_pooling_age_mdl,pooling_age_mdl:
    az.plot_forest(data=[trace_pooling_age_mdl,trace_partial_pooling_age_mdl],
                model_names=[pooling_age_mdl,partial_pooling_age_mdl],var_names='p_alpha',
                hdi_prob=0.89,combined=True,figsize=(18,12))

In [None]:
combined_df

In [None]:
### weighted avg of samples gets closest to Bayes mean ###

print (combined_df.loc[combined_df['age_idx'] == 0][['pop_conversion_rate','sample_conversion_rate']].mean())
print (combined_df.loc[combined_df['age_idx'] == 1][['pop_conversion_rate','sample_conversion_rate']].mean())

conv_0 = combined_df.loc[combined_df['age_idx'] == 0][['pop_conversion_rate','sample_conversion_rate']]
weights_0 = combined_df.loc[combined_df['age_idx'] == 0]['sample_size'] 

conv_1 = combined_df.loc[combined_df['age_idx'] == 1][['pop_conversion_rate','sample_conversion_rate']]
weights_1 = combined_df.loc[combined_df['age_idx'] == 1]['sample_size']

print ('weighted sample avg 0 : ',np.average(conv_0['sample_conversion_rate'],weights=weights_0))
print ('weighted sample avg 1 : ',np.average(conv_1['sample_conversion_rate'],weights=weights_1))
print ()

print ('weighted_pop_avg_0 : ',np.average(conv_0['pop_conversion_rate'],weights=weights_0))
print ('weighted_pop_avg_1 : ',np.average(conv_1['pop_conversion_rate'],weights=weights_1))
print()

print ('Bayes full pool mean : ',trace_pooling_age_mdl['p_alpha'].mean(axis=0))
print ('Bayes partial pool mean : ',trace_partial_pooling_age_mdl['p_alpha'].mean(axis=0))

In [None]:
combined_df

In [None]:
az.plot_forest({'fake' : sps.norm.rvs(0,1,10000)},hdi_prob=0.89)

In [None]:
fig,ax = plt.subplots(figsize=(18,12))
az.plot_forest({'p_alpha_no_pool_agg' : trace_no_pool_agg_mdl['p_alpha'],
               'p_alpha_no_pool' : trace_no_pool_mdl['p_alpha'],
               'p_alpha_full_pool_age[0]' : trace_pooling_age_mdl['p_alpha'][:,0],
               'p_alpha_full_pool_age[1]' : trace_pooling_age_mdl['p_alpha'][:,1],
               'p_alpha_partial_pool_age[0]' : trace_partial_pooling_age_mdl['p_alpha'][:,0],
               'p_alpha_partial_pool_age[1]' : trace_partial_pooling_age_mdl['p_alpha'][:,1]},
               hdi_prob=0.89,ax=ax,colors=['red'],
              markersize=10)


In [None]:
### A/B test on fully pooled on age ###

coords = {'age' : ['young','old']}

with pm.Model(coords=coords) as a_b_full_pool_age:
    sigma = pm.Exponential('sigma',1)
    alpha = pm.Normal('alpha',mu=0,sd=sigma,dims='age')
    
    p_alpha = pm.Deterministic('p_alpha',pm.math.invlogit(alpha),dims='age')
    
    ### 1 == treatment, 0 == control ###
    rel_diff = pm.Deterministic('rel_diff',(p_alpha[1] / p_alpha[0]) - 1 )
    abs_diff = pm.Deterministic('abs_diff',p_alpha[1] - p_alpha[0])
    
    obs = pm.Binomial('obs',n=combined_df['sample_size'],p=pm.math.invlogit(alpha[combined_df['age_idx']]),
                     observed=combined_df['sample_conversions'])
    
    trace_a_b_full_pool_age = pm.sample(10000,tune=10000,return_inferencedata=False,
                                       target_accept=0.95)

In [None]:
with a_b_full_pool_age:
    print (pm.summary(trace_a_b_full_pool_age,hdi_prob=0.89,round_to=5))

In [None]:
with a_b_full_pool_age:
    az.plot_posterior(data=trace_a_b_full_pool_age,hdi_prob=0.89,
                      var_names=['p_alpha','abs_diff','rel_diff'],figsize=(18,6))

In [None]:
with a_b_full_pool_age:
    az.plot_forest(data=trace_a_b_full_pool_age,var_names=['p_alpha'],hdi_prob=0.89,combined=True)

In [None]:
a_b_full_pool_idata = az.from_pymc3(trace=trace_a_b_full_pool_age,model=a_b_full_pool_age)
a_b_full_pool_idata

In [None]:
with a_b_full_pool_age:
    az.plot_posterior(data=a_b_full_pool_idata,coords=coords,
                      var_names=['p_alpha','abs_diff','rel_diff'],hdi_prob=0.89,
                     figsize=(18,6),ref_val={'p_alpha' : [{'age': 'young' , 'ref_val' : 0}],
                                             'abs_diff' : [{'ref_val' : 0}]})

In [None]:
combined_df