In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as sps
import pymc3 as pm
import arviz as az

sns.set()

np.random.seed(4711)

In [None]:
size = 1000

df = pd.DataFrame({'newsworthy' : np.random.uniform(-1,1,size),
                  'trustworthy' : np.random.uniform(-1,1,size)})
df

In [None]:
df.plot(x='newsworthy',y='trustworthy',style='o')

In [None]:
df['product'] = df['newsworthy'] * df['trustworthy']
selection = df.loc[ ( df['newsworthy'] > 0 ) & (df['trustworthy'] > 0) ]
selection = selection.sort_values('product',ascending=False)
top_10_pct = int(len(df) * 0.1)

print (top_10_pct)
selection = selection.iloc[:top_10_pct]
selection

In [None]:
ax = df.plot(x='newsworthy',y='trustworthy',style='o',legend=False)
selection.plot(x='newsworthy',y='trustworthy',style='ro',ax=ax,legend=False)
ax.set_ylabel('trustworthy')
print ('selected : ',len(selection))

In [None]:
slope_pop,intercept_pop,_,_,_ = sps.linregress (df['newsworthy'],df['trustworthy'])
print (slope_pop,intercept_pop)
slope_selection,intercept_selection,_,_,_ = sps.linregress(selection['newsworthy'],selection['trustworthy'])
print (slope_selection,intercept_selection)

In [None]:
ax = df.plot(x='newsworthy',y='trustworthy',style='o',legend=False)
selection.plot(x='newsworthy',y='trustworthy',style='ro',ax=ax,legend=False)
ax.set_ylabel('trustworthy')

X = np.linspace(-1,1,size)
ax.plot(X,X * slope_pop + intercept_pop,ls='--',color='orange')
ax.plot(X,X * slope_selection + intercept_selection,ls='--',color='orange')

In [None]:
### with non-flat priors it seems that there is a difference when having few data points ###
### between sps.linregress beta and pymc mean beta. With flat priors the difference (almost) vanishes ### 

with pm.Model() as model:
    alpha = pm.Normal('alpha',mu=0,sd=1,shape=2)
    beta = pm.Normal('beta',mu=0,sd=1,shape=2)
    
    ### flat priors give results very close with sps.linregress ###
    #alpha = pm.Uniform('alpha',-2,2,shape=2)
    #beta = pm.Uniform('beta',-2,2,shape=2)
    
    sigma = pm.Uniform('sigma',0,1)
    
    mu_pop = pm.Deterministic('mu_pop',df['newsworthy'].values * beta[0] + alpha[0])
    mu_selection = pm.Deterministic('mu_selection',selection['newsworthy'].values * beta[1] + alpha[1])
    
    pm.Deterministic('diff',beta[0] - beta[1])
    pm.Deterministic('rel',(beta[0] / beta[1]) - 1)
    
    obs_pop = pm.Normal('obs_pop',mu=mu_pop,
                        sd=sigma,observed=df['trustworthy'].values)
    
    obs_selection = pm.Normal('obs_selection',mu=mu_selection,sd=sigma,
                              observed=selection['trustworthy'].values)
    
    trace = pm.sample()

In [None]:
with model:
    az.plot_trace(trace,var_names=['alpha','beta','sigma','diff','rel'])
    print (az.summary(trace,hdi_prob=0.89,var_names=['alpha','beta','sigma','diff','rel']))

In [None]:
print ('pm beta_0 : ',trace['beta'][:,0].mean(),'sps beta_0 :',slope_pop)
print ('pm_alpha_0 : ', trace['alpha'][:,0].mean(),'sps_alpha_0 : ',intercept_pop)

print ('pm_beta_1 : ',trace['beta'][:,1].mean(), 'sps_beta_1 : ',slope_selection)
print ('pm_alpha_1 : ', trace['alpha'][:,1].mean(), 'sps_alpha_1 : ',intercept_selection)

In [None]:
ax = az.plot_hpd(df['newsworthy'],trace['mu_pop'],hdi_prob=0.89)
ax.plot(df['newsworthy'],df['newsworthy'] * trace['beta'][:,0].mean() + trace['alpha'][:,0].mean(),
       color='k')

ax.plot(df['newsworthy'],df['newsworthy'] * slope_pop + intercept_pop,color='crimson')


In [None]:
ax = az.plot_hpd(selection['newsworthy'],trace['mu_selection'],hdi_prob=0.89)
ax.plot(selection['newsworthy'],selection['newsworthy'] * trace['beta'][:,1].mean() + trace['alpha'][:,1].mean(),
       color='k')

ax.plot(selection['newsworthy'],selection['newsworthy'] * slope_selection + intercept_selection,color='crimson')


In [None]:
ax = az.plot_hpd(df['newsworthy'],trace['mu_pop'],hdi_prob=0.89,color='blue')
ax.plot(df['newsworthy'],df['newsworthy'] * trace['beta'][:,0].mean() + trace['alpha'][:,0].mean(),
       color='k')

ax.plot(df['newsworthy'],df['newsworthy'] * slope_pop + intercept_pop,color='blue')

az.plot_hpd(selection['newsworthy'],trace['mu_selection'],hdi_prob=0.89,ax=ax)
ax.plot(selection['newsworthy'],selection['newsworthy'] * trace['beta'][:,1].mean() + trace['alpha'][:,1].mean(),
       color='k')

ax.plot(selection['newsworthy'],selection['newsworthy'] * slope_selection + intercept_selection,color='crimson')


In [None]:
with model:
    az.plot_posterior(trace,var_names=['alpha','beta','diff','rel'],ref_val=0)

In [None]:

### draws 100 random instances of all params from the trace. For each draw generates values for each x, from a 
### Normal distribution with the drawn parameter instances ###


with model:
    samples = pm.sample_posterior_predictive(trace,len(selection['newsworthy']),model,var_names=['obs_selection'])
    samples

In [None]:
plt.hist([s.mean() for s in samples['obs_selection']])

In [None]:
selection['newsworthy'].sort_values()

In [None]:
plt.figure(figsize=(18,12))
sample_CI = np.percentile(samples['obs_selection'],[5.5,94.5],axis=1)
print (sample_CI.shape)

### plot posterior samples ###

plt.plot(selection['newsworthy'].sort_values(),[s for s in samples['obs_selection']],'o',color='b',alpha=0.1)
    

plt.plot(selection['newsworthy'].sort_values(),samples['obs_selection'].mean(axis=1),'o',color='k',
         label='selected posterior sample mean')

plt.plot(selection['newsworthy'],selection['trustworthy'],'x',color='crimson',label='selected data points',ms=12)

plt.fill_between(selection['newsworthy'].sort_values(),y1=sample_CI[0,:],y2=sample_CI[1,:],
                 color='orange',alpha=0.3,label='selected posterior sample 89% CI')

plt.legend(loc='upper left')
plt.xlabel('newsworthy')
plt.ylabel('trustworthy')

title = 'Selection of articles to publish - Selection bias confounding'
title = title + '\nselection by taking top 10% of product newsworthy x trustworthy (i.e. upper right corner points)'
plt.title(title)

ax = plt.gca()

df.plot(x='newsworthy',y='trustworthy',ax=ax,style='k.',label='all data points')

az.plot_hpd(df['newsworthy'],trace['mu_pop'],hdi_prob=0.89,color='blue',ax=ax)
ax.plot(df['newsworthy'],df['newsworthy'] * trace['beta'][:,0].mean() + trace['alpha'][:,0].mean(),
       color='k')

ax.plot(df['newsworthy'],df['newsworthy'] * slope_pop + intercept_pop,color='blue')

ax.plot(selection['newsworthy'],selection['newsworthy'] * trace['beta'][:,1].mean() + trace['alpha'][:,1].mean(),
       color='k')

ax.plot(selection['newsworthy'],selection['newsworthy'] * slope_selection + intercept_selection,color='crimson')
az.plot_hpd(selection['newsworthy'],trace['mu_selection'],hdi_prob=0.89,ax=ax,color='cyan')

plt.legend(loc='upper left')
plt.axvline(0,ls='dashed',color='magenta')
plt.axhline(0,ls='dashed',color='magenta')

In [None]:
with model:
    data_ppc = az.from_pymc3(trace=trace,posterior_predictive=samples)
    
data_ppc

In [None]:
data_ppc

In [None]:
az.plot_ppc(data_ppc,var_names=['obs_selection'],figsize=(18,12))

In [None]:
selection.plot(x='newsworthy',y='trustworthy',style='o')

In [None]:
draws = np.arange(0,len(trace),10) # selects every n:th sample (of alpha & beta) posteriors from trace

fig,ax = plt.subplots()


# for each x-value, generate draws nr of samples of alpha and beta # 
# use these sampled params to generate HDI for means for y-values #
# NOTE ! this is not y-samples from posterior - for that, use pm.sample_posterior_predictive as above # 

_= ax.plot(selection['newsworthy'].values,
         selection['newsworthy'].values[:,np.newaxis] * trace['beta'][:,1][draws] + trace['alpha'][:,1][draws],
        'o',color='crimson',alpha=0.1)

# mean line by pymc
ax.plot(selection['newsworthy'],selection['newsworthy'] * trace['beta'][:,1].mean() + trace['alpha'][:,1].mean(),
       color='k')

# mean line by sps
ax.plot(selection['newsworthy'],selection['newsworthy'] * slope_selection + intercept_selection,color='c')

# hdi for mean
az.plot_hpd(selection['newsworthy'],trace['mu_selection'],hdi_prob=0.89,ax=ax,color='orange')


In [None]:
df

In [None]:
df['sum'] = df['newsworthy'] + df['trustworthy']
df = df.sort_values('sum',ascending=False)
selected_2 = df[:top_10_pct]
selected_2

In [None]:
ax = df.plot(x='newsworthy',y='trustworthy',style='bo')
selected_2.plot(x='newsworthy',y='trustworthy',style='ro',ax=ax)


In [851]:
with pm.Model() as model2:
    alpha = pm.Normal('alpha',mu=0,sd=1,shape=2)
    beta = pm.Normal('beta',mu=0,sd=1,shape=2)
    
    ### flat priors give results very close with sps.linregress ###
    #alpha = pm.Uniform('alpha',-2,2,shape=2)
    #beta = pm.Uniform('beta',-2,2,shape=2)
    
    sigma = pm.Uniform('sigma',0,1)
    
    mu_pop = pm.Deterministic('mu_pop',df['newsworthy'].values * beta[0] + alpha[0])
    mu_selection = pm.Deterministic('mu_selection',selected_2['newsworthy'].values * beta[1] + alpha[1])
    
    pm.Deterministic('diff',beta[0] - beta[1])
    pm.Deterministic('rel',(beta[0] / beta[1]) - 1)
    
    obs_pop = pm.Normal('obs_pop',mu=mu_pop,
                        sd=sigma,observed=df['trustworthy'].values)
    
    obs_selection = pm.Normal('obs_selection',mu=mu_selection,sd=sigma,
                              observed=selected_2['trustworthy'].values)
    
    trace2 = pm.sample(return_inferencedata=True)
    

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [sigma, beta, alpha]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 6 seconds.


In [853]:
model2.coords

{}

In [None]:
with model2:
    print (pm.summary(trace2,hdi_prob=0.89,var_names=['alpha','beta']))
    pm.plot_trace(trace2,var_names=['alpha','beta','diff','rel'])

In [None]:
with model2:
    az.plot_posterior(trace2,var_names=['alpha','beta','diff','rel'],ref_val=0)

In [None]:
with model2:
    samples2 = pm.sample_posterior_predictive(trace2,len(selection['newsworthy']),
                                              model2,var_names=['obs_selection'])

    
print (samples2['obs_selection'].shape)



In [None]:
plt.plot(selected_2['newsworthy'],[s for s in samples2['obs_selection']],'o',color='b',alpha=0.01)

plt.plot(selected_2['newsworthy'],
         selected_2['newsworthy'] * trace2['beta'][:,1].mean() + trace2['alpha'][:,1].mean(),
         ls='dashed',color='crimson')

plt.plot(selected_2['newsworthy'],selected_2['trustworthy'],'.',color='r')

ax = plt.gca()

az.plot_hpd(selected_2['newsworthy'],trace2['mu_selection'],hdi_prob=0.89,ax=ax)

In [None]:
fig,ax = plt.subplots(figsize=(18,12))

az.plot_hpd(selected_2['newsworthy'],trace2['mu_selection'],hdi_prob=0.89,ax=ax)

ax.plot(selected_2['newsworthy'],
        selected_2['newsworthy'] * trace2['beta'][:,1].mean() + trace2['alpha'][:,1].mean(),
       ls='dashed',color='crimson')

selected_2.plot(x='newsworthy',y='trustworthy',style='ro',ax=ax)

df.plot(x='newsworthy',y='trustworthy',style='b.',ax=ax)

az.plot_hpd(df['newsworthy'],trace2['mu_pop'],hdi_prob=0.89,ax=ax)

ax.plot(df['newsworthy'],
        df['newsworthy'] * trace2['beta'][:,0].mean() + trace2['alpha'][:,0].mean(),
       ls='dashed',color='crimson')
