# Chapter 6
Issues when adding variables to a regression - multicollinearity, post-treatment bias, collider bias.

To start with, demonstrate Berkson's paradox, showing that the act of selection generates a negative correlation between two independent variables.


In [None]:
import arviz
import matplotlib.pyplot as plt
# import networkx as nx
import numpy as np
# import scipy.stats
import seaborn as sns
# import polars as pl (doesn't work with pymc yet)
import pandas as pd 
import pymc as pm 
# import pybayes

sns.set_style("white") 

In [None]:
# In this example, we have two variables, trustworthiness and newsworthiness, and they are uncorrelated. 
# We are a grant committee, and we take 200 proposals, and then select the ones in the top 10% by average score.

num_proposals = 200
selection_fraction = 0.1
trustworthiness = np.random.normal(loc=0, scale=1, size=num_proposals)
newsworthiness = np.random.normal(loc=0, scale=1, size=num_proposals)
score = trustworthiness + newsworthiness
winners = np.where(score > np.quantile(score, (1-selection_fraction)))

fig, ax = plt.subplots()
plt.scatter(trustworthiness, newsworthiness)
plt.scatter(trustworthiness[winners], newsworthiness[winners], c='red')
plt.show()

In [None]:
print(np.corrcoef(trustworthiness[winners], newsworthiness[winners]))

## Multicollinearity

When there is a strong association between two or more predictior variables, the posterior distribution will seem like none of your variables is reliably associated with the outcome, even when they all are. 

In [None]:
# demo - predicting height using the length of both legs.

N = 100

height = np.random.normal(loc=10, scale=2, size=N)
leg_prop = np.random.uniform(low=0.4, high=0.5, size=N)  # leg length as proportion of total height
leg_left = leg_prop*height + np.random.normal(loc=0, scale=0.02, size=N) 
leg_right = leg_prop*height + np.random.normal(loc=0, scale=0.02, size=N)


df = pd.DataFrame({
    "height": height,
    "leg_left": leg_left,
    "leg_right": leg_right
})



In [None]:
sns.pairplot(df)


In [None]:
# we are predicting the beta coefficient to be around average height / average leg length = 10/(0.45*10) ~ 2.2  
with pm.Model() as model_leg_lengths:
    alpha = pm.Normal('alpha', mu=10, sigma=100)
    beta_left = pm.Normal('beta_left', mu=2, sigma=10)
    beta_right = pm.Normal('beta_right', mu=2, sigma=10)

    sigma = pm.Exponential('sigma', 1)

    mu = alpha + beta_left * df['leg_left'] + beta_right * df['leg_right']

    height_obs = pm.Normal('height_obs', mu=mu, sigma=sigma, observed=df['height'])

    trace =  pm.sample(1000, tune=1000) #, nuts_sampler='numpyro')


In [None]:

arviz.plot_trace(trace)
plt.tight_layout()
plt.show()

summary = arviz.summary(trace)
print(summary)

arviz.plot_forest(trace)
plt.tight_layout()

In [None]:
plt.scatter(data=trace.posterior, x='beta_left', y='beta_right')

In [None]:
# to explain the wide wide posterior, we can look at the joint posterior distribution of beta_left and beta_right:
arviz.plot_posterior(data=trace, var_names=['beta_left', 'beta_right'], )

In [None]:
# NB - the sum of the two betas is much more sensibly behaved:
posterior_beta_sum = trace.posterior['beta_left'] + trace.posterior['beta_right']
arviz.plot_posterior(posterior_beta_sum)