# Chapter 4

Bayesian interpretations of linear regression, via some Gaussians

In [None]:
import arviz
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
import seaborn as sns
import pandas as pd
import pymc as pm 

import pybayes

sns.set_style("white") 


Firstly, we simulate some random walks to show empirically (if further evidence were needed) that lots of things end up being Gaussian.

Eg random walks via binomial (e.g. flip a coin, step forward if heads, backward if tails).

In [None]:
n_walks = 1000
n_steps = 1000

walks = np.random.binomial(n=1, p=0.5, size=(n_walks, n_steps))
walks[walks == 0] = -1

paths = np.cumsum(walks, axis=1)

fig, ax = plt.subplots()

for i in range(n_walks):
    plt.plot(paths[i, :], alpha=0.05)

plt.title('Walks')
plt.xlabel('Step number')
plt.ylabel('Position')
plt.show()

In [None]:
final_position = paths[:, -1]
pybayes.utils.hist(final_position)
plt.show()

## Grid-approximating our two-parameter model

We're going to do a regression on some height data. Our model will be:

\begin{equation}
\begin{aligned}
h_i &\sim \mathcal{N}(\mu, \sigma) \\
\mu &\sim \mathcal{N}(178, 20) \\
\sigma &\sim \text{Uniform}(0, 50)\end{aligned}
\end{equation}

Here our likelihood is line one, and line two and three are sensibly chosen priors. We can check the sensibleness by plotting the priors, and then looking at what they imply, with a prior predictive simulation

In [None]:
mu_mean = 178
mu_sigma = 20
p_grid_mu = np.linspace(100,250, 1000)
mu_prior = scipy.stats.norm.pdf(p_grid_mu, loc= mu_mean, scale=mu_sigma)

In [None]:
pybayes.utils.plot_nicely(x_vals=p_grid_mu, y_vals=mu_prior)

In [None]:
sigma_low = 0
sigma_high = 50
p_grid_sigma = np.linspace(-5,55, 100)
sigma_prior = scipy.stats.uniform.pdf(p_grid_sigma, loc=sigma_low, scale=sigma_high)
# it is really weird that uniform uses loc and scale to mean these things.
pybayes.utils.plot_nicely(x_vals=p_grid_sigma, y_vals=sigma_prior)

In [None]:
sample_mu = np.random.normal(loc=mu_mean, scale=mu_sigma, size=10_000)
sample_sigma = np.random.uniform(low=sigma_low, high=sigma_low, size=10_000)
prior_h = np.random.normal(loc=sample_mu, scale=sample_sigma)

In [None]:
pybayes.utils.hist(prior_h)

The above is not the empirical distribution of H, its not even Gaussian. It's the distribution of relative plausibilities of different heights before we've seen the data. Next step is to grab the data and grid-approximate the posterior.

In [None]:
howell = "https://raw.githubusercontent.com/rmcelreath/rethinking/master/data/Howell1.csv"

df = pd.read_csv(howell, sep=';')

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
# only use adults here
d2 = df[df.age >= 18]

In [None]:
# grid-approximate using an algo to be explained later.
mu_list = np.linspace(150, 160, 100)
sigma_list = np.linspace(7,9, 100)  # why these values i do not know - i assume this is from the observed data.
# all combos of sigma and mu
post = pd.DataFrame({
    'mu': np.tile(mu_list, len(sigma_list)),
    'sigma': np.repeat(sigma_list, len(mu_list))
}) 

# Calculate the log likelihoods
def log_likelihood(row):
    mu = row['mu']
    sigma = row['sigma']
    ll = np.sum(scipy.stats.norm.logpdf(d2['height'], mu, sigma))
    return ll

post['LL'] = post.apply(log_likelihood, axis=1)

# Calculate the product of likelihood and priors
post['prod'] = (post['LL'] + 
                scipy.stats.norm.logpdf(post['mu'], 178, 20) + 
                np.where((post['sigma'] >= 0) & (post['sigma'] <= 50), np.log(1/50), -np.inf))

# Convert to probability
max_prod = np.max(post['prod'])
post['prob'] = np.exp(post['prod'] - max_prod)

In [None]:
sns.kdeplot(data=post, x='mu', y='sigma', weights='prob',  fill=True)

In [None]:
two_d = post.pivot(index='mu', columns='sigma', values='prob')

In [None]:
sns.heatmap(two_d)

In [None]:
# sample from the posterior, by sampling from the rows numbers proportionally to the probability and pulling the params.

rows = np.random.choice(post.index, 10_000, replace=True, p=post.prob/post.prob.sum())


sample = pd.DataFrame.from_dict({'mu': post.iloc[rows].mu,
                                 'sigma': post.iloc[rows].sigma}).reset_index(drop=True)

In [None]:
sample

In [None]:
sns.scatterplot(data=sample, x='mu', y='sigma', alpha=0.05)

In [None]:
sns.histplot(sample.mu, binwidth=0.1)

In [None]:
sns.histplot(sample.sigma, binwidth=0.1)

In [None]:
# recall our priors for mu and sigma 
fig, ax = plt.subplots()
ax.plot(p_grid_mu, mu_prior, label='prior')
ax.set_ylabel('p')
ax2 = ax.twinx()
sns.histplot(sample.mu, binwidth=0.1, ax=ax2, label='posterior')

plt.show()

In [None]:
# our posterior for mu has collapsed as a result of our observations.
print(f'HDPI for mu:', arviz.hdi(sample.mu.values))
print(f'HDPI for sigma:', arviz.hdi(sample.sigma.values))

In [None]:
d3 = d2.sample(20)

In [None]:
# If we repeat all the above but only using 20 of the heights from the dataset, we get:
# grid-approximate using an algo to be explained later.
mu_list = np.linspace(150, 170, 100)
sigma_list = np.linspace(4,20, 100)  
# all combos of sigma and mu
post = pd.DataFrame({
    'mu': np.tile(mu_list, len(sigma_list)),
    'sigma': np.repeat(sigma_list, len(mu_list))
}) 

# Calculate the log likelihoods
def log_likelihood(row):
    mu = row['mu']
    sigma = row['sigma']
    ll = np.sum(scipy.stats.norm.logpdf(d3['height'], mu, sigma))
    return ll

post['LL'] = post.apply(log_likelihood, axis=1)

# Calculate the product of likelihood and priors
post['prod'] = (post['LL'] + 
                scipy.stats.norm.logpdf(post['mu'], 178, 20) + 
                np.where((post['sigma'] >= 0) & (post['sigma'] <= 50), np.log(1/50), -np.inf))

# Convert to probability
max_prod = np.max(post['prod'])
post['prob'] = np.exp(post['prod'] - max_prod)

# sample
rows = np.random.choice(post.index, 10_000, replace=True, p=post.prob/post.prob.sum())
sample = pd.DataFrame.from_dict({'mu': post.iloc[rows].mu,
                                 'sigma': post.iloc[rows].sigma}).reset_index(drop=True)

In [None]:
sns.histplot(sample.mu, binwidth=0.1)

In [None]:
sns.histplot(sample.sigma, binwidth=0.1)

In [None]:
sns.scatterplot(data=sample, x='mu', y='sigma', alpha=0.05)

In [None]:
# the stdev is notably less Gaussian.

## Moving to quadratic approximation

In r this is all done in quap. We fit a quadratic to the maximum of the a posteriori distro and use that.

The model again:\begin{equation}
\begin{aligned}
h_i &\sim \mathcal{N}(\mu, \sigma) \\
\mu &\sim \mathcal{N}(178, 20) \\
\sigma &\sim \text{Uniform}(0, 50)\end{aligned}
\end{equation}

In [None]:
d2

In [None]:
# TODO: make and use py-quap, this is doing MCMC sampling

with pm.Model() as height_model:
    # Uniform prior for sigma
    mu = pm.Normal('mu', mu=178, sigma=20)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    
    # Normal likelihood
    height = pm.Normal('height', mu=mu, sigma=sigma, observed=d2.height)
    

In [None]:
with height_model:
    trace = pm.sample(1000, tune=1000)

In [None]:
arviz.plot_trace(trace)
plt.tight_layout()
plt.show()

In [None]:
print(arviz.summary(trace, kind='stats'))

In [None]:
# what happens if we use a much tighter and more informative prior on mu?
with pm.Model() as height_model_2:
    # Uniform prior for sigma
    mu = pm.Normal('mu', mu=178, sigma=0.1)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    
    # Normal likelihood
    height = pm.Normal('height', mu=mu, sigma=sigma, observed=d2.height)
    
    
    trace_2 = pm.sample(1000, tune=1000)
    
arviz.plot_trace(trace_2)
plt.tight_layout()
plt.show()
print(arviz.summary(trace_2, kind='stats'))

In [None]:
# our model is insistent that the mean is 178, and this disagrees with the data a lot, so the posterior for sigma changes

In [None]:
# sampling from the quadratic approximation
# note that here we're already got samples, but pretend we used quap. Then the quadratic approximation 
# is a multi-dimensional Gaussian, specified by the means and covariance of our distro.
trace_df = arviz.extract(trace, combined=True).to_dataframe()

In [None]:
trace_df[['mu', 'sigma']].cov()

In [None]:
# decompose into the variances for the params, and the correlation
np.diag(trace_df[['mu', 'sigma']].cov())

In [None]:
trace_df[['mu', 'sigma']].corr()

In [None]:
# this matrix shows that learning about mu tells us little about sigma, and vice versa - may not always be the case.

In [None]:
# we can extract vectors of values from the Gaussian, given this info.
mean_values = trace_df.mean()[['mu', 'sigma']]
quad_samples = scipy.stats.multivariate_normal.rvs(mean=mean_values, cov=trace_df[['mu', 'sigma']].cov(), size=1000)

In [None]:
pybayes.utils.hist(quad_samples[:,0])

In [None]:
pybayes.utils.hist(quad_samples[:,1])

In [None]:
arviz.hdi(quad_samples[:,0])

In [None]:
arviz.hdi(quad_samples[:,1])

## Predicting things

We've fit a Gaussian to some heights. What we want to do is model how some predictor variables affect an outcome of interest.
Here we'll use weight to predict height.

\begin{equation}
\begin{aligned}
h_i &\sim \mathcal{N}(\mu_i, \sigma) \\
\mu_i &= \alpha + \beta(x_i - \bar{x}) \\
\alpha &\sim \text{Normal}(178, 20) \\
\beta  &\sim \text{Normal}(0,10) \\
\sigma &\sim \text{Uniform}(0, 50)
\end{aligned}
\end{equation}

Now the mean depends on each row $i$. And we no longer estimate $\mu$ as a parameter, instead we construct it, assuming the linear model given. Note the lack of $\sim$, the $\mu_i$ is deterministic given the inputs.

In [None]:
sns.scatterplot(data=d2, x='weight', y='height')

In [None]:
# what do our priors mean? We can do a prior predictive simulation

N = 100
alpha = np.random.normal(loc=178, scale=20, size=N)
beta = np.random.normal(loc=0, scale=10, size=N)

fig, ax= plt.subplots()
x = np.linspace(30, 60, N)
x_bar = d2.weight.mean()
for a, b in zip(alpha, beta):
    
    ax.plot(x, [a + b*(i-x_bar) for i in x], alpha=0.1)
    
plt.show()

note this is very silly. Noone on Earth is <0 or > 300 cm tall. So use a new prior on beta:

\begin{equation}
\beta  \sim \text{Log-Normal}(0,1) \\
\end{equation}

In [None]:
beta = np.random.lognormal(mean=0, sigma=1, size=10_000)
sns.histplot(beta)

In [None]:
# Repeat our prior predictive simulation

N = 100
alpha = np.random.normal(loc=178, scale=20, size=N)
beta = np.random.lognormal(mean=0, sigma=1, size=N)

fig, ax= plt.subplots()
x = np.linspace(30, 60, N)
x_bar = d2.weight.mean()
for a, b in zip(alpha, beta):
    ax.plot(x, [a + b*(i-x_bar) for i in x], alpha=0.1)

plt.show()

In [None]:
# now generate the posterior, as before

x_bar = d2.weight.mean()

with pm.Model() as height_model_2:
    # Uniform prior for sigma
    alpha = pm.Normal('alpha', mu=178, sigma=20)
    beta = pm.LogNormal('beta', mu=0, sigma=1)
    sigma = pm.Uniform('sigma', lower=0, upper=50)

    mu = alpha + beta*(d2.weight-x_bar)
    
    height=pm.Normal('height', mu=mu, sigma=sigma, observed=d2.height) 
    
    trace_regr = pm.sample(1000, tune=1000)

In [None]:
arviz.plot_trace(trace_regr)
plt.tight_layout()
plt.show()
print(arviz.summary(trace_regr, kind='stats'))

In [None]:
# visualising our posterior. To start with, look at the raw data and the posterior mean.
trace_regr_df = trace_regr.posterior.to_dataframe()
fix, ax = plt.subplots()
ax.scatter(data=d2, x='weight', y='height', alpha=0.25)

x = np.linspace(d2.weight.min(), d2.weight.max(), 100)
alpha_mean = trace_regr_df.alpha.mean()
beta_mean = trace_regr_df.beta.mean()
y = alpha_mean + beta_mean*(x - x_bar)

ax.plot(x, y)

plt.show()


In [None]:
# now add some sample lines to show the uncertainty in the parameter values
trace_regr_df = trace_regr.posterior.to_dataframe()
fix, ax = plt.subplots()
ax.scatter(data=d2, x='weight', y='height', alpha=0.25)

x = np.linspace(d2.weight.min(), d2.weight.max(), 100)
alpha_mean = trace_regr_df.alpha.mean()
beta_mean = trace_regr_df.beta.mean()
y = alpha_mean + beta_mean*(x - x_bar)

ax.plot(x, y, c='green')

lines = trace_regr_df.sample(10)

for _, (a, b, s) in lines.iterrows():
    y = a + b*(x - x_bar)
    ax.plot(x, y, c='green', alpha=.2)
# for i in range(10):
#     alpha

plt.show()



In [None]:
# we can see, as the number of points we're inferring from increases, the uncertainty is reduced.

def get_posterior_from_sample(input_df) -> pd.DataFrame:
    x_bar = input_df.weight.mean()
    with pm.Model() as height_model_3:
        alpha = pm.Normal('alpha', mu=178, sigma=20)
        beta = pm.LogNormal('beta', mu=0, sigma=1)
        sigma = pm.Uniform('sigma', lower=0, upper=50)
        mu = alpha + beta*(input_df.weight-x_bar)
        height=pm.Normal('height', mu=mu, sigma=sigma, observed=input_df.height) 
        trace_regr = pm.sample(1000, tune=1000)
    return trace_regr.posterior.to_dataframe()

for num_points in [10, 10, 100, len(d2)]:
    sub_df = d2[:num_points]
    # now generate the posterior, as before
    posterior = get_posterior_from_sample(sub_df)
    fix, ax = plt.subplots()
    ax.scatter(data=sub_df, x='weight', y='height', alpha=0.25)

    x = np.linspace(30, 65, 100)
    alpha_mean = posterior.alpha.mean()
    beta_mean = posterior.beta.mean()
    y = alpha_mean + beta_mean*(x - x_bar)
    ax.plot(x, y, c='green')

    lines = posterior.sample(10)
    for _, (a, b, s) in lines.iterrows():
        y = a + b*(x - x_bar)
        ax.plot(x, y, c='green', alpha=.2)

    plt.show()


In [None]:
# we can do better by plotting the interval.

# to start with, what's the distribution of the posterior mu at a fixed point (e.g. weight=50)?

mu_at_50 = trace_regr_df.alpha + trace_regr_df.beta * (50 - x_bar)

sns.kdeplot(mu_at_50)
plt.xlabel('mu | weight=50')

In [None]:
# mu has a distribution (Gaussian, since its inputs are all Gaussians). So we can work out the HPDI.
arviz.hdi(mu_at_50.values, hdi_prob=0.89)

In [None]:
trace_regr_df.alpha.values

In [None]:
# we can draw this interval for each value of the weight
x = np.linspace(d2.weight.min(), d2.weight.max(), 100)
y= trace_regr_df.alpha.values[:, np.newaxis] + trace_regr_df.beta.values[:, np.newaxis] * (x - x_bar).T


In [None]:
fig, ax = plt.subplots()
arviz.plot_hdi(x, y)
ax.scatter(data=d2, x='weight', y='height', alpha=0.25)
alpha_mean = trace_regr_df.alpha.mean()
beta_mean = trace_regr_df.beta.mean()
y = alpha_mean + beta_mean*(x - x_bar)
ax.plot(x, y)

plt.show()


In [None]:
# what we want, though, are prediction intervals for h, not for mu.


x_bar = d2.weight.mean()
with pm.Model() as model_simulate_h:
    alpha = pm.Normal('alpha', mu=178, sigma=20)
    beta = pm.LogNormal('beta', mu=0, sigma=1)
    sigma = pm.Uniform('sigma', lower=0, upper=50)
    mu = alpha + beta*(d2.weight-x_bar)
    height=pm.Normal('height', mu=mu, sigma=sigma, observed=d2.height) 
    samples = pm.sample(1000, tune=1000)

samples_df = samples.posterior.to_dataframe()


In [None]:
samples_df