# Chapter 5

Multiple regression and some causal inference.

In [None]:
import arviz
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
import seaborn as sns
import pandas as pd
import pymc as pm 

import pybayes

sns.set_style("white") 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
%load_ext watermark
%watermark -v -m -p arviz,matplotlib,numpy,scipy,seaborn,pandas,pymcs

In [None]:
# we look at the 'waffle houses and divorce' dataset in this chapter.
waffle_url = "https://raw.githubusercontent.com/rmcelreath/rethinking/master/data/WaffleDivorce.csv"

waffle_divorce_df = pd.read_csv(waffle_url, sep=';')

In [None]:
waffle_divorce_df

In [None]:
# scale our variables

def standardise(column: pd.Series) -> pd.Series:
    return ( column - column.mean() ) / column.std()

In [None]:
waffle_divorce_df['D'] = standardise(waffle_divorce_df.Divorce)
waffle_divorce_df['M'] = standardise(waffle_divorce_df.Marriage)
waffle_divorce_df['A'] = standardise(waffle_divorce_df.MedianAgeMarriage)

In [None]:
waffle_divorce_df[['D', 'M', 'A']].cov()

start with a basic linear regression model:
\begin{equation}
\begin{aligned}
D_i &\sim \mathcal{N}(\mu_i, \sigma) \\
\mu_i &= \alpha + \beta_A A_i  \\
\alpha &\sim \text{Normal}(0, 0.2) \\
\beta &\sim \text{Normal}(0, 0.5) \\
\sigma &\sim \text{Exponential}(1)
\end{aligned}
\end{equation}



In [None]:
# i should make the below a function.

with pm.Model() as model_age_vs_divorce:
    # priors
    alpha = pm.Normal('alpha', mu=0, sigma=0.2)
    beta = pm.Normal('beta', mu=0, sigma=0.5)
    sigma = pm.Exponential('sigma', scale=1)
    # model
    mu = pm.Deterministic('mu', alpha + beta * waffle_divorce_df.A)
    # likelihood
    divorce_rate=pm.Normal('divorce_rate', mu=mu, sigma=sigma, observed=waffle_divorce_df.D) 
    model_age_vs_divorce_samples = pm.sample(1000, tune=1000)
    map_vals = pm.find_MAP()

In [None]:
mu_pred = model_age_vs_divorce_samples.posterior['mu']
divorce_pred = pm.sample_posterior_predictive(model_age_vs_divorce_samples, model=model_age_vs_divorce)

In [None]:
x = np.linspace(waffle_divorce_df.A.min(), waffle_divorce_df.A.max(), 100)

mu = map_vals['alpha'] + map_vals['beta'] * x

plt.plot(x, mu, color='#1b9e77')
plt.scatter(waffle_divorce_df.A, waffle_divorce_df.D, alpha=0.25)

arviz.plot_hdi(waffle_divorce_df.A, mu_pred, hdi_prob=0.89, fill_kwargs={'alpha': 0.25, 'color': '#1b9e77'})
# ppd interval for the heights
arviz.plot_hdi(waffle_divorce_df.A, divorce_pred.posterior_predictive['divorce_rate'], hdi_prob=0.89, fill_kwargs={'alpha': 0.1, 'color': '#1b9e77'})

plt.show()

In [None]:
# look at our priors
num_lines = 100
alpha_prior = np.random.normal(loc=0, scale=0.2, size=num_lines)
beta_prior = np.random.normal(loc=0, scale=0.5, size=num_lines)

fig, ax= plt.subplots()
num_points = 100
x = np.linspace(waffle_divorce_df.A.min(), waffle_divorce_df.A.max(), num_points)

for a, b in zip(alpha_prior, beta_prior):   
    ax.plot(x, [a + b*i for i in x], alpha=0.1)

ax.set_xlabel('standardised median age')
ax.set_ylabel('standardised divorce rate')

plt.show()

In [None]:
# look at the posterior
sns.histplot(model_age_vs_divorce_samples.posterior['beta'].values.flatten())

In [None]:
# what does the value of beta mean, in the context of multiple regression?

## Multiple regression


Set up our regression just like the single-variable regression

\begin{equation}
\begin{aligned}
D_i &\sim \mathcal{N}(\mu_i, \sigma) \\
\mu_i &= \alpha + \beta_A A_i + \beta_M M_i \\
\alpha &\sim \text{Normal}(0, 0.2) \\
\beta_A &\sim \text{Normal}(0, 0.5) \\
\beta_M &\sim \text{Normal}(0, 0.5) \\
\sigma &\sim \text{Exponential}(1)
\end{aligned}
\end{equation}



In [None]:

with pm.Model() as model_age_and_rate_vs_divorce:
    # priors
    alpha = pm.Normal('alpha', mu=0, sigma=0.2)
    beta_a = pm.Normal('beta_a', mu=0, sigma=0.5)
    beta_m = pm.Normal('beta_m', mu=0, sigma=0.5)
    sigma = pm.Exponential('sigma', scale=1)
    # model
    mu = pm.Deterministic('mu', alpha + beta_a * waffle_divorce_df.A + beta_m * waffle_divorce_df.M)
    # likelihood
    divorce_rate=pm.Normal('divorce_rate', mu=mu, sigma=sigma, observed=waffle_divorce_df.D) 
    model_age_and_rate_vs_divorce_samples = pm.sample(1000, tune=1000)
    map_vals = pm.find_MAP()

In [None]:
arviz.summary(model_age_and_rate_vs_divorce_samples.posterior, var_names=['~mu'])

if you compare this to single-variable models the beta_m changes a lot, but the beta_a stays the same - this could be interpreted as that once we know the median age (A), there is little information in knowing M (conditional on A, D and M are independent)

## Plotting

### predictor residual plots 

the predictor residual is the average prediction error when we use all the other variables to model a predictor. In our example

\begin{equation}
\begin{aligned}
M_i &\sim \mathcal{N}(\mu_i, \sigma) \\
\mu_i &= \alpha + \beta A_i \\
\alpha &\sim \text{Normal}(0, 0.2) \\
\beta &\sim \text{Normal}(0, 0.5) \\
\sigma &\sim \text{Exponential}(1)
\end{aligned}
\end{equation}



In [None]:
with pm.Model() as model_5_13:
    # priors
    alpha = pm.Normal('alpha', mu=0, sigma=0.2)
    beta = pm.Normal('beta', mu=0, sigma=0.5)
    sigma = pm.Exponential('sigma', scale=1)
    # model
    mu = pm.Deterministic('mu', alpha + beta * waffle_divorce_df.A)
    # likelihood
    M=pm.Normal('m', mu=mu, sigma=sigma, observed=waffle_divorce_df.M) 
    model_5_13_samples = pm.sample(1000, tune=1000)
    map_vals = pm.find_MAP()
    
    mu_pred = model_age_vs_divorce_samples.posterior['mu']
    m_pred = pm.sample_posterior_predictive(model_5_13_samples)

In [None]:
residual = waffle_divorce_df.M - map_vals['mu'] 

In [None]:
plt.scatter(residual, waffle_divorce_df.D)