In [None]:
import pandas as pd
import numpy as np
import altair as alt

import pymc as pm
import bambi as bmb
import pymc_bart as pmb
import arviz as az

from pymc import do, observe

data = pd.read_csv('./data/discrimination.csv', index_col=0)

data.head()


In [None]:
coords = {"predictors": ["conservative"]}

with pm.Model(coords=coords) as model:
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta_conservative = pm.Normal("beta_conservative", mu=0, sigma=10)
    
    p = pm.math.sigmoid(alpha + beta_conservative * data["conservative"])
    likelihood = pm.Bernoulli("discrimination", p=p, observed=data["discrimination"])
    
    trace = pm.sample()
    pm.sample_posterior_predictive(trace, extend_inferencedata=True)

pm.plot_posterior(trace)

In [None]:

# Add party preference as dummy variables
party_dummies = pd.get_dummies(data["party"], drop_first=True).astype(int)
df = pd.concat([data, party_dummies], axis=1)

# Add dummies to predictors
coords["predictors"] = ["conservative"] + list(party_dummies.columns)

with pm.Model(coords=coords) as adjusted_model:
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta = pm.Normal("beta", mu=0, sigma=10, dims="predictors")
    
    predictors = df[["conservative"] + list(party_dummies.columns)]
    p = pm.math.sigmoid(alpha + pm.math.dot(predictors, beta))
    likelihood = pm.Bernoulli("discrimination", p=p, observed=df["discrimination"])
    
    party_adjusted_trace = pm.sample()
    
pm.plot_posterior(party_adjusted_trace)

One simple trick we can use to take a lot of manual work out of defining simple PyMC models is to fit them with design matrices based on Wilkinson notation (this should be very familiar if you have used ``brms`` or ``lmer4`` in R or ``statsmodels``in Python). For this there are several options, here we use ``patsy``.

In [None]:
import patsy

# Create the design matrix using patsy
formula = "discrimination ~ 1 + conservative + C(party, Treatment)"
y, X = patsy.dmatrices(formula, data=data, return_type='dataframe')

# Get the predictor names from the design matrix
predictor_names = X.columns.tolist()
coords = {"predictors": predictor_names}

with pm.Model(coords=coords) as patsy_model:
    # Priors
    beta = pm.Normal("beta", mu=0, sigma=10, dims="predictors")
    
    # Linear combination using dot product with design matrix X
    # Note: alpha is already included in the design matrix as the intercept
    linear_pred = pm.math.dot(X, beta)
    
    # Link function
    p = pm.math.sigmoid(linear_pred)
    
    # Likelihood
    likelihood = pm.Bernoulli("discrimination", p=p, observed=y.values.ravel())
    
    # Sample from the posterior
    party_adjusted_trace = pm.sample()

In [None]:
pm.plot_trace(party_adjusted_trace)

In [None]:
pm.plot_posterior(party_adjusted_trace)

In [None]:
party_model = bmb.Model(
    'party[EKRE] ~ conservative + gender + age + ethnicity + age:gender',
    df,
    family='bernoulli',
)

party_model

In [None]:
result = party_model.fit(idata_kwargs={"log_likelihood": True})

az.plot_trace(result, compact=False)

In [None]:
party_model.graph()

In [None]:
az.summary(result)

In [None]:
party_model.predict(result, kind="response")
ax = az.plot_separation(result, y="party", figsize=(9,0.5));


In [None]:
loo = az.loo(result, pointwise=True)
az.plot_khat(loo.pareto_k);


In [None]:
ax = az.plot_khat(loo.pareto_k)
sorted_kappas = np.sort(loo.pareto_k.values.ravel())

# find observation where the kappa value exceeds the threshold
threshold = sorted_kappas[-2:].min()
ax.axhline(threshold, ls="--", color="orange")
influential_observations = df.reset_index()[loo.pareto_k.values >= threshold].index

for x in influential_observations:
    y = loo.pareto_k.values[x]
    ax.text(x, y + 0.01, str(x), ha="center", va="baseline")

In [None]:
df[loo.pareto_k.values >= threshold]

In [None]:
age = np.arange(18, 91)
new_data = pd.DataFrame({
    "age": np.tile(age, 8),
    "conservative": np.repeat([0, 1], len(age) * 4),
    "ethnicity": np.tile(np.repeat(["estonian", "other"], len(age)), 4),
    "gender": np.tile(np.repeat(["male", "female"], len(age)), 4)
})
new_data

In [None]:

party_model.predict(result, data=new_data)
vote_posterior = az.extract_dataset(result, num_samples=2000)["p"]

pd.concat([new_data, pd.DataFrame(vote_posterior.to_numpy().mean(axis=1))], axis=1).rename(columns={0: "p"})

alt.Chart(pd.concat([new_data, pd.DataFrame(vote_posterior.to_numpy().mean(axis=1))], axis=1).rename(columns={0: "p"})).mark_line().encode(
    x="age",
    y="mean(p)",
    color="gender"
).facet(
    "ethnicity:N"
)