In [11]:
import pandas as pd
import numpy as np
import altair as alt

import pymc as pm
import arviz as az

data = pd.read_csv('./data/discrimination.csv', index_col=0)

data.head()


Unnamed: 0,gender,age,ethnicity,party,conservative,discrimination
0,male,36,estonian,Keskerakond,1,0
1,female,50,estonian,SDE,0,0
2,male,67,estonian,Isamaa,1,0
3,male,58,estonian,Reform,0,0
4,male,62,estonian,Reform,1,0


In [2]:
data.groupby('discrimination').conservative.value_counts(normalize=True)

discrimination  conservative
0               1               0.518341
                0               0.481659
1               1               0.774799
                0               0.225201
Name: proportion, dtype: float64

In [3]:

alt.Chart(data.replace({0: 'No', 1: 'Yes'}), width=300).mark_bar().encode(
    x='conservative:N',
    xOffset='discrimination:N',
    y='count()',
    color='discrimination:N'
)

In [12]:
### Let's code the model

coords = {"predictors": ["conservative"]}

with pm.Model(coords=coords) as model:
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta_conservative = pm.Normal("beta_conservative", mu=0, sigma=10)
    
    p = pm.math.sigmoid(alpha + beta_conservative * data["conservative"])
    likelihood = pm.Bernoulli("discrimination", p=p, observed=data["discrimination"])
    
    trace = pm.sample()

print("NModel summary:")
pm.summary(trace)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta_conservative]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 1 seconds.


NModel summary:


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,-1.276,0.088,-1.444,-1.112,0.003,0.002,1161.0,1589.0,1.01
beta_conservative,1.158,0.105,0.968,1.365,0.003,0.002,1162.0,1512.0,1.01


In [14]:
### Same with plain regression model

import statsmodels.api as sm
import statsmodels.formula.api as smf

# Define the logistic regression formula
formula = 'discrimination ~ conservative'

# Fit the logistic regression model
model = smf.logit(formula=formula, data=data)
results = model.fit()

# Print the summary of the model
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.626752
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:         discrimination   No. Observations:                 2000
Model:                          Logit   Df Residuals:                     1998
Method:                           MLE   Df Model:                            1
Date:                Thu, 13 Mar 2025   Pseudo R-squ.:                 0.05114
Time:                        21:43:48   Log-Likelihood:                -1253.5
converged:                       True   LL-Null:                       -1321.1
Covariance Type:            nonrobust   LLR p-value:                 3.102e-31
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -1.2796      0.087    -14.670      0.000      -1.451      -1.109
conservative     1.1622

In [26]:
### Model stratified by party preference

# Add party preference as dummy variables
party_dummies = pd.get_dummies(data["party"], drop_first=True).astype(int)
df = pd.concat([data, party_dummies], axis=1)

# Add dummies to predictors
coords["predictors"] = ["conservative"] + list(party_dummies.columns)

with pm.Model(coords=coords) as adjusted_model:
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta = pm.Normal("beta", mu=0, sigma=10, dims="predictors")
    
    predictors = df[["conservative"] + list(party_dummies.columns)]
    p = pm.math.sigmoid(alpha + pm.math.dot(predictors, beta))
    likelihood = pm.Bernoulli("discrimination", p=p, observed=df["discrimination"])
    
    party_adjusted_trace = pm.sample()
    
pm.summary(party_adjusted_trace)

Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 4 seconds.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,-1.649,0.299,-2.198,-1.104,0.011,0.007,732.0,1015.0,1.01
beta[conservative],0.082,0.167,-0.229,0.392,0.004,0.003,1702.0,1617.0,1.0
beta[EKRE],2.771,0.357,2.142,3.464,0.013,0.009,718.0,1091.0,1.01
beta[Isamaa],0.545,0.344,-0.079,1.198,0.013,0.008,732.0,1139.0,1.01
beta[Keskerakond],2.163,0.35,1.486,2.807,0.013,0.009,745.0,1118.0,1.01
beta[Parempoolsed],-0.109,0.4,-0.872,0.617,0.013,0.007,1011.0,1318.0,1.01
beta[Reform],0.023,0.34,-0.608,0.651,0.012,0.007,839.0,1201.0,1.01
beta[SDE],-0.111,0.352,-0.775,0.545,0.013,0.007,794.0,1216.0,1.01


In [27]:
### Kitchen sink model

# Add party preference as dummy variables
party_dummies = pd.get_dummies(data["party"], drop_first=True).astype(int)
df = pd.concat([data, party_dummies], axis=1)

# Convert ethnicity and gender values to 0 and 1
df.replace({'male': 1, 'female': 0, 'estonian': 1, 'other':0}, inplace=True)

# Scale the age column and add it to predictors together with gender and ethnicity
df["age_scaled"] = (df["age"] - df["age"].mean()) / df["age"].std()
coords["predictors"] = ["conservative", "age_scaled", "gender", "ethnicity"] + list(party_dummies.columns)

with pm.Model(coords=coords) as adjusted_model:
    alpha = pm.Normal("alpha", mu=0, sigma=10)
    beta = pm.Normal("beta", mu=0, sigma=10, dims="predictors")
    
    predictors = df[["conservative", "age_scaled", "gender", "ethnicity"] + list(party_dummies.columns)]
    p = pm.math.sigmoid(alpha + pm.math.dot(predictors, beta))
    likelihood = pm.Bernoulli("discrimination", p=p, observed=df["discrimination"])
    
    kitchen_sink_trace = pm.sample()
    
pm.summary(kitchen_sink_trace)

  df.replace({'male': 1, 'female': 0, 'estonian': 1, 'other':0}, inplace=True)
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [alpha, beta]


Output()

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 5 seconds.


Unnamed: 0,mean,sd,hdi_3%,hdi_97%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
alpha,-1.656,0.354,-2.321,-1.014,0.009,0.006,1419.0,1821.0,1.0
beta[conservative],0.079,0.161,-0.217,0.383,0.003,0.002,2715.0,2655.0,1.0
beta[age_scaled],-0.057,0.055,-0.162,0.045,0.001,0.001,3363.0,2770.0,1.0
beta[gender],0.052,0.109,-0.162,0.247,0.002,0.002,3890.0,2905.0,1.0
beta[ethnicity],-0.023,0.177,-0.372,0.284,0.003,0.003,3263.0,2533.0,1.0
beta[EKRE],2.777,0.354,2.069,3.395,0.01,0.006,1261.0,2111.0,1.0
beta[Isamaa],0.549,0.342,-0.096,1.17,0.01,0.005,1282.0,2203.0,1.0
beta[Keskerakond],2.157,0.361,1.506,2.846,0.01,0.006,1264.0,1769.0,1.0
beta[Parempoolsed],-0.106,0.403,-0.859,0.632,0.01,0.006,1647.0,2514.0,1.0
beta[Reform],0.02,0.336,-0.62,0.652,0.009,0.005,1373.0,2008.0,1.0


In [23]:

# Define the kitchen sink regression formula
formula = 'discrimination ~ conservative + ethnicity + gender + party'

# Fit the logistic regression model
model = smf.logit(formula=formula, data=data)
results = model.fit()

# Print the summary of the model
print(results.summary())

Optimization terminated successfully.
         Current function value: 0.527911
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:         discrimination   No. Observations:                 2000
Model:                          Logit   Df Residuals:                     1990
Method:                           MLE   Df Model:                            9
Date:                Thu, 13 Mar 2025   Pseudo R-squ.:                  0.2008
Time:                        21:46:03   Log-Likelihood:                -1055.8
converged:                       True   LL-Null:                       -1321.1
Covariance Type:            nonrobust   LLR p-value:                1.695e-108
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -1.6479      0.302     -5.465      0.000      -2.239      -1.