<a href="https://colab.research.google.com/github/taliafabs/sta496/blob/main/VoteChoice_RaceDep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building Logistic Regression Vote Choice Models Using Recent U.S. Presidential Election Data


## Introduction

## Data

The datasets used for this project are the 2020 and 2024 CES survey datasets, obtained from Harvard Dataverse. They can be found at the following links:
* 2024:
* 2020:

### Loading the data

In [1]:
# Workplace setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pymc as pm

In [2]:
# loading the data
from google.colab import drive
drive.mount('/content/drive')

# 2024
ces24 = pd.read_stata("/content/drive/MyDrive/STA496/Datasets/CES24_Common.dta")

# 2020
ces20 = pd.read_stata("/content/drive/MyDrive/STA496/Datasets/CES20_Common_OUTPUT_vv.dta")

Mounted at /content/drive


In [None]:
# data preparation
ces24_subset = ces24[['gender4', 'race', 'hispanic', 'educ', 'marstat', 'inputstate', 'region', 'birthyr',
                  'ownhome', 'urbancity', 'industry',
                  'religpew', 'pew_religimp', 'pew_churatd', 'pew_prayer',
                  'CC24_361b', 'CC24_363', 'CC24_364a', 'CC24_364b', 'presvote20post', 'pid3', 'pid7'
                  ]]

ces20_subset = ces20[['gender', 'race', 'hispanic', 'educ', 'marstat', 'inputstate', 'region', 'birthyr',
                      'ownhome', 'urbancity', 'industryclass',
                      'religpew', 'pew_religimp', 'pew_churatd', 'pew_prayer',
                      'votereg', 'votereg_f','CC20_364a', 'CC20_364b', 'presvote16post', 'pid3', 'pid7']]

# only include trump and biden/harris
ces24_subset = ces24_subset[
    (ces24_subset['CC24_364a'] == "Kamala Harris (Democrat)") |
    (ces24_subset['CC24_364a'] == "Donald Trump (Republican)")
]

ces20_subset = ces20_subset[
    (ces20_subset['CC20_364a'] == "Joe Biden (Democrat)") |
    (ces20_subset['CC20_364a'] == "Donald Trump (Republican)")
]

# create vote_trump binary variable
ces24_subset['vote_trump'] = np.where(ces24_subset['CC24_364a'] == 'Donald Trump (Republican)', 1, 0)
ces20_subset['vote_trump'] = np.where(ces20_subset['CC20_364a'] == 'Donald Trump (Republican)', 1, 0)

# columns
ces24_subset['age'] = 2024 - ces24_subset['birthyr']

ces24_subset['age_bracket'] = pd.cut(
    ces24_subset['age'],
    bins=[17, 24, 34, 44, 54, 64, 74, 100],
    labels=[
        '18–24', '25–34', '35–44', '45–54', '55–64', '65–74', '75+'
    ]
)

ces24_subset = ces24_subset[[
    'vote_trump',
    'age_bracket',
    'gender4',
    'race',
    # 'hispanic',
    'educ',
    # 'marstat',
    # 'inputstate',
    'region',
    'urbancity',
    'religpew',
    # 'pew_religimp',
    # 'pew_churatd',
    'pid3'
]]

# 2020
ces20_subset['age'] = 2020 - ces20_subset['birthyr']

ces20_subset['age_bracket'] = pd.cut(
    ces20_subset['age'],
    bins=[17, 24, 34, 44, 54, 64, 74, 100],
    labels=[
        '18–24', '25–34', '35–44', '45–54', '55–64', '65–74', '75+'
    ]
)

ces20_subset = ces20_subset[[
    'vote_trump',
    'age_bracket',
    'gender',
    'race',
    # 'hispanic',
    'educ',
    # 'marstat',
    # 'inputstate',
    'region',
    'urbancity',
    'religpew',
    # 'pew_religimp',
    # 'pew_churatd',
    'pid3'
]]

# categorical predictors
predictors24 = ces24_subset.columns.drop('vote_trump')
ces24_subset[predictors24] = ces24_subset[predictors24].astype('category')
predictors20 = ces20_subset.columns.drop('vote_trump')
ces20_subset[predictors20] = ces20_subset[predictors20].astype('category')

ces24_subset.dropna(inplace=True)
ces20_subset.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ces20_subset['vote_trump'] = np.where(ces20_subset['CC20_364a'] == 'Donald Trump (Republican)', 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ces20_subset['age'] = 2020 - ces20_subset['birthyr']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ces20_subset['age_bracket'] = pd.cut(
A value is t

In [None]:
# Dummy variables and interaction terms
# X_24 = ces24_subset.drop(columns=['vote_trump'])
# y_24 = ces24_subset['vote_trump']
# categorical_features = X_24.select_dtypes(include=['category']).columns

race_dummies_24 = pd.get_dummies(ces24_subset['race'], prefix='race', drop_first=True)
gender_dummies_24 = pd.get_dummies(ces24_subset['gender4'], prefix='gender', drop_first=True)
education_dummies_24 = pd.get_dummies(ces24_subset['educ'], prefix='educ', drop_first=True)

race_dummies_20 = pd.get_dummies(ces20_subset['race'], prefix='race', drop_first=True)
gender_dummies_20 = pd.get_dummies(ces20_subset['gender'], prefix='gender', drop_first=True)
education_dummies_20 = pd.get_dummies(ces20_subset['educ'], prefix='educ', drop_first=True)

ces24_subset_ = pd.get_dummies(ces24_subset, drop_first=True)
ces20_subset_ = pd.get_dummies(ces20_subset, drop_first=True)

interaction_terms_24 = pd.DataFrame()
interaction_terms_20 = pd.DataFrame()

# 2024 interaction dummies
for col1 in race_dummies_24.columns: # go thru every race dummy
    for col2 in gender_dummies_24.columns: # race * gender
        interaction_terms_24[f'{col1}_{col2}'] = race_dummies_24[col1] * gender_dummies_24[col2]
    for col3 in education_dummies_24.columns: # race * education
        interaction_terms_24[f'{col1}_{col3}'] = race_dummies_24[col1] * education_dummies_24[col3]

# 2020 interaction dummies
for col1 in race_dummies_20.columns: # go through every race dummy
    for col2 in gender_dummies_20.columns: # race * gender
        interaction_terms_20[f'{col1}_{col2}'] = race_dummies_20[col1] * gender_dummies_20[col2]
    for col3 in education_dummies_20.columns: # race * education
        interaction_terms_20[f'{col1}_{col3}'] = race_dummies_20[col1] * education_dummies_20[col3]
# combine
df_24_model = pd.concat([
    ces24_subset_.drop(columns=['race', 'gender', 'educ'], errors='ignore'),  # optional
    race_dummies_24,
    gender_dummies_24,
    education_dummies_24,
    interaction_terms_24
], axis=1)

df_20_model = pd.concat([
    ces20_subset_.drop(columns=['race', 'gender', 'educ'], errors='ignore'),  # optional
    race_dummies_20,
    gender_dummies_20,
    education_dummies_20,
    interaction_terms_20
], axis=1)

In [None]:
# list(df_24_model.columns)

## Machine Learning (ML) Approach

### Setup

### Models

I will evaluate the performance of the following logistic regression models on the CES 2020 and 2024 datasets:

In [None]:
# full model
X_24 = df_24_model.drop(columns=['vote_trump'])
y_24 = df_24_model['vote_trump']

# train test split
X_train_24, X_test_24, y_train_24, y_test_24 = train_test_split(X_24, y_24, test_size=0.25, random_state=42)

# train
model0_ml = LogisticRegression()
model0_ml.fit(X_train_24, y_train_24)

# predict
y_pred_24 = model0_ml.predict(X_test_24)

# evaluate
print(classification_report(y_test_24, y_pred_24))

              precision    recall  f1-score   support

           0       0.91      0.94      0.93      1609
           1       0.85      0.79      0.82       703

    accuracy                           0.89      2312
   macro avg       0.88      0.87      0.87      2312
weighted avg       0.89      0.89      0.89      2312



In [None]:
# reduced model (remove race main effect)
cols_to_drop = [
    "vote_trump"
    "race_Black",
    "race_Hispanic",
    "race_Asian",
    "race_Native American",
    "race_Two or more races",
    "race_Other",
    "race_Middle Eastern"
]

X_24_reduced1 = df_24_model.drop(columns=cols_to_drop, errors='ignore')
y_24_reduced1 = df_24_model['vote_trump']

# train
X_train_24_reduced1, X_test_24_reduced1, y_train_24_reduced1, y_test_24_reduced1 = train_test_split(X_24_reduced1, y_24_reduced1, test_size=0.25, random_state=42)
model1_ml = LogisticRegression()
model1_ml.fit(X_train_24_reduced1, y_train_24_reduced1)

# predict
y_pred_24_reduced1 = model1_ml.predict(X_test_24_reduced1)

# evaluate
print(classification_report(y_test_24_reduced1, y_pred_24_reduced1))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1609
           1       1.00      1.00      1.00       703

    accuracy                           1.00      2312
   macro avg       1.00      1.00      1.00      2312
weighted avg       1.00      1.00      1.00      2312



In [None]:
# remove race interactions, but keep race main effect

In [None]:
# 2020
X_20 = df_20_model.drop(columns=['vote_trump'])
y_20 = df_20_model['vote_trump']

# train test split
X_train_20, X_test_20, y_train_20, y_test_20 = train_test_split(X_20, y_20, test_size=0.25, random_state=42)
model0_ml = LogisticRegression()
model0_ml.fit(X_train_20, y_train_20)

# predict
y_pred_20 = model0_ml.predict(X_test_20)

# report accuracy
print(classification_report(y_test_20, y_pred_20))

### Results

## Frequentist Approach

### Setup
Likelihood ratio test will compare nested logistic GLMs to determine whether the more complex model provides a significantly better explanation of the data than the simpler model.

Complex model:

\begin{align*}
\frac{p(\text{vote_trump})}{1 - p(\text{(vote_trump})} = \beta_0 + \beta_1 \cdot \text{race} + \beta_2 \cdot \text{gender} + \beta_3 \cdot \text{educ} + \beta_4 \cdot \text{region} + \beta_5 \cdot \text{urbancity} + \beta_6 \cdot \text{religion} + \beta_7 \cdot \text{party_id} + \beta_8 \cdot (\text{race} \times \text{gender}) + \beta_9 \cdot (\text{race} \times \text{educ})
\end{align*}

Reduced models:

remove race interactions
\begin{align*}
\frac{p(\text{vote_trump})}{1 - p(\text{(vote_trump})} = \beta_0 + \beta_1 \cdot \text{race} + \beta_2 \cdot \text{gender} + \beta_3 \cdot \text{educ} + \beta_4 \cdot \text{region} + \beta_5 \cdot \text{urbancity} + \beta_6 \cdot \text{religion} + \beta_7 \cdot \text{party_id}
\end{align*}

remove race main effect and interactions
\begin{align*}
\frac{p(\text{vote_trump})}{1 - p(\text{(vote_trump})} = \beta_0 + \beta_1 \cdot \text{race} + \beta_2 \cdot \text{gender} + \beta_3 \cdot \text{educ} + \beta_4 \cdot \text{region} + \beta_5 \cdot \text{urbancity} + \beta_6 \cdot \text{religion} + \beta_7 \cdot \text{party_id}
\end{align*}

### Models

Likelihood ratio test and frequentist model evals
Is the race main effect significant?
How did the significance of race change between 2020 and 2024?

In [None]:
# 2024 include race as main effect with race
ces24_model1 = smf.logit("vote_trump ~ C(race) + C(gender4) + C(educ) + C(age_bracket) + C(region) + C(urbancity) + C(religpew) + C(pid3)",
                         data=ces24_subset).fit(maxiter=10000)

Optimization terminated successfully.
         Current function value: 0.226628
         Iterations 9


In [None]:
# remove race from the model
ces24_model2 = smf.logit("vote_trump ~  C(gender4) + C(educ) + C(age_bracket) + C(region) + C(urbancity) + C(religpew) + C(pid3)",
                         data=ces24_subset).fit(maxiter=10000)

Optimization terminated successfully.
         Current function value: 0.228656
         Iterations 9


In [None]:
# 2020 include race as main effect
ces20_model1 = smf.logit("vote_trump ~ C(race) + C(gender) + C(educ) + C(age_bracket) + C(region) + C(urbancity) + C(religpew) + C(pid3)",
                         data=ces20_subset).fit(maxiter=10000)

In [None]:
# 2020 drop race main effect
ces20_model2 = smf.logit("vote_trump ~  C(gender) + C(educ) + C(age_bracket) + C(region) + C(urbancity) + C(religpew) + C(pid3)",
                         data=ces20_subset).fit(maxiter=10000)

### Results

In [None]:
# likelihood ratio test (2024)

In [None]:
print(ces24_model1.summary())

                           Logit Regression Results                           
Dep. Variable:             vote_trump   No. Observations:                 9242
Model:                          Logit   Df Residuals:                     9198
Method:                           MLE   Df Model:                           43
Date:                Mon, 23 Jun 2025   Pseudo R-squ.:                  0.6336
Time:                        00:17:25   Log-Likelihood:                -2094.5
converged:                       True   LL-Null:                       -5716.1
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                               coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------
Intercept                                   -3.9217      0.418     -9.379      0.000      -4.741      -3.102
C(race)[T.Black]                         

In [None]:
print(ces24_model2.summary())

                           Logit Regression Results                           
Dep. Variable:             vote_trump   No. Observations:                 9242
Model:                          Logit   Df Residuals:                     9205
Method:                           MLE   Df Model:                           36
Date:                Mon, 23 Jun 2025   Pseudo R-squ.:                  0.6303
Time:                        00:17:35   Log-Likelihood:                -2113.2
converged:                       True   LL-Null:                       -5716.1
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                               coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------
Intercept                                   -4.1760      0.413    -10.119      0.000      -4.985      -3.367
C(gender4)[T.Woman]                      

In [None]:
print(ces20_model1.summary())

In [None]:
print(ces20_model2.summary())

In [None]:
# likelihood ratio test