In [None]:
#Importing all the necessary libraries
import pandas as pd
import numpy as np
import csv
pd.set_option('display.max_columns', None)
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# Load data
df = pd.read_csv('EEE_Regression_Data.csv')

In [None]:
# Try to clean data of unnecessary characters
pct_cols = [ '% ≥ Some-college', '% Foreign-born-citizens']
for c in pct_cols:
    df[c] = df[c].replace('%', '').astype(float)/100

num_cols = ['Dem votes','Rep votes','3rd-party','Total turnout',
            'Population', 'VAP (log )']
df[num_cols] = df[num_cols].replace({',': '', ' ': ''}, regex=True).astype(float)

In [None]:
#Create new variables for the regression
df['dem_share'] = df['Dem votes'] / (df['Total turnout'])
df['rep_share'] = df['Rep votes'] / (df['Total turnout'])
df['third_party_share'] = df['3rd-party'] / (df['Total turnout'])
df['Arab_dummy'] = (df['% Arab of pop'] >= 0.10).astype(int) 
df['Post'] = (df['Year'] == 2024).astype(int)
df['DiD'] = df['Arab_dummy'] * df['Post']
df['log_VAP']    = np.log(df['VAP (log )'])

In [None]:
#Leooking at the data
df

Unnamed: 0,City,Year,Dem votes,Rep votes,3rd-party,Total turnout,Population,Arab pop,% Arab of pop,Some college+,% ≥ Some-college,% Foreign-born-citizens,VAP (log ),Unnamed: 13,Unnamed: 14,dem_share,rep_share,third_party_share,Arab_dummy,Post,DiD,log_VAP
0,Dearborn,2020.0,30719.0,13239.0,605.0,44563.0,93927.0,40725.0,0.4336,41183.0,0.00438,0.00392,0.748,,,0.689339,0.297085,0.013576,1,0,0,-0.290352
1,Dearborn Hts,2020.0,16623.0,9749.0,346.0,26718.0,55630.0,15069.0,0.2709,23159.0,0.00416,0.00368,0.756,,,0.622165,0.364885,0.01295,1,0,0,-0.279714
2,Hamtramck,2020.0,6628.0,1042.0,75.0,7745.0,21704.0,5366.0,0.2472,5449.0,0.00251,0.00461,0.659,,,0.855778,0.134538,0.009684,1,0,0,-0.417032
3,Dane County,2020.0,260185.0,78800.0,5801.0,344786.0,546695.0,1464.0,0.0027,336245.0,0.00615,0.00039,0.801,,,0.754628,0.228548,0.016825,0,0,0,-0.221894
4,Dearborn,2024.0,15208.0,17802.0,8359.0,41369.0,105818.0,48520.0,0.4585,48014.0,0.00454,0.00392,0.748,,,0.367618,0.430322,0.20206,1,1,1,-0.290352
5,Dearborn Hts,2024.0,9652.0,11083.0,4159.0,24894.0,62099.0,20795.0,0.3349,26760.0,0.00431,0.00368,0.756,,,0.387724,0.445208,0.167068,1,1,1,-0.279714
6,Hamtramck,2024.0,3271.0,3017.0,741.0,7029.0,27830.0,8479.0,0.3047,7004.0,0.00252,0.00461,0.659,,,0.465358,0.429222,0.10542,1,1,1,-0.417032
7,Dane County,2024.0,273995.0,85454.0,6477.0,365926.0,564777.0,2458.0,0.0044,354772.0,0.00628,0.00039,0.801,,,0.748772,0.233528,0.0177,0,1,0,-0.221894
8,,,,,,,,,,,,,,,,,,,0,0,0,
9,,,,,,,,,,,,,,,,,,,0,0,0,


In [None]:
#Drop ghost columns
df = df.drop(columns=['Unnamed: 13', 'Unnamed: 14'])

In [None]:
#Remove excel whitespace
df= df[0:8]

In [None]:
#make sure the columns are in the right format
df

Unnamed: 0,City,Year,Dem votes,Rep votes,3rd-party,Total turnout,Population,Arab pop,% Arab of pop,Some college+,% ≥ Some-college,% Foreign-born-citizens,VAP (log ),dem_share,rep_share,third_party_share,Arab_dummy,Post,DiD,log_VAP
0,Dearborn,2020.0,30719.0,13239.0,605.0,44563.0,93927.0,40725,0.4336,41183,0.00438,0.00392,0.748,0.689339,0.297085,0.013576,1,0,0,-0.290352
1,Dearborn Hts,2020.0,16623.0,9749.0,346.0,26718.0,55630.0,15069,0.2709,23159,0.00416,0.00368,0.756,0.622165,0.364885,0.01295,1,0,0,-0.279714
2,Hamtramck,2020.0,6628.0,1042.0,75.0,7745.0,21704.0,5366,0.2472,5449,0.00251,0.00461,0.659,0.855778,0.134538,0.009684,1,0,0,-0.417032
3,Dane County,2020.0,260185.0,78800.0,5801.0,344786.0,546695.0,1464,0.0027,336245,0.00615,0.00039,0.801,0.754628,0.228548,0.016825,0,0,0,-0.221894
4,Dearborn,2024.0,15208.0,17802.0,8359.0,41369.0,105818.0,48520,0.4585,48014,0.00454,0.00392,0.748,0.367618,0.430322,0.20206,1,1,1,-0.290352
5,Dearborn Hts,2024.0,9652.0,11083.0,4159.0,24894.0,62099.0,20795,0.3349,26760,0.00431,0.00368,0.756,0.387724,0.445208,0.167068,1,1,1,-0.279714
6,Hamtramck,2024.0,3271.0,3017.0,741.0,7029.0,27830.0,8479,0.3047,7004,0.00252,0.00461,0.659,0.465358,0.429222,0.10542,1,1,1,-0.417032
7,Dane County,2024.0,273995.0,85454.0,6477.0,365926.0,564777.0,2458,0.0044,354772,0.00628,0.00039,0.801,0.748772,0.233528,0.0177,0,1,0,-0.221894


In [None]:
# Create the regression model and fit it
feature_cols = ['Post','Arab_dummy','DiD','% ≥ Some-college', '% Foreign-born-citizens','log_VAP']

X = df[feature_cols].values
y = df['third_party_share'].values


In [None]:
#Wait, fit it down here instead
model = LinearRegression(fit_intercept=True)
model.fit(X, y)

In [None]:
#Yay we have a model, what are the results?
print("R²:", model.score(X, y))

for name, coef in zip(feature_cols, model.coef_):
    print(f"{name:20s}  {coef:+.4f}")

R²: 0.9951581498568293
Post                  -0.0682
Arab_dummy            +3.6103
DiD                   +0.1576
% ≥ Some-college      +531.4939
% Foreign-born-citizens  -1012.7173
log_VAP               -13.2797


In [None]:
#Now make this into a format that is actually readable
X_sm = sm.add_constant(X)
sm_model = sm.OLS(y, X_sm).fit(cov_type='HC2')
print(sm_model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.995
Model:                            OLS   Adj. R-squared:                  0.966
Method:                 Least Squares   F-statistic:                     60.32
Date:                Thu, 08 May 2025   Prob (F-statistic):             0.0974
Time:                        08:55:48   Log-Likelihood:                 30.807
No. Observations:                   8   AIC:                            -47.61
Df Residuals:                       1   BIC:                            -47.06
Df Model:                           6                                         
Covariance Type:                  HC2                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.8036      1.963     -2.957      0.0

  return hypotest_fun_in(*args, **kwds)
