In [35]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

In [36]:
import_df = pd.read_csv('../datasets/all_merged.csv').drop(columns=['Marital.Status'])
import_df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Position.on.Federal.K.12.Education.Policy,Position.on.Climate.Change,Position.on.Campaign.Finance.Reform,Position.on.Legalization.Decriminalization.of.Marijuana.Policy,Position.on.Defense.Spending,Position.on.Handling.Terrorism.Abroad,Position.on.Russia,Party.Category,Female,SinglePayer
0,Lizzetta Hill McConnell,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,0,,,-30.68,...,Candidate supports federal proposals for major...,Candidate supports regulations and measures to...,Candidate provides no information,Candidate supports legalization/decriminalizat...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Establishment Democrat,1.0,0
1,Robert Kennedy Jr.,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,1,,On the Ballot,-30.68,...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Establishment Democrat,0.0,0
2,Audri Scott Williams,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,0,,,-33.080002,...,Candidate supports federal proposals for major...,Candidate supports regulations and measures to...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Progressive Democrat,1.0,1
3,Tabitha Isner,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,1,,On the Ballot,-33.080002,...,Candidate supports federal proposals for major...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate opposes reductions in military spending,Candidate provides no information,Candidate provides no information,Progressive Democrat,1.0,1
4,Adia McClellan Winfrey,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,0,,,-33.66,...,Candidate supports federal proposals for major...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Other,1.0,0


In [37]:
cat_cols = ['Education', 'Position.on.Affordable.Care.Act..ObamaCare.', 
           'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes', 'Position.on.Business.Regulations', 
           'Position.on.National.Debt.Deficit', 'Position.on.Social.Security', 'Position.on.Gun.Control', 
           'Position.on.Immigration', 'Position.on.Abortion', 'Position.on.Criminal.Justice.Reform', 
           'Position.on.Federal.K.12.Education.Policy', 'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform', 
           'Position.on.Legalization.Decriminalization.of.Marijuana.Policy', 'Position.on.Defense.Spending', 
           'Position.on.Handling.Terrorism.Abroad', 'Position.on.Russia', 'Party.Category']

enc = OneHotEncoder()
enc.fit(import_df[cat_cols])

cat_df = pd.DataFrame(data=enc.transform(import_df[cat_cols]).toarray(), columns=enc.get_feature_names_out(cat_cols))
cat_df = cat_df.drop(columns=[s for s in cat_df.columns if 'Candidate provides no information' in s] + 
                     ["Education_Associate's or less", "Party.Category_Other"])

df = pd.concat([import_df.drop(columns=cat_cols), cat_df], axis=1)
df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Position.on.Defense.Spending_Candidate supports a reduction in military spending,Position.on.Handling.Terrorism.Abroad_Candidate provides complicated/complex/unclear position,Position.on.Handling.Terrorism.Abroad_Candidate supports calls for increased American intervention to combat terrorism,Position.on.Handling.Terrorism.Abroad_Candidate supports status quo efforts to combat terrorism,Position.on.Russia_Candidate identifies Russia as political ally of the United States,Position.on.Russia_Candidate notes Russia as political enemy of the United States,Position.on.Russia_Candidate provides complicated/complex/unclear position,Party.Category_Establishment Democrat,Party.Category_Moderate Democrat,Party.Category_Progressive Democrat
0,Lizzetta Hill McConnell,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,0,,,-30.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Robert Kennedy Jr.,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,1,,On the Ballot,-30.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Audri Scott Williams,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,0,,,-33.080002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Tabitha Isner,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,1,,On the Ballot,-33.080002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Adia McClellan Winfrey,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,0,,,-33.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
endorsement_cols = ['Party Support?', 'Emily Endorsed?', 'Gun Sense Candidate?', 'Biden Endorsed?', 
                    'Warren Endorsed?', 'Sanders Endorsed?', 'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 
                    'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?']

## Functions

In [39]:
def clean_df(df, treatment):
    endorsements = endorsement_cols.copy()
    if treatment in endorsements:
        endorsements.remove(treatment)
    output = df[df[treatment].notnull() == True].reset_index().drop(columns=['index', 'Candidate', 'State', 'District', 'total_runners', 
                                                                            'Office Type', 'Race Type', 'Race Primary Election Date', 
                                                                            'Primary Status', 'Primary Runoff Status', 'General Status', 
                                                                            'Primary %', 'Elected Official?', 'Self-Funder?', 'Obama Alum?', 
                                                                            'Endorsed'] + endorsements).dropna()
    output['Total Other Endorsements'] = output['Total Endorsements'] - output[treatment]
    output = output.drop(columns=['Total Endorsements'])[output['receipts'].notnull()]
    return output

def fit_OLS_model(df, target_variable, explanatory_variables, intercept = False):
    """
    Fits an OLS model from data.
    
    Inputs:
        df: pandas DataFrame
        target_variable: string, name of the target variable
        explanatory_variables: list of strings, names of the explanatory variables
        intercept: bool, if True add intercept term
    Outputs:
        fitted_model: model containing OLS regression results
    """
    
    target = df[target_variable]
    inputs = df[explanatory_variables]
    if intercept:
        inputs = sm.add_constant(inputs)
    
    fitted_model = sm.OLS(target, inputs).fit()
    return(fitted_model)

def mean_squared_error(true_vals, predicted_vals):
    """
    Return the mean squared error
    
    Inputs:
        true_vals: array of true labels
        predicted_vals: array labels predicted from the data
    Output:
        float, mean squared error of the predicted values
    """
    return np.mean((true_vals - predicted_vals) ** 2)

In [41]:
party = clean_df(df, 'Party Support?')
party_model = fit_OLS_model(party, 'Won Primary', party.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(party_model.summary())

                            OLS Regression Results                            
Dep. Variable:            Won Primary   R-squared:                       0.923
Model:                            OLS   Adj. R-squared:                  0.854
Method:                 Least Squares   F-statistic:                     13.41
Date:                Sun, 10 Dec 2023   Prob (F-statistic):           1.09e-20
Time:                        06:23:18   Log-Likelihood:                 86.719
No. Observations:                 124   AIC:                            -55.44
Df Residuals:                      65   BIC:                             111.0
Df Model:                          58                                         
Covariance Type:            nonrobust                                         
                                                                                                                                                   coef    std err          t      P>|t|      [0.025      0.975]
-

In [43]:
our_rev = clean_df(df, 'Our Revolution Endorsed?')
our_rev_model = fit_OLS_model(our_rev, 'Won Primary', our_rev.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(our_rev_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.650
Model:                            OLS   Adj. R-squared (uncentered):              0.529
Method:                 Least Squares   F-statistic:                              5.373
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    4.69e-20
Time:                        06:26:56   Log-Likelihood:                         -51.154
No. Observations:                 257   AIC:                                      234.3
Df Residuals:                     191   BIC:                                      468.5
Df Model:                          66                                                  
Covariance Type:            nonrobust                                                  
                                                                                                                        

In [44]:
emily = clean_df(df, 'Emily Endorsed?')
emily_model = fit_OLS_model(emily, 'Won Primary', emily.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(emily_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.855
Model:                            OLS   Adj. R-squared (uncentered):              0.779
Method:                 Least Squares   F-statistic:                              11.16
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    2.73e-28
Time:                        06:28:08   Log-Likelihood:                          53.203
No. Observations:                 179   AIC:                                      17.59
Df Residuals:                     117   BIC:                                      215.2
Df Model:                          62                                                  
Covariance Type:            nonrobust                                                  
                                                                                                                        

In [45]:
jd = clean_df(df, 'Justice Dems Endorsed?')
jd_model = fit_OLS_model(jd, 'Won Primary', jd.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(jd_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.730
Model:                            OLS   Adj. R-squared (uncentered):              0.573
Method:                 Least Squares   F-statistic:                              4.643
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    1.42e-12
Time:                        06:29:50   Log-Likelihood:                         0.30123
No. Observations:                 171   AIC:                                      125.4
Df Residuals:                     108   BIC:                                      323.3
Df Model:                          63                                                  
Covariance Type:            nonrobust                                                  
                                                                                                                        

In [46]:
gs = clean_df(df, 'Gun Sense Candidate?')
gs_model = fit_OLS_model(gs, 'Won Primary', gs.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(gs_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.672
Model:                            OLS   Adj. R-squared (uncentered):              0.578
Method:                 Least Squares   F-statistic:                              7.165
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    3.16e-29
Time:                        06:30:28   Log-Likelihood:                         -56.833
No. Observations:                 292   AIC:                                      243.7
Df Residuals:                     227   BIC:                                      482.7
Df Model:                          65                                                  
Covariance Type:            nonrobust                                                  
                                                                                                                        

In [49]:
ind = clean_df(df, 'Indivisible Endorsed?')
ind_model = fit_OLS_model(ind, 'Won Primary', ind.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(ind_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.818
Model:                            OLS   Adj. R-squared (uncentered):              0.608
Method:                 Least Squares   F-statistic:                              3.896
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    7.91e-07
Time:                        06:31:18   Log-Likelihood:                          10.264
No. Observations:                 112   AIC:                                      99.47
Df Residuals:                      52   BIC:                                      262.6
Df Model:                          60                                                  
Covariance Type:            nonrobust                                                  
                                                                                                                        

In [50]:
vv = clean_df(df, 'VoteVets Endorsed?')
vv_model = fit_OLS_model(vv, 'Won Primary', vv.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(vv_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.877
Model:                            OLS   Adj. R-squared (uncentered):              0.684
Method:                 Least Squares   F-statistic:                              4.551
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    2.03e-06
Time:                        06:32:16   Log-Likelihood:                          32.137
No. Observations:                  95   AIC:                                      51.73
Df Residuals:                      37   BIC:                                      199.8
Df Model:                          58                                                  
Covariance Type:            nonrobust                                                  
                                                                                                                        