In [70]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder

In [71]:
import_df = pd.read_csv('../datasets/all_merged.csv').drop(columns=['Marital.Status'])
import_df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Position.on.Federal.K.12.Education.Policy,Position.on.Climate.Change,Position.on.Campaign.Finance.Reform,Position.on.Legalization.Decriminalization.of.Marijuana.Policy,Position.on.Defense.Spending,Position.on.Handling.Terrorism.Abroad,Position.on.Russia,Party.Category,Female,SinglePayer
0,Lizzetta Hill McConnell,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,0,,,-30.68,...,Candidate supports federal proposals for major...,Candidate supports regulations and measures to...,Candidate provides no information,Candidate supports legalization/decriminalizat...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Establishment Democrat,1.0,0
1,Robert Kennedy Jr.,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,1,,On the Ballot,-30.68,...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Establishment Democrat,0.0,0
2,Audri Scott Williams,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,0,,,-33.080002,...,Candidate supports federal proposals for major...,Candidate supports regulations and measures to...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Progressive Democrat,1.0,1
3,Tabitha Isner,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,1,,On the Ballot,-33.080002,...,Candidate supports federal proposals for major...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate opposes reductions in military spending,Candidate provides no information,Candidate provides no information,Progressive Democrat,1.0,1
4,Adia McClellan Winfrey,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,0,,,-33.66,...,Candidate supports federal proposals for major...,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Candidate provides no information,Other,1.0,0


In [72]:
cat_cols = ['Education', 'Position.on.Affordable.Care.Act..ObamaCare.', 
           'Position.on.Minimum.Wage', 'Position.on.Federal.Taxes', 'Position.on.Business.Regulations', 
           'Position.on.National.Debt.Deficit', 'Position.on.Social.Security', 'Position.on.Gun.Control', 
           'Position.on.Immigration', 'Position.on.Abortion', 'Position.on.Criminal.Justice.Reform', 
           'Position.on.Federal.K.12.Education.Policy', 'Position.on.Climate.Change', 'Position.on.Campaign.Finance.Reform', 
           'Position.on.Legalization.Decriminalization.of.Marijuana.Policy', 'Position.on.Defense.Spending', 
           'Position.on.Handling.Terrorism.Abroad', 'Position.on.Russia', 'Party.Category']

enc = OneHotEncoder()
enc.fit(import_df[cat_cols])

cat_df = pd.DataFrame(data=enc.transform(import_df[cat_cols]).toarray(), columns=enc.get_feature_names_out(cat_cols))
cat_df = cat_df.drop(columns=[s for s in cat_df.columns if 'Candidate provides no information' in s] + 
                     ["Education_Associate's or less", "Party.Category_Other"])

df = pd.concat([import_df.drop(columns=cat_cols), cat_df], axis=1)
df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Partisan Lean,...,Position.on.Defense.Spending_Candidate supports a reduction in military spending,Position.on.Handling.Terrorism.Abroad_Candidate provides complicated/complex/unclear position,Position.on.Handling.Terrorism.Abroad_Candidate supports calls for increased American intervention to combat terrorism,Position.on.Handling.Terrorism.Abroad_Candidate supports status quo efforts to combat terrorism,Position.on.Russia_Candidate identifies Russia as political ally of the United States,Position.on.Russia_Candidate notes Russia as political enemy of the United States,Position.on.Russia_Candidate provides complicated/complex/unclear position,Party.Category_Establishment Democrat,Party.Category_Moderate Democrat,Party.Category_Progressive Democrat
0,Lizzetta Hill McConnell,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,0,,,-30.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,Robert Kennedy Jr.,AL,U.S. House Alabama District 1,Representative,Regular,6/5/18,1,,On the Ballot,-30.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,Audri Scott Williams,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,0,,,-33.080002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Tabitha Isner,AL,U.S. House Alabama District 2,Representative,Regular,6/5/18,1,,On the Ballot,-33.080002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Adia McClellan Winfrey,AL,U.S. House Alabama District 3,Representative,Regular,6/5/18,0,,,-33.66,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
endorsement_cols = ['Party Support?', 'Emily Endorsed?', 'Gun Sense Candidate?', 'Biden Endorsed?', 
                    'Warren Endorsed?', 'Sanders Endorsed?', 'Our Revolution Endorsed?', 'Justice Dems Endorsed?', 
                    'PCCC Endorsed?', 'Indivisible Endorsed?', 'WFP Endorsed?', 'VoteVets Endorsed?', 'No Labels Support?']

## Checking for Linearity

In [100]:
df['Party Support?']

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
627   NaN
628   NaN
629   NaN
630   NaN
631   NaN
Name: Party Support?, Length: 632, dtype: float64

## Functions

In [86]:
def clean_df(df, treatment):
    endorsements = endorsement_cols.copy()
    if treatment in endorsements:
        endorsements.remove(treatment)
    output = df[df[treatment].notnull() == True].reset_index().drop(columns=['index', 'Candidate', 'State', 'District', 'total_runners', 
                                                                            'Office Type', 'Race Type', 'Race Primary Election Date', 
                                                                            'Primary Status', 'Primary Runoff Status', 'General Status', 
                                                                            'Primary %', 'Elected Official?', 'Self-Funder?', 'Obama Alum?', 
                                                                            'Endorsed', 'Position.on.Same.Sex.Marriage', 
                                                                            'Position.on.Affordable.Care.Act..ObamaCare._Candidate explicitly supports ACA',
       'Position.on.Affordable.Care.Act..ObamaCare._Candidate provides complicated/complex/unclear position',
       'Position.on.Minimum.Wage_Candidate provides complicated/complex/unclear position',
       'Position.on.Minimum.Wage_Candidate supports raising the minimum wage',
       'Position.on.Federal.Taxes_Candidate opposes raises taxes (or candidate supports lowering taxes for all)',
       'Position.on.Federal.Taxes_Candidate provides complicated/complex/unclear position',
       'Position.on.Federal.Taxes_Candidate supports raising taxes on the wealthy/corporations',
       'Position.on.Business.Regulations_Candidate opposes business regulations and other "red tape"',
       'Position.on.Business.Regulations_Candidate provides complicated/complex/unclear position',
       'Position.on.Business.Regulations_Candidate supports placing smart regulations on business',
       'Position.on.National.Debt.Deficit_Candidate calls for lowering the national debt or calls for deficit reduction',
       'Position.on.National.Debt.Deficit_Candidate provides complicated/complex/unclear position',
       'Position.on.National.Debt.Deficit_Candidate supports increasing the national debt or the federal deficit',
       'Position.on.Social.Security_Candidate proposes reforms to reshape Social Security (e.g. increasing the retirement age; means-testing; etc.)',
       'Position.on.Social.Security_Candidate provides complicated/complex/unclear position',
       'Position.on.Social.Security_Candidate supports protecting the status quo Social Security system',
       'Position.on.Gun.Control_Candidate opposes gun control of all kinds (including background checks)',
       'Position.on.Gun.Control_Candidate provides complicated/complex/unclear position',
       'Position.on.Gun.Control_Candidate supports gun control measures',
       'Position.on.Immigration_Candidate provides complicated/complex/unclear position',
       'Position.on.Immigration_Candidate supports comprehensive immigration reform (including a path to citizenship for illegal immigrants)',
       'Position.on.Abortion_Candidate identifies with the pro-choice position (i.e. supports abortion rights)',
       'Position.on.Abortion_Candidate identifies with the pro-life position (i.e. anti-abortion)',
       'Position.on.Abortion_Candidate provides complicated/complex/unclear position',
       'Position.on.Criminal.Justice.Reform_Candidate provides complicated/complex/unclear position',
       'Position.on.Criminal.Justice.Reform_Candidate supports major criminal justice reform',
       'Position.on.Federal.K.12.Education.Policy_Candidate provides complicated/complex/unclear position',
       'Position.on.Federal.K.12.Education.Policy_Candidate supports federal proposals for major education reform (including common core)',
       'Position.on.Federal.K.12.Education.Policy_Candidate supports local solutions to reform education (e.g. opposes common core, etc.)',
       'Position.on.Climate.Change_Candidate opposes climate change regulations OR denies the effects of climate change',
       'Position.on.Climate.Change_Candidate provides complicated/complex/unclear position',
       'Position.on.Climate.Change_Candidate supports regulations and measures to combat climate change',
       'Position.on.Campaign.Finance.Reform_Candidate provides complicated/complex/unclear position',
       'Position.on.Campaign.Finance.Reform_Candidate supports reforming campaign finance ("overturning Citizens United," "no more SuperPACs," etc.)',
       'Position.on.Legalization.Decriminalization.of.Marijuana.Policy_Candidate opposes legalization/decriminalization of marijuana',
       'Position.on.Legalization.Decriminalization.of.Marijuana.Policy_Candidate provides complicated/complex/unclear position',
       'Position.on.Legalization.Decriminalization.of.Marijuana.Policy_Candidate supports legalization/decriminalization of marijuana',
       'Position.on.Defense.Spending_Candidate opposes reductions in military spending',
       'Position.on.Defense.Spending_Candidate provides complicated/complex/unclear position',
       'Position.on.Defense.Spending_Candidate supports a reduction in military spending',
       'Position.on.Handling.Terrorism.Abroad_Candidate provides complicated/complex/unclear position',
       'Position.on.Handling.Terrorism.Abroad_Candidate supports calls for increased American intervention to combat terrorism',
       'Position.on.Handling.Terrorism.Abroad_Candidate supports status quo efforts to combat terrorism',
       'Position.on.Russia_Candidate identifies Russia as political ally of the United States',
       'Position.on.Russia_Candidate notes Russia as political enemy of the United States',
       'Position.on.Russia_Candidate provides complicated/complex/unclear position', 
       "Education_Bachelor's or some college", "Education_J.D.", 'Education_Other', 
       "Education_Other Graduate", "Education_Master's Degree (includes MBA)"] + endorsements).dropna()
    output['Total Other Endorsements'] = output['Total Endorsements'] - output[treatment]
    output = output.drop(columns=['Total Endorsements'])[output['receipts'].notnull()]
    return output

def fit_OLS_model(df, target_variable, explanatory_variables, intercept = False):
    """
    Fits an OLS model from data.
    
    Inputs:
        df: pandas DataFrame
        target_variable: string, name of the target variable
        explanatory_variables: list of strings, names of the explanatory variables
        intercept: bool, if True add intercept term
    Outputs:
        fitted_model: model containing OLS regression results
    """
    
    target = df[target_variable]
    inputs = df[explanatory_variables]
    if intercept:
        inputs = sm.add_constant(inputs)
    
    fitted_model = sm.OLS(target, inputs).fit()
    return(fitted_model)

def mean_squared_error(true_vals, predicted_vals):
    """
    Return the mean squared error
    
    Inputs:
        true_vals: array of true labels
        predicted_vals: array labels predicted from the data
    Output:
        float, mean squared error of the predicted values
    """
    return np.mean((true_vals - predicted_vals) ** 2)

In [87]:
def drop_VIF_col(X, threshold=5):
    while True:
        vif_info = pd.DataFrame()
        vif_info['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
        vif_info['Column'] = X.columns
        vif_info = vif_info.sort_values('VIF', ascending=False)
        max_VIF = vif_info['VIF'].iloc[0]

        if max_VIF > threshold:
            to_drop = vif_info['Column'].iloc[0]
            # print(f"Dropped: {to_drop} with VIF of {max_VIF}")
            X = X.drop(to_drop, axis=1)
        else:
            break

    return X  # Returning the modified DataFrame

In [97]:
party = clean_df(df, 'Party Support?')
party_model = fit_OLS_model(party, 'Won Primary', party.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(party_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.901
Model:                            OLS   Adj. R-squared (uncentered):              0.888
Method:                 Least Squares   F-statistic:                              71.30
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    2.00e-48
Time:                        14:35:48   Log-Likelihood:                          53.222
No. Observations:                 124   AIC:                                     -78.44
Df Residuals:                     110   BIC:                                     -38.96
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [90]:
our_rev = clean_df(df, 'Our Revolution Endorsed?')
our_rev_vif = drop_VIF_col(our_rev.drop(columns=['Won Primary'], inplace=False))
our_rev_vif['Won Primary'] = our_rev['Won Primary']
our_rev_model = fit_OLS_model(our_rev_vif, 'Won Primary', our_rev_vif.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(our_rev_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.577
Model:                            OLS   Adj. R-squared (uncentered):              0.554
Method:                 Least Squares   F-statistic:                              25.56
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    1.74e-38
Time:                        14:32:28   Log-Likelihood:                         -75.587
No. Observations:                 257   AIC:                                      177.2
Df Residuals:                     244   BIC:                                      223.3
Df Model:                          13                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [91]:
emily = clean_df(df, 'Emily Endorsed?')
emily_model = fit_OLS_model(emily, 'Won Primary', emily.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(emily_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.790
Model:                            OLS   Adj. R-squared (uncentered):              0.772
Method:                 Least Squares   F-statistic:                              44.21
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    2.03e-48
Time:                        14:32:36   Log-Likelihood:                          19.597
No. Observations:                 179   AIC:                                     -11.19
Df Residuals:                     165   BIC:                                      33.43
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [92]:
jd = clean_df(df, 'Justice Dems Endorsed?')
jd_model = fit_OLS_model(jd, 'Won Primary', jd.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(jd_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.639
Model:                            OLS   Adj. R-squared (uncentered):              0.606
Method:                 Least Squares   F-statistic:                              19.82
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    5.99e-28
Time:                        14:33:16   Log-Likelihood:                         -24.741
No. Observations:                 171   AIC:                                      77.48
Df Residuals:                     157   BIC:                                      121.5
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [93]:
gs = clean_df(df, 'Gun Sense Candidate?')
gs_model = fit_OLS_model(gs, 'Won Primary', gs.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(gs_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.606
Model:                            OLS   Adj. R-squared (uncentered):              0.587
Method:                 Least Squares   F-statistic:                              30.59
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    3.11e-48
Time:                        14:33:26   Log-Likelihood:                         -83.601
No. Observations:                 292   AIC:                                      195.2
Df Residuals:                     278   BIC:                                      246.7
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [94]:
ind = clean_df(df, 'Indivisible Endorsed?')
ind_model = fit_OLS_model(ind, 'Won Primary', ind.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(ind_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.715
Model:                            OLS   Adj. R-squared (uncentered):              0.674
Method:                 Least Squares   F-statistic:                              17.53
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    8.30e-21
Time:                        14:33:29   Log-Likelihood:                         -14.919
No. Observations:                 112   AIC:                                      57.84
Df Residuals:                      98   BIC:                                      95.90
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------

In [95]:
vv = clean_df(df, 'VoteVets Endorsed?')
vv_model = fit_OLS_model(vv, 'Won Primary', vv.drop(columns=['Won Primary'], inplace=False).columns.to_list())
print(vv_model.summary())

                                 OLS Regression Results                                
Dep. Variable:            Won Primary   R-squared (uncentered):                   0.755
Model:                            OLS   Adj. R-squared (uncentered):              0.713
Method:                 Least Squares   F-statistic:                              17.82
Date:                Sun, 10 Dec 2023   Prob (F-statistic):                    3.61e-19
Time:                        14:33:31   Log-Likelihood:                        -0.63418
No. Observations:                  95   AIC:                                      29.27
Df Residuals:                      81   BIC:                                      65.02
Df Model:                          14                                                  
Covariance Type:            nonrobust                                                  
                                            coef    std err          t      P>|t|      [0.025      0.975]
--------------