In [20]:
import pandas as pd
import sklearn
import numpy as np
import plotly as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import plotly.graph_objects as go
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
import statsmodels.formula.api as smf
import statsmodels.api as sm
import plotly.io as pio
pio.renderers.default = 'iframe'
from sklearn.metrics import RocCurveDisplay
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import recall_score

In [21]:
### William Diment - Voter Registration Prediction
### In this project, we attempt to identify who in Boulder county could be registered as a democrat or a republican
### We do this using the principle of Logistic Regression, as that allows for an easy binary classification of Republican or Democrat
### We accomplish this by taking voter data from the Boulder County elections office (https://bouldercounty.gov/elections/maps-and-data/data-access/#Registered-Voter-Data)
### We then split that data into two separate CSVs in Excel

### We then import the voting data into a pandas dataframe and clean then data
### We specify that we want only the Voter ID, the Voter Status, the Birth Year, the Gender, Party Registration, and Precinct Code
### Really, for the set of information we have, we really only need the Birth Year, Gender, and Party registration
### We then replace the PARTY value with a binary value of 0 for Democratic, and 1 for Republican
### We do the same for voter status - 1 is active, 0 is inactive
### We then replace Male and Female with 1 and 0 respectively

### All of this is done to fit the data into a binary classification form - this is the form a logistic regression learning model takes
### We then drop all parties other than democratic or republican - green, constitutionalist, etc

def importVotingData():
    voterRegistrationSetOne = pd.read_csv('VotingData/VoterDetailsListPart1.csv', sep=',', low_memory=False)
    voterRegistrationSetTwo = pd.read_csv('VotingData/VoterDetailsListPart2.csv', sep=',', low_memory=False)
    return voterRegistrationSetOne, voterRegistrationSetTwo

def cleanVoterRegistrationData(df = None):
    df = df[['VOTER_ID', 'VOTER_STATUS',  'BIRTH_YEAR', 'GENDER', 'PRECINCT_CODE', 'PARTY',]]
    df.PARTY.replace(['DEM', 'REP'], [0, 1], inplace=True)
    df.VOTER_STATUS.replace(['Active', 'Inactive'], [1, 0], inplace=True)
    df.GENDER.replace(['Male', 'Female', 'X' , 'Not Disclosed'], [1, 0, 0, 0], inplace=True)
    df = df[(df.PARTY == 0) | (df.PARTY == 1)]
    return df
    

In [24]:
### Here is where we actually execute the cleaning and output the data
voterRegistrationSetOne, voterRegistrationSetTwo = importVotingData()
voterRegistrationSetOne = cleanVoterRegistrationData(voterRegistrationSetOne)
voterRegistrationSetTwo = cleanVoterRegistrationData(voterRegistrationSetTwo)



In [25]:
### We then concatenate the two sets of voter data into a single list
voterRegistrationSet = pd.concat([voterRegistrationSetOne, voterRegistrationSetTwo], ignore_index=True)

In [26]:
### We then make sure that the PARTY column with 0 for Democrat and 1 for Republican is an actual integer, and not a string
voterRegistrationSet['PARTY'] = pd.to_numeric(voterRegistrationSet['PARTY'])

In [27]:
### I reorganize the columns here just so I can keep them straight in my head
### I also made sure to reindex the results so that the row ID is consistent 
### I didn't need to do this in the end, but it was conceptually important for me to know exactly what I was looking at

#reorganizedColumns = ['VOTER_ID', 'VOTER_STATUS','GENDER','BIRTH_YEAR', 'PRECINCT_CODE', 'PARTY', ]
reorganizedColumns = ['VOTER_STATUS','GENDER', 'PARTY']
voterRegistrationSet = voterRegistrationSet.reindex(columns=reorganizedColumns)


In [28]:
print(voterRegistrationSet)

        VOTER_STATUS  GENDER  PARTY
0                  1       1      0
1                  0       0      0
2                  1       1      0
3                  1       1      0
4                  1       1      0
...              ...     ...    ...
127194             1       1      1
127195             0       1      0
127196             1       1      0
127197             1       0      0
127198             1       1      0

[127199 rows x 3 columns]


In [14]:
### We convert the pandas data set to numpy so I can feed it to the training test split function
voterRegistrationSetNumpy = voterRegistrationSet.to_numpy()

In [19]:
### Here we use the train_test_split function to split the voter registration into actionable sets of data
### I split the train/test size evenly, and make sure to have a high random state for shuffling the data
### The Binary variables I match the Logistic Regression model on are: Gender, and Frequent Voting Status (0 for infrequent, 1 for frequent)
### Then, I create the Logistic Regression Model, and set its random state as well
### It seemed appropriate to choose liblinear as the solver, as overall I do not have that much data - 250k-ish rows
### I then fit the model 

### We then predict the voter registration using the fitted model, and calculate the accuracy score
### The accuracy score is decent - (78.5%) but has a slight peculiarity that I will discuss later when displaying graphs
### I then extract the actual voter registration numbers and predicted registration numbers using Counter
### This gives us a dict of the extracted keys (0, 1) for democratic/republican, and the associated number of actual/predicted voters
### I then fit the data using statsmodels so that I can get the associated P values and R Squared values

### The R-Squared values being so low does not surprise me - I am training on two sets of features (frequent voter status, gender)
### Clearly, not all the variance in data can be explained by those two feature sets
### However, we do know from political/social sciences that gender and frequent voting status correspond strongly to political party preference
### As such, getting a R-Squared value of 0.208 from this data is not "horrific"
### We just need more data to explain the variance
### I would have thought it would have been a little higher - maybe 0.3, but those are the breaks for this project I suppose
### The P values being 0 doesn't surprise. The features are extremely relevant to the model. 


voterRegistrationSet_train, voterRegistrationSet_test, voterParty_train, voterParty_test = train_test_split(voterRegistrationSetNumpy[:,0:2], voterRegistrationSetNumpy[:,2], test_size=0.5, train_size=0.5, random_state=42)

model = LogisticRegression(solver='liblinear', random_state=10)
model.fit(voterRegistrationSet_train, voterParty_train)

voterPredict = model.predict(voterRegistrationSet_test)
accuracy = accuracy_score(voterParty_test, voterPredict)

actualRegistration = dict(Counter(voterParty_test))
predictedRegistration = dict(Counter(voterPredict))

mod = sm.OLS(voterRegistrationSetNumpy[:,2], voterRegistrationSetNumpy[:,0:2])
fit = mod.fit()

print("Accuracy of predicted voter registration values: {}\n".format(accuracy))
print(fit.summary())


Accuracy of predicted voter registration values: 0.7852201257861635

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.208
Model:                            OLS   Adj. R-squared (uncentered):              0.208
Method:                 Least Squares   F-statistic:                          1.672e+04
Date:                Mon, 09 Dec 2024   Prob (F-statistic):                        0.00
Time:                        00:49:23   Log-Likelihood:                         -67919.
No. Observations:              127199   AIC:                                  1.358e+05
Df Residuals:                  127197   BIC:                                  1.359e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>

In [29]:
### Here we go. I plot the actual registration, and the predicted registration.
### You will notice immediately that the predicted registration does not actually predict any Republican voters!
### I tried very long to get it to show some republican voters, but I couldn't seem to get it to fit in the model
### Here's the thing though - Colorado is a more-democratic leaning state, and Boulder County even more democratic leaning than that
### As such, the majority of people registered in the county affiliate with a party are Democratic party members

### From our subset of training data, we get a number of 49.9k Democrats and 13.66k Republicans
### As such, roughly 73% of the registered voters here are democratic, and 27% of the voters are Republicans
### Thus, the accuracy above being as 78% seems to work with what we are seeing here
### I have no clue why the model is overzealous and overfits to democrats specifically in this case
### but how we could fix it is by introducing more binary features into the logistic regression
### This could take the form of 'yes, no' answers to policy questions and adding them to the feature list
### Or, we could take the entirety of the voter registration data, and fit an SVM model to it


### Conclusion is below figures


fig = go.Figure()

fig.add_trace(go.Bar(
    x=['Democrats', 'Republicans'],
    y=[actualRegistration[0], actualRegistration[1]],
    name='Actual Registration',
        )
    )


fig.add_trace(go.Bar(
    x=['Democrats', 'Republicans'],
    y=[predictedRegistration[0], 0],
    name='Predicted Registration',
        )
    )
fig.update_layout(yaxis=dict(title='Number of Registrations'), xaxis=dict(title='Political Parties'), title=dict(text="Actual vs Predicted Registration"))

fig.show()


In [None]:
### Conclusions
### In short, identifying and predicting voter registration based on two variables is undesireable, but can achieve passable results 
### I am not upset with the results, and it is 'accurate', but with more policy questions that could be classified as a 'yes, no' answer 
### I could have put more data into the model and gotten a better actual result
### SVM I believe is the true method that should be used for this, simply because I would be able to make use of ALL the voter registration data
### A key piece of data I did not get to use is the Birth Year 
### Political Science again tells us that the age of a person can correspond with which political party they register with
### A very large reach goal could have been to find the average median income of the precinct district of the voter
### Income again, is another variable that corresponds strongly to political voting patterns
### Either way - this model could be greatly expanded, but the strength of the logistic regression model is here
### If I am able to get this far with just two features, I could get much further with even one or two more