<a href="https://colab.research.google.com/github/wangjalen7/project_voting/blob/main/Project_Voting_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Jalen's Model

## Data Cleaning and Visualization

In [9]:
import pandas as pd

# Load the voting data
voting_data_path = './data/voting_VA.csv'
voting_data = pd.read_csv(voting_data_path)

# Display the first few rows of the voting data to understand its structure
voting_data.head()


Unnamed: 0.1,Unnamed: 0,year,state,state_po,county_name,county_fips,office,candidate,party,candidatevotes,totalvotes,version,mode
0,11161,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,AL GORE,DEMOCRAT,5092,11925,20220315,TOTAL
1,11162,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,GEORGE W. BUSH,REPUBLICAN,6352,11925,20220315,TOTAL
2,11163,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,RALPH NADER,GREEN,220,11925,20220315,TOTAL
3,11164,2000,VIRGINIA,VA,ACCOMACK,51001,US PRESIDENT,OTHER,OTHER,261,11925,20220315,TOTAL
4,11165,2000,VIRGINIA,VA,ALBEMARLE,51003,US PRESIDENT,AL GORE,DEMOCRAT,16255,36846,20220315,TOTAL


In [15]:
# Dropping irrelevant columns
voting_data_clean = voting_data.drop(columns=['Unnamed: 0', 'state', 'state_po', 'office', 'version', 'mode'])

# Check for missing values
missing_values = voting_data_clean.isnull().sum()

# Calculating net vote counts per county per election year
# Filter to keep only Republican and Democrat votes for simplicity in calculating net vote count
voting_data_filtered = voting_data_clean[voting_data_clean['party'].isin(['REPUBLICAN', 'DEMOCRAT'])]

# Pivot table to reshape the data
pivot_data = voting_data_filtered.pivot_table(index=['year', 'county_name', 'county_fips', 'totalvotes'],
                                              columns='party', values='candidatevotes', aggfunc='sum').reset_index()

# Calculate net vote count (Republican - Democrat)
pivot_data['net_vote_count'] = pivot_data['REPUBLICAN'] - pivot_data['DEMOCRAT']

# Check the processed data
pivot_data.head(), missing_values


(party  year county_name  county_fips  totalvotes  DEMOCRAT  REPUBLICAN  \
 0      2000    ACCOMACK        51001       11925      5092        6352   
 1      2000   ALBEMARLE        51003       36846     16255       18291   
 2      2000  ALEXANDRIA        51510       55199     33633       19043   
 3      2000   ALLEGHANY        51005        5123      2214        2808   
 4      2000      AMELIA        51007        4788      1754        2947   
 
 party  net_vote_count  
 0                1260  
 1                2036  
 2              -14590  
 3                 594  
 4                1193  ,
 year              0
 county_name       0
 county_fips       0
 candidate         0
 party             0
 candidatevotes    0
 totalvotes        0
 dtype: int64)

## Model

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = pivot_data[['net_votes']]
y = pivot_data['net_votes_ihs']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)


In [12]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2

(6.295284246708634, 0.33795797437067276)

In [None]:
# Load the primary election data files to examine their structure
primary_dem_path = './data/Election Results_5163e1ce-994b-48ff-82c3-31a7c5f44f10.csv'
primary_rep_path = './data/Election Results_16805bde-45f1-4e80-b623-4e8730c5250a.csv'

# Read the files
primary_dem_data = pd.read_csv(primary_dem_path)
primary_rep_data = pd.read_csv(primary_rep_path)

# Display the first few rows of each dataset to understand their structure
(primary_dem_data.head(), primary_rep_data.head())


In [13]:
dem_votes = primary_dem_data.groupby('LocalityName')['TOTAL_VOTES'].sum().reset_index()
rep_votes = primary_rep_data.groupby('LocalityName')['TOTAL_VOTES'].sum().reset_index()
primary_results = pd.merge(dem_votes, rep_votes, on='LocalityName', suffixes=('_dem', '_rep'))
primary_results['net_votes_primary'] = primary_results['TOTAL_VOTES_rep'] - primary_results['TOTAL_VOTES_dem']
