In [28]:
import requests
from bs4 import BeautifulSoup
import os
import re
import pandas as pd

In [5]:
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
    "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
    "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
    "New_Hampshire", "New_Jersey", "New_Mexico", "New_York",
    "North_Carolina", "North_Dakota", "Ohio", "Oklahoma", "Oregon",
    "Pennsylvania", "Rhode_Island", "South_Carolina", "South_Dakota",
    "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington",
    "West_Virginia", "Wisconsin", "Wyoming"
]

In [3]:
output_dir = "states_html"
for state in states:
    state_filename = state.lower().replace("_", "")  # e.g., "New_York" -> "newyork"
    url = f"https://ballotpedia.org/{state}_elections,_2024"
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        html_pretty = soup.prettify()
        filename = os.path.join(output_dir, f"{state_filename}.html")
        
        with open(filename, "w", encoding="utf-8") as file:
            file.write(html_pretty)
        
        print(f"Successfully saved {state} elections page.")
    else:
        print(f"Failed to download {state} elections page. Status code: {response.status_code}")

Successfully saved Alabama elections page.
Successfully saved Alaska elections page.
Successfully saved Arizona elections page.
Successfully saved Arkansas elections page.
Successfully saved California elections page.
Successfully saved Colorado elections page.
Successfully saved Connecticut elections page.
Successfully saved Delaware elections page.
Successfully saved Florida elections page.
Successfully saved Georgia elections page.
Successfully saved Hawaii elections page.
Successfully saved Idaho elections page.
Successfully saved Illinois elections page.
Successfully saved Indiana elections page.
Successfully saved Iowa elections page.
Successfully saved Kansas elections page.
Successfully saved Kentucky elections page.
Successfully saved Louisiana elections page.
Successfully saved Maine elections page.
Successfully saved Maryland elections page.
Successfully saved Massachusetts elections page.
Successfully saved Michigan elections page.
Successfully saved Minnesota elections pag

In [37]:
def parse_candidates_to_df(html_file_path, state):
    with open(html_file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    candidate_tables = soup.find_all('table', class_='widget-table')

    candidates_list = []

    type_map = {
        'Federal Candidates': 'Federal',
        'State Executive Candidates': 'State',
        'Local Candidates': 'Local'
    }

    for table in candidate_tables:
        type_candidates = table.find('caption').get_text().strip()
        type_candidates = type_map.get(type_candidates, 'Unknown')
        rows = table.find('tbody').find_all('tr')

        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 4:  # Ensure there are enough columns
                name = cells[0].get_text(strip=True)
                office = cells[1].get_text(strip=True)
                party = cells[2].get_text(strip=True)
                status = cells[3].get_text(strip=True)
                candidates_list.append({
                    'State': state,
                    'Type': type_candidates,
                    'Name': name,
                    'Office': office,
                    'Party': party,
                    'Status': status
                })
    
    return pd.DataFrame(candidates_list)

base_dir = 'states_html'

all_candidates_df = pd.DataFrame()

states_formatted = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
    "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
    "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
    "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
    "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
    "NewHampshire", "NewJersey", "NewMexico", "NewYork",
    "NorthCarolina", "NorthDakota", "Ohio", "Oklahoma", "Oregon",
    "Pennsylvania", "RhodeIsland", "SouthCarolina", "SouthDakota",
    "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington",
    "WestVirginia", "Wisconsin", "Wyoming"
]

for state in states_formatted:
    html_file_path = os.path.join(base_dir, f"{state}.html")
    if os.path.exists(html_file_path):
        df_candidates = parse_candidates_to_df(html_file_path, state)
        all_candidates_df = pd.concat([all_candidates_df, df_candidates], ignore_index=True)
    else:
        print(f"File not found: {html_file_path}")

In [38]:
# all_candidates_df
all_candidates_df['Type'] = all_candidates_df['Type'].replace('unknown', 'State')

all_candidates_df['Incumbent'] = all_candidates_df['Name'].apply(lambda x: 'Y' if x.endswith('Incumbent') else 'N')

all_candidates_df['Name'] = all_candidates_df['Name'].str.replace('Incumbent', '', case=False)

def split_status(status):
    parts = re.split('(?<=[a-z])(?=[A-Z])', status)
    return parts[0], parts[1] if len(parts) > 1 else None

all_candidates_df['Election_Status'], all_candidates_df['Election'] = zip(*all_candidates_df['Status'].apply(split_status))

all_candidates_df.fillna('NA', inplace=True)
all_candidates_df.to_csv("final.csv")