In [1]:
import requests
import pandas as pd
import os
import numpy as np

def get_state_data(state):
    headers = {
        'accept': '*/*',
        'accept-language': 'en-US,en;q=0.9',
        'dnt': '1',
        'priority': 'u=1, i',
        'sec-ch-ua': '"Not;A=Brand";v="24", "Chromium";v="128"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"macOS"',
        'sec-fetch-dest': 'empty',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'same-origin',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    }
    
    url = f'https://projects.fivethirtyeight.com/polls/president-general/2024/{state.replace("_", "-")}/polling-average.json'
    headers['referer'] = f'https://projects.fivethirtyeight.com/polls/president-general/2024/{state.replace("_", "-")}/'
    
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes
        return response.json()
    except requests.RequestException as e:
        print(f"Error retrieving data for {state}: {e}")
        return None


In [2]:
swing_states_data = []

def process_and_export_data(state_data, state):
    # Create a dictionary to hold data for each date
    data_dict = {}
    
    for entry in state_data:
        date = entry['date']
        candidate = entry['candidate']
        pct_estimate = entry['pct_estimate']
        
        if date not in data_dict:
            data_dict[date] = {'date': date, 'trump_polling_average': None, 'harris_polling_average': None}
        
        if candidate == 'Trump':
            data_dict[date]['trump_polling_average'] = pct_estimate
        elif candidate == 'Harris':
            data_dict[date]['harris_polling_average'] = pct_estimate
    
    processed_data = list(data_dict.values())
    df = pd.DataFrame(processed_data)
    df['date'] = pd.to_datetime(df['date'])
    df.sort_values('date', inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['lead'] = abs(df['trump_polling_average'] - df['harris_polling_average']).round(2)
    df['candidate_lead'] = df.apply(lambda row: 
        'trump' if row['trump_polling_average'] > row['harris_polling_average'] 
        else 'harris', axis=1)

    latest_data = {
        'state': state,
        'date': df.iloc[-1]['date'].strftime('%Y-%m-%d'),
        'trump_polling_average': df.iloc[-1]['trump_polling_average'],
        'harris_polling_average': df.iloc[-1]['harris_polling_average'],
        'lead': df.iloc[-1]['lead'],
        'candidate_lead': df.iloc[-1]['candidate_lead']
    }
    swing_states_data.append(latest_data)
    
    # Export to CSV
    filename = f"{state}.csv"
    df.to_csv(filename, index=False)
    
    return df


In [3]:
swing_states = ['michigan', 'wisconsin', 'pennsylvania', 'nevada', 'arizona', 'north_carolina', 'georgia']

for state in swing_states:
    print(f"Processing {state}...")
    
    # Retrieve data
    state_data = get_state_data(state)
    
    if state_data is not None:
        # Process and export data
        state_df = process_and_export_data(state_data, state)
        
        # Create the variable with the dynamic name
        exec(f"{state} = state_df")
    else:
        print(f"Skipping {state} due to data retrieval error")
    
    print("------------------------\n")

swing_states_df = pd.DataFrame(swing_states_data)
print("Data requests and formatting complete for swing states line graphs.")


Processing michigan...
------------------------

Processing wisconsin...
------------------------

Processing pennsylvania...
------------------------

Processing nevada...
------------------------

Processing arizona...
------------------------

Processing north_carolina...
------------------------

Processing georgia...
------------------------

Data requests and formatting complete for swing states line graphs.


In [27]:
def format_map_data(state_data, state):
    if len(state_data) < 2:
        return None
    
    trump_data = state_data[0] if state_data[0]['candidate'] == 'Trump' else state_data[1]
    harris_data = state_data[1] if state_data[1]['candidate'] == 'Harris' else state_data[0]
    
    trump_avg = trump_data['pct_estimate']
    harris_avg = harris_data['pct_estimate']
    
    lead = round(abs(trump_avg - harris_avg), 2)
    candidate_lead = 'trump' if trump_avg > harris_avg else 'harris'
    
    return {
        'state': state,
        'trump_polling_average': trump_avg,
        'harris_polling_average': harris_avg,
        'date': trump_data['date'],
        'lead': lead,
        'candidate_lead': candidate_lead
    }

In [28]:
poll_states = ["california", "colorado",
    "connecticut", "delaware", "florida", "hawaii", "idaho", "illinois",
    "indiana", "iowa", "kansas", "kentucky", "louisiana", "maine", "maryland",
    "massachusetts", "minnesota", "mississippi", "missouri", "montana", "nebraska", "new_jersey", 
    "new_mexico", "new_york", "north_dakota", "oklahoma", "oregon",
    "rhode_island", "south_carolina", "south_dakota", "tennessee", "texas",
    "utah", "vermont", "virginia", "washington", "west_virginia",
    "wyoming", "washington_dc"]

poll_states_data = []

for state in poll_states:
    print(f"Processing {state}...")
    
    # Retrieve data
    state_data = get_state_data(state)
    
    if state_data is not None:
        formatted_data = format_map_data(state_data, state)
        if formatted_data:
            poll_states_data.append(formatted_data)
    else:
        print(f"Skipping {state} due to data retrieval error")
    print("------------------------\n")

poll_states_df = pd.DataFrame(poll_states_data)
swing_and_poll_states_df = pd.concat([poll_states_df, swing_states_df], ignore_index=True)
print("Other states data requests and formatting complete.")

Processing california...
------------------------

Processing colorado...
Error retrieving data for colorado: 404 Client Error: Not Found for url: https://projects.fivethirtyeight.com/polls/president-general/2024/colorado/polling-average.json
Skipping colorado due to data retrieval error
------------------------

Processing connecticut...
Error retrieving data for connecticut: 404 Client Error: Not Found for url: https://projects.fivethirtyeight.com/polls/president-general/2024/connecticut/polling-average.json
Skipping connecticut due to data retrieval error
------------------------

Processing delaware...
Error retrieving data for delaware: 404 Client Error: Not Found for url: https://projects.fivethirtyeight.com/polls/president-general/2024/delaware/polling-average.json
Skipping delaware due to data retrieval error
------------------------

Processing florida...
------------------------

Processing hawaii...
Error retrieving data for hawaii: 404 Client Error: Not Found for url: https

In [29]:
map_widget_df = swing_and_poll_states_df
map_widget_df = map_widget_df.sort_values('state')
map_widget_df = map_widget_df.reset_index(drop=True)

electoral_votes = {
    'arizona': 11,
    'california': 54,
    'georgia': 16,
    'florida': 30,
    'maryland': 10,
    'michigan': 15,
    'minnesota': 10,
    'missouri': 10,
    'montana': 4,
    'nevada': 6,
    'new_mexico': 5,
    'new_york': 28,
    'north_carolina': 16,
    'pennsylvania': 19,
    'texas': 40,
    'virginia': 13,
    'wisconsin': 10
}

map_widget_df['electoral_votes'] = map_widget_df['state'].map(electoral_votes)

In [30]:
def merge_missing_states(map_widget_df: pd.DataFrame) -> pd.DataFrame:
    """
    Merge missing states data and update missing electoral votes in existing states.
    """
    missing_states = {
        'alabama': {'electoral_votes': 9, 'candidate_lead': 'trump'},
        'alaska': {'electoral_votes': 3, 'candidate_lead': 'trump'},
        'arkansas': {'electoral_votes': 6, 'candidate_lead': 'trump'},
        'colorado': {'electoral_votes': 10, 'candidate_lead': 'harris'},
        'connecticut': {'electoral_votes': 7, 'candidate_lead': 'harris'},
        'delaware': {'electoral_votes': 3, 'candidate_lead': 'harris'},
        'hawaii': {'electoral_votes': 4, 'candidate_lead': 'harris'},
        'idaho': {'electoral_votes': 4, 'candidate_lead': 'trump'},
        'illinois': {'electoral_votes': 19, 'candidate_lead': 'harris'},
        'indiana': {'electoral_votes': 11, 'candidate_lead': 'trump'},
        'iowa': {'electoral_votes': 6, 'candidate_lead': 'trump'},
        'kansas': {'electoral_votes': 6, 'candidate_lead': 'trump'},
        'kentucky': {'electoral_votes': 8, 'candidate_lead': 'trump'},
        'louisiana': {'electoral_votes': 8, 'candidate_lead': 'trump'},
        'maine': {'electoral_votes': 4, 'candidate_lead': 'harris'},
        'massachusetts': {'electoral_votes': 11, 'candidate_lead': 'harris'},
        'mississippi': {'electoral_votes': 6, 'candidate_lead': 'trump'},
        'nebraska': {'electoral_votes': 5, 'candidate_lead': 'trump'},
        'new_hampshire': {'electoral_votes': 4, 'candidate_lead': 'harris'},
        'new_jersey': {'electoral_votes': 14, 'candidate_lead': 'harris'},
        'north_dakota': {'electoral_votes': 3, 'candidate_lead': 'trump'},
        'ohio': {'electoral_votes': 17, 'candidate_lead': 'trump'},
        'oklahoma': {'electoral_votes': 7, 'candidate_lead': 'trump'},
        'oregon': {'electoral_votes': 8, 'candidate_lead': 'harris'},
        'rhode_island': {'electoral_votes': 4, 'candidate_lead': 'harris'},
        'south_carolina': {'electoral_votes': 9, 'candidate_lead': 'trump'},
        'south_dakota': {'electoral_votes': 3, 'candidate_lead': 'trump'},
        'tennessee': {'electoral_votes': 11, 'candidate_lead': 'trump'},
        'utah': {'electoral_votes': 6, 'candidate_lead': 'trump'},
        'vermont': {'electoral_votes': 3, 'candidate_lead': 'harris'},
        'washington': {'electoral_votes': 12, 'candidate_lead': 'harris'},
        'washington_dc': {'electoral_votes': 3, 'candidate_lead': 'harris'},
        'west_virginia': {'electoral_votes': 4, 'candidate_lead': 'trump'},
        'wyoming': {'electoral_votes': 3, 'candidate_lead': 'trump'}
    }

    # Create a DataFrame of all states
    all_states_df = pd.DataFrame.from_dict(missing_states, orient='index')
    all_states_df['state'] = all_states_df.index
    all_states_df = all_states_df.reset_index(drop=True)

    # First, update electoral votes for existing states
    result_df = map_widget_df.copy()
    
    # Create a mapping of states to electoral votes
    electoral_votes_map = {
        state: data['electoral_votes'] 
        for state, data in missing_states.items()
    }
    
    # Update electoral votes where they're missing
    result_df['electoral_votes'] = result_df.apply(
        lambda row: electoral_votes_map.get(row['state']) 
        if pd.isna(row['electoral_votes']) 
        else row['electoral_votes'],
        axis=1
    )
    
    # Get states that don't exist in the polling data
    existing_states = set(result_df['state'].unique())
    states_to_add = {
        state: data 
        for state, data in missing_states.items() 
        if state not in existing_states
    }
    
    if states_to_add:
        # Create DataFrame for new states
        new_df = pd.DataFrame.from_dict(states_to_add, orient='index')
        new_df['state'] = new_df.index
        new_df = new_df.reset_index(drop=True)
        
        # Ensure new DataFrame has all columns from original DataFrame
        for col in result_df.columns:
            if col not in new_df.columns:
                new_df[col] = np.nan
                
        # Ensure columns are in the same order
        new_df = new_df[result_df.columns]
        
        # Combine DataFrames
        result_df = pd.concat([result_df, new_df], ignore_index=True)
    
    # Sort and reset index
    result_df = result_df.sort_values('state').reset_index(drop=True)
    
    # Remove duplicate states (keep the one with polling data if available)
    result_df = result_df.drop_duplicates(
        subset=['state'],
        keep='first'  # keeps the first occurrence (which has polling data if available)
    )
    
    return result_df

# Usage:
map_widget_df = merge_missing_states(map_widget_df)

In [36]:
def determine_position(row):
    if pd.isnull(row['lead']):
        return 'republican' if row['candidate_lead'] == 'trump' else 'democrat'
    else:
        lead = row['lead']
        candidate = row['candidate_lead']
        if lead > 10:
            return 'republican' if candidate == 'trump' else 'democrat'
        elif 2 < lead <= 10:
            return 'lean_trump' if candidate == 'trump' else 'lean_harris'
        else:  # lead is between 0 and 2
            return 'battleground_trump' if candidate == 'trump' else 'battleground_harris'

# Apply the function to create the new 'position' column
map_widget_df['position'] = map_widget_df.apply(determine_position, axis=1)
map_widget_df.to_csv("map_widget.csv", index=False)
print("map widget data formatting complete")

map widget data formatting complete


In [37]:
positions = ['republican', 'lean_trump', 'battleground_trump', 
             'battleground_harris', 'lean_harris', 'democrat']

# Create a dictionary to store the sum of electoral votes for each position
electoral_votes_sum = {}

# Calculate the sum of electoral votes for each position
for position in positions:
    electoral_votes_sum[position] = map_widget_df[map_widget_df['position'] == position]['electoral_votes'].sum()

# Create the simplified dataframe
widget = pd.DataFrame([electoral_votes_sum])
widget.to_csv("widget_simplified.csv", index=False)

In [38]:
national_data_raw = get_state_data("national")
national_df = process_and_export_data(national_data_raw, "national")
print("national polling data request and formating complete.")

national polling data request and formating complete.
