## üìö Importing Necessary Libraries

In [3]:
# Import necessary libraries for data manipulation and model training
import pandas as pd
import numpy as np
import pickle
import json
import os

# Import machine learning models and evaluation metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, brier_score_loss

## üìÇ Loading and Preprocessing the Data
This step loads the prepared dataset and preprocesses it to define features and labels for model training.


In [4]:
# Load the dataset without setting the first column as the index
df = pd.read_csv(
    "dataset/polling_data_with_features.csv",
    low_memory=False  # Prevents chunk-wise processing that may cause dtype issues
)

# Confirm that the 'state' column is present
print("Columns in the dataset after loading:", df.columns.tolist())

# Define feature columns and target variable for model training
features = ['pct_estimate', 'days_until_election', 'is_incumbent_pres',
            'is_incumbent_vp', 'is_incumbent_party', 'party_DEM', 'party_REP']
target = 'vote_share'

# Specify the election cycles and states for model evaluation
cycles = [2004, 2008, 2012, 2016, 2020]
states = [
    "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut",
    "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana",
    "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts",
    "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
    "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina",
    "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island",
    "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
    "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
]

Columns in the dataset after loading: ['state', 'cycle', 'modeldate', 'pct_estimate', 'party', 'election_date', 'days_until_election', 'category', 'candidate', 'vote_share', 'is_incumbent_pres', 'is_incumbent_vp', 'is_incumbent_party', 'party_DEM', 'party_REP']


## üõ†Ô∏è Defining Model Training and Evaluation Function
This step creates a reusable function to train and evaluate machine learning models.


In [5]:
# Define a function to train and evaluate models
def train_and_evaluate_model(model, train_data, test_data, features, target):
    """
    Train and evaluate a given model on the provided train and test data.

    Parameters:
    - model: Machine learning model (e.g., LinearRegression, RandomForestRegressor)
    - train_data: Training dataset
    - test_data: Testing dataset
    - features: List of feature column names
    - target: Name of the target variable

    Returns:
    - mae: Mean Absolute Error of predictions
    - predicted_winner: Predicted winning party based on the model
    - actual_winner: Actual winning party based on the data
    """
    # Extract features and target variables for train and test sets
    X_train, y_train = train_data[features], train_data[target]
    X_test, _ = test_data[features], test_data[target]

    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test dataset
    test_data[f'predicted_{model.__class__.__name__}'] = model.predict(X_test)

    # Aggregate predictions at the party level
    aggregated_predictions = test_data.groupby('party').apply(
        lambda df: np.average(df[f'predicted_{model.__class__.__name__}'])
    ).reset_index(name='aggregated_prediction')

    # Compute actual average vote shares
    actual_vote_shares = test_data.groupby('party')[target].mean().reset_index()

    # Merge predictions and actuals for comparison
    comparison = pd.merge(aggregated_predictions, actual_vote_shares, on='party')
    comparison.rename(columns={target: 'actual_vote_share'}, inplace=True)

    # Determine the predicted and actual winning parties
    predicted_winner = comparison.loc[comparison['aggregated_prediction'].idxmax(), 'party']
    actual_winner = comparison.loc[comparison['actual_vote_share'].idxmax(), 'party']

    # Calculate Mean Absolute Error between predictions and actuals
    mae = mean_absolute_error(comparison['actual_vote_share'], comparison['aggregated_prediction'])

    return mae, predicted_winner, actual_winner

## ü§ñ Train and Evaluate Multiple Models
In this step, we loop through the data to train and evaluate multiple models, comparing their performance.

We will train and evaluate models such as:
- LinearRegression
- RandomForestRegressor
- DecisionTreeRegressor
- XGBoost



In [18]:
from tqdm.notebook import tqdm  # Use this for Jupyter/Colab environments

# Define model types to evaluate
models = [
    ("LinearRegression", LinearRegression()),
    ("RandomForest", RandomForestRegressor(n_estimators=100)),
    ("DecisionTree", DecisionTreeRegressor(max_depth=10)),
    ("XGBoost", XGBRegressor(objective='reg:squarederror', n_estimators=100))  # Use XGBRegressor directly
]

# Initialize dictionaries to store results
state_to_pred = {}
state_to_true = {}
state_to_mae = {}

# Loop through cycles and states to evaluate each model
for model_name, model in tqdm(models, desc="Evaluating Models", dynamic_ncols=True):
    tqdm.write(f"Evaluating {model_name}...")

    for i in tqdm(range(1, len(cycles)), desc=f"Evaluating {model_name} Cycles", leave=False, dynamic_ncols=True):
        for state in tqdm(states, desc=f"Evaluating {model_name} States", leave=False, dynamic_ncols=True):
            test_cycle = cycles[i]
            train_cycles = cycles[:i]

            data_state = df[df['state'] == state].copy()
            train_data = data_state[data_state['cycle'].isin(train_cycles)]
            test_data = data_state[data_state['cycle'] == test_cycle]

            if train_data.empty or test_data.empty:
                continue  # Skip if there is no data for this state and cycle

            # Extract features and target
            X_train, y_train = train_data[features], train_data[target]
            X_test, y_test = test_data[features], test_data[target]

            # Train the model
            model.fit(X_train, y_train)

            # Predict and save to test_data
            test_data = test_data.copy()  # Ensure it's a deep copy
            test_data.loc[:, f'predicted_{model.__class__.__name__}'] = model.predict(X_test)

            # Aggregate predictions at the party level
            aggregated_predictions = (
                test_data.groupby('party')[f'predicted_{model.__class__.__name__}']
                .apply(np.average)
                .reset_index(name='aggregated_prediction')
            )

            # Compute actual average vote shares
            actual_vote_shares = test_data.groupby('party')[target].mean().reset_index()

            # Merge predictions and actuals for comparison
            comparison = pd.merge(aggregated_predictions, actual_vote_shares, on='party')
            comparison.rename(columns={target: 'actual_vote_share'}, inplace=True)

            # Determine the predicted and actual winning parties
            predicted_winner = comparison.loc[comparison['aggregated_prediction'].idxmax(), 'party']
            actual_winner = comparison.loc[comparison['actual_vote_share'].idxmax(), 'party']

            # Calculate Mean Absolute Error between predictions and actuals
            mae = mean_absolute_error(comparison['actual_vote_share'], comparison['aggregated_prediction'])

            # Store the results
            if state not in state_to_pred:
                state_to_pred[state] = {}
            if state not in state_to_true:
                state_to_true[state] = {}
            if state not in state_to_mae:
                state_to_mae[state] = {}

            state_to_pred[state][test_cycle] = predicted_winner
            state_to_true[state][test_cycle] = actual_winner
            state_to_mae[state][test_cycle] = mae

    # Save results for the model
    os.makedirs('results', exist_ok=True)  # Ensure the directory exists
    with open(f'results/{model_name}_pred.json', 'w') as file:
        json.dump(state_to_pred, file, indent=4)

    with open(f'results/{model_name}_true.json', 'w') as file:
        json.dump(state_to_true, file, indent=4)

    with open(f'results/{model_name}_mae.json', 'w') as file:
        json.dump(state_to_mae, file, indent=4)

Evaluating Models:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating LinearRegression...


Evaluating LinearRegression Cycles:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating LinearRegression States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating LinearRegression States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating LinearRegression States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating LinearRegression States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating RandomForest...


Evaluating RandomForest Cycles:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating RandomForest States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating RandomForest States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating RandomForest States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating RandomForest States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating DecisionTree...


Evaluating DecisionTree Cycles:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating DecisionTree States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating DecisionTree States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating DecisionTree States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating DecisionTree States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating XGBoost...


Evaluating XGBoost Cycles:   0%|          | 0/4 [00:00<?, ?it/s]

Evaluating XGBoost States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating XGBoost States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating XGBoost States:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating XGBoost States:   0%|          | 0/50 [00:00<?, ?it/s]

## üíæ Saving Trained Models

In [20]:
# Loop through each model and save it using Pickle
for model_name, model in models:
    # Save the model as a .pkl file in the 'model' directory
    with open(f"model/{model_name}.pkl", "wb") as f:
        pickle.dump(model, f)  # Serialize and write the model to disk

    # Confirm successful saving of the model
    print(f"‚úÖ {model_name} model saved to 'model/{model_name}.pkl'")

‚úÖ LinearRegression model saved to 'model/LinearRegression.pkl'
‚úÖ RandomForest model saved to 'model/RandomForest.pkl'
‚úÖ DecisionTree model saved to 'model/DecisionTree.pkl'
‚úÖ XGBoost model saved to 'model/XGBoost.pkl'
