In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
import warnings

# Methods:
import model_training_and_prediction_methods as mtapm

warnings.filterwarnings("ignore")

In [2]:
model_set_df = pd.read_parquet("../Data/Proccesed_data/model_set.parquet")
model_set_df.head()


Unnamed: 0,terc_code,county,year,round,turnout_percentage,gdp_per_capita_delta_5_years,average_gross_salary,demographic_dependency_ratio,demographic_dependency_ratio_delta_5_years,population_70_plus_delta_1_year
0,201,bolesławiecki,2000,1,58.5,,1860.0,66.0,,
1,201,bolesławiecki,2005,1,44.16,5084.0,2027.29,56.5,-9.5,252.0
2,201,bolesławiecki,2005,2,45.09,5084.0,2027.29,56.5,-9.5,252.0
3,201,bolesławiecki,2010,1,50.33,14142.0,2629.01,52.9,-3.6,55.0
4,201,bolesławiecki,2010,2,49.09,14142.0,2629.01,52.9,-3.6,55.0


### The previously prepared data was used to predict turnout results at the district level for 2025. 
### For this purpose, training data from 2000-2025 was used, but NaN values ​​were inserted in place of the attendance in 2025 so that the model would not see the results. 

### Three different models were used for prediction:
1. XGBoost
2. Random Forest
3. Hist Gradient Boost

### The predictions of each of these models were compared with real turnout results and it was determined which model was most effective, so as to use this model in a later phase to make predictions for 2030.

In [3]:
features = [
    "round", 
    "gdp_per_capita_delta_5_years", 
    "average_gross_salary", 
    "population_70_plus_delta_1_year", 
    "demographic_dependency_ratio", 
    "demographic_dependency_ratio_delta_5_years"
]

target = "turnout_percentage"

train_df, test_df = pd.DataFrame(), pd.DataFrame()


### XGBoost

In [4]:
train_df, test_df = mtapm.prepare_train_and_test_data(model_set_df, target, 2025)

parameters_grid = {
    'n_estimators': [100, 200, 300], 
    'learning_rate': [0.01, 0.05, 0.1], 
    'max_depth': [3, 5, 7], 
    'subsample': [0.8, 1.0]
}

xgb_base = XGBRegressor(random_state= 42, n_jobs= 1)

print("Looking for best parameters.")
grid_search = GridSearchCV(
    estimator= xgb_base,
    param_grid= parameters_grid,
    scoring= 'neg_mean_absolute_error', 
    cv= 3, 
    verbose= 1,
    n_jobs= -1
)

grid_search.fit(train_df[features], train_df[target])
best_model = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")

predicted_values = best_model.predict(test_df[features])

test_df[target] = predicted_values

xgboost_comparison_df = pd.merge(
    model_set_df[model_set_df["year"] == 2025], 
    test_df[["county", "terc_code", "round", target]], 
    on=["county", "terc_code", "round"], 
    suffixes=("_real", "_pred")
)

xgboost_comparison_df["error"] = xgboost_comparison_df[f"{target}_real"] - xgboost_comparison_df[f"{target}_pred"]
mae = mean_absolute_error(xgboost_comparison_df[f"{target}_real"], xgboost_comparison_df[f"{target}_pred"])
rmse = np.sqrt(mean_squared_error(xgboost_comparison_df[f"{target}_real"], xgboost_comparison_df[f"{target}_pred"]))

del predicted_values, best_model
del xgb_base
del xgboost_comparison_df

print("-" * 60)
print(f"Scores for 2025 (XGBoost):")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print("-" * 60)

Looking for best parameters.
Fitting 3 folds for each of 54 candidates, totalling 162 fits


Best parameters: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
------------------------------------------------------------
Scores for 2025 (XGBoost):
MAE: 5.2666
RMSE: 6.2819
------------------------------------------------------------


### Random Forest


In [5]:
train_df, test_df = mtapm.prepare_train_and_test_data(model_set_df, target, 2025)


parameters_grid = {
    'n_estimators': [100, 200, 300], 
    'max_depth': [5, 10, 15, None], 
    'min_samples_split': [2, 5, 10], 
    'min_samples_leaf': [1, 2, 4]
}

rf_base = RandomForestRegressor(random_state=42, n_jobs= 1)

print("Looking for best parameters: ")
grid_search = GridSearchCV(
    estimator= rf_base,
    param_grid= parameters_grid,
    scoring= 'neg_mean_absolute_error', 
    cv= 3, 
    verbose= 1,
    n_jobs= -1
)

grid_search.fit(train_df[features], train_df[target])

best_model = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")

predicted_values = best_model.predict(test_df[features])

test_df[target] = predicted_values

random_forest_comparison_df = pd.merge(
    model_set_df[model_set_df["year"] == 2025], 
    test_df[["county", "terc_code", "round", target]], 
    on=["county", "terc_code", "round"], 
    suffixes=("_real", "_pred")
)

random_forest_comparison_df["error"] = random_forest_comparison_df[f"{target}_real"] - random_forest_comparison_df[f"{target}_pred"]
mae = mean_absolute_error(random_forest_comparison_df[f"{target}_real"], random_forest_comparison_df[f"{target}_pred"])
rmse = np.sqrt(mean_squared_error(random_forest_comparison_df[f"{target}_real"], random_forest_comparison_df[f"{target}_pred"]))

del predicted_values, best_model
del rf_base
del random_forest_comparison_df

print("-" * 60)
print(f"Scores for 2025 (Random Forest):")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print("-" * 60)

Looking for best parameters: 
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
------------------------------------------------------------
Scores for 2025 (Random Forest):
MAE: 6.0560
RMSE: 7.1460
------------------------------------------------------------


### Hist Gradient

In [6]:
train_df, test_df = mtapm.prepare_train_and_test_data(model_set_df, target, 2025)

parameters_grid = {
    'max_iter': [100, 200, 300], 
    'learning_rate': [0.01, 0.05, 0.1], 
    'max_depth': [3, 5, 7], 
    'l2_regularization': [0, 0.1, 1.0]
}

hgb_base = HistGradientBoostingRegressor(random_state=42)

print("Looking for best parameters: ")
grid_search = GridSearchCV(
    estimator= hgb_base,
    param_grid= parameters_grid,
    scoring= 'neg_mean_absolute_error', 
    cv= 3, 
    verbose= 1,
    n_jobs= -1
)

grid_search.fit(train_df[features], train_df[target])

best_model = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")

predicted_values = best_model.predict(test_df[features])

test_df[target] = predicted_values

hist_gradient_comparison_df = pd.merge(
    model_set_df[model_set_df["year"] == 2025], 
    test_df[["county", "terc_code", "round", target]], 
    on=["county", "terc_code", "round"], 
    suffixes=("_real", "_pred")
)

hist_gradient_comparison_df["error"] = hist_gradient_comparison_df[f"{target}_real"] - hist_gradient_comparison_df[f"{target}_pred"]
mae = mean_absolute_error(hist_gradient_comparison_df[f"{target}_real"], hist_gradient_comparison_df[f"{target}_pred"])
rmse = np.sqrt(mean_squared_error(hist_gradient_comparison_df[f"{target}_real"], hist_gradient_comparison_df[f"{target}_pred"]))

del predicted_values, best_model
del hgb_base
del hist_gradient_comparison_df

print("-" * 60)
print(f"Scores for 2025 (Hist Gradient Boost)")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print("-" * 60)

del mae, rmse

Looking for best parameters: 
Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'l2_regularization': 1.0, 'learning_rate': 0.05, 'max_depth': 5, 'max_iter': 200}
------------------------------------------------------------
Scores for 2025 (Hist Gradient Boost)
MAE: 5.7723
RMSE: 6.8004
------------------------------------------------------------


### Due to the best accuracy of the XGBoost model, after evaluating all models used for prediction, it was decided to use this model to make predictions for 2030. 

### Taking into account that the model does not see polls, emotions or candidates' names, but only "dry" economic data (salaries, age, GDP), the result with the MAE error slightly above 5.2 is very good. The model correctly captures the region's mobilization potential.

### For this purpose, data from the previously prepared model_set_2030 set was used

In [7]:
model_set_2030_df = pd.read_parquet("../Data/Proccesed_data/model_set_2030.parquet")
model_set_2030_df[model_set_2030_df["year"] == 2030]

Unnamed: 0,county,terc_code,turnout_percentage,year,round,gdp_per_capita_delta_5_years,average_gross_salary,population_70_plus_delta_1_year,demographic_dependency_ratio,demographic_dependency_ratio_delta_5_years
11,bolesławiecki,0201,,2030,1,41730.0,13503.192500,288.244643,0.000051,-72.199949
12,bolesławiecki,0201,,2030,2,41730.0,13503.192500,288.244643,0.000051,-72.199949
24,dzierżoniowski,0202,,2030,1,41730.0,12875.966250,252.627957,82.800000,5.500000
25,dzierżoniowski,0202,,2030,2,41730.0,12875.966250,252.627957,82.800000,5.500000
37,głogowski,0203,,2030,1,41730.0,13999.598750,660.000000,80.000000,3.000000
...,...,...,...,...,...,...,...,...,...,...
4913,Koszalin,3261,,2030,2,32915.0,13789.637500,482.445864,84.207174,5.207174
4925,Szczecin,3262,,2030,1,32915.0,15304.753750,1271.880000,77.176031,4.076031
4926,Szczecin,3262,,2030,2,32915.0,15304.753750,1271.880000,77.176031,4.076031
4938,Świnoujście,3263,,2030,1,32915.0,11512.225238,195.000000,83.600000,5.500000


In [8]:
train_df, test_df = mtapm.prepare_train_and_test_data(model_set_2030_df, target, 2030)

parameters_grid = {
    'n_estimators': [100, 200, 300], 
    'learning_rate': [0.01, 0.05, 0.1], 
    'max_depth': [3, 5, 7], 
    'subsample': [0.8, 1.0]
}

xgb_base = XGBRegressor(random_state=42, n_jobs=-1)

print("Looking for best parameters...")
grid_search = GridSearchCV(
    estimator= xgb_base,
    param_grid= parameters_grid,
    scoring= 'neg_mean_absolute_error', 
    cv= 3, 
    verbose= 1,
    n_jobs= -1
)

grid_search.fit(train_df[features], train_df[target])
best_model = grid_search.best_estimator_

print(f"Best parameters: {grid_search.best_params_}")

predicted_values = best_model.predict(test_df[features])

test_df[target] = predicted_values

model_set_2030_df.loc[test_df.index, target] = predicted_values

model_set_2030_df[model_set_2030_df["year"] == 2030][["county", "round", "year", target]].head(10)


Looking for best parameters...
Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}


Unnamed: 0,county,round,year,turnout_percentage
11,bolesławiecki,1,2030,64.907806
12,bolesławiecki,2,2030,67.136757
24,dzierżoniowski,1,2030,65.444374
25,dzierżoniowski,2,2030,70.56855
37,głogowski,1,2030,71.218552
38,głogowski,2,2030,75.392914
50,górowski,1,2030,64.284828
51,górowski,2,2030,70.364845
63,jaworski,1,2030,63.839413
64,jaworski,2,2030,66.676231


In [9]:
model_set_2030_df = model_set_2030_df[model_set_2030_df["year"] == 2030]

columns_to_keep = [
    "terc_code", "county", "round", 
    "turnout_percentage", "year"
]

model_set_2030_df["turnout_percentage"] = round(model_set_2030_df["turnout_percentage"], 2)

model_set_2030_df = model_set_2030_df[columns_to_keep]
model_set_2030_df.head()

Unnamed: 0,terc_code,county,round,turnout_percentage,year
11,201,bolesławiecki,1,64.91,2030
12,201,bolesławiecki,2,67.14,2030
24,202,dzierżoniowski,1,65.44,2030
25,202,dzierżoniowski,2,70.57,2030
37,203,głogowski,1,71.22,2030


In [10]:
model_set_2030_df.to_parquet("../Data/Proccesed_data/final_predictions_to_visualize.parquet")