In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [19]:
# Load the processed dataset
data = pd.read_csv('../Data/training_data.csv')

# Selecting the features and target variable
# You might want to select more relevant features for your model
#X = data[['year', 'census_total_pop', 'census_percent_employed', 'tpc_state_beer_tax_rates_dollar_gal']]  # Example feature, include others as necessary
#X = data.drop('ba_craft_beer_produced_gallons', axis=1).values
X = data[['niaaa_total_beer_vol_consumed_gallons', 'niaaa_pop_21_plus', 'census_total_pop']]

y = data['ba_craft_beer_produced_gallons']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
param_grid = {
    'n_estimators': [125, 150, 175],
    'learning_rate': [0.15, 0.2, 0.25],
    'max_depth': [3, 4, 5]
}

gb_regressor = GradientBoostingRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Retrain the model on the full training set using the optimal number of iterations
gb_model = GradientBoostingRegressor(**best_params, random_state=42)
gb_model.fit(X_train, y_train)

# Early stopping
best_val_error = float('inf')
best_iter = 0
error_increases = 0
max_increases = 5

for i, val_pred in enumerate(gb_model.staged_predict(X_test)):
    val_error = mean_squared_error(y_test, val_pred)

    if val_error < best_val_error:
        best_val_error = val_error
        best_iter = i
        error_increases = 0
    else:
        error_increases += 1
        if error_increases >= max_increases:
            print(f"Early stopping at iteration {best_iter}")
            break

# Retrain the model on the full training set using the optimal number of iterations
gb_model = GradientBoostingRegressor(n_estimators=best_iter, learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)
gb_model.fit(X_train, y_train)

Best Hyperparameters: {'learning_rate': 0.25, 'max_depth': 4, 'n_estimators': 125}
Early stopping at iteration 19


In [21]:
# Predict on the test set
y_pred = gb_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f'R^2: {r2}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')

R^2: 0.904694569213275
MAE: 3618270.457913971
MSE: 34358113936868.863


Implementing the early stopping/hyperparameter search, the r2 increased from 0.94 to 0.95. Next we will find which columns are the most desireable

Ran on all columns, r2 was .9792

After identifying highest weighted columns, including only the top 3 resulted in an r2 of 0.9047

In [15]:
# Predict for the year 2025
# Adjust the features accordingly if your model uses more than the year
total_population_2025 = 300000000  
percent_employed_2025 = 50
tpc_state_beer_tax_rates_dollar_gal = .40
prediction_2025 = gb_model.predict([[2025, total_population_2025, percent_employed_2025, tpc_state_beer_tax_rates_dollar_gal]])
print(f'Predicted craft beer production for 2025: {prediction_2025[0]} gallons')

ValueError: X has 4 features, but GradientBoostingRegressor is expecting 38 features as input.

In [16]:
import numpy as np

feature_importances = gb_model.feature_importances_

# Assuming X was your original DataFrame
n = 5
top_features_indices = feature_importances.argsort()[-n:][::-1]
selected_features = data.columns[top_features_indices]
selected_features

Index(['ba_craft_beer_produced_gallons',
       'niaaa_total_beer_vol_consumed_gallons', 'niaaa_pop_21_plus',
       'census_total_pop', 'bea_personal_income_dollars'],
      dtype='object')