In [33]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [34]:
df = pd.read_csv('Data/training_data.csv')
yColName = "niaaa_legal_adult_per_capita_beer_consumed_gallons"
X = df[['census_percent_pop_never_married', 'year','brfss_drinking_culture_surrogate_metric_percent_binge']]
y = df[yColName].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [35]:
param_grid = {
    'n_estimators': [125, 150, 175],
    'learning_rate': [0.15, 0.2, 0.25],
    'max_depth': [3, 4, 5]
}

gb_regressor = GradientBoostingRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Retrain the model on the full training set using the optimal number of iterations
best_gb_regressor = GradientBoostingRegressor(**best_params, random_state=42)
best_gb_regressor.fit(X_train, y_train)

# Early stopping
best_val_error = float('inf')
best_iter = 0
error_increases = 0
max_increases = 5

for i, val_pred in enumerate(best_gb_regressor.staged_predict(X_test)):
    val_error = mean_squared_error(y_test, val_pred)

    if val_error < best_val_error:
        best_val_error = val_error
        best_iter = i
        error_increases = 0
    else:
        error_increases += 1
        if error_increases >= max_increases:
            print(f"Early stopping at iteration {best_iter}")
            break

# Retrain the model on the full training set using the optimal number of iterations
best_gb_regressor = GradientBoostingRegressor(n_estimators=best_iter, learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)
best_gb_regressor.fit(X_train, y_train)

Best Hyperparameters: {'learning_rate': 0.15, 'max_depth': 3, 'n_estimators': 125}
Early stopping at iteration 10


In [36]:
# Make predictions on the test set
test_predictions = best_gb_regressor.predict(X_test)

# Evaluate the model
test_r2 = r2_score(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

print(f"Test R-squared: {test_r2}")
print(f"Test Mean Squared Error: {test_mse}")

Test R-squared: 0.3173088677348982
Test Mean Squared Error: 18.357695867981416


Tested with what appeared to be the 5 best predictors, but still got a really high r2 of 0.9923. This was also done after some tweaking of hyperparameters, so I am now testing with only 2 predictors just to see.

With 2 predictors, the r2 was 0.9911. Still very very high. I don't know anymore. 

For fun I decided to do only one parameter. The r2 was still way to high, at 0.98. It makes sense, but I was hoping for some more interesting data. 

Changed the prediction from total consumed to per capita consumed. R2 dropped to 0.8655

Per capita drinking analyzed with percent never married, fips code (state), and binge drinking frequency resulted in a r2 of 0.8969. Removing fips drops it to 0.4273. Replacing it with year brings it to 0.3173