In [50]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import csv

In [42]:
df = pd.read_csv('Data/training_data.csv')
yColName = "niaaa_legal_adult_per_capita_beer_consumed_gallons"
X = df[['fips_code', 'brfss_drinking_culture_surrogate_metric_percent_binge']]
y = df[yColName].values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [43]:
param_grid = {
    'n_estimators': [125, 150, 175],
    'learning_rate': [0.15, 0.2, 0.25],
    'max_depth': [3, 4, 5]
}

gb_regressor = GradientBoostingRegressor()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Retrain the model on the full training set using the optimal number of iterations
best_gb_regressor = GradientBoostingRegressor(**best_params, random_state=42)
best_gb_regressor.fit(X_train, y_train)

# Early stopping
best_val_error = float('inf')
best_iter = 0
error_increases = 0
max_increases = 5

for i, val_pred in enumerate(best_gb_regressor.staged_predict(X_test)):
    val_error = mean_squared_error(y_test, val_pred)

    if val_error < best_val_error:
        best_val_error = val_error
        best_iter = i
        error_increases = 0
    else:
        error_increases += 1
        if error_increases >= max_increases:
            print(f"Early stopping at iteration {best_iter}")
            break

# Retrain the model on the full training set using the optimal number of iterations
best_gb_regressor = GradientBoostingRegressor(n_estimators=best_iter, learning_rate=best_params['learning_rate'], max_depth=best_params['max_depth'], random_state=42)
best_gb_regressor.fit(X_train, y_train)

Best Hyperparameters: {'learning_rate': 0.25, 'max_depth': 3, 'n_estimators': 175}
Early stopping at iteration 89


In [51]:
# Make predictions on the test set
test_predictions = best_gb_regressor.predict(X_test)

# Evaluate the model
test_r2 = r2_score(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)

print(f"Test R-squared: {test_r2}")
print(f"Test Mean Squared Error: {test_mse}")
model = best_gb_regressor

Test R-squared: 0.9072579064340865
Test Mean Squared Error: 3.3668057796149338


Tested with what appeared to be the 5 best predictors, but still got a really high r2 of 0.9923. This was also done after some tweaking of hyperparameters, so I am now testing with only 2 predictors just to see.

With 2 predictors, the r2 was 0.9911. Still very very high. I don't know anymore. 

For fun I decided to do only one parameter. The r2 was still way to high, at 0.98. It makes sense, but I was hoping for some more interesting data. 

Changed the prediction from total consumed to per capita consumed. R2 dropped to 0.8655

Per capita drinking analyzed with percent never married, fips code (state), and binge drinking frequency resulted in a r2 of 0.8969. Removing fips drops it to 0.4273. Replacing it with year brings it to 0.3173

Fips and never married is r2 of 0.9391

fibs and binge drinking is r2 of 0.9073

In [54]:
# 1. Map FIPS Codes to State Names
fips_to_state = {26: 'Michigan', 51: 'Virginia', 6: 'California'}

# 2. Save Separate CSV Files for Each State
for fips_code, state_name in fips_to_state.items():
    # Create an array of feature values for the current FIPS code and percentages
    percentages = np.arange(0.01, 1.01, 0.01)
    features = np.array([[fips_code, percentage] for percentage in percentages])

    # Make predictions using your trained model
    predictions = model.predict(features)

    # Combine the results with FIPS codes
    results = np.column_stack((percentages, predictions))

    # Specify the CSV file path for the current state
    csv_file_path = f"Predictions/{state_name}_predictions_results.csv"

    # Write the results for the current state to the CSV file
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)

        # Write the header
        writer.writerow(['Percentage', 'Predicted_Value'])

        # Write the data rows
        writer.writerows(results)

    print(f"Predictions for {state_name} saved to {csv_file_path}")

Predictions for Michigan saved to Predictions/Michigan_predictions_results.csv
Predictions for Virginia saved to Predictions/Virginia_predictions_results.csv
Predictions for California saved to Predictions/California_predictions_results.csv


