In [2]:
import sys
sys.path.append('../scripts')
from scripts.cross_validation import loocv, k_fold_cv
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV
from scripts.data_preprocessing import load_data, exclude_outliers


# Load and preprocess data
X, y = load_data()

# Exclude outliers
X_filtered, y_filtered = exclude_outliers(X, y)

Number of outliers:  14




In [5]:
# Define the parameter grid for Grid Search
param_grid = {
    "n_estimators": [100, 200, 500],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 5, 10],
    "learning_rate": [0.01, 0.05, 0.1],
    "loss": ["squared_error"]
}

# Initialize the model
reg = ensemble.GradientBoostingRegressor()

# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(reg, param_grid, cv=6, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Best parameters from Grid Search
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the model with the best parameters
best_reg = grid_search.best_estimator_

# Evaluate with LOOCV and k-fold
print('LOOCV')
loocv(X, y, best_reg)

print('\n10-fold')
k_fold_cv(X, y, best_reg, 6)

Best parameters found:  {'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 3, 'min_samples_split': 5, 'n_estimators': 100}
LOOCV
Average R^2 (LOOCV): nan
Average MSE (LOOCV): 102.51863630390584
Average RMSE (LOOCV): 10.125148705273707
Average MAE (LOOCV): 5.027608188021575

10-fold
Mean MSE: 109.51601208334044
Mean MAE: 5.69637206465038
Mean R2: -0.27912060015233303
RMSE 10.46498982719718


In [6]:
print('LOOCV')
loocv(X_filtered, y_filtered, best_reg)

print('\n10-fold')
k_fold_cv(X_filtered, y_filtered, best_reg, 6)

LOOCV
Average R^2 (LOOCV): nan
Average MSE (LOOCV): 35.115479465788056
Average RMSE (LOOCV): 5.9258315421371925
Average MAE (LOOCV): 4.044039064343017

10-fold
Mean MSE: 39.36248499084931
Mean MAE: 4.2790288697795855
Mean R2: -0.38081029192488874
RMSE 6.2739528999546454
