In [2]:
import sklearn
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import polars.selectors as cs

## Load the Data

In [3]:
(parcel_data := pl.read_csv("./data/water_quality_and_parcel_summaries_2004_to_2015.csv")
 .drop('DNR_ID_Site_Number', 'Year', 'LAKE_NAME')
)

avg_secchi_depth,avg_total_phosphorus,ACRES_POLY_mean,FIN_SQ_FT_mean,EMV_TOTAL_mean,TOTAL_TAX_mean,GARAGE_mean,BASEMENT_mean,OTHER_COOLING,UNKNOWN_COOLING,FORCED AIR_COOLING,N_COOLING,Y_COOLING,1 AC UNIT_COOLING,A/CON_COOLING,CENTRAL W/AIR COND_COOLING,1_COOLING,0_COOLING,CONDOMINIUM,UNKNOWN_DWELL_TYPE,OTHER_DWELL,SINGLE-FAMILY / OWNER OCCUPIED,TOWNHOUSE,S.FAM.RES
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.359091,0.148182,0.901842,1431.351641,338015.338245,0.0,0.778969,0.852646,0.0,0.147354,0.0,0.00067,0.851976,0.0,0.0,0.0,0.0,0.0,0.144675,0.147354,0.0,0.303416,0.404555,0.0
0.964333,0.082083,0.864665,1388.569201,376544.834308,0.0,0.755686,0.82716,0.0,0.17284,0.0,0.00065,0.826511,0.0,0.0,0.0,0.0,0.0,0.140351,0.17284,0.0,0.294347,0.392463,0.0
1.3,0.092538,0.463816,1769.156433,250127.55848,0.0,0.944079,0.944444,0.0,0.055556,0.0,0.013889,0.930556,0.0,0.0,0.0,0.0,0.0,0.062865,0.055556,0.000731,0.788012,0.092836,0.0
1.571429,0.070571,0.254844,1688.303912,191261.27551,2441.579762,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026531,0.093878,0.0,0.361054,0.518537
2.127594,0.07425,0.541326,1048.443524,225295.813345,4032.902893,0.913941,0.036633,0.0,0.248728,0.751272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2.485714,0.021714,0.640889,2034.265359,388570.065359,4587.40915,0.895425,0.0,0.010458,0.129412,0.0,0.0,0.0,0.860131,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0.4,0.157,0.0,0.0,399197.073546,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0.596923,0.092462,3.7903,0.0,420461.350844,412444.840525,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
0.917857,0.314143,36.750676,1499.175676,391657.432432,1748.155405,0.641892,0.0,0.027027,0.513514,0.0,0.0,0.0,0.459459,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


# Set up Parameter Grids

In [4]:
param_grid_dt = {'max_depth': [None, 1,2,3,4,5],
              'min_samples_split' : [1, 5, 10],
               'min_samples_leaf' : [1, 5, 10],
              }

In [5]:
param_grid_rf = {'n_estimators': 10 ** np.arange(1, 3),
              'max_depth': [None, 1,2,3,4,5],
              'min_samples_split' : [1, 5, 10],
               'min_samples_leaf' : [1, 5, 10],
              }

# Secchi Depth Model

## Set up Dataframe

In [6]:
X = parcel_data.drop('avg_secchi_depth', 'avg_total_phosphorus').to_pandas()
y = parcel_data['avg_secchi_depth'].to_pandas()

In [7]:
X.head()

Unnamed: 0,ACRES_POLY_mean,FIN_SQ_FT_mean,EMV_TOTAL_mean,TOTAL_TAX_mean,GARAGE_mean,BASEMENT_mean,OTHER_COOLING,UNKNOWN_COOLING,FORCED AIR_COOLING,N_COOLING,...,A/CON_COOLING,CENTRAL W/AIR COND_COOLING,1_COOLING,0_COOLING,CONDOMINIUM,UNKNOWN_DWELL_TYPE,OTHER_DWELL,SINGLE-FAMILY / OWNER OCCUPIED,TOWNHOUSE,S.FAM.RES
0,0.901842,1431.351641,338015.338245,0.0,0.778969,0.852646,0.0,0.147354,0.0,0.00067,...,0.0,0.0,0.0,0.0,0.144675,0.147354,0.0,0.303416,0.404555,0.0
1,0.864665,1388.569201,376544.834308,0.0,0.755686,0.82716,0.0,0.17284,0.0,0.00065,...,0.0,0.0,0.0,0.0,0.140351,0.17284,0.0,0.294347,0.392463,0.0
2,0.463816,1769.156433,250127.55848,0.0,0.944079,0.944444,0.0,0.055556,0.0,0.013889,...,0.0,0.0,0.0,0.0,0.062865,0.055556,0.000731,0.788012,0.092836,0.0
3,0.254844,1688.303912,191261.27551,2441.579762,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.026531,0.093878,0.0,0.361054,0.518537
4,0.541326,1048.443524,225295.813345,4032.902893,0.913941,0.036633,0.0,0.248728,0.751272,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
y.head()

0    0.359091
1    0.964333
2    1.300000
3    1.571429
4    2.127594
Name: avg_secchi_depth, dtype: float64

## Create the Training/Test/Validation Sets with `sklearn`

In [9]:
from sklearn.model_selection import train_test_split

X_temp_secchi, X_val_secchi, y_temp_secchi, y_val_secchi = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
X_train_secchi, X_test_secchi, y_train_secchi, y_test_secchi = train_test_split(X_temp_secchi, y_temp_secchi,test_size=0.3, random_state=67)

In [15]:
X_temp_secchi.shape

(369, 22)

In [16]:
y_temp_secchi.shape

(369,)

In [17]:
X_train_secchi.shape

(258, 22)

In [18]:
X_test_secchi.shape

(111, 22)

In [19]:
y_train_secchi.shape

(258,)

In [20]:
y_test_secchi.shape

(111,)

In [22]:
X_val_secchi.shape

(159, 22)

In [23]:
y_val_secchi.shape

(159,)

## Cross-Validation Setup

In [24]:
from sklearn.model_selection import KFold, GridSearchCV

state = 458

cv_obj = KFold(n_splits=10, shuffle=True, random_state=state)
cv_obj

KFold(n_splits=10, random_state=458, shuffle=True)

##  Grid Search for CART Model

In [25]:
from sklearn.tree import DecisionTreeRegressor

grid_search_dt_secchi = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, verbose=3, cv=cv_obj)

In [26]:
grid_search_dt_secchi.fit(X_train_secchi, y_train_secchi)

Fitting 10 folds for each of 54 candidates, totalling 540 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 9/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 10/10] END max_depth=Non

180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_paramet

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [1, 5, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,5
,min_samples_split,5
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## Grid Search for Random Forest Model

In [27]:
from sklearn.ensemble import RandomForestRegressor

grid_search_rf_secchi = GridSearchCV(RandomForestRegressor(), param_grid_rf, verbose=3, cv=cv_obj)

In [28]:
grid_search_rf_secchi.fit(X_train_secchi, y_train_secchi)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parame

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [1, 5, ...], 'n_estimators': array([ 10, 100])}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(100)
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Investigate Best Models From the Search

In [29]:
grid_search_dt_secchi.best_score_

np.float64(0.22760504135790077)

In [30]:
grid_search_dt_secchi.best_params_

{'max_depth': 5, 'min_samples_leaf': 5, 'min_samples_split': 5}

In [31]:
grid_search_rf_secchi.best_score_

np.float64(0.336098433133004)

In [32]:
grid_search_rf_secchi.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': np.int64(100)}

In [33]:
grid_search_rf_secchi.best_score_ > grid_search_dt_secchi.best_score_

np.True_

## Fit the Models with the Test Data

In [34]:
y_val_test_dt_secchi = grid_search_dt_secchi.predict(X_test_secchi)

In [35]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error

{'R^2': explained_variance_score(y_test_secchi, y_val_test_dt_secchi),
 'MSE': mean_squared_error(y_test_secchi, y_val_test_dt_secchi),
 'MAE': mean_absolute_error(y_test_secchi, y_val_test_dt_secchi),
}

{'R^2': 0.3076928509701963,
 'MSE': 0.7834963635593454,
 'MAE': 0.6755197520544077}

In [36]:
y_val_test_rf_secchi = grid_search_rf_secchi.predict(X_test_secchi)

In [37]:
{'R^2': explained_variance_score(y_test_secchi, y_val_test_rf_secchi),
 'MSE': mean_squared_error(y_test_secchi, y_val_test_rf_secchi),
 'MAE': mean_absolute_error(y_test_secchi, y_val_test_rf_secchi),
}

{'R^2': 0.5346380367142701,
 'MSE': 0.5278318604178912,
 'MAE': 0.5548685957729624}

## Re-Fit the Best Model (RF) on the 70% NOT Validation (Train + Test)

In [40]:
grid_search_rf_secchi.fit(X_temp_secchi, y_temp_secchi)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parame

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [1, 5, ...], 'n_estimators': array([ 10, 100])}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(100)
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [41]:
grid_search_rf_secchi.best_score_

np.float64(0.5017632169455882)

In [42]:
grid_search_rf_secchi.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': np.int64(100)}

## Test the Best Model (RF) on the Validation Data

In [43]:
y_val_pred_rf_secchi = grid_search_rf_secchi.predict(X_val_secchi)

In [44]:
{'R^2': explained_variance_score(y_val_secchi, y_val_pred_rf_secchi),
 'MSE': mean_squared_error(y_val_secchi, y_val_pred_rf_secchi),
 'MAE': mean_absolute_error(y_val_secchi, y_val_pred_rf_secchi),
}

{'R^2': 0.5489726167382278,
 'MSE': 0.5189670079923241,
 'MAE': 0.4982833993816203}

The Random Forest was the best-performing model for predicting Secchi depth. On the 30% validation set, it achieved:

R² = 0.55

MSE = 0.52

MAE = 0.50

This means the model explains about 55% of the variation in secchi depth, which is strong all things considered. Predictions were, on average, within about 0.5 meters of observed Secchi depth.

# Total Phosphorus Model

## Set up Dataframe

In [45]:
X_p = parcel_data.drop('avg_secchi_depth', 'avg_total_phosphorus').to_pandas()
y_p = parcel_data['avg_total_phosphorus'].to_pandas()

In [46]:
X_p.head()

Unnamed: 0,ACRES_POLY_mean,FIN_SQ_FT_mean,EMV_TOTAL_mean,TOTAL_TAX_mean,GARAGE_mean,BASEMENT_mean,OTHER_COOLING,UNKNOWN_COOLING,FORCED AIR_COOLING,N_COOLING,...,A/CON_COOLING,CENTRAL W/AIR COND_COOLING,1_COOLING,0_COOLING,CONDOMINIUM,UNKNOWN_DWELL_TYPE,OTHER_DWELL,SINGLE-FAMILY / OWNER OCCUPIED,TOWNHOUSE,S.FAM.RES
0,0.901842,1431.351641,338015.338245,0.0,0.778969,0.852646,0.0,0.147354,0.0,0.00067,...,0.0,0.0,0.0,0.0,0.144675,0.147354,0.0,0.303416,0.404555,0.0
1,0.864665,1388.569201,376544.834308,0.0,0.755686,0.82716,0.0,0.17284,0.0,0.00065,...,0.0,0.0,0.0,0.0,0.140351,0.17284,0.0,0.294347,0.392463,0.0
2,0.463816,1769.156433,250127.55848,0.0,0.944079,0.944444,0.0,0.055556,0.0,0.013889,...,0.0,0.0,0.0,0.0,0.062865,0.055556,0.000731,0.788012,0.092836,0.0
3,0.254844,1688.303912,191261.27551,2441.579762,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.026531,0.093878,0.0,0.361054,0.518537
4,0.541326,1048.443524,225295.813345,4032.902893,0.913941,0.036633,0.0,0.248728,0.751272,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [47]:
y_p.head()

0    0.148182
1    0.082083
2    0.092538
3    0.070571
4    0.074250
Name: avg_total_phosphorus, dtype: float64

## Create the Training/Test/Validation Sets with `sklearn`

In [48]:
X_temp_phos, X_val_phos, y_temp_phos, y_val_phos = train_test_split(X_p, y_p, test_size=0.3, random_state=99)

In [49]:
X_train_phos, X_test_phos, y_train_phos, y_test_phos = train_test_split(X_temp_phos, y_temp_phos,test_size=0.3, random_state=183)

In [50]:
X_temp_phos.shape

(369, 22)

In [51]:
y_temp_phos.shape

(369,)

In [76]:
X_train_phos.shape

(258, 22)

In [77]:
y_train_phos.shape

(258,)

In [78]:
X_test_phos.shape

(111, 22)

In [80]:
y_test_phos.shape

(111,)

In [52]:
X_val_phos.shape

(159, 22)

In [53]:
y_val_phos.shape

(159,)

## Cross-Validation Setup

In [55]:
state = 752

cv_obj_2 = KFold(n_splits=10, shuffle=True, random_state=state)
cv_obj_2

KFold(n_splits=10, random_state=752, shuffle=True)

## Grid Search for CART Model

In [56]:
grid_search_dt_phos = GridSearchCV(DecisionTreeRegressor(), param_grid_dt, verbose=3, cv=cv_obj_2)

In [57]:
grid_search_dt_phos.fit(X_train_phos, y_train_phos)

Fitting 10 folds for each of 54 candidates, totalling 540 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 9/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1;, score=nan total time=   0.0s
[CV 10/10] END max_depth=Non

180 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_paramet

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [1, 5, ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,5
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## Grid Search for Random Forest Model

In [58]:
grid_search_rf_phos = GridSearchCV(RandomForestRegressor(), param_grid_rf, verbose=3, cv=cv_obj_2)

In [59]:
grid_search_rf_phos.fit(X_train_phos, y_train_phos)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parame

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [1, 5, ...], 'n_estimators': array([ 10, 100])}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(100)
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## Investigate Best Models From the Search

In [60]:
grid_search_dt_phos.best_score_

np.float64(0.3218516991427186)

In [61]:
grid_search_dt_phos.best_params_

{'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}

In [62]:
grid_search_rf_phos.best_score_

np.float64(0.43202433407963203)

In [63]:
grid_search_rf_phos.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': np.int64(100)}

In [64]:
grid_search_rf_phos.best_score_ > grid_search_dt_phos.best_score_

np.True_

## Fit the Models with the Test Data

In [65]:
y_val_test_dt_phos = grid_search_dt_phos.predict(X_test_phos)

In [66]:
{'R^2': explained_variance_score(y_test_phos, y_val_test_dt_phos),
 'MSE': mean_squared_error(y_test_phos, y_val_test_dt_phos),
 'MAE': mean_absolute_error(y_test_phos, y_val_test_dt_phos),
}

{'R^2': 0.26031517576952834,
 'MSE': 0.002783656521760695,
 'MAE': 0.03929187912403145}

In [67]:
y_val_test_rf_phos = grid_search_rf_phos.predict(X_test_phos)

In [68]:
{'R^2': explained_variance_score(y_test_phos, y_val_test_rf_phos),
 'MSE': mean_squared_error(y_test_phos, y_val_test_rf_phos),
 'MAE': mean_absolute_error(y_test_phos, y_val_test_rf_phos),
}

{'R^2': 0.5630230620782897,
 'MSE': 0.0016432695398817506,
 'MAE': 0.029340024944708196}

## Re-Fit the Best Model (RF) on the 70% NOT Validation (Train + Test)

In [69]:
grid_search_rf_phos.fit(X_temp_phos, y_temp_phos)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
[CV 1/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 2/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 3/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 4/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 5/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 6/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 7/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   0.0s
[CV 8/10] END max_depth=None, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\ol3399hi\AppData\Local\anaconda3\envs\polars\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parame

0,1,2
,estimator,RandomForestRegressor()
,param_grid,"{'max_depth': [None, 1, ...], 'min_samples_leaf': [1, 5, ...], 'min_samples_split': [1, 5, ...], 'n_estimators': array([ 10, 100])}"
,scoring,
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,3
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,np.int64(100)
,criterion,'squared_error'
,max_depth,
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [70]:
grid_search_rf_phos.best_score_

np.float64(0.5273070741390138)

In [71]:
grid_search_rf_phos.best_params_

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'n_estimators': np.int64(100)}

## Test the Best Model (RF) on the Validation Data

In [72]:
y_val_pred_rf_phos = grid_search_rf_phos.predict(X_val_phos)

In [73]:
{'R^2': explained_variance_score(y_val_phos, y_val_pred_rf_phos),
 'MSE': mean_squared_error(y_val_phos, y_val_pred_rf_phos),
 'MAE': mean_absolute_error(y_val_phos, y_val_pred_rf_phos),
}

{'R^2': 0.6360661776457004,
 'MSE': 0.0015337396893206668,
 'MAE': 0.024907203777602117}

The Random Forest was the best model of total phosphorus as well. On the 30% validation set, it achieved:

R² = 0.64

MSE = 0.00153

MAE = 0.025

This indicates the model explains about 64% of the variation in total phosphorus, with predictions typically within 0.025 units of observed values.