In [1]:
import polars as pl
import polars.selectors as cs
import sklearn
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Example with my data

In [3]:
(water_table_summaries := pl.read_csv("./data/water_quality_and_parcel_summaries_2004_to_2015MINE.csv"))

Year,LAKE_NAME,average_secchi_depth,average_total_phosphorus,FIN_SQ_FT_mean,GARAGESQFT_mean,EMV_TOTAL_mean,SALE_VALUE_mean,TOTAL_TAX_mean,FIN_SQ_FT_std,GARAGESQFT_std,EMV_TOTAL_std,SALE_VALUE_std,TOTAL_TAX_std,HEATING_null,COOLING_null
i64,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str
2008,,2.78125,0.019,,,,,,,,,,,,
2006,,3.075,0.022667,,,,,,,,,,,,
2005,,1.230143,0.082429,,,,,,,,,,,,
2013,,3.103571,0.0165,,,,,,,,,,,,
2009,,2.273333,0.030533,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2007,,1.7,0.0202,,,,,,,,,,,,
2009,,1.954167,0.03225,,,,,,,,,,,,
2010,,1.728571,0.029786,,,,,,,,,,,,
2011,,1.615385,0.033154,,,,,,,,,,,,


In [4]:
n = water_table_summaries.height

indices = np.random.permutation(n)

val_cutoff = int(0.3 * n)
train_cutoff = val_cutoff + int(0.7 * (n - val_cutoff))

labels = np.empty(n, dtype=object)
labels[indices[:val_cutoff]] = "validation"
labels[indices[val_cutoff:train_cutoff]] = "training"
labels[indices[train_cutoff:]] = "test"

(water_table_summaries := (water_table_summaries.with_columns(
    pl.Series("train_valid", labels))
)
)

Year,LAKE_NAME,average_secchi_depth,average_total_phosphorus,FIN_SQ_FT_mean,GARAGESQFT_mean,EMV_TOTAL_mean,SALE_VALUE_mean,TOTAL_TAX_mean,FIN_SQ_FT_std,GARAGESQFT_std,EMV_TOTAL_std,SALE_VALUE_std,TOTAL_TAX_std,HEATING_null,COOLING_null,train_valid
i64,str,f64,f64,str,str,str,str,str,str,str,str,str,str,str,str,str
2008,,2.78125,0.019,,,,,,,,,,,,,"""training"""
2006,,3.075,0.022667,,,,,,,,,,,,,"""training"""
2005,,1.230143,0.082429,,,,,,,,,,,,,"""test"""
2013,,3.103571,0.0165,,,,,,,,,,,,,"""validation"""
2009,,2.273333,0.030533,,,,,,,,,,,,,"""test"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2007,,1.7,0.0202,,,,,,,,,,,,,"""validation"""
2009,,1.954167,0.03225,,,,,,,,,,,,,"""validation"""
2010,,1.728571,0.029786,,,,,,,,,,,,,"""validation"""
2011,,1.615385,0.033154,,,,,,,,,,,,,"""test"""


In [5]:
feature_cols = [
    col for col in water_table_summaries.columns
    if col not in ['LAKE_NAME', 'train_valid', 'average_secchi_depth', 'average_total_phosphorus']
]

X = water_table_summaries.select(feature_cols).to_numpy()
y = water_table_summaries['average_secchi_depth', 'average_total_phosphorus'].to_numpy()

X_val = water_table_summaries.select(feature_cols).to_numpy()
y_val = water_table_summaries['average_secchi_depth', 'average_total_phosphorus'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 5
)

cart = DecisionTreeRegressor(random_state = 5)

cart_param_grid = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_cart = GridSearchCV(cart, cart_param_grid, cv = 5, scoring = 'neg_mean_squared_error')
grid_cart.fit(X_train, y_train)

best_cart = grid_cart.best_estimator_
print("Best CART Model:", best_cart)

rf = RandomForestRegressor(random_state = 5)

rf_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(rf, rf_param_grid, cv = 5, scoring = 'neg_mean_squared_error')
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
print("Best Random Forest Model:", best_rf)

models = {
    "CART": best_cart,
    "Random Forest": best_rf
}

for name, model in models.items():
    pred = model.predict(X_val)
    mse = mean_squared_error(y_val, pred)
    r2 = r2_score(y_val, pred)
    print(f"{name} -> Validation MSE: {mse:.2f}, R^2: {r2:.3f}")

Best CART Model: DecisionTreeRegressor(max_depth=3, random_state=5)
Best Random Forest Model: RandomForestRegressor(max_depth=5, n_estimators=300, random_state=5)
CART -> Validation MSE: 0.50, R^2: 0.006
Random Forest -> Validation MSE: 0.50, R^2: 0.008


# Example with povided data

In [6]:
(water_table := pl.read_csv('./data/water_quality_and_parcel_summaries_2004_to_2015.csv'))

Unnamed: 0_level_0,Monit_MAP_CODE1,Year,LAKE_NAME,Mean_Secchi_Depth_Result,Mean_Phosporus_Result,Mean_EMV_Total,STD_EMV_Total,Mean_Sale_Value,STD_Sale_Value,Mean_Total_Tax,STD_Total_Tax,Mean_Garage_Size,STD_Garage_Size,Mean_Fin_SQ_FT,STD_Fin_SQ_FT,Percentage_Yes_Basement,Percentage_Yes_Garage,Percentage_Yes_Tax_Exempt,Percentage_Air_Cooling,Percentage_AC_Cooling,Percentage_Central_Cooling,Percentage_Other_Cooling,Percentage_No_Cooling,Percentage_Air_Heating,Percentage_Space_Heating,Percentage_Water_Heating,Percentage_Electric_Heating,Percentage_Other_Heating,Percentage_No_Heating
i64,str,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0,"""02000500-01""",2014,"""George Watch Lake""",0.716667,0.108778,215637.592745,459023.922281,122485.615829,183414.899771,3313.685903,17837.814516,,,1849.473207,8220.487298,70.486397,0.0,7.914262,0.0,0.0,0.0,2.308326,97.691674,0.0,0.0,0.0,0.0,85.325639,14.674361
1,"""02000500-01""",2013,"""George Watch Lake""",0.365,0.3105,196764.633141,454192.377508,120601.723001,190129.461399,3319.081616,18199.952917,,,1841.112119,8221.467234,69.991756,0.0,7.914262,0.824402,0.0,0.0,1.154163,0.0,79.060181,0.824402,1.154163,2.802968,1.154163,0.0
2,"""02000500-01""",2012,"""George Watch Lake""",0.359,0.2649,200414.333057,480355.968668,118315.128418,190091.029827,3460.064623,19346.923479,,,1811.576636,8223.512958,70.173985,0.0,8.119304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"""02000500-01""",2011,"""George Watch Lake""",0.973333,0.119417,216297.932175,536941.029067,111218.812242,190312.451914,3459.313482,19320.28833,,,1780.282051,8215.092878,76.840364,0.0,8.271299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"""02000500-01""",2010,"""George Watch Lake""",0.493333,0.173,222789.430223,559656.083477,110765.119736,190448.272924,3384.645747,18646.626107,,,1760.876135,8210.519477,75.887696,0.0,8.092486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
523,"""82036800-01""",2008,"""Klawitter Pond""",0.596923,0.092462,420461.350844,802637.556411,128391.380863,182645.51138,412444.840525,798052.142235,,,0.0,0.0,0.0,0.0,1.876173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
524,"""82036800-01""",2007,"""Klawitter Pond""",0.491667,0.098083,396113.320826,487241.389735,131310.281426,178831.780564,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
525,"""82036800-01""",2006,"""Klawitter Pond""",0.6,0.1035,371583.114447,261100.184471,120477.452158,168187.904304,2743.87242,1971.437642,,,1634.529081,1160.042283,0.0,0.0,1.125704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
526,"""82036800-01""",2005,"""Klawitter Pond""",0.734615,0.155154,356141.619586,263138.264288,40758.19209,88168.335428,2646.779661,1937.204051,,,0.0,0.0,0.0,0.0,1.129944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
n = water_table.height

indices = np.random.permutation(n)

val_cutoff = int(0.3 * n)
train_cutoff = val_cutoff + int(0.7 * (n - val_cutoff))

labels = np.empty(n, dtype=object)
labels[indices[:val_cutoff]] = "validation"
labels[indices[val_cutoff:train_cutoff]] = "training"
labels[indices[train_cutoff:]] = "test"

(water_table := (water_table.with_columns(
    pl.Series("train_valid", labels))
)
)

Unnamed: 0_level_0,Monit_MAP_CODE1,Year,LAKE_NAME,Mean_Secchi_Depth_Result,Mean_Phosporus_Result,Mean_EMV_Total,STD_EMV_Total,Mean_Sale_Value,STD_Sale_Value,Mean_Total_Tax,STD_Total_Tax,Mean_Garage_Size,STD_Garage_Size,Mean_Fin_SQ_FT,STD_Fin_SQ_FT,Percentage_Yes_Basement,Percentage_Yes_Garage,Percentage_Yes_Tax_Exempt,Percentage_Air_Cooling,Percentage_AC_Cooling,Percentage_Central_Cooling,Percentage_Other_Cooling,Percentage_No_Cooling,Percentage_Air_Heating,Percentage_Space_Heating,Percentage_Water_Heating,Percentage_Electric_Heating,Percentage_Other_Heating,Percentage_No_Heating,train_valid
i64,str,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str
0,"""02000500-01""",2014,"""George Watch Lake""",0.716667,0.108778,215637.592745,459023.922281,122485.615829,183414.899771,3313.685903,17837.814516,,,1849.473207,8220.487298,70.486397,0.0,7.914262,0.0,0.0,0.0,2.308326,97.691674,0.0,0.0,0.0,0.0,85.325639,14.674361,"""training"""
1,"""02000500-01""",2013,"""George Watch Lake""",0.365,0.3105,196764.633141,454192.377508,120601.723001,190129.461399,3319.081616,18199.952917,,,1841.112119,8221.467234,69.991756,0.0,7.914262,0.824402,0.0,0.0,1.154163,0.0,79.060181,0.824402,1.154163,2.802968,1.154163,0.0,"""test"""
2,"""02000500-01""",2012,"""George Watch Lake""",0.359,0.2649,200414.333057,480355.968668,118315.128418,190091.029827,3460.064623,19346.923479,,,1811.576636,8223.512958,70.173985,0.0,8.119304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""validation"""
3,"""02000500-01""",2011,"""George Watch Lake""",0.973333,0.119417,216297.932175,536941.029067,111218.812242,190312.451914,3459.313482,19320.28833,,,1780.282051,8215.092878,76.840364,0.0,8.271299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""validation"""
4,"""02000500-01""",2010,"""George Watch Lake""",0.493333,0.173,222789.430223,559656.083477,110765.119736,190448.272924,3384.645747,18646.626107,,,1760.876135,8210.519477,75.887696,0.0,8.092486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""validation"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
523,"""82036800-01""",2008,"""Klawitter Pond""",0.596923,0.092462,420461.350844,802637.556411,128391.380863,182645.51138,412444.840525,798052.142235,,,0.0,0.0,0.0,0.0,1.876173,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""training"""
524,"""82036800-01""",2007,"""Klawitter Pond""",0.491667,0.098083,396113.320826,487241.389735,131310.281426,178831.780564,0.0,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""validation"""
525,"""82036800-01""",2006,"""Klawitter Pond""",0.6,0.1035,371583.114447,261100.184471,120477.452158,168187.904304,2743.87242,1971.437642,,,1634.529081,1160.042283,0.0,0.0,1.125704,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""validation"""
526,"""82036800-01""",2005,"""Klawitter Pond""",0.734615,0.155154,356141.619586,263138.264288,40758.19209,88168.335428,2646.779661,1937.204051,,,0.0,0.0,0.0,0.0,1.129944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""training"""


In [8]:
feature_cols = [
    col for col in water_table.columns
    if col not in ['Monit_MAP_CODE1', 'LAKE_NAME', 'train_valid', 'Mean_Secchi_Depth_Result', 'Mean_Phosporus_Result']
]

X = water_table.select(feature_cols).to_numpy()
y = water_table['Mean_Secchi_Depth_Result', 'Mean_Phosporus_Result'].to_numpy()

X_val = water_table.select(feature_cols).to_numpy()
y_val = water_table['Mean_Secchi_Depth_Result', 'Mean_Phosporus_Result'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 5
)

cart = DecisionTreeRegressor(random_state = 5)

cart_param_grid = {
    'max_depth': [None, 3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_cart = GridSearchCV(cart, cart_param_grid, cv = 5, scoring = 'neg_mean_squared_error')
grid_cart.fit(X_train, y_train)

best_cart = grid_cart.best_estimator_
print("Best CART Model:", best_cart)

rf = RandomForestRegressor(random_state = 5)

rf_param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}

grid_rf = GridSearchCV(rf, rf_param_grid, cv = 5, scoring = 'neg_mean_squared_error')
grid_rf.fit(X_train, y_train)

best_rf = grid_rf.best_estimator_
print("Best Random Forest Model:", best_rf)

models = {
    "CART": best_cart,
    "Random Forest": best_rf
}

for name, model in models.items():
    pred = model.predict(X_val)
    mse = mean_squared_error(y_val, pred)
    r2 = r2_score(y_val, pred)
    print(f"{name} -> Validation MSE: {mse:.2f}, R^2: {r2:.3f}")

Best CART Model: DecisionTreeRegressor(min_samples_leaf=2, min_samples_split=10, random_state=5)
Best Random Forest Model: RandomForestRegressor(n_estimators=300, random_state=5)
CART -> Validation MSE: 0.09, R^2: 0.780
Random Forest -> Validation MSE: 0.04, R^2: 0.922
