In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('Final_Merged_Data.csv')

# Handle missing values
data = data.dropna()

# Convert 'timestamp_sentinel2' to datetime
data['timestamp_sentinel2'] = pd.to_datetime(data['timestamp_sentinel2'])

# Define the target variable (y)
target = 'Turbidity_Buoy_(NTU)'

# Select features
features = [
    *[f'point{point}_B{band}' for band in [2, 3, 4, 8, 8, 8, 8, 11, 12] for point in range(1, 23)],
    *[f'point{point}_cs' for point in range(1, 23)],
    *[f'point{point}_cs_cdf' for point in range(1, 23)],
    'B2_AVG', 'B3_AVG', 'B4_AVG', 'B8_AVG', 'B8A_AVG', 'B11_AVG', 'B12_AVG',
    'cs_AVG', 'cs_cdf_AVG',
    'Ground_Measurements_time_diff_(seconds)', 'Lake_Height_(m)', 
    'PercentFull_Active_Lake_Storage_(%)', 'Snow_Volume_Opuha_Catchment_(mm)',
    'WDir(Deg)', 'WSpd(m/s)', 'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 
    'Rain(mm)', 'Tdry(C)', 'TWet(C)', 'RH(%)', 'Tmax(C)', 'Tmin(C)', 
    'Pmsl(hPa)', 'Pstn(hPa)', 'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)'
]

X = data[features]
y = data[target]

# Normalise the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform GridSearchCV
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the best model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Best Random Forest Mean Squared Error: {mse_rf}")
print(f"Best Random Forest R-squared: {r2_rf}")

# Feature importance
importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df.head(20))  # Display top 20 important features


Fitting 3 folds for each of 648 candidates, totalling 1944 fits


648 fits failed out of a total of 1944.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
229 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Program Files\Anaconda3\envs\ee\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Program Files\Anaconda3\envs\ee\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Program Files\Anaconda3\envs\ee\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Program Files\Anaconda3\envs\ee\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_para

Best Random Forest Mean Squared Error: 134.1881908065569
Best Random Forest R-squared: 0.5314012086581434
                              Feature  Importance
254  Snow_Volume_Opuha_Catchment_(mm)    0.025231
268            Water_Temp_Buoy_(degC)    0.014855
269        Water_Temp_Platform_(degC)    0.013508
261                           Tdry(C)    0.012450
11                         point12_B2    0.011399
230                    point11_cs_cdf    0.010998
241                    point22_cs_cdf    0.010601
240                    point21_cs_cdf    0.010544
233                    point14_cs_cdf    0.010447
229                    point10_cs_cdf    0.010314
231                    point12_cs_cdf    0.009546
264                           Tmax(C)    0.009423
263                             RH(%)    0.008975
220                     point1_cs_cdf    0.008760
172                       point19_B11    0.008611
160                        point7_B11    0.008155
222                     point3_cs_cdf    0.0

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the data
data = pd.read_csv('Final_Merged_Data.csv')

# Handle missing values
data = data.dropna()

# Convert 'timestamp_sentinel2' to datetime
data['timestamp_sentinel2'] = pd.to_datetime(data['timestamp_sentinel2'])

# Define the target variable (y)
target = 'Turbidity_Buoy_(NTU)'

# Select features
features = [
    'B2_AVG', 'B3_AVG', 'B4_AVG', 'B8_AVG', 'B8A_AVG', 'B11_AVG', 'B12_AVG',
    'cs_AVG', 'cs_cdf_AVG',
    'Ground_Measurements_time_diff_(seconds)', 'Lake_Height_(m)', 
    'PercentFull_Active_Lake_Storage_(%)', 'Snow_Volume_Opuha_Catchment_(mm)',
    'WDir(Deg)', 'WSpd(m/s)', 'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 
    'Rain(mm)', 'Tdry(C)', 'TWet(C)', 'RH(%)', 'Tmax(C)', 'Tmin(C)', 
    'Pmsl(hPa)', 'Pstn(hPa)', 'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)'
]

X = data[features]
y = data[target]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['sqrt', 'log2', 0.2, 0.5, 0.7],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Perform GridSearchCV
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluate the best model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Best Random Forest Mean Squared Error: {mse_rf}")
print(f"Best Random Forest R-squared: {r2_rf}")

# Feature importance
importances = best_rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': features, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

print(feature_importance_df.head(20))  # Display top 20 important features


Fitting 3 folds for each of 1080 candidates, totalling 3240 fits
