In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV

# Load the data
data = pd.read_csv('Data_Merged.csv')

# Handle missing values
data = data.dropna()

# Convert 'timestamp_sentinel2' to datetime
data['timestamp_sentinel2'] = pd.to_datetime(data['timestamp_sentinel2'])

# Define the target variable (y)
target = 'Turbidity_Buoy_(NTU)'

# Select features
features = [
    'B2_AVG', 'B3_AVG', 'B4_AVG', 'B8_AVG', 'B8A_AVG', 'B11_AVG', 'B12_AVG',
    'cs_AVG', 'cs_cdf_AVG',
    'Ground_Measurements_time_diff_(seconds)', 'Lake_Height_(m)', 
    'PercentFull_Active_Lake_Storage_(%)', 'Snow_Volume_Opuha_Catchment_(mm)',
    'WDir(Deg)', 'WSpd(m/s)', 'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 
    'Rain(mm)', 'Tdry(C)', 'TWet(C)', 'RH(%)', 'Tmax(C)', 'Tmin(C)', 
    'Pmsl(hPa)', 'Pstn(hPa)', 'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)'
]

X = data[features]
y = data[target]

# Normalise the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define hyperparameter grid for Lasso with increased iterations and more alpha values
lasso_params = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'max_iter': [10000, 50000, 100000]}

# Define hyperparameter grid for Ridge with increased iterations
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100], 'max_iter': [1000, 5000, 10000]}

# Perform GridSearchCV for Lasso
lasso = Lasso()
lasso_cv = GridSearchCV(lasso, lasso_params, cv=5)
lasso_cv.fit(X_train, y_train)

# Perform GridSearchCV for Ridge
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, ridge_params, cv=5)
ridge_cv.fit(X_train, y_train)

# Get the best models
best_lasso = lasso_cv.best_estimator_
best_ridge = ridge_cv.best_estimator_

# Feature selection using RFE with cross-validation
selector = RFECV(best_lasso, step=1, cv=5)
selector = selector.fit(X_train, y_train)

# Select the most important features
selected_features = X_train.columns[selector.support_]

# Train the Lasso model with selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)
best_lasso.fit(X_train_selected, y_train)

# Make predictions with the best Lasso model
y_pred_lasso = best_lasso.predict(X_test_selected)

# Evaluate the best Lasso model
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

# Train the Ridge model with selected features
best_ridge.fit(X_train_selected, y_train)

# Make predictions with the best Ridge model
y_pred_ridge = best_ridge.predict(X_test_selected)

# Evaluate the best Ridge model
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Best Lasso Regression Mean Squared Error: {mse_lasso}")
print(f"Best Lasso Regression R-squared: {r2_lasso}")

print(f"Best Ridge Regression Mean Squared Error: {mse_ridge}")
print(f"Best Ridge Regression R-squared: {r2_ridge}")

print("Selected features:", selected_features)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Best Lasso Regression Mean Squared Error: 168.109168753274
Best Lasso Regression R-squared: 0.412945708428024
Best Ridge Regression Mean Squared Error: 180.7113875073894
Best Ridge Regression R-squared: 0.36893748057347875
Selected features: Index(['B2_AVG', 'B4_AVG', 'B8A_AVG', 'cs_cdf_AVG', 'Lake_Height_(m)',
       'PercentFull_Active_Lake_Storage_(%)', 'Water_Temp_Buoy_(degC)',
       'Water_Temp_Platform_(degC)'],
      dtype='object')


In [None]:

# Best Lasso Regression Mean Squared Error: 168.109168753274
# Best Lasso Regression R-squared: 0.412945708428024
# Best Ridge Regression Mean Squared Error: 180.7113875073894
# Best Ridge Regression R-squared: 0.36893748057347875


# Selected features: Index(['B2_AVG', 'B4_AVG', 'B8A_AVG', 'cs_cdf_AVG', 'Lake_Height_(m)',
#        'PercentFull_Active_Lake_Storage_(%)', 'Water_Temp_Buoy_(degC)',
#        'Water_Temp_Platform_(degC)'],
#       dtype='object')
