# This block only processes data for DAL LAKE

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv('set.csv')

# --- Features ---
numeric_features = ['Min Temperature', 'Max Temperature', 'Min Dissolved Oxygen', 'Max Dissolved Oxygen',
                    'Min pH', 'Max pH', 'Min Conductivity', 'Max Conductivity', 'Min BOD', 'Max BOD']

X = df[numeric_features].copy()

# Add range features
X['Temp_Range'] = X['Max Temperature'] - X['Min Temperature']
X['DO_Range'] = X['Max Dissolved Oxygen'] - X['Min Dissolved Oxygen']
X['pH_Range'] = X['Max pH'] - X['Min pH']
X['Conductivity_Range'] = X['Max Conductivity'] - X['Min Conductivity']
X['BOD_Range'] = X['Max BOD'] - X['Min BOD']

y = df[numeric_features].copy()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- Leave-One-Out CV ---
loo = LeaveOneOut()
y_pred = np.zeros_like(y.values, dtype=float)

for train_idx, test_idx in loo.split(X_scaled):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train = y.values[train_idx]

    # SVR
    svr = MultiOutputRegressor(SVR(kernel='linear'))
    svr.fit(X_train, y_train)
    y_svr_pred = svr.predict(X_test)

    # Random Forest
    rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=50, max_depth=3, random_state=42))
    rf.fit(X_train, y_train)
    y_rf_pred = rf.predict(X_test)

    # Gradient Boosting
    gb = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42))
    gb.fit(X_train, y_train)
    y_gb_pred = gb.predict(X_test)

    # Average predictions
    y_pred[test_idx] = (y_svr_pred + y_rf_pred + y_gb_pred) / 3

# --- Metrics ---
print("\n--- Model Metrics (LOO CV) ---")
for i, col in enumerate(y.columns):
    r2 = r2_score(y[col], y_pred[:, i])
    mae = mean_absolute_error(y[col], y_pred[:, i])
    rmse = np.sqrt(mean_squared_error(y[col], y_pred[:, i]))
    print(f"{col} -> R²: {r2:.3f}, MAE: {mae:.3f}, RMSE: {rmse:.3f}")

# --- Forecast 2024-2026 ---
# Simple trend: average yearly change
increments = df[numeric_features].diff().mean()
last_year_values = df[numeric_features].iloc[-1].copy()

forecast_values = []
for year in [2024, 2025, 2026]:
    next_vals = last_year_values + increments
    forecast_values.append(next_vals)
    last_year_values = next_vals

X_future = pd.DataFrame(forecast_values)

# Add range features
X_future['Temp_Range'] = X_future['Max Temperature'] - X_future['Min Temperature']
X_future['DO_Range'] = X_future['Max Dissolved Oxygen'] - X_future['Min Dissolved Oxygen']
X_future['pH_Range'] = X_future['Max pH'] - X_future['Min pH']
X_future['Conductivity_Range'] = X_future['Max Conductivity'] - X_future['Min Conductivity']
X_future['BOD_Range'] = X_future['Max BOD'] - X_future['Min BOD']

# Scale future features
X_future_scaled = scaler.transform(X_future)

# Predict with trained ensemble (average of SVR, RF, GB)
svr.fit(X_scaled, y)
rf.fit(X_scaled, y)
gb.fit(X_scaled, y)

y_forecast = (svr.predict(X_future_scaled) + rf.predict(X_future_scaled) + gb.predict(X_future_scaled)) / 3

# Create final forecast DataFrame
forecast_df = pd.DataFrame(y_forecast, columns=numeric_features)
forecast_df.insert(0, 'Year', [2024, 2025, 2026])
forecast_df.insert(0, 'Name of Lake', df['Name of Lake'].iloc[0])

# Save to CSV
forecast_df.to_csv('forecast_2024_2026.csv', index=False)
print("\nForecast saved to 'forecast_2024_2026.csv'")
print(forecast_df)



--- Model Metrics (LOO CV) ---
Min Temperature -> R²: 0.872, MAE: 1.254, RMSE: 1.827
Max Temperature -> R²: 0.770, MAE: 0.822, RMSE: 1.543
Min Dissolved Oxygen -> R²: 0.894, MAE: 0.228, RMSE: 0.344
Max Dissolved Oxygen -> R²: 0.907, MAE: 0.234, RMSE: 0.330
Min pH -> R²: 0.199, MAE: 0.112, RMSE: 0.214
Max pH -> R²: 0.748, MAE: 0.106, RMSE: 0.173
Min Conductivity -> R²: 0.318, MAE: 22.668, RMSE: 38.764
Max Conductivity -> R²: 0.627, MAE: 24.515, RMSE: 36.463
Min BOD -> R²: 0.676, MAE: 0.180, RMSE: 0.304
Max BOD -> R²: 0.771, MAE: 0.414, RMSE: 0.644

Forecast saved to 'forecast_2024_2026.csv'
  Name of Lake  Year  Min Temperature  Max Temperature  Min Dissolved Oxygen  \
0     Dal lake  2024         6.354462        24.582020              4.945551   
1     Dal lake  2025         6.520632        24.509006              4.962628   
2     Dal lake  2026         6.651166        24.502033              4.961647   

   Max Dissolved Oxygen    Min pH    Max pH  Min Conductivity  \
0             10

# Same thing for both lakes

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv('set.csv')

numeric_features = ['Min Temperature', 'Max Temperature', 'Min Dissolved Oxygen', 'Max Dissolved Oxygen',
                    'Min pH', 'Max pH', 'Min Conductivity', 'Max Conductivity', 'Min BOD', 'Max BOD']

forecast_years = [2024, 2025, 2026]
all_forecasts = []

# Loop through each lake
for lake in df['Name of Lake'].unique():
    print(f"\nProcessing {lake}...")

    df_lake = df[df['Name of Lake'] == lake].reset_index(drop=True)
    X = df_lake[numeric_features].copy()
    # Add range features
    X['Temp_Range'] = X['Max Temperature'] - X['Min Temperature']
    X['DO_Range'] = X['Max Dissolved Oxygen'] - X['Min Dissolved Oxygen']
    X['pH_Range'] = X['Max pH'] - X['Min pH']
    X['Conductivity_Range'] = X['Max Conductivity'] - X['Min Conductivity']
    X['BOD_Range'] = X['Max BOD'] - X['Min BOD']

    y = df_lake[numeric_features].copy()
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # --- LOO CV ---
    if len(df_lake) > 1:
        loo = LeaveOneOut()
        y_pred = np.zeros_like(y.values, dtype=float)
        for train_idx, test_idx in loo.split(X_scaled):
            X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
            y_train = y.values[train_idx]

            svr = MultiOutputRegressor(SVR(kernel='linear'))
            svr.fit(X_train, y_train)
            y_svr_pred = svr.predict(X_test)

            rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=50, max_depth=3, random_state=42))
            rf.fit(X_train, y_train)
            y_rf_pred = rf.predict(X_test)

            gb = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42))
            gb.fit(X_train, y_train)
            y_gb_pred = gb.predict(X_test)

            y_pred[test_idx] = (y_svr_pred + y_rf_pred + y_gb_pred) / 3

        print(f"\n--- Model Metrics for {lake} (LOO CV) ---")
        for i, col in enumerate(y.columns):
            r2 = r2_score(y[col], y_pred[:, i])
            mae = mean_absolute_error(y[col], y_pred[:, i])
            rmse = np.sqrt(mean_squared_error(y[col], y_pred[:, i]))
            print(f"{col} -> R²: {r2:.3f}, MAE: {mae:.3f}, RMSE: {rmse:.3f}")
    else:
        print(f"\nSkipping LOO CV for {lake} (only 1 row of data)")

    # --- Forecast ---
    if len(df_lake) == 1:
        last_vals = df_lake[numeric_features].iloc[0]
        forecast_values = pd.DataFrame([last_vals.values]*len(forecast_years), columns=numeric_features)
    else:
        increments = df_lake[numeric_features].diff().mean().fillna(0)
        last_vals = df_lake[numeric_features].iloc[-1]
        forecast_values = []
        for year in forecast_years:
            next_vals = last_vals + increments
            forecast_values.append(next_vals)
            last_vals = next_vals
        forecast_values = pd.DataFrame(forecast_values, columns=numeric_features)

    # Add range features
    forecast_values['Temp_Range'] = forecast_values['Max Temperature'] - forecast_values['Min Temperature']
    forecast_values['DO_Range'] = forecast_values['Max Dissolved Oxygen'] - forecast_values['Min Dissolved Oxygen']
    forecast_values['pH_Range'] = forecast_values['Max pH'] - forecast_values['Min pH']
    forecast_values['Conductivity_Range'] = forecast_values['Max Conductivity'] - forecast_values['Min Conductivity']
    forecast_values['BOD_Range'] = forecast_values['Max BOD'] - forecast_values['Min BOD']

    # Scale future features
    X_future_scaled = scaler.transform(forecast_values)

    # Train ensemble on full lake data
    svr = MultiOutputRegressor(SVR(kernel='linear')).fit(X_scaled, y)
    rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=50, max_depth=3, random_state=42)).fit(X_scaled, y)
    gb = MultiOutputRegressor(GradientBoostingRegressor(n_estimators=50, max_depth=3, random_state=42)).fit(X_scaled, y)

    y_forecast = (svr.predict(X_future_scaled) + rf.predict(X_future_scaled) + gb.predict(X_future_scaled)) / 3

    forecast_df = pd.DataFrame(y_forecast, columns=numeric_features)
    forecast_df.insert(0, 'Year', forecast_years)
    forecast_df.insert(0, 'Name of Lake', lake)

    all_forecasts.append(forecast_df)

# Combine forecasts for all lakes
final_forecast_df = pd.concat(all_forecasts, ignore_index=True)
final_forecast_df.to_csv('forecast_all_lakes_2024_2026.csv', index=False)
print("\nForecast saved to 'forecast_all_lakes_2024_2026.csv'")
print(final_forecast_df)



Processing Dal lake...

--- Model Metrics for Dal lake (LOO CV) ---
Min Temperature -> R²: 0.524, MAE: 3.159, RMSE: 4.331
Max Temperature -> R²: 0.365, MAE: 1.506, RMSE: 1.879
Min Dissolved Oxygen -> R²: 0.027, MAE: 0.570, RMSE: 0.780
Max Dissolved Oxygen -> R²: 0.583, MAE: 0.590, RMSE: 0.745
Min pH -> R²: 0.004, MAE: 0.160, RMSE: 0.272
Max pH -> R²: 0.122, MAE: 0.100, RMSE: 0.152
Min Conductivity -> R²: -0.073, MAE: 34.463, RMSE: 55.775
Max Conductivity -> R²: 0.167, MAE: 25.245, RMSE: 30.614
Min BOD -> R²: -0.224, MAE: 0.458, RMSE: 0.667
Max BOD -> R²: 0.027, MAE: 1.181, RMSE: 1.677

Processing Wular lake...

--- Model Metrics for Wular lake (LOO CV) ---
Min Temperature -> R²: -0.564, MAE: 3.367, RMSE: 4.504
Max Temperature -> R²: -0.286, MAE: 2.303, RMSE: 3.855
Min Dissolved Oxygen -> R²: 0.147, MAE: 1.000, RMSE: 1.232
Max Dissolved Oxygen -> R²: 0.563, MAE: 0.553, RMSE: 0.709
Min pH -> R²: -0.012, MAE: 0.105, RMSE: 0.118
Max pH -> R²: 0.009, MAE: 0.246, RMSE: 0.387
Min Conductivit