In [None]:
#pip install scikit-learn

#Key Changes:
    #The neural network model was replaced with a RandomForestRegressor.
    #The evaluation function was updated to use mean_squared_error from sklearn.metrics.
    #The .ravel() method was used to reshape the target data y_train appropriately for the random forest model.
    #*tune the hyperparameters of RandomForestRegressor (e.g., n_estimators, max_depth) to get better performance. 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# set a random seed for reproducibility
np.random.seed(42)

In [None]:
# function to evaluate loss
def evaluate_loss(X_test_scaled, y_test_scaled, model):
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test_scaled, y_pred)
    print(f'Mean Squared Error on Test Set: {mse}')

In [None]:
# Load the train dataset
train_dataset = pd.read_excel('/home/framework/coding_python/wqi_ann/src/data_input/train_dataset.xlsx')  # replace with the path to your excel file

In [None]:
# show the first 10 values of the training dataset
train_dataset.head(11)

In [None]:
# separate features (11 variables) and target (WQI)
x = train_dataset[['ph', 'Total  Hardness', 'Total Alkalinity', 'Cl', 'NO3', 'SO4', 'F', 'TDS', 'Fe', 'As (ppb)', 'Pb (ppb)']]
y = train_dataset[['WQI']]

In [None]:
# normalize the features and target using MinMaxScaler
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(x)
y_scaled = scaler_y.fit_transform(y)

In [None]:
# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

In [None]:
# build a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# train the model
model.fit(X_train, y_train.ravel())  # y_train.ravel() is needed to avoid warnings due to the target's shape

In [None]:
# load the test dataset (11 variables without WQI)
test_dataset = pd.read_excel('/home/framework/coding_python/wqi_ann/src/data_input/test_dataset.xlsx')

In [None]:
# extract the 11 variables
X_test_data = test_dataset[['pH', 'Total Hardness', 'Total Alkalinity', 'Cl', 'NO3', 'SO4', 'F', 'TDS', 'Fe', 'As (ppb)', 'Pb (ppb)']]

In [None]:
# normalize the test data using the same scaler fitted to the training data
X_test_data_scaled = scaler_X.transform(X_test_data)

In [None]:
# Predict WQI values using the trained model
predicted_wqi_scaled = model.predict(X_test_data_scaled)

In [None]:
# print scaled prediction using the trained model
print("Scaled prediction:", predicted_wqi_scaled)

In [None]:
# Inverse transform to get the original WQI values
predicted_wqi = scaler_y.inverse_transform(predicted_wqi_scaled.reshape(-1, 1))

In [None]:
# print unscaled prediction for debugging
print("Unscaled predictions:", predicted_wqi)

In [None]:
# Evaluate the loss of the model on the test set
evaluate_loss(X_test, y_test, model)

In [None]:
# Add the predicted WQI values to the test dataset
test_dataset['WQI'] = predicted_wqi

In [None]:
# Save the updated test dataset to a new Excel file
output_excel_path = '/home/framework/coding_python/wqi_ann/src/data_output/validation_dataset_output.xlsx'
test_dataset.to_excel(output_excel_path, index=False)

In [None]:
print("WQI values have been added and saved to the output Excel file.")