<a href="https://colab.research.google.com/github/ylfoo/HelloWorld/blob/main/COD_GitHub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Full Scale Waste Water Treatment Plant Data**

Energy consumption, climate, and wastewater characteristics of Melbourne eastern wastewater treatment plant for preiod of six years (2014-2019). Source: https://www.kaggle.com/datasets/d4rklucif3r/full-scale-waste-water-treatment-plant-data

Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score, train_test_split as split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression as LNR, Ridge, Lasso, ElasticNet as ENR
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.ensemble import AdaBoostRegressor as ADA, BaggingRegressor as BAG, GradientBoostingRegressor as GBR, RandomForestRegressor as RFR
from sklearn.neural_network import MLPRegressor as MLPR
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor as KNR

from tensorflow import keras
from keras import layers
from keras.layers import LSTM, Dense
from keras.models import Sequential

Load the dataset

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/ylfoo/datasets/main/Data-Melbourne_F_fixed.csv', index_col = 0)

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
atmospheric_values = [
    'Average Temperature',
    'Maximum temperature',
    'Minimum temperature',
    'Atmospheric pressure',
    'Average humidity',
    'Total rainfall',
    'Average visibility',
    'Average wind speed',
    'Maximum wind speed',
]

In [None]:
data[atmospheric_values].head()

In [None]:
data['Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']])
data.drop(['Year', 'Month', 'Day'], axis=1, inplace=True)

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

Predict COD

In [None]:
data = data.set_index("Date")
# Define your input features (X) and target variable (y)
X = data.drop("Chemical Oxygen Demand", axis=1)  # Input features (exclude COD)
y = data["Chemical Oxygen Demand"]  # Target variable (COD)

In [None]:
sns.scatterplot(y)
plt.show()

**Machine Learning**

In [None]:
models = {}
models['LNR'] = LNR()
models['RDG'] = Ridge()
models['LSR'] = Lasso()
models['ENR'] = ENR()
models['DTR'] = DTR()
models['ABR'] = ADA()
models['BAG'] = BAG()
models['GBR'] = GBR()
models['RFR'] = RFR()

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for m in models:
    score = cross_val_score(models[m], X, y, scoring='r2', cv=kf, n_jobs=-1) # 'r2' is the default score for regressor
    print(f"{m}: R2 score is {score.mean():.3f} Â± {score.std():.3f}")

**Neural Network**

Split the data into training, validation, and test sets

In [None]:
X_train, X_temp, y_train, y_temp = split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = split(X_temp, y_temp, test_size=0.5, random_state=42)

Standardize the data range (feature scaling)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

Build a simple neural network model

In [None]:
model_neural = Sequential()
model_neural.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model_neural.add(Dense(32, activation='relu'))
model_neural.add(Dense(1))  # Output layer (1 neuron for regression)

# Compile the model
model_neural.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
neural_history = model_neural.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=400, batch_size=50, verbose=1)

In [None]:
# Make predictions on the test set
y_pred = model_neural.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"RMSE on test set: {rmse:.2f}")
print(f"R-squared (R2) Score: {r2:.2f}")

Define a Feedforward Neural Network (FNN) model

In [None]:
def create_ffn_model():
    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1],)),
        layers.Dense(64, activation='relu'),
        layers.Dense(32, activation='relu'),
        layers.Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

Define a Convolutional Neural Network (CNN) model

In [None]:
def create_cnn_model():
    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1], 1)),  # Assuming 1D data
        layers.Conv1D(32, kernel_size=3, activation='relu'),
        layers.MaxPooling1D(pool_size=2),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

Define a Recurrent Neural Network (RNN) model with LSTM cells

In [None]:
def create_rnn_model():
    model = keras.Sequential([
        layers.Input(shape=(X_train.shape[1], 1)),  # Assuming 1D data
        layers.LSTM(32, activation='relu'),
        layers.Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

Train the models

In [None]:
ffn_model = create_ffn_model()
ffn_history = ffn_model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=400, batch_size=50)

cnn_model = create_cnn_model()
cnn_history = cnn_model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train, validation_data=(X_val.reshape(X_val.shape[0], X_val.shape[1], 1), y_val), epochs=400, batch_size=50)

rnn_model = create_rnn_model()
rnn_history = rnn_model.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train, validation_data=(X_val.reshape(X_val.shape[0], X_val.shape[1], 1), y_val), epochs=400, batch_size=50)

Test the models

In [None]:
y_ffn_pred = ffn_model.predict(X_test)
mse_ffn = mean_squared_error(y_test, y_ffn_pred)
rmse_ffn = np.sqrt(mse_ffn)
r2_ffn = r2_score(y_test, y_ffn_pred)
print(f'FFN RMSE on test set: {rmse_ffn:.2f}')
print(f"FFN R-squared (R2) Score: {r2_ffn:.2f}")

y_cnn_pred = cnn_model.predict(X_test.reshape(X_test.shape[0], X_test.shape[1], 1))
mse_cnn = mean_squared_error(y_test, y_cnn_pred)
rmse_cnn = np.sqrt(mse_cnn)
r2_cnn = r2_score(y_test, y_cnn_pred)
print(f'CNN RMSE on test set: {rmse_cnn:.2f}')
print(f"CNN R-squared (R2) Score: {r2_cnn:.2f}")

y_rnn_pred = rnn_model.predict(X_test.reshape(X_test.shape[0], X_test.shape[1], 1))
mse_rnn = mean_squared_error(y_test, y_rnn_pred)
rmse_rnn = np.sqrt(mse_rnn)
r2_rnn = r2_score(y_test, y_rnn_pred)
print(f'RNN RMSE on test set: {rmse_rnn:.2f}')
print(f"RNN R-squared (R2) Score: {r2_rnn:.2f}")

In [None]:
model_names = ['NN', 'FFN', 'CNN', 'RNN']
r2_values = [r2, r2_ffn, r2_cnn, r2_rnn]
for i, m in enumerate(model_names):
    print(f"{m}: R2 score is {r2_values[i]:.3}")