This machine learning project was done as a learning exercise. The dataset contains ~200 rows of water quality data downloaded from [kaggle](https://www.kaggle.com/datasets/shreyanshverma27/water-quality-testing).

This is a regression problem, so in this notebook I will choose an arbritrary set of features and targets and choose different algorithms to try and find a relationship.

In [None]:
import pandas as pd
import tensorflow as tf
from scipy import stats
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import regularizers


In [None]:
# Load the dataset and examine the first 5 rows

dataset = pd.read_csv("/content/Water Quality Testing.csv")
dataset.head()

Unnamed: 0,Sample ID,pH,Temperature (°C),Turbidity (NTU),Dissolved Oxygen (mg/L),Conductivity (µS/cm)
0,1,7.25,23.1,4.5,7.8,342
1,2,7.11,22.3,5.1,6.2,335
2,3,7.03,21.5,3.9,8.3,356
3,4,7.38,22.9,3.2,9.5,327
4,5,7.45,20.7,3.8,8.1,352


In [None]:
# inspect dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Sample ID                500 non-null    int64  
 1   pH                       500 non-null    float64
 2   Temperature (°C)         500 non-null    float64
 3   Turbidity (NTU)          500 non-null    float64
 4   Dissolved Oxygen (mg/L)  500 non-null    float64
 5   Conductivity (µS/cm)     500 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 23.6 KB


Each column of data contains 500 items and there is no missing data

In [None]:
# Check for outliers

def detect_outliers(df):
    outlier_data = {}
    for col in df.columns:
        z_scores = np.abs(stats.zscore(df[col]))
        outliers = z_scores > 3
        outlier_rows = df[outliers]
        if not outlier_rows.empty:
            outlier_data[col] = outlier_rows
    return outlier_data


outlier_data = detect_outliers(dataset.drop(columns=['Sample ID']))

# Print the results in a neat way
for column, data in outlier_data.items():
    print(f'The outliers in the {column} data are:')
    print(data)
    print()

The outliers in the pH data are:
      pH  Temperature (°C)  Turbidity (NTU)  Dissolved Oxygen (mg/L)  \
20  6.83              22.5              3.3                      6.1   

    Conductivity (µS/cm)  
20                   348  



#Regression Analysis

In [None]:
# Define features and targets

X = dataset[["pH", "Temperature (°C)", "Turbidity (NTU)", "Conductivity (µS/cm)"]].values
Y = dataset[["Dissolved Oxygen (mg/L)"]]


# Normalise the data
scalar = StandardScaler()
X = scalar.fit_transform(X)

In [None]:
# Split the dataset into training and test splits

# Train and val+test
Xtrain, X_val, Ytrain, Y_val = train_test_split(X, Y, test_size=0.3, random_state = 10)


# Splitting up validation and test sets
Xval, Xtest, Yval, Ytest = train_test_split(X_val, Y_val, test_size = 0.5, random_state = 10)

# Inspect the new dataset sizes
print("Training set size: ", Xtrain.shape)
print("Validation set size: ", Xval.shape)
print("Test set size: ", Xtest.shape)


Training set size:  (350, 4)
Validation set size:  (75, 4)
Test set size:  (75, 4)


## Regression Algorithm 1: Support Vector Regression

Support vector regression key points:


*   Finds the hyperplane that fits the data
*   SVM is the classification version
*   SVR tries to reduce the error between the hyperplane and the data
*   SVR allows you to determine how tolerant of errors you are
*   Errors only outside of a margin (epsilon) contribute to the loss









In [None]:
from sklearn.svm import SVR

SVR1 = SVR(kernel = 'rbf', C = 4)
SVR2 = SVR(kernel = 'poly', C= 4)
SVR3 = SVR(kernel = 'linear', C= 4)



print(SVR1)


SVR(C=4)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

def compute_metrics(model, model_name, Xval, Yval, Ytrain, Xtrain):
  # Fit to the data
  model.fit(Xtrain, Ytrain.values.ravel())

  # Use the trained model to make predictions on your validation data
  Y_pred = model.predict(Xval)

  # Now you can compute the metrics
  mse = mean_squared_error(Yval, Y_pred)
  r2 = r2_score(Yval, Y_pred)
  mae = mean_absolute_error(Yval, Y_pred)

  print(model_name)
  print("Mean squared error: ", mse)
  print("R² score1: ", r2)
  print("Mean absolute error:", mae)



In [None]:
compute_metrics(SVR1, "SVR1 - rbf", Xval, Yval, Ytrain, Xtrain)
compute_metrics(SVR2, "SVR2 - poly", Xval, Yval, Ytrain, Xtrain)
compute_metrics(SVR3, "SVR3 - linear", Xval, Yval, Ytrain, Xtrain)

SVR1 - rbf
Mean squared error:  0.0889682225824273
R² score1:  0.8632197104958638
Mean absolute error: 0.21065509594495355
SVR2 - poly
Mean squared error:  0.2371973957535288
R² score1:  0.6353312731325368
Mean absolute error: 0.3565587509039766
SVR3 - linear
Mean squared error:  0.15301026307177018
R² score1:  0.7647610857835148
Mean absolute error: 0.2591891062812409


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter space
params = {
    'C' : [0.05, 0.1, 0.5, 1, 4, 5, 10],
    'kernel' : ['poly', 'rbf', 'linear'],
    'gamma' : ['scale',  'auto']
}

# initialise model
svr = SVR()

#create search object
gridsearch = GridSearchCV(svr, params, cv = 5, scoring='neg_mean_squared_error')

# fit to training data
gridsearch.fit(Xtrain, Ytrain.values.ravel())



print("Best Parameters: ", gridsearch.best_params_)
print("Best Score: ", gridsearch.best_score_)

# interestingly the results are not as good as without the gridsearch

Best Parameters:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score:  -0.22865572605894918
