# Machine Exercise 3
## Miguel Luis Martinez, Juan Carlos Roldan, Jeryl Salas | AI 221 WZZQ  Shannon Batch | University of the Philippines Diliman

In [20]:
# download and import dependencies
!pip install ucimlrepo optuna

from ucimlrepo import fetch_ucirepo

import optuna
import pandas as pd
import seaborn as sns
import numpy as np
from time import perf_counter
from optuna.samplers import TPESampler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,mean_squared_error,r2_score
from sklearn.model_selection import KFold





# Loading the Dataset
The Wine Quality dataset downloaded from https://archive.ics.uci.edu/ml/datasets/wine+quality consists of 6497 entries split into 1599 samples of red Portuguese "Vinho Verde" wine and 4898 samples of white Portuguese "Vinho Verde" wine. For this machine exercise, only the red wine samples will be used.

In [16]:
# fetch dataset as pandas dataframe
wine_quality = fetch_ucirepo(id=186).data.original
# use a downloaded dataset
#wine_quality = pd.read_csv("winequality-red.csv",sep=";")
#show dataset
print("Original Dataset")
display(wine_quality)
#restrict data to red wine
if "color" in wine_quality.columns:
    wine_quality = wine_quality[wine_quality["color"]=="red"]
    wine_quality = wine_quality.loc[:,wine_quality.columns!="color"]
print("Filtered Dataset")
display(wine_quality)
# data (as pandas dataframes)
X = wine_quality.iloc[:,:-1]
y = wine_quality.iloc[:,-1:]


Original Dataset


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6,red
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,red
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,white
6493,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,white
6494,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,white
6495,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,white


Filtered Dataset


Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [17]:
data = X.copy()
data['target'] = y
#sns.pairplot(data, hue='target')
#sns.histplot(data, x='fixed_acidity')

## Problem a.
* Randomly split red wine dataset into 70% train and 30% test with stratify=y.



In [18]:
#Use a seed for random state
wine_seed = 42 #seed used for splitting the training and testing data
tune_seed = 420 #seed used for hyperparameter tuning
cv_seed = 420 #seed used for K-fold cross validation
pipe_seed = 6591 #seed used for classifier and regressor pipelines
#uncomment to generate a new seed
#wine_seed = np.random.randint((2**31)-1)
#cv_seed = np.random.randint((2**31)-1)
#tune_seed = np.random.randint((2**31)-1)
#pipe_seed = np.random.randint((2**31)-1)
print(f"Wine Dataset Splitter Seed: {wine_seed}")
print(f"Cross-Validation Seed: {cv_seed}")
print(f"Tuning Seed: {tune_seed}")
print(f"Classifier Seed: {pipe_seed}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.3, random_state=wine_seed, stratify=y)


Wine Dataset Splitter Seed: 42
Cross-Validation Seed: 420
Tuning Seed: 420
Classifier Seed: 6591


* Build a pipeline with Standard scaler then SVC.
* Design hyper-parameter tuning procedure to automatically find the model with the best cross-validation accuracy.

For hyperparameter tuning, Optuna was used to find the optimal set of hyperparameters within a certain range in 100 trials. The hyperparameters that were tuned were the regularization parameter C (from 0 to 2 in intervals of 0.001), the kernel (linear, rbf, or sigmoid), the kernel coefficient gamma (from 0 to 10 in intervals of 0.0001). Cross-validation was performed via the sklearn cross_val_score using the default 5-fold cross-validation
settings.

As shown below, the best classifier model found used the Radial Basis Function kernel and had a regularization parameter of 1.598 and a kernel coefficient of 0.949.

In [21]:
# Using Optuna to optimize hyperparameters
def objective(trial):
    C = trial.suggest_float("C",0,2,step=0.001)
    kernel = trial.suggest_categorical("kernel", ['rbf', 'sigmoid'])
    gamma = trial.suggest_float("gamma",0,10,step=0.0001)
    pipeline = Pipeline([
        ('standard_scaler', StandardScaler()),
        ('classifier', SVC(kernel=kernel,gamma=gamma,random_state=pipe_seed))
    ])
    cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=cv_seed)
    #Return the mean score across all folds
    cv_score = cross_val_score(pipeline,X_train,y_train.values.ravel(),scoring="accuracy",cv=cv)
    print(cv_score)
    return np.mean(cv_score)

study = optuna.create_study(direction="maximize",sampler=TPESampler(seed=tune_seed))
study.optimize(objective, n_trials=100)

trial = study.best_trial

print(f"Best Classifier Mean Accuracy: {study.best_trial.values}")
print(f"Best Classifier Parameters:")
for param in study.best_trial.params.items():
    print(f"{param[0]}: {param[1]}")

#Reconstruct the best classifier
estimators_c = [
    ('standard_scaler', StandardScaler()),
    ('classifier', SVC(**study.best_trial.params,random_state=pipe_seed))
]
pipeline_c = Pipeline(estimators_c)
pipeline_c.fit(X_train,y_train.values.ravel())


[I 2024-09-22 10:40:21,078] A new study created in memory with name: no-name-e7c7788d-dc37-4b2b-94f6-84818cf45dcc
[I 2024-09-22 10:40:23,511] Trial 0 finished with value: -0.4481595639745886 and parameters: {'C': 3.2, 'kernel': 'linear', 'gamma': 8.700000000000001, 'epsilon': 0.63}. Best is trial 0 with value: -0.4481595639745886.
[I 2024-09-22 10:40:24,762] Trial 1 finished with value: -0.46174337876582944 and parameters: {'C': 3.6, 'kernel': 'rbf', 'gamma': 0.9, 'epsilon': 0.056}. Best is trial 1 with value: -0.46174337876582944.
[I 2024-09-22 10:40:25,576] Trial 2 finished with value: -0.5795284477329286 and parameters: {'C': 2.4000000000000004, 'kernel': 'rbf', 'gamma': 3.5, 'epsilon': 0.46900000000000003}. Best is trial 2 with value: -0.5795284477329286.
[I 2024-09-22 10:40:29,802] Trial 3 finished with value: -0.4474304496840385 and parameters: {'C': 7.7, 'kernel': 'linear', 'gamma': 7.6000000000000005, 'epsilon': 0.441}. Best is trial 2 with value: -0.5795284477329286.
[I 2024-0

Best Regressor Mean Squared Error: -618574.1619118742
Best Regressor Parameters:
C: 10.0
kernel: sigmoid
gamma: 8.0
epsilon: 0.404


We then show the performance of the chosen classifier. As seen from the results, the classifier was able to perform really well on the training set with a 97.14% accuracy, 98.61% precision, and an F1 score of 0.9584. The performance of the classifier on the test set, on the other hand, were much lower than expected with an accuracy of 63.33%, precision of 40.49% and and F1 score of 0.3449.


In [22]:
y_train_predict_c = pipeline_c.predict(X_train)
print(f"Classifier Results on Train Set:")
print(f"Test Set Prediction Confusion Matrix:\n{confusion_matrix(y_train,y_train_predict_c)}")
print(f"Test Set Prediction Accuracy: {accuracy_score(y_train,y_train_predict_c):4f}")
print(f"Test Set Prediction Precision: {precision_score(y_train,y_train_predict_c,average='macro'):4f}")
print(f"Test Set Prediction Recall: {recall_score(y_train,y_train_predict_c,average='macro'):4f}")
print(f"Test Set Prediction F1 Score: {f1_score(y_train,y_train_predict_c,average='macro'):4f}")

y_test_predict_c = pipeline_c.predict(X_test)
print(f"Classifier Results on Test Set:")
print(f"Test Set Prediction Confusion Matrix:\n{confusion_matrix(y_test,y_test_predict_c)}")
print(f"Test Set Prediction Accuracy: {accuracy_score(y_test,y_test_predict_c):4f}")
print(f"Test Set Prediction Precision: {precision_score(y_test,y_test_predict_c,average='macro'):4f}")
print(f"Test Set Prediction Recall: {recall_score(y_test,y_test_predict_c,average='macro'):4f}")
print(f"Test Set Prediction F1 Score: {f1_score(y_test,y_test_predict_c,average='macro'):4f}")

Classifier Results on Train Set:
Test Set Prediction Confusion Matrix:
[[  7   0   0   0   0   0]
 [  0  31   4   2   0   0]
 [  0   0 471   6   0   0]
 [  0   0  13 432   1   0]
 [  0   0   1   3 135   0]
 [  0   0   0   0   2  11]]
Test Set Prediction Accuracy: 0.971403
Test Set Prediction Precision: 0.986103
Test Set Prediction Recall: 0.935208
Test Set Prediction F1 Score: 0.958366
Classifier Results on Test Set:
Test Set Prediction Confusion Matrix:
[[  0   1   2   0   0   0]
 [  0   0  11   5   0   0]
 [  0   0 156  46   2   0]
 [  0   0  58 127   7   0]
 [  0   0   6  33  20   1]
 [  0   0   2   1   1   1]]
Test Set Prediction Accuracy: 0.633333
Test Set Prediction Precision: 0.404926
Test Set Prediction Recall: 0.326583
Test Set Prediction F1 Score: 0.344930


  _warn_prf(average, modifier, msg_start, len(result))


Even with lower than expected results from the classifier, we used the appraoch of 'Modeling wine preferences by data mining from physicochemical properties,' Cortez et al. (2009) wherein the researchers evaluated distinct accuracies, allowing the degree of error, tolerance (T) is accepted. As seen from the confusion matrix, the misclassifications are mostly just one class away from the true class. And therefore, allowing to have a tolerance, T=1, would correct most of the mistakes done by the model and therefore, the testing accuracy with a tolerance of T = 1, has made the model highly accurate with 96.25% accuracy.

In [23]:
import numpy as np

def accuracy_with_tol(y_true, y_pred, tolerance):
    correct = 0
    for true, pred in zip(y_true, y_pred):
        if np.abs(true - pred) <= tolerance:
            correct += 1
    return correct / len(y_true)


y_true = (np.array(y_test)).tolist()
y_pred = np.array(y_test_predict_c)
tolerances = [0.1, 0.25, 0.5, 1.0]

for tolerance in tolerances:
    acc_with_tolerance = accuracy_with_tol(y_true, y_pred, tolerance)
    print(f"Accuracy with Tolerance, T = {tolerance}: {acc_with_tolerance}")

acc_without_tolerance = (y_true == y_pred).mean()
print(f"Traditional Accuracy (Classifier): {accuracy_score(y_test,y_test_predict_c):4f}")

Accuracy with Tolerance, T = 0.1: 0.6333333333333333
Accuracy with Tolerance, T = 0.25: 0.6333333333333333
Accuracy with Tolerance, T = 0.5: 0.6333333333333333
Accuracy with Tolerance, T = 1.0: 0.9625
Traditional Accuracy (Classifier): 0.633333


## Problem b.
* Build a pipeline with Standard scaler then SVR.
* Design hyper-parameter tuning procedure to automatically find the model with the best cross-validation accuracy.

For hyperparameter tuning, Optuna was used to find the optimal set of hyperparameters within a certain range in 100 trials. Similar to the SVC in the previous item, the hyperparameters that were tuned were the regularization parameter C (from 0.1 to 10 in intervals of 0.1), the kernel (linear, rbf, or sigmoid), the kernel coefficient gamma (from 0 to 10 in intervals of 0.1), and the epsilon value (from 0 to 10 in intervals of 0.001).

As shown below, the best regressor model found used the linear kernel and had a regularization parameter of 9.2, a gamma of 0.8, and an epsilon parameter of 0.091.

In [25]:
# Using Optuna to optimize hyperparameters
def objective(trial):
    C = trial.suggest_float("C", 0.1, 10, step=0.1)
    kernel = trial.suggest_categorical("kernel", ['linear', 'rbf', 'sigmoid'])
    gamma = trial.suggest_float("gamma", 0, 10, step=0.1)
    epsilon = trial.suggest_float("epsilon", 0, 1, step=0.001)

    pipeline = Pipeline([
        ('standard_scaler', StandardScaler()),
        ('regressor', SVR(C=C, kernel=kernel, gamma=gamma, epsilon=epsilon))
    ])

    # Use KFold instead of StratifiedKFold since you are working with regression
    cv = KFold(n_splits=5, shuffle=True, random_state=cv_seed)

    # Use a regression scoring metric such as neg_mean_squared_error or r2
    cv_score = cross_val_score(pipeline, X_train, y_train.values.ravel(), scoring="neg_mean_squared_error", cv=cv)

    # Return the mean score across all folds
    return np.mean(cv_score)


study = optuna.create_study(direction="minimize", sampler=TPESampler(seed=tune_seed))  # Minimizing error
study.optimize(objective, n_trials=100)

trial = study.best_trial

print(f"Best Regressor Mean Squared Error: {study.best_trial.value}")
print(f"Best Regressor Parameters:")
for param in study.best_trial.params.items():
    print(f"{param[0]}: {param[1]}")

# Reconstruct the best regressor
estimators_r = [
    ('standard_scaler', StandardScaler()),
    ('regressor', SVR(**study.best_trial.params))  # Remove random_state, since SVR doesn't have it
]

pipeline_r = Pipeline(estimators_r)
pipeline_r.fit(X_train, y_train.values.ravel())


[I 2024-09-22 10:43:33,909] A new study created in memory with name: no-name-9df852da-6be0-412e-9df9-23420181b109
[I 2024-09-22 10:43:36,881] Trial 0 finished with value: -0.4481595639745886 and parameters: {'C': 3.2, 'kernel': 'linear', 'gamma': 8.700000000000001, 'epsilon': 0.63}. Best is trial 0 with value: -0.4481595639745886.
[I 2024-09-22 10:43:39,015] Trial 1 finished with value: -0.46174337876582944 and parameters: {'C': 3.6, 'kernel': 'rbf', 'gamma': 0.9, 'epsilon': 0.056}. Best is trial 1 with value: -0.46174337876582944.
[I 2024-09-22 10:43:40,630] Trial 2 finished with value: -0.5795284477329286 and parameters: {'C': 2.4000000000000004, 'kernel': 'rbf', 'gamma': 3.5, 'epsilon': 0.46900000000000003}. Best is trial 2 with value: -0.5795284477329286.
[I 2024-09-22 10:43:44,084] Trial 3 finished with value: -0.4474304496840385 and parameters: {'C': 7.7, 'kernel': 'linear', 'gamma': 7.6000000000000005, 'epsilon': 0.441}. Best is trial 2 with value: -0.5795284477329286.
[I 2024-0

Best Regressor Mean Squared Error: -618574.1619118742
Best Regressor Parameters:
C: 10.0
kernel: sigmoid
gamma: 8.0
epsilon: 0.404


* Report the MSE, R2, and MAD of the best SVR on test data.
* Compare with paper results

The Support Vector Regressor used in the paper by Cortez et al. (2009) used a Gaussian kernel with C = 3, epsilon = \sigma  square root of N, sigma hat = 1.5 / (N * variance), y hat = value predicted by 3NN

Grid Search

The range of hyperparameters that were used were H in [0,11] and gamma in [2^3, 2^1, ..., 2^-15]

2/3, 1/3 holdout split for validation, sensitivity analysis L=5

evaluate with 20 runs of 5-fold cross validation





Paper Results:
Accuracy:
T=0.25 -> 43.2 +/- 0.6
T=0.5  -> 62.4 +/- 0.4
T=1.0  -> 89.0 +/- 0.2
MAD = 0.46



In [26]:
def mad_score(df):
    return abs((df-df.mean(axis=0)).mean(axis=0))


y_train_predict_r = pipeline_r.predict(X_train)
print(f"Regressor Results on Train Set:")
print(f"Test Set Prediction MSE: {mean_squared_error(y_train,y_train_predict_r)}")
print(f"Test Set Prediction R2 Score: {r2_score(y_train,y_train_predict_r)}")
print(f"Test Set Prediction MAD: {mad_score(y_train)}")

y_test_predict_r = pipeline_r.predict(X_test)
print(f"Regressor Results on Test Set:")
print(f"Test Set Prediction MSE: {mean_squared_error(y_test,y_test_predict_r)}")
print(f"Test Set Prediction R2 Score: {r2_score(y_test,y_test_predict_r)}")
print(f"Test Set Prediction MAD: {mad_score(y_test)}")

Regressor Results on Train Set:
Test Set Prediction MSE: 1065377.494196344
Test Set Prediction R2 Score: -1630931.0699874666
Test Set Prediction MAD: quality    1.277897e-16
dtype: float64
Regressor Results on Test Set:
Test Set Prediction MSE: 1048974.2107929613
Test Set Prediction R2 Score: -1617964.9124130423
Test Set Prediction MAD: quality    2.960595e-16
dtype: float64


In [28]:
for tolerance in tolerances:
    acc_with_tolerance = accuracy_with_tol(y_true, y_pred, tolerance)
    print(f"Accuracy with Tolerance, T = {tolerance}: {acc_with_tolerance}")

Accuracy with Tolerance, T = 0.1: 0.6333333333333333
Accuracy with Tolerance, T = 0.25: 0.6333333333333333
Accuracy with Tolerance, T = 0.5: 0.6333333333333333
Accuracy with Tolerance, T = 1.0: 0.9625



## Discussion
* Based on your results, discuss the difference of treating this problem as classification or regression. How will this decision impact the users of your model?

From this exercise, we saw two different ways on treating this problem. Treating this as a classification, would allow the users to quickly identify wines according to different quality categories making things a bit more straightforward. Based on the results of the classification, the classification model achieved high accuracy, precision and F1 score on the training set but had lower than expected results for the test set. That means the model is able to distinguish the classes of wine qualities well on the training set but is limited on unseen data. Treating this as a regression problem, on the other hand, allows users to predict wine qualities in a more precise manner. Based on the performance of the regression model, the model was able to achieve moderate performance on the training and test set as seen from the MSE, R2, and MAD results. This means that, the model, while not able to predict every data point, can provide relatively close estimates from the true value.