###                                                             Wine Quality SVM model

In [394]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as clrs
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings

In [395]:
warnings.filterwarnings('ignore')

In [396]:
df = pd.read_csv('C:/Users/Tigranuhi/Downloads/winequalityN.csv')
df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


### See in what columns there are missing values

In [397]:
df.isna().sum()

type                     0
fixed acidity           10
volatile acidity         8
citric acid              3
residual sugar           2
chlorides                2
free sulfur dioxide      0
total sulfur dioxide     0
density                  0
pH                       9
sulphates                4
alcohol                  0
quality                  0
dtype: int64

Replace missing values with meaningful data

In [398]:
df.fillna(df.mean(), inplace=True)
df.isna().sum()

type                    0
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

In [399]:
df = pd.get_dummies(df)
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type_red,type_white
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8,6,0,1
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5,6,0,1
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1,6,0,1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6,0,1
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9,6,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5,5,1,0
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2,6,1,0
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0,6,1,0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2,5,1,0


In [400]:
X = df[df.columns.difference(df['quality'])]
Y = df['quality']
Y.value_counts()

6    2836
5    2138
7    1079
4     216
8     193
3      30
9       5
Name: quality, dtype: int64

In [401]:
fig = px.histogram(df, x='quality')
fig.show()

In [402]:
plt.scatter?

In [403]:
 x_train, x_test_final, y_train, y_test_final = train_test_split(X, Y, test_size=0.2, random_state=42) 

In [404]:
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [421]:
def get_SVC_trained_model(x_train, x_test, y_train, scaler=None, withHyperparameterTuning=False):
    if scaler == 'StandardScaler':
        x_train = pd.DataFrame(StandardScaler().fit_transform(x_train), index=x_train.index, columns=x_train.columns)
    elif scaler == 'MinMaxScaler':
        x_train = pd.DataFrame(MinMaxScaler().fit_transform(x_train), index=x_train.index, columns=x_train.columns)
    
    clf = svm.SVC()
    if withHyperparameterTuning:
        param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 
                      'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                      'kernel': ['rbf', 'linear']} 

        clf = GridSearchCV(clf, param_grid, cv=5)          
        clf.fit(x_train, y_train)
        
        print(clf.best_params_)
        print(clf.best_estimator_)
        
        return clf
    
    clf.fit(x_train, y_train)
    return clf

In [422]:
def predict(clf, x_test, y_test, scaler=None):
    
    if scaler == 'StandardScaler':
        x_test = pd.DataFrame(StandardScaler().fit_transform(x_test), index=x_test.index, columns=x_test.columns)
    elif scaler == 'MinMaxScaler':
        x_test = pd.DataFrame(MinMaxScaler().fit_transform(x_test), index=x_test.index, columns=x_test.columns)    
    
    y_pred = clf.predict(x_test)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='macro', zero_division=0))
    print("Recall:", recall_score(y_test, y_pred, average='macro', zero_division=0))
    
    print(classification_report(y_test, y_pred, zero_division=0))  

In [423]:
print('SVM metrics without scaling...')
clf = get_SVC_trained_model(x_train, x_test, y_train)
predict(clf, x_test, y_test)

SVM metrics without scaling...
Accuracy: 0.4519230769230769
Precision: 0.13952878895091333
Recall: 0.1541186299081036
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00        38
           5       0.53      0.14      0.23       342
           6       0.44      0.94      0.60       450
           7       0.00      0.00      0.00       173
           8       0.00      0.00      0.00        31
           9       0.00      0.00      0.00         1

    accuracy                           0.45      1040
   macro avg       0.14      0.15      0.12      1040
weighted avg       0.37      0.45      0.33      1040



In [424]:
print('SVM metrics using MinMaxScaler...')
clf = get_SVC_trained_model(x_train, x_test, y_train, scaler='MinMaxScaler')
predict(clf, x_test, y_test, scaler='MinMaxScaler')

SVM metrics using MinMaxScaler...
Accuracy: 0.9807692307692307
Precision: 0.8343330892724212
Recall: 0.7144799814191456
              precision    recall  f1-score   support

           3       1.00      0.20      0.33         5
           4       0.92      0.95      0.94        38
           5       0.99      0.99      0.99       342
           6       0.98      1.00      0.99       450
           7       0.98      0.96      0.97       173
           8       0.97      0.90      0.93        31
           9       0.00      0.00      0.00         1

    accuracy                           0.98      1040
   macro avg       0.83      0.71      0.74      1040
weighted avg       0.98      0.98      0.98      1040



In [425]:
print('SVM metrics using StandardScaler...')
clf = get_SVC_trained_model(x_train, x_test, y_train, scaler='StandardScaler')
predict(clf, x_test, y_test, scaler='StandardScaler')

SVM metrics using StandardScaler...
Accuracy: 0.9884615384615385
Precision: 0.6905515591166874
Recall: 0.7064649565987191
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         5
           4       0.88      1.00      0.94        38
           5       1.00      1.00      1.00       342
           6       0.99      1.00      0.99       450
           7       0.99      0.98      0.99       173
           8       0.97      0.97      0.97        31
           9       0.00      0.00      0.00         1

    accuracy                           0.99      1040
   macro avg       0.69      0.71      0.70      1040
weighted avg       0.98      0.99      0.99      1040



### Hyperparameter tuning

In [412]:
print('SVM metrics without scaling...')
clf = get_SVC_trained_model(x_train, x_test, y_train, withHyperparameterTuning=True)
predict(clf, x_test, y_test)

SVM metrics without scaling...


KeyboardInterrupt: 

In [None]:
print('SVM metrics using MinMaxScaler...')
clf = get_SVC_trained_model(x_train, x_test, y_train, scaler='MinMaxScaler', withHyperparameterTuning=True)
predict(clf, x_test, y_test, scaler='MinMaxScaler')

In [426]:
print('SVM metrics using StandardScaler...')
clf = get_SVC_trained_model(x_train, x_test, y_train, scaler='StandardScaler', withHyperparameterTuning=True)
predict(clf, x_test, y_test, scaler='StandardScaler')

SVM metrics using StandardScaler...
{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
              precision    recall  f1-score   support

           3       1.00      1.00      1.00         5
           4       1.00      1.00      1.00        38
           5       1.00      1.00      1.00       342
           6       1.00      1.00      1.00       450
           7       1.00      1.00      1.00       173
           8       1.00      1.00      1.00        31
           9       1.00      1.00      1.00         1

    accuracy                           1.00      1040
   macro avg       1.00      1.00      1.00      1040
weighted avg       1.00      1.00      1.00      1040



In [427]:
print('Final test - SVM metrics using StandardScaler...')
predict(clf, x_test_final, y_test_final, scaler='StandardScaler')

Final test - SVM metrics using StandardScaler...
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
              precision    recall  f1-score   support

           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00        46
           5       1.00      1.00      1.00       420
           6       1.00      1.00      1.00       579
           7       1.00      1.00      1.00       221
           8       1.00      1.00      1.00        32

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300

