###                                                             Wine Quality data preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score, cross_val_predict

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('data/winequalityN.csv')
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.info()

In [None]:
df.describe()

### Checking for missing values

In [None]:
df.isna().sum()

### Replace missing values with mean of the corresponding column values

In [None]:
df.fillna(df.mean(), inplace=True)
df.isna().sum()

### One-hot-encoding the type column

In [None]:
df = pd.get_dummies(df)
df

In [None]:
sns.histplot(df["quality"])

In [None]:
X = df[df.columns.difference(df['quality'])]
Y = df['quality']
Y.value_counts()

In [None]:
df_non_cat = df.drop(labels = ["quality", "type_red", "type_white" ],axis = 1)
sns.pairplot(df_non_cat)

### A we can see in the plots there are several outliers that can affect the result, so we will remove them from the data.

In [None]:
sns.histplot(df["fixed acidity"])

In [None]:
sns.histplot(df["volatile acidity"])

In [None]:
sns.histplot(df["total sulfur dioxide"])

In [None]:
sns.histplot(df["density"])

In [None]:
sns.histplot(df["pH"])

In [None]:
sns.histplot(df["sulphates"])

In [None]:
sns.histplot(df["alcohol"])

In [None]:
sns.histplot(df["chlorides"])

In [None]:
sns.histplot(df["free sulfur dioxide"])

In [None]:
sns.histplot(df["citric acid"])

In [None]:
sns.histplot(df["residual sugar"])

In [None]:
df.drop(df[ (df["fixed acidity"] > 20)].index , inplace=True)
df.drop(df[ (df["volatile acidity"] > 1)].index , inplace=True)
df.drop(df[ (df["total sulfur dioxide"] > 250)].index , inplace=True)
df.drop(df[ (df["density"] > 1.01)].index , inplace=True)
df.drop(df[ (df["pH"] > 3.8)].index , inplace=True)
df.drop(df[ (df["sulphates"] > 1.25)].index , inplace=True)
df.drop(df[ (df["chlorides"] > 0.2)].index , inplace=True)
df.drop(df[ (df["free sulfur dioxide"] > 100)].index , inplace=True)
df.drop(df[ (df["citric acid"] > 0.75)].index , inplace=True)
df.drop(df[ (df["residual sugar"] > 20)].index , inplace=True)

In [None]:
df_non_cat = df.drop(labels = ["quality", "type_red", "type_white" ],axis = 1)
sns.pairplot(df_non_cat)

### Checking the correlations

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True)

### Train-Test splitting

In [None]:
Y_orig = df["quality"].values
X_orig = df.drop(labels = ["quality" ],axis = 1)

X, X_test, Y, Y_test = train_test_split(X_orig, Y_orig ,test_size = 0.1, random_state=13)
X_train, X_vald, Y_train, Y_vald = train_test_split(X, Y ,test_size = 0.2, random_state=13)


### Model creation

In [None]:
def getModel(type):
    switcher = {
        "knn": KNeighborsClassifier(),
        "svc": svm.SVC(),
        "rf": RandomForestClassifier(),
        "lr": LogisticRegression(),        
    }
  
    return switcher.get(type, None)

In [None]:
def get_trained_model(clf, x_train, y_train, scaler=None, param_grid=None, withHyperparameterTuning=False):
    if scaler is not None:
        x_train = pd.DataFrame(scaler.fit_transform(x_train), index=x_train.index, columns=x_train.columns)
        
    if withHyperparameterTuning:        
        clf = GridSearchCV(clf, param_grid, cv=5)          
        clf.fit(x_train, y_train)
        
        print(clf.best_params_)
        print(clf.best_estimator_)
        
        return clf
    
    clf.fit(x_train, y_train)
    return clf

In [None]:
def predict(clf, x_test, y_test, scaler=None):
    if scaler is not None:
        x_test = pd.DataFrame(scaler.transform(x_test), index=x_test.index, columns=x_test.columns)
    
    y_pred = clf.predict(x_test)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average='macro', zero_division=0))    
    print("Recall:", recall_score(y_test, y_pred, average='macro', zero_division=0))
    print(classification_report(y_test, y_pred, zero_division=0))

### Logistic Regression


### Decision Tree


### Random Forest

In [None]:
rf = 'rf'

In [None]:
print('Random forest metrics without scaling...')
clf = get_trained_model(getModel(rf), X_train, Y_train)
predict(clf, X_vald, Y_vald)
print('Final test...')
predict(clf, X_test, Y_test)

print('CV accuracy', cross_val_score(clf, X_train, Y_train, cv=5).mean())

In [None]:
print('Random forest metrics using MinMaxScaler...')
scaler = MinMaxScaler()
clf = get_trained_model(getModel(rf), X_train, Y_train, scaler=scaler)
predict(clf, X_vald, Y_vald, scaler=scaler)
print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

print('CV accuracy', cross_val_score(clf, X_train, Y_train, cv=5).mean())

In [None]:
print('Random forest metrics using StandardScaler...')
scaler = StandardScaler()
clf = get_trained_model(getModel(rf), X_train, Y_train, scaler=scaler)
predict(clf, X_vald, Y_vald, scaler=scaler)
print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

print('CV accuracy', cross_val_score(clf, X_train, Y_train, cv=5).mean())

### KNN

In [None]:
knn = "knn"

In [None]:
# Hyperparameters to tune
knn_params  = {'leaf_size': list(range(1,5)), 
               'n_neighbors': list(range(1,10)),
               'p': [1,2]}                          


In [None]:
print('KNN metrics without scaling...')
clf = get_trained_model(getModel(knn), X_train, Y_train, param_grid=knn_params)
predict(clf, X_vald, Y_vald)

print('Final test...')
predict(clf, X_test, Y_test)

In [None]:
print('KNN metrics using MinMaxScaler...')
scaler = MinMaxScaler()
clf = get_trained_model(getModel(knn), X_train, Y_train, scaler=scaler)
predict(clf, X_vald, Y_vald, scaler=scaler)

print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

In [None]:
# print(classification_report(Y_test, knn_pred))

### SVM

In [None]:
svc = "svc"

In [None]:
print('SVM metrics without scaling...')
clf = get_trained_model(getModel(svc), X_train, Y_train)
predict(clf, X_vald, Y_vald)

print('Final test...')
predict(clf, X_test, Y_test)

In [None]:
print('SVM metrics using MinMaxScaler...')
scaler = MinMaxScaler()
clf = get_trained_model(getModel(svc), X_train, Y_train, scaler=scaler)
predict(clf, X_vald, Y_vald, scaler=scaler)

print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

In [None]:
print('SVM metrics using StandardScaler...')
scaler = StandardScaler()
clf = get_trained_model(getModel(svc), X_train, Y_train, scaler=scaler)
predict(clf, X_vald, Y_vald, scaler=scaler)

print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

## HyperParameter Tuning

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']} 

In [None]:
print('SVM metrics without scaling...')
clf = get_trained_model(getModel(svc), X_train, Y_train, withHyperparameterTuning=True, param_grid=param_grid)
predict(clf, X_vald, Y_vald)

print('Final test...')
predict(clf, X_test, Y_test)

In [None]:
print('SVM metrics using MinMaxScaler...')
scaler = MinMaxScaler()
clf = get_trained_model(getModel(svm), X_train, Y_train, scaler=scaler, withHyperparameterTuning=True, param_grid=param_grid)
predict(clf, X_vald, Y_vald, scaler=scaler)

print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

In [None]:
print('SVM metrics using StandardScaler...')
scaler = StandardScaler()
clf = get_trained_model(getModel(svm), X_train, Y_train, scaler=scaler, withHyperparameterTuning=True, param_grid=param_grid)
predict(clf, X_vald, Y_vald, scaler=scaler)

print('Final test...')
predict(clf, X_test, Y_test, scaler=scaler)

### Final Results

In [None]:
results = pd.DataFrame({'models': ["Logistic Regression", "Decision Tree", "Random Forest","KNN", "SVM"],
                           'accuracies': [accuracy_score(y_test,pred_svc),accuracy_score(y_test,pred_rfc),accuracy_score(y_test,pred_knn)]})
results