In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scikeras.wrappers import KerasClassifier

In [2]:
data = pd.read_csv('winequality-red.csv')
data.columns = [col.strip() for col in data.columns]

In [3]:
old_data_length = len(data)
IF_model = IsolationForest(contamination=0.1) # Remove 10% data
outliers = IF_model.fit_predict(data) # train IF model
data['outliers'] = outliers

# Remove outliers
data_outliers = data[data['outliers'] == -1]
new_data = data[data['outliers'] != -1].drop(columns = ['outliers'])

new_data_len = len(new_data)

print(f"There are total {old_data_length - new_data_len} rows been removied.")

There are total 160 rows been removied.


In [4]:
def wine_classifier(quality, top_percent):
    if quality >= top_percent[0.9]:
        return 'Great'
    elif quality >= top_percent[0.7]:
        return 'Good'
    elif quality >= top_percent[0.3]:
        return 'Normal'
    else:
        return 'Bad'

In [5]:
top_percent = new_data['quality'].quantile([0.3, 0.7, 0.9])

quality = []
for index, row in new_data.iterrows():
    quality.append(wine_classifier(row['quality'], top_percent))

new_data['classified_quality'] = quality

In [6]:
x = new_data.drop(columns=['quality', 'classified_quality'], axis=1)
y = OneHotEncoder().fit_transform(new_data[['classified_quality']]).toarray()

In [7]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2, random_state=10)

In [8]:
from sklearn.preprocessing import MinMaxScaler
minmax_scaler = MinMaxScaler()
scale_xTrain = minmax_scaler.fit_transform(xTrain)
scale_xTest = minmax_scaler.transform(xTest)

In [9]:
#change y arrays to int arrays
int_yTrain = yTrain.astype(int)
int_yTest = yTest.astype(int)

#create NN with input layer that takes in 11 features. Has 6 hidden layers, 512, 256, 128, 64, 32, and 4 notes respectively. 
#All have 'relu' activation functions except for last layer, which uses 'softmax'
model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(11,)),
        tf.keras.layers.Dense(512, activation="relu"),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(4, activation="softmax") 
    ])

#compile model using the adam optimization algo, using categorical crossentropy since we have a multiclass model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

epochs = 50
batch = 32
modelfit = model.fit(scale_xTrain, int_yTrain, epochs=epochs, batch_size=batch)

test_loss, test_accuracy = model.evaluate(scale_xTest, yTest)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [10]:

# def create_model(optimizer='adam', kernel_initializer='glorot_uniform', dropout_rate=0.2):
#     model = tf.keras.Sequential([
#         tf.keras.layers.Input(shape=(11,)),
#         tf.keras.layers.Dense(512, activation="relu", kernel_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(256, activation="relu", kernel_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(128, activation="relu", kernel_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(64, activation="relu", kernel_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(32, activation="relu", kernel_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(4, activation="softmax", kernel_initializer='glorot_uniform') 
#     ])

#     model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
#     return model

# model = KerasClassifier(build_fn = create_model, verbose = 0, kernel_initializer = 'glorot_uniform')
# epochs = 50
# batch = 32
# modelfit = model.fit(scale_xTrain, int_yTrain, epochs=epochs, batch_size=batch, validation_split=0.2)


In [16]:
#create wrapper in order to format keras for scikit-learn, trying glorot uniform  this time
def create_model(optimizer='adam', kernel_initializer='glorot_uniform', dropout_rate=0.2):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(11,)),
        tf.keras.layers.Dense(512, activation="relu"),
        tf.keras.layers.Dense(256, activation="relu"),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(4, activation="softmax") 
    ])

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

model = KerasClassifier(build_fn = create_model, verbose = 0, kernel_initializer = 'glorot_uniform', dropout_rate=0.4)
epochs = 50
batch = 32
history = model.fit(scale_xTrain, int_yTrain, epochs=epochs, batch_size=batch)

#testing adam, rmsprop, or sgd for best optimizer, as well as best batch size and number of epochs.
hparam = {'epochs': [36, 50, 100], 'batch_size': [32,64], 'dropout_rate': [0.2,0.3,0.4], 'optimizer': ['adam', 'rmsprop', 'SGD'], 'kernel_initializer': ['glorot_uniform', 'normal']}

random_search = RandomizedSearchCV(
    model,
    param_distributions = hparam,
    n_iter = 10,
    cv = 3,
    scoring = 'accuracy',
    verbose = 2

)


random_search.fit(scale_xTrain, int_yTrain)
print("Best Hyperparameters:", random_search.best_params_)

  X, y = self._initialize(X, y)


Fitting 3 folds for each of 10 candidates, totalling 30 fits


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.2, epochs=36, kernel_initializer=glorot_uniform, optimizer=adam; total time=   1.8s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.2, epochs=36, kernel_initializer=glorot_uniform, optimizer=adam; total time=   1.8s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.2, epochs=36, kernel_initializer=glorot_uniform, optimizer=adam; total time=   1.7s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=50, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   3.2s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=50, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   3.1s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=50, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   3.0s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   6.7s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   6.3s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   6.3s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=glorot_uniform, optimizer=adam; total time=   6.1s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=glorot_uniform, optimizer=adam; total time=   6.1s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=glorot_uniform, optimizer=adam; total time=   5.9s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=normal, optimizer=adam; total time=   6.1s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=normal, optimizer=adam; total time=   5.7s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.4, epochs=100, kernel_initializer=normal, optimizer=adam; total time=   5.6s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.2, epochs=36, kernel_initializer=glorot_uniform, optimizer=rmsprop; total time=   2.5s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.2, epochs=36, kernel_initializer=glorot_uniform, optimizer=rmsprop; total time=   2.6s


  X, y = self._initialize(X, y)


[CV] END batch_size=32, dropout_rate=0.2, epochs=36, kernel_initializer=glorot_uniform, optimizer=rmsprop; total time=   3.2s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.2, epochs=100, kernel_initializer=normal, optimizer=adam; total time=   3.7s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.2, epochs=100, kernel_initializer=normal, optimizer=adam; total time=   3.6s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.2, epochs=100, kernel_initializer=normal, optimizer=adam; total time=   3.5s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=100, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   3.5s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=100, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   3.4s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=100, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   3.5s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=50, kernel_initializer=glorot_uniform, optimizer=rmsprop; total time=   2.2s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=50, kernel_initializer=glorot_uniform, optimizer=rmsprop; total time=   2.1s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=50, kernel_initializer=glorot_uniform, optimizer=rmsprop; total time=   2.0s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=36, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   1.7s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=36, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   1.7s


  X, y = self._initialize(X, y)


[CV] END batch_size=64, dropout_rate=0.3, epochs=36, kernel_initializer=glorot_uniform, optimizer=SGD; total time=   2.1s


  X, y = self._initialize(X, y)


Best Hyperparameters: {'optimizer': 'rmsprop', 'kernel_initializer': 'glorot_uniform', 'epochs': 50, 'dropout_rate': 0.3, 'batch_size': 64}


In [20]:
#implementing model with best hyperparameters
model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(11,)),
        tf.keras.layers.Dense(512, activation="relu",kernel_initializer='glorot_uniform'),
        tf.keras.layers.Dense(256, activation="relu",kernel_initializer='glorot_uniform'),
        tf.keras.layers.Dense(128, activation="relu",kernel_initializer='glorot_uniform'),
        tf.keras.layers.Dense(64, activation="relu",kernel_initializer='glorot_uniform'),
        tf.keras.layers.Dense(32, activation="relu",kernel_initializer='glorot_uniform'),
        tf.keras.layers.Dense(4, activation="softmax",kernel_initializer='glorot_uniform') 
    ])

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

epochs = 50
batch = 64
history = model.fit(scale_xTrain, int_yTrain, epochs=epochs, batch_size=batch)

test_loss, test_accuracy = model.evaluate(scale_xTest, int_yTest)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
