In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

!pip install selenium
!pip install bokeh
from bokeh.plotting import figure
from bokeh.io import show, output_notebook, output_file, export_png
from bokeh.layouts import gridplot, layout, row

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K    100% |████████████████████████████████| 911kB 23.5MB/s 
Installing collected packages: selenium
Successfully installed selenium-3.141.0


In [2]:
white_df = pd.read_csv('https://raw.githubusercontent.com/Zaabson/wine-quality/master/winequality-white.csv', sep=';')
red_df = pd.read_csv('https://raw.githubusercontent.com/Zaabson/wine-quality/master/winequality-red.csv', sep=';')
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [20]:
output_notebook()

def plot_but_only_text(text, width, height, text_color, font_size):
  
  plot = figure(width = width, height = height, toolbar_location = None)
  plot.text(0, 0, text_align = 'center', text_baseline = 'middle', text = [text], text_color = text_color, text_font_size = font_size)
  plot.xgrid.grid_line_color = None
  plot.ygrid.grid_line_color = None
  plot.outline_line_color = None
  plot.xaxis.visible = False
  plot.yaxis.visible = False
  
  return plot


def make_hist(serie_name, bins, x_range):
  
  range1 = (min(white_df[serie_name].min(), red_df[serie_name].min()), max(white_df[serie_name].max(), red_df[serie_name].max()))
  hist_white, edges_white = np.histogram(white_df[serie_name], bins=bins, range=range1)
  hist_red, edges_red = np.histogram(red_df[serie_name], bins=bins, range=range1)
  hist_white = hist_white / white_df.shape[0]
  hist_red = hist_red / red_df.shape[0]
  
  plot = figure(title=serie_name, x_range = x_range, plot_width = 250, plot_height = 250, toolbar_location = None)
  plot.quad(bottom = np.zeros(shape=(hist_white.shape[0])), top = hist_white, left = edges_white[:-1], right = edges_white[1:], fill_alpha = 0.3, line_alpha = 0)  # white wine
  plot.quad(bottom = np.zeros(shape=(hist_red.shape[0])), top = hist_red, left = edges_red[:-1], right = edges_red[1:], fill_alpha = 0.3, line_alpha = 0, fill_color = 'crimson')#red wine
  
  return plot

# individualy adjust numbers of bins for every serie to loog good
bins_num_for_serie = {'fixed acidity':100, 'volatile acidity':50, 'citric acid':100, 'residual sugar':100,
       'chlorides':100, 'free sulfur dioxide':70, 'total sulfur dioxide':100, 'density':100,
       'pH':50, 'sulphates':60, 'alcohol':60}

# also adjust range
x_range = {'fixed acidity':(4, 16), 'volatile acidity':(0, 1.4), 'citric acid':(0, 1), 'residual sugar':(0, 23),
       'chlorides':(0, 0.2), 'free sulfur dioxide':(0, 100), 'total sulfur dioxide':(0, 300), 'density':(0.985, 1.01),
       'pH':(2.7, 4), 'sulphates':(0, 1.5), 'alcohol':(8, 15)}

histograms = [make_hist(serie_name, bins_num_for_serie[serie_name], x_range[serie_name]) for serie_name in white_df.columns[:-1]]

title = plot_but_only_text('Distributions of wine characteristics (relative frequencies)', 1200, 100, 'mediumturquoise', '30pt')
subtitle = plot_but_only_text("", 250, 200, 'green', "10pt")
subtitle.circle([], [], fill_color = 'crimson', fill_alpha = 0.5, legend = "red wine", line_color = None)
subtitle.circle([], [], fill_color = "lightskyblue", fill_alpha = 0.65, legend = "white wine", line_color = None)
subtitle.legend.label_text_font_size = '18pt'
subtitle.legend.border_line_color = None

hist_grid = layout([[title], [subtitle] + histograms[:3], histograms[3:7], histograms[7:11]], merge_tools = True)
show(hist_grid)

In [0]:
# this takes a moment to load

output_notebook()

white_sample = white_df.sample(1000)
red_sample = red_df.sample(1000)

def make_scatter(x_col_name, y_col_name):
  
  plot = figure(height = 250, width = 250, toolbar_location = None)
  plot.xaxis.axis_label = x_col_name
  plot.yaxis.axis_label = y_col_name
  plot.circle(white_sample[x_col_name], white_sample[y_col_name], size=1, color='deepskyblue')
  plot.circle(red_sample[x_col_name], red_sample[y_col_name], size=1, color='darkred')
  
  return plot


names_grid = [[(x, y) for y in white_df.columns[:-1]] for x in white_df.columns[:-1]]

for x in range(11):
  for y in range(11):
    
    if x == y:
      names_grid[x][y] = plot_but_only_text(names_grid[x][y][0], 250, 250, text_color = 'black', font_size = '15pt')
    else:
      names_grid[x][y] = make_scatter(*names_grid[x][y])


main_title = plot_but_only_text("Corelations between wine characteristics", 1200, 100, 'mediumturquoise', '40pt')
subtitle = plot_but_only_text("", 300, 100, 'green', "10pt")
subtitle.circle([], [], color = "darkred", legend = "red wine")
subtitle.circle([], [], color = "deepskyblue", legend = "white wine")
subtitle.legend.label_text_font_size = '18pt'
subtitle.legend.border_line_color = None


corelations_grid = layout([[main_title], [subtitle], *names_grid])
show(corelations_grid)

In [0]:
# prepare data

# for quality_white
white_array = np.array(white_df)
white_array = np.array_split(white_array, 12, axis=1)
characteristics_white, quality_white = np.concatenate(white_array[:-1], axis=1), white_array[-1]
mean_white = characteristics_white.mean(axis=0)
std_white = characteristics_white.std(axis=0)
characteristics_white = (characteristics_white - mean_white) / std_white

# for quality red
red_array = np.array(red_df)
red_array = np.array_split(red_array, 12, axis=1)
characteristics_red, quality_red = np.concatenate(red_array[:-1], axis=1), red_array[-1]
mean_red = characteristics_red.mean(axis=0)
std_red = characteristics_red.std(axis=0)
characteristics_red = (characteristics_red - mean_red) / std_red


# for colour
ones = pd.Series(np.ones(red_df.shape[0]))
red_df_1 = pd.concat([red_df, ones], axis=1)
zeros = pd.Series(np.zeros(white_df.shape[0]))
white_df_0 = pd.concat([white_df, zeros], axis=1)

train_inputs = red_df_1.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]]
train_inputs = train_inputs.append(white_df_0.iloc[:1599, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]])
train_inputs = np.array(train_inputs)
np.random.shuffle(train_inputs)
train_inputs, train_outputs = np.split(train_inputs, [11], axis=1)

In [22]:
# create a model to judge a colour

model_colour = keras.Sequential([keras.layers.Dense(64, input_shape=(11,), activation='sigmoid'),
                           keras.layers.Dense(32, activation='sigmoid'),
                           keras.layers.Dense(1, activation='sigmoid')])

model_colour.compile(optimizer='rmsprop', loss='mse', metrics=['mae', 'acc'])
model_colour.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 64)                768       
_________________________________________________________________
dense_13 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 33        
Total params: 2,881
Trainable params: 2,881
Non-trainable params: 0
_________________________________________________________________


In [23]:
# create two models, each to judge the quality of wine of specified colour

model_quality_white = keras.Sequential(
                        [keras.layers.Dense(64, input_shape=(11,), activation='relu'),
                        keras.layers.Dense(32, activation='relu'),
                        keras.layers.Dense(1)])

model_quality_white.compile(optimizer='adam', loss='mse', metrics=['mae'])

model_quality_red = keras.Sequential(
                        [keras.layers.Dense(64, input_shape=(11,), activation='relu'),
                        keras.layers.Dense(32, activation='relu'),
                        keras.layers.Dense(1)])

model_quality_red.compile(optimizer='adam', loss='mse', metrics=['mae'])

model_quality_white.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 64)                768       
_________________________________________________________________
dense_16 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 33        
Total params: 2,881
Trainable params: 2,881
Non-trainable params: 0
_________________________________________________________________


In [24]:
# train models

model_colour_history = model_colour.fit(train_inputs, train_outputs, epochs=30, batch_size=64)
model_quality_white_history = model_quality_white.fit(characteristics_white, quality_white, epochs=30, batch_size=64)
model_quality_red_history = model_quality_red.fit(characteristics_red, quality_red, epochs=30, batch_size=64)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 2

In [25]:
output_notebook()

g1 = figure(title='colour model loss', plot_width=300, plot_height=300)
g1.line(x=np.arange(len(model_colour_history.history['loss'])), y=model_colour_history.history['loss'])

g2 = figure(title='quality model loss (white)', plot_width=300, plot_height=300)
g2.line(x=np.arange(len(model_quality_white_history.history['loss'])), y=model_quality_white_history.history['loss'])

g3 = figure(title='quality model loss (red)', plot_width=300, plot_height=300)
g3.line(x=np.arange(len(model_quality_red_history.history['loss'])), y=model_quality_red_history.history['loss'])

show(row(g1, g2, g3))

In [26]:
model_quality_white.evaluate(characteristics_red, quality_red)



[0.8495421718253875, 0.7119241]

In [27]:
# now use the models to first predict a colour 
# and then predict a wine quality using two different models for seperate colours

def predict_quality(characteristics_array):
  
    colours = model_colour.predict(characteristics_array)
    quality_if_white = model_quality_white.predict(characteristics_array)
    quality_if_red = model_quality_red.predict(characteristics_array)
    # calculate predicted quality as a weighted average with probabilities as weights
    quality = colours * quality_if_red + (1 - colours) * quality_if_white  # red - 1, white - 0
    
    return predicted_quality

characteristics_array = red_df.append(white_df)
characteristics_array = characteristics_array.iloc[:,:11]
characteristics_array = np.array(characteristics_array)
mean = characteristics_array.mean(axis=0)
std = characteristics_array.std(axis=0)
characteristics_array = (characteristics_array - mean) / std
quality = red_df.append(white_df)
quality = np.array(quality['quality']).reshape(characteristics_array.shape[0], 1)

prediction = predict_quality(characteristics_array)
mean_absolute_error = np.sum(np.abs(prediction - quality)) / prediction.shape[0]
print(f"mean absolute error between quality and predicted quality is: {mean_absolute_error}")

mean absolute error between quality and predicted quality is: 0.619186022290087


Now check if it's better than training one quality model on both wine colours

In [0]:
# characteristics = np.concatenate((characteristics_white, characteristics_red), axis=0)
# qualities = np.concatenate((quality_white, quality_red), axis=0)

model_quality_both = keras.Sequential(
                        [keras.layers.Dense(64, input_shape=(11,), activation='relu'),
                        keras.layers.Dense(32, activation='relu'),
                        keras.layers.Dense(1)])

model_quality_both.compile(optimizer='adam', loss='mse', metrics=['mae'])

model_quality_both_history = model_quality_both.fit(characteristics_array, quality, epochs=20)

In [33]:
prediction1 = model_quality_both.predict(characteristics_array)
mae2 = np.sum(np.abs(prediction1 - quality)) / prediction1.shape[0]
f"mean absolute error here is {mae2}"

'mean absolute error here is 0.6166889455403954'