In [0]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

!pip install selenium
!pip install bokeh
from bokeh.plotting import figure
from bokeh.io import show, output_notebook, output_file, export_png
from bokeh.layouts import gridplot, layout, row

In [67]:
white_df = pd.read_csv('https://raw.githubusercontent.com/Zaabson/wine-quality/master/winequality-white.csv', sep=';')
red_df = pd.read_csv('https://raw.githubusercontent.com/Zaabson/wine-quality/master/winequality-red.csv', sep=';')
white_df = white_df.sample(frac=1).reset_index(drop=True)
red_df = red_df.sample(frac=1).reset_index(drop=True)
red_df.reindex()
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,6.4,0.28,0.19,5.4,0.042,67.0,181.0,0.99435,3.31,0.35,10.2,6
1,8.1,0.12,0.49,1.2,0.042,43.0,160.0,0.9934,3.13,0.48,9.7,6
2,5.8,0.28,0.66,9.1,0.039,26.0,159.0,0.9965,3.66,0.55,10.8,5
3,5.3,0.32,0.23,9.65,0.026,26.0,119.0,0.99168,3.18,0.53,12.2,6
4,6.7,0.18,0.3,6.4,0.048,40.0,251.0,0.9956,3.29,0.52,10.0,5


In [68]:
output_notebook()

def plot_but_only_text(text, width, height, text_color, font_size):
  
  plot = figure(width = width, height = height, toolbar_location = None)
  plot.text(0, 0, text_align = 'center', text_baseline = 'middle', text = [text], text_color = text_color, text_font_size = font_size)
  plot.xgrid.grid_line_color = None
  plot.ygrid.grid_line_color = None
  plot.outline_line_color = None
  plot.xaxis.visible = False
  plot.yaxis.visible = False
  
  return plot


def make_hist(serie_name, bins, x_range):
  
  range1 = (min(white_df[serie_name].min(), red_df[serie_name].min()), max(white_df[serie_name].max(), red_df[serie_name].max()))
  hist_white, edges_white = np.histogram(white_df[serie_name], bins=bins, range=range1)
  hist_red, edges_red = np.histogram(red_df[serie_name], bins=bins, range=range1)
  hist_white = hist_white / white_df.shape[0]
  hist_red = hist_red / red_df.shape[0]
  
  plot = figure(title=serie_name, x_range = x_range, plot_width = 250, plot_height = 250, toolbar_location = None)
  plot.quad(bottom = np.zeros(shape=(hist_white.shape[0])), top = hist_white, left = edges_white[:-1], right = edges_white[1:], fill_alpha = 0.3, line_alpha = 0)  # white wine
  plot.quad(bottom = np.zeros(shape=(hist_red.shape[0])), top = hist_red, left = edges_red[:-1], right = edges_red[1:], fill_alpha = 0.3, line_alpha = 0, fill_color = 'crimson')#red wine
  
  return plot

# individualy adjust numbers of bins for every serie to loog good
bins_num_for_serie = {'fixed acidity':100, 'volatile acidity':50, 'citric acid':100, 'residual sugar':100,
       'chlorides':100, 'free sulfur dioxide':70, 'total sulfur dioxide':100, 'density':100,
       'pH':50, 'sulphates':60, 'alcohol':60}

# also adjust range
x_range = {'fixed acidity':(4, 16), 'volatile acidity':(0, 1.4), 'citric acid':(0, 1), 'residual sugar':(0, 23),
       'chlorides':(0, 0.2), 'free sulfur dioxide':(0, 100), 'total sulfur dioxide':(0, 300), 'density':(0.985, 1.01),
       'pH':(2.7, 4), 'sulphates':(0, 1.5), 'alcohol':(8, 15)}

histograms = [make_hist(serie_name, bins_num_for_serie[serie_name], x_range[serie_name]) for serie_name in white_df.columns[:-1]]

title = plot_but_only_text('Distributions of wine characteristics (relative frequencies)', 1200, 100, 'mediumturquoise', '30pt')
subtitle = plot_but_only_text("", 250, 200, 'green', "10pt")
subtitle.circle([], [], fill_color = 'crimson', fill_alpha = 0.5, legend = "red wine", line_color = None)
subtitle.circle([], [], fill_color = "lightskyblue", fill_alpha = 0.65, legend = "white wine", line_color = None)
subtitle.legend.label_text_font_size = '18pt'
subtitle.legend.border_line_color = None

hist_grid = layout([[title], [subtitle] + histograms[:3], histograms[3:7], histograms[7:11]], merge_tools = True)
show(hist_grid)

In [69]:
# this takes a moment to load

output_notebook()

white_sample = white_df.sample(1000)
red_sample = red_df.sample(1000)

def make_scatter(x_col_name, y_col_name):
  
  plot = figure(height = 250, width = 250, toolbar_location = None)
  plot.xaxis.axis_label = x_col_name
  plot.yaxis.axis_label = y_col_name
  plot.circle(white_sample[x_col_name], white_sample[y_col_name], size=1, color='deepskyblue')
  plot.circle(red_sample[x_col_name], red_sample[y_col_name], size=1, color='darkred')
  
  return plot


names_grid = [[(x, y) for y in white_df.columns[:-1]] for x in white_df.columns[:-1]]

for x in range(11):
  for y in range(11):
    
    if x == y:
      names_grid[x][y] = plot_but_only_text(names_grid[x][y][0], 250, 250, text_color = 'black', font_size = '15pt')
    else:
      names_grid[x][y] = make_scatter(*names_grid[x][y])


main_title = plot_but_only_text("Corelations between wine characteristics", 1200, 100, 'mediumturquoise', '40pt')
subtitle = plot_but_only_text("", 300, 100, 'green', "10pt")
subtitle.circle([], [], color = "darkred", legend = "red wine")
subtitle.circle([], [], color = "deepskyblue", legend = "white wine")
subtitle.legend.label_text_font_size = '18pt'
subtitle.legend.border_line_color = None


corelations_grid = layout([[main_title], [subtitle], *names_grid])
show(corelations_grid)

Now create two models, one predicting wine quality and second one predicting wine colour. Prepare data first.

In [0]:
# for quality
all_wine_df = white_df.append(red_df)

all_wine_df_train = all_wine_df.sample(frac=0.8)
all_wine_df_test = all_wine_df.drop(all_wine_df_train.index)
x_quality_train, y_quality_train = all_wine_df_train.iloc[:,:11].values, all_wine_df_train.iloc[:,11].values
x_quality_test, y_quality_test = all_wine_df_test.iloc[:,:11].values, all_wine_df_test.iloc[:,11].values

# for colour
ones = pd.Series(np.ones(red_df.shape[0]))
red_df_1 = pd.concat([red_df, ones], axis=1)
zeros = pd.Series(np.zeros(white_df.shape[0]))
white_df_0 = pd.concat([white_df, zeros], axis=1)

data_colour = red_df_1.iloc[:, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]]
data_colour = data_colour.append(white_df_0.iloc[:1599, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12]])
data_colour = data_colour.reset_index(drop=True)
train_data = data_colour.sample(frac=0.8)
test_data = data_colour.drop(train_data.index)

x_colour_train, y_colour_train = train_data.iloc[:,:11].values, train_data.iloc[:,11].values
x_colour_test, y_colour_test = test_data.iloc[:,:11].values, test_data.iloc[:,11].values

In [73]:
# create a model to judge a colour

model_colour = keras.Sequential([keras.layers.Dense(64, input_shape=(11,), activation='sigmoid'),
                           keras.layers.Dense(32, activation='sigmoid'),
                           keras.layers.Dense(1, activation='sigmoid')])

model_colour.compile(optimizer='rmsprop', loss='mse', metrics=['mae', 'acc'])


# and a model to judge quality

model_quality = keras.Sequential(
                        [keras.layers.Dense(64, input_shape=(11,), activation='relu'),
                        keras.layers.Dense(32, activation='relu'),
                        keras.layers.Dense(1)])

model_quality.compile(optimizer='adam', loss='mse', metrics=['mae'])
model_quality.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 64)                768       
_________________________________________________________________
dense_40 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_41 (Dense)             (None, 1)                 33        
Total params: 2,881
Trainable params: 2,881
Non-trainable params: 0
_________________________________________________________________


In [0]:
# train models
model_colour_history = model_colour.fit(x_colour_train, y_colour_train, epochs=30)
model_quality_history = model_quality.fit(x_quality_train, y_quality_train, epochs=15)

In [75]:
output_notebook()

g1 = figure(title='colour model loss', plot_width=300, plot_height=300)
g1.line(x=np.arange(len(model_colour_history.history['loss'])), y=model_colour_history.history['loss'])

g2 = figure(title='quality model loss', plot_width=300, plot_height=300)
g2.line(x=np.arange(len(model_quality_history.history['loss'])), y=model_quality_history.history['loss'])

show(row(g1, g2))

In [76]:
print('quality estimation:')
model_quality.evaluate(x_quality_test, y_quality_test)
print('colour prediction:')
model_colour.evaluate(x_colour_test, y_colour_test)

quality estimation:
colour prediction:


[0.028750652098096907, 0.06309685, 0.9640625]

I've tried training two models: one for white wine quality and second for red. then use colour prediction with those models to predict quality, but the results were same as with this simpler aproach.