In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
!pip3 install -q -U tensorflow

In [4]:
import itertools
import os
import math
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
from datetime import datetime
import numpy as np
import datetime
layers = keras.layers


In [6]:
!wget https://storage.googleapis.com/sara-cloud-ml/wine_data.csv

--2021-05-07 13:19:00--  https://storage.googleapis.com/sara-cloud-ml/wine_data.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.172.16, 142.250.78.16, 142.250.78.144, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.172.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49783940 (47M) [application/octet-stream]
Saving to: ‘wine_data.csv.4’


2021-05-07 13:19:22 (2.27 MB/s) - ‘wine_data.csv.4’ saved [49783940/49783940]



In [5]:
path = "wine_data.csv"

In [6]:
data = pd.read_csv(path)
data = data.sample(frac=1)
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
62380,62380,Italy,"A vintage-dated Prosecco with a festive, bubbl...",Millesimato Selezione Banda Rossa,89,19.0,Veneto,Prosecco di Valdobbiadene,,Prosecco,Bortolomiol
87819,87819,Italy,"Made with organically grown grapes, this silky...",,90,25.0,Piedmont,Langhe,,Nebbiolo,Cascina Corte di Barosi Alessandro
64753,64753,US,"A lovely bottle, loaded with textures of moist...",Estate,90,28.0,Washington,Columbia Valley (WA),Columbia Valley,Syrah,Gamache
104965,104965,France,"Following the fashion of bone-dry Champagnes, ...",Grande Cuvée Extra Brut,88,,Champagne,Champagne,,Champagne Blend,Veuve Doussot
3882,3882,Argentina,Strong blackberry aromas come with hints of wo...,La Mascota,89,15.0,Mendoza Province,Mendoza,,Malbec,Mascota


In [7]:
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1)
variety_threshold = 500
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove , np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [8]:
train_size= int(len(data) * .8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(data) - train_size))

Train size: 95646
Test size: 23912


In [9]:
# Train inputs
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# Train labels
labels_train = data['price'][:train_size]

# Test inputs
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

In [10]:
# Create a tokenizer
vocab_size = 5000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train)

In [11]:
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [12]:
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

#To one-hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [13]:
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer= layers.concatenate([bow_inputs, variety_inputs])
merget_layer = layers.Dense(256, activation="relu")(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [14]:
wide_model.compile(loss="mse", optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 5000)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 5040)         0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            5041        concatenate[0][0]            

In [15]:
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen=max_seq_length, padding = "post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen=max_seq_length, padding = "post")

In [16]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 170)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 170, 8)            40000     
_________________________________________________________________
flatten (Flatten)            (None, 1360)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1361      
Total params: 41,361
Trainable params: 41,361
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
deep_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [18]:
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 170)]        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 5000)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 170, 8)       40000       input_3[0][0]                    
____________________________________________________________________________________________

In [19]:
combined_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [20]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

In [21]:
combined_model.fit([description_bow_train,
                    variety_train] + [ train_embed ],
                   labels_train,
                   epochs=50,
                   batch_size=128,
                   callbacks=[tensorboard_callback])
combined_model.evaluate([description_bow_test, variety_test] + [ test_embed ], labels_test, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[1035.8035888671875, 0.0]

In [22]:
predictions = combined_model.predict([ description_bow_test, variety_test] + [ test_embed])

In [23]:
num_predictions = 40
diff = 0
for i in range(num_predictions):
  val = predictions[i]
  print(description_test.iloc[i])
  print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
  diff += abs(val[0] - labels_test.iloc[i])

This vintage shows the heat, with big, ripe cherry intensity, power and beefy aromas of barbecue, red beets, resin, tar and toasted chestnut. The wine boasts deep concentration, almost viscous consistency and a sweet finish.
Predicted:  60.3638 Actual:  65.0 

The best Cara Mia yet, this is Red Mountain Cabernet Franc from the estate vineyard, with 15% Merlot and 8% Petit Verdot included. Stylish and compact, this seems sculpted with pure, vivid purple fruits, balanced against crisp, fine-grained tannins. The depth and concentration are noteworthy.
Predicted:  83.43269 Actual:  60.0 

With firm tannins and a solid texture, this rich wine has delicious juicy black fruits and dry tannins. It's delicious now, but it will also improve and evolve with time. Drink from 2016.
Predicted:  41.74598 Actual:  15.0 

Densely tannic and rich, this is full of ripe chocolate and blackberry fruit. It is dark, brooding and dense—a great success for the second wine of Château du Glana.
Predicted:  14.90

In [25]:
%load_ext tensorboard
%tensorboard --logdir logs