In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
!pip install -q -U tensorflow

In [3]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers


In [None]:
!wget https://storage.googleapis.com/sara-cloud-ml/wine_data.csv

In [4]:
path = "wine_data.csv"

In [5]:
data = pd.read_csv(path)
data = data.sample(frac=1)
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,variety,winery
13618,13618,Italy,"Bright and refined, this conveys aromas of dar...",,91,26.0,Tuscany,Vino Nobile di Montepulciano,,Red Blend,Bindella
93481,93481,US,"Oaky-sweet and simple, with jammy pineapple, t...",Appellation Series,84,15.0,California,Russian River Valley,Sonoma,Chardonnay,Healdsburg Ranches
143346,143346,South Africa,"Clean, fresh flavors, good body and a balance ...",,87,11.0,Western Cape,,,Sauvignon Blanc,Douglas Green
28167,28167,France,"This is a ripe, fresh and fruity wine that's f...",,89,21.0,Loire Valley,Sancerre,,Sauvignon Blanc,Domaine de Rome
52757,52757,US,This full-bodied Chardonnay begins with aromas...,Golden Glen,85,20.0,New York,Finger Lakes,Finger Lakes,Chardonnay,Glenora


In [6]:
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1)
variety_threshold = 500
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove , np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [7]:
train_size= int(len(data) * .8)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(data) - train_size))

Train size: 95646
Test size: 23912


In [8]:
# Train inputs
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

# Train labels
labels_train = data['price'][:train_size]

# Test inputs
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

In [9]:
# Create a tokenizer
vocab_size = 5000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train)

In [10]:
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [11]:
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

#To one-hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [12]:
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
merged_layer= layers.concatenate([bow_inputs, variety_inputs])
merget_layer = layers.Dense(256, activation="relu")(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = keras.Model(inputs=[bow_inputs, variety_inputs], outputs=predictions)

In [13]:
wide_model.compile(loss="mse", optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 5000)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 5040)         0           input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            5041        concatenate[0][0]            

In [14]:
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(train_embed, maxlen=max_seq_length, padding = "post")
test_embed = keras.preprocessing.sequence.pad_sequences(test_embed, maxlen=max_seq_length, padding = "post")

In [15]:
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = keras.Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 170)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 170, 8)            40000     
_________________________________________________________________
flatten (Flatten)            (None, 1360)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1361      
Total params: 41,361
Trainable params: 41,361
Non-trainable params: 0
_________________________________________________________________
None


In [16]:
deep_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [17]:
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = keras.Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 170)]        0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            [(None, 5000)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 40)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 170, 8)       40000       input_3[0][0]                    
____________________________________________________________________________________________

In [18]:
combined_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

In [22]:
combined_model.fit([description_bow_train, variety_train] + [ train_embed ], labels_train, epochs=50, batch_size=128)
combined_model.evaluate([description_bow_test, variety_test] + [ test_embed ], labels_test, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[996.4148559570312, 0.0]

In [23]:
predictions = combined_model.predict([ description_bow_test, variety_test] + [ test_embed])

In [24]:
num_predictions = 40
diff = 0
for i in range(num_predictions):
  val = predictions[i]
  print(description_test.iloc[i])
  print('Predicted: ', val[0], 'Actual: ', labels_test.iloc[i], '\n')
  diff += abs(val[0] - labels_test.iloc[i])

Here's another exceptional Oregon Riesling to add to the growing ranks of top producers. Lemon-drop fruit meets peaches and cream in the mouth, as this off-dry (20g/L) wine displays a spot-on balance between acid, sugar, fruit and honey. This is delicious already, and built to age nicely over a decade or longer.
Predicted:  106.493416 Actual:  18.0 

Made to fit a standard mold, but made well. Aromas are of plum and cinnamon, with similar notes on the palate. Slender in body, with smooth tannins and a nut-laden finish. Imported by Southern Starz, Inc.
Predicted:  32.487404 Actual:  23.0 

Young and tart in cool-climate acidity, this Pinot needs time in the cellar. It's an exotic wine, spicy and peppery, almost briary, like a Zinfandel, except with flavors of wild forest raspberries, cherries, orange zest and a hint of pine cone. It's as cellarable a Pinot Noir as exists in California. Best after 2015.
Predicted:  26.840513 Actual:  60.0 

Compact aromas of red berries get a boost from 