In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelEncoder

from tensorflow import keras
from keras.models import Model
import keras.layers as layers

# This code was tested with TensorFlow v1.5
print("You have TensorFlow version", tf.__version__)

In [None]:
data = pd.read_csv("winemag-data_first150k.csv")

In [None]:
# Shuffle the data
data = data.sample(frac=1)

data.head()

In [None]:
# Do some preprocessing to limit the # of wine varities in the dataset
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1) 

variety_threshold = 500 # Anything that occurs less than this will be removed.
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]

In [None]:
# Split data into train and test
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

In [None]:
# Train features
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]
points_train = data['points'][:train_size]

# Train labels
labels_train = data['price'][:train_size]

# Test features
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]
points_test = data['points'][train_size:]

# Test labels
labels_test = data['price'][train_size:]

In [None]:
# Create a tokenizer to preprocess our text descriptions
vocab_size = 100000
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) # only fit on train

In [None]:
# Wide feature 1: sparse bag of words (bow) vocab_size vector 
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [None]:
# Wide feature 2: one-hot vector of variety categories

# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

# Convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)

In [None]:
# Wide feature 3 - points rating split into buckets
# Create buckets column for points value
points_train_buckets = []
num_points_buckets = 4
for i in range(len(points_train)):
    bucket_arr = np.zeros(num_points_buckets)
    bucket_index = math.ceil((points_train.iloc[i] - 80) / 5) - 1
    bucket_arr[bucket_index] = 1
    points_train_buckets.append(bucket_arr)


points_test_buckets = []
for i in range(len(points_test)):
    bucket_arr = np.zeros(num_points_buckets)
    bucket_index = math.ceil((points_test.iloc[i] - 80) / 5) - 1
    bucket_arr[bucket_index] = 1
    points_test_buckets.append(bucket_arr)

In [None]:
# Define our wide model with the functional API
bow_inputs = layers.Input(shape=(vocab_size,))
variety_inputs = layers.Input(shape=(num_classes,))
points_inputs = layers.Input(shape=(num_points_buckets,))
merged_layer = layers.concatenate([bow_inputs, variety_inputs,points_inputs])
merged_layer = layers.Dense(256, activation='relu')(merged_layer)
predictions = layers.Dense(1)(merged_layer)
wide_model = Model(inputs=[bow_inputs, variety_inputs, points_inputs],outputs=predictions)

In [None]:
wide_model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
print(wide_model.summary())

In [None]:
# Deep model feature 1: word embeddings of wine descriptions
train_embed = tokenize.texts_to_sequences(description_train)
test_embed = tokenize.texts_to_sequences(description_test)

max_seq_length = 170
train_embed = keras.preprocessing.sequence.pad_sequences(
    train_embed, maxlen=max_seq_length, padding="post")
test_embed = keras.preprocessing.sequence.pad_sequences(
    test_embed, maxlen=max_seq_length, padding="post")

In [None]:
# Define our deep model with the Functional API
deep_inputs = layers.Input(shape=(max_seq_length,))
embedding = layers.Embedding(vocab_size, 8, input_length=max_seq_length)(deep_inputs)
embedding = layers.Flatten()(embedding)
embed_out = layers.Dense(1)(embedding)
deep_model = Model(inputs=deep_inputs, outputs=embed_out)
print(deep_model.summary())

In [None]:
deep_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [None]:
# Combine wide and deep into one model
merged_out = layers.concatenate([wide_model.output, deep_model.output])
merged_out = layers.Dense(1)(merged_out)
combined_model = Model(wide_model.input + [deep_model.input], merged_out)
print(combined_model.summary())

combined_model.compile(loss='mse',
                       optimizer='adam',
                       metrics=['accuracy'])

In [None]:
# Run training (need to run this for at least 5 epochs to get good accuracy, should probably do this on the cloud)
combined_model.fit([description_bow_train, variety_train, np.asarray(points_train_buckets)] + [train_embed], labels_train, epochs=2, batch_size=256)

In [None]:
combined_model.evaluate([description_bow_test, variety_test, np.asarray(points_test_buckets)] + [test_embed], labels_test, batch_size=256)

In [None]:
# Generate predictions
predictions = combined_model.predict([description_bow_test, variety_test, np.asarray(points_test_buckets)] + [test_embed])

In [None]:
# Compare predictions with actual values for the first few items in our test dataset
for i in range(35):
    val = predictions[i]
    print(description_test[i])
    print(val[0], 'Actual: ', labels_test.iloc[i], '\n')