In [1]:
import itertools
import os
import math
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras
layers = keras.layers

## Get data

In [3]:
data = pd.read_csv('datasets/wine_data_v1.csv')
# Shuffle data
data = data.sample(frac=1)
data.shape

(129971, 14)

Some preprocessing to limit variaties in the dataset

In [4]:
data = data[pd.notnull(data['country'])]
data = data[pd.notnull(data['price'])]
data = data.drop(data.columns[0], axis=1)
data.shape

(120916, 13)

In [5]:
variety_threshold = 500 # Anything that occurs less than this will be removed
value_counts = data['variety'].value_counts()
to_remove = value_counts[value_counts <= variety_threshold].index
data.replace(to_remove, np.nan, inplace=True)
data = data[pd.notnull(data['variety'])]
data = data.head(5000)
data.shape

(5000, 13)

Split data into training & testing

In [6]:
train_size = int(len(data) * .8)
print('Train size: %d' % train_size)
print('Test size: %d' % (len(data) - train_size))

Train size: 4000
Test size: 1000


In [7]:
# Train faetures
description_train = data['description'][:train_size]
variety_train = data['variety'][:train_size]

In [8]:
# Train labels
labels_train = data['price'][:train_size]

In [9]:
# Test faetures
description_test = data['description'][train_size:]
variety_test = data['variety'][train_size:]

In [10]:
# Test labels
labels_test = data['price'][train_size:]

In [11]:
# Create a tokim=nizer to process text description
vocab_size = 12000 # hyperparam, experiment with different values
tokenize = keras.preprocessing.text.Tokenizer(num_words=vocab_size, char_level=False)
tokenize.fit_on_texts(description_train) # only fit on train

In [12]:
# Wide feature 1: sparse bag of words (bow) vocab_size vector
description_bow_train = tokenize.texts_to_matrix(description_train)
description_bow_test = tokenize.texts_to_matrix(description_test)

In [15]:
# Wide feature 2: one-hot vector of variety categories

# use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(variety_train)
variety_train = encoder.transform(variety_train)
variety_test = encoder.transform(variety_test)
num_classes = np.max(variety_train) + 1

In [16]:
# convert labels to one hot
variety_train = keras.utils.to_categorical(variety_train, num_classes)
variety_test = keras.utils.to_categorical(variety_test, num_classes)