In [38]:
import csv
import re
import os
import numpy as np

from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

from keras.layers.core import Activation, Dropout, Dense
from keras.layers.embeddings import Embedding
from keras.layers import Flatten, Conv1D, GlobalMaxPooling1D, LSTM, Bidirectional

from keras.preprocessing.text import Tokenizer

In [5]:
#open and prepare CSV file for data analysis

attributes = []
entries = []

with open("./data/winemag-data_first150k.csv") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        #get attribute names 
        if line_count == 0:            
            for r in row:
                attributes.append(r)
            
            attributes[0] = 'entry'
            
            line_count += 1

        else:
            #each entry as a dict of table attributes
            entry = {}
            n = 0
            for r in row:
                entry[attributes[n]] = r
                n+=1
                
            entries.append(entry)                  
            
            line_count += 1
    print(f'Processed {line_count} lines.')

Processed 150931 lines.


In [6]:
entries[0]

{'entry': '0',
 'country': 'US',
 'description': 'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.',
 'designation': "Martha's Vineyard",
 'points': '96',
 'price': '235.0',
 'province': 'California',
 'region_1': 'Napa Valley',
 'region_2': 'Napa',
 'variety': 'Cabernet Sauvignon',
 'winery': 'Heitz'}

In [7]:
#isolate each attribute
entry = []
country = []
description = []
designation = []
points = []
province = []
region_1 = []
region_2 = []
variety = []
winery = []


for e in entries:
    entry.append(e['entry'])
    country.append(e['country'])
    description.append(e['description'])
    designation.append(e['designation'])
    points.append(e['points'])
    province.append(e['province'])
    region_1.append(e['region_1'])
    region_2.append(e['region_2'])
    variety.append(e['variety'])
    winery.append(e['winery'])

In [8]:
#pre process text in description attribute
def preprocess_text(sentence):
    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [9]:
description[0]

'This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.'

In [10]:
preprocess_text(description[0])

'This tremendous varietal wine hails from Oakville and was aged over three years in oak Juicy red cherry fruit and compelling hint of caramel greet the palate framed by elegant fine tannins and subtle minty tone in the background Balanced and rewarding from start to finish it has years ahead of it to develop further nuance Enjoy '

In [11]:
#preprocess all descriptions
n=0
for text in description:
    description[n] = preprocess_text(description[n])
    n+=1

In [58]:
#preprocess wine varieties into integer values
varieties = {}

n = 0

for v in variety:
    if v not in varieties:
        varieties[v] = n
        n+=1

varieties

{'Cabernet Sauvignon': 0,
 'Tinta de Toro': 1,
 'Sauvignon Blanc': 2,
 'Pinot Noir': 3,
 'Provence red blend': 4,
 'Friulano': 5,
 'Tannat': 6,
 'Chardonnay': 7,
 'Tempranillo': 8,
 'Malbec': 9,
 'Rosé': 10,
 'Tempranillo Blend': 11,
 'Syrah': 12,
 'Mavrud': 13,
 'Sangiovese': 14,
 'Sparkling Blend': 15,
 'Rhône-style White Blend': 16,
 'Red Blend': 17,
 'Mencía': 18,
 'Palomino': 19,
 'Petite Sirah': 20,
 'Riesling': 21,
 'Cabernet Sauvignon-Syrah': 22,
 'Portuguese Red': 23,
 'Nebbiolo': 24,
 'Pinot Gris': 25,
 'Meritage': 26,
 'Baga': 27,
 'Glera': 28,
 'Malbec-Merlot': 29,
 'Merlot-Malbec': 30,
 'Ugni Blanc-Colombard': 31,
 'Viognier': 32,
 'Cabernet Sauvignon-Cabernet Franc': 33,
 'Moscato': 34,
 'Pinot Grigio': 35,
 'Cabernet Franc': 36,
 'White Blend': 37,
 'Monastrell': 38,
 'Gamay': 39,
 'Zinfandel': 40,
 'Greco': 41,
 'Barbera': 42,
 'Grenache': 43,
 'Rhône-style Red Blend': 44,
 'Albariño': 45,
 'Malvasia Bianca': 46,
 'Assyrtiko': 47,
 'Malagouzia': 48,
 'Carmenère': 49,
 '

## Part 1: predicting wine attributes with text descriptions

In [18]:
#use pretrained global vector model for word embeddings
if not os.path.exists("./glove.6B.100d.txt"):
    with open("glove.6B.100d.txt", "w+") as output:
        for i in range(0,8):
            print("Writing embeddings_" + str(i))
            with open(".embeddings/embeddings_"+str(i)) as part:
                output.write("".join(part.readlines()))
    shutil.rmtree(".embeddings/")
    print("Embedding file created!")
else:
    print("Embedding file already exists!")
    
print("Done!")

Embedding file already exists!
Done!


In [19]:
X_train, X_test, y_train, y_test = train_test_split(
    description, 
    variety, 
    test_size=0.20, 
    random_state=42)

In [22]:
#create word-to-index dictionary
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

# convert text to sequences
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# tokenizer dictionary
dictionary = tokenizer.word_index

In [25]:
#length of current dictionary vocab
vocab_size = len(tokenizer.word_index) + 1

maxlen = 65

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [43]:
embeddings_dictionary = dict()

#file is too large to store on git, must download and place in folder manually
glove_file = open('./glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [44]:
#create embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [54]:
vocab_size == len(embedding_matrix)

True

In [55]:
y_test

['Malbec',
 'Pinot Noir',
 'Syrah',
 'Shiraz',
 'Pinot Noir',
 'Sauvignon Blanc',
 'Pinot Noir',
 'Riesling',
 'Syrah',
 'Red Blend',
 'Rhône-style Red Blend',
 'Chardonnay',
 'Red Blend',
 'Pinot Noir',
 'Red Blend',
 'Nebbiolo',
 'Rosé',
 'Pinot Noir',
 'Touriga Nacional',
 'Chardonnay',
 'Malbec',
 'Sauvignon Blanc',
 'Verdejo',
 'Merlot',
 'Moscato',
 'Chardonnay',
 'Tempranillo',
 'Chardonnay',
 'Champagne Blend',
 'Gewürztraminer',
 'Tempranillo',
 'Shiraz',
 'Chardonnay',
 'Pinot Noir',
 'Gewürztraminer-Riesling',
 'Pinot Gris',
 'Malbec',
 'Pinot Noir',
 'Chardonnay',
 'Chardonnay',
 'Dolcetto',
 'Port',
 'Corvina, Rondinella, Molinara',
 'Verdejo',
 'Sauvignon Blanc',
 'Bordeaux-style Red Blend',
 'Red Blend',
 'White Blend',
 'Nebbiolo',
 'Bordeaux-style Red Blend',
 'Syrah',
 'Corvina, Rondinella, Molinara',
 'Cabernet Franc',
 'Syrah',
 'Red Blend',
 'Verdejo',
 'Chardonnay',
 'Sauvignon Blanc',
 'Bordeaux-style Red Blend',
 'Zinfandel',
 'Bordeaux-style Red Blend',
 'Borde

### build models

In [39]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [40]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 65, 100)           2842400   
_________________________________________________________________
flatten_2 (Flatten)          (None, 6500)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6501      
Total params: 2,848,901
Trainable params: 6,501
Non-trainable params: 2,842,400
_________________________________________________________________
None
