In [2]:
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten
from tensorflow.keras.layers import Embedding
from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from tokenizer import filter_to_top_x

In [4]:
df = pd.read_csv('data/wine.csv', index_col=0)
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
counter = Counter(df['variety'].tolist())
print(counter)

Counter({'Pinot Noir': 13272, 'Chardonnay': 11753, 'Cabernet Sauvignon': 9472, 'Red Blend': 8946, 'Bordeaux-style Red Blend': 6915, 'Riesling': 5189, 'Sauvignon Blanc': 4967, 'Syrah': 4142, 'Rosé': 3564, 'Merlot': 3102, 'Nebbiolo': 2804, 'Zinfandel': 2714, 'Sangiovese': 2707, 'Malbec': 2652, 'Portuguese Red': 2466, 'White Blend': 2360, 'Sparkling Blend': 2153, 'Tempranillo': 1810, 'Rhône-style Red Blend': 1471, 'Pinot Gris': 1455, 'Champagne Blend': 1396, 'Cabernet Franc': 1353, 'Grüner Veltliner': 1345, 'Portuguese White': 1159, 'Bordeaux-style White Blend': 1066, 'Pinot Grigio': 1052, 'Gamay': 1025, 'Gewürztraminer': 1012, 'Viognier': 996, 'Shiraz': 836, 'Petite Sirah': 770, 'Sangiovese Grosso': 751, 'Barbera': 721, 'Glera': 709, 'Port': 668, 'Grenache': 651, 'Corvina, Rondinella, Molinara': 619, 'Chenin Blanc': 591, 'Tempranillo Blend': 588, 'Carmenère': 575, 'Albariño': 477, 'Pinot Blanc': 442, 'Rhône-style White Blend': 425, "Nero d'Avola": 365, 'Aglianico': 359, 'Moscato': 358, '

In [6]:
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(10))}
print(top_10_varieties)

{'Pinot Noir': 0, 'Chardonnay': 1, 'Cabernet Sauvignon': 2, 'Red Blend': 3, 'Bordeaux-style Red Blend': 4, 'Riesling': 5, 'Sauvignon Blanc': 6, 'Syrah': 7, 'Rosé': 8, 'Merlot': 9}


In [7]:
df = df[df['variety'].map(lambda x: x in top_10_varieties)]
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
10,US,"Soft, supple plum envelopes an oaky structure ...",Mountain Cuvée,87,19.0,California,Napa Valley,Napa,Virginie Boone,@vboone,Kirkland Signature 2011 Mountain Cuvée Caberne...,Cabernet Sauvignon,Kirkland Signature
12,US,"Slightly reduced, this wine offers a chalky, t...",,87,34.0,California,Alexander Valley,Sonoma,Virginie Boone,@vboone,Louis M. Martini 2012 Cabernet Sauvignon (Alex...,Cabernet Sauvignon,Louis M. Martini
14,US,Building on 150 years and six generations of w...,,87,12.0,California,Central Coast,Central Coast,Matt Kettmann,@mattkettmann,Mirassou 2012 Chardonnay (Central Coast),Chardonnay,Mirassou


In [11]:
description_list = df['description'].tolist()
print(description_list[1])

Much like the regular bottling from 2012, this comes across as rather rough and tannic, with rustic, earthy, herbal characteristics. Nonetheless, if you think of it as a pleasantly unfussy country wine, it's a good companion to a hearty winter stew.


In [13]:
mapped_list, word_list = filter_to_top_x(description_list, 2500, 10)
print(mapped_list[1])

[114, 1293, 147, 14, 952, 207, 495, 31, 357, 964, 100, 530, 155, 105, 1102, 456, 166, 2383, 5, 31, 1191, 5, 3, 62, 2005, 2, 859]


In [15]:
print(word_list)

{'in': 0, 'flavors': 1, 'to': 2, "'s": 3, 'The': 4, 'it': 5, 'fruit': 6, 'It': 7, 'on': 8, 'This': 9, 'that': 10, 'acidity': 11, 'finish': 12, 'palate': 13, 'from': 14, 'tannins': 15, 'but': 16, 'aromas': 17, 'cherry': 18, 'are': 19, 'black': 20, 'has': 21, 'ripe': 22, 'for': 23, '%': 24, 'A': 25, 'by': 26, 'oak': 27, 'Drink': 28, 'red': 29, 'notes': 30, 'as': 31, 'its': 32, 'Cabernet': 33, 'now': 34, 'rich': 35, 'spice': 36, 'nose': 37, 'an': 38, 'dry': 39, 'blend': 40, 'fruits': 41, 'fresh': 42, 'plum': 43, 'texture': 44, 'blackberry': 45, 'soft': 46, 'through': 47, 'sweet': 48, 'berry': 49, 'shows': 50, 'crisp': 51, 'apple': 52, 'Sauvignon': 53, 'vanilla': 54, 'well': 55, 'at': 56, 'offers': 57, 'more': 58, 'light': 59, 'dark': 60, 'Merlot': 61, 'good': 62, 'raspberry': 63, 'citrus': 64, 'drink': 65, 'some': 66, 'pepper': 67, 'fruity': 68, 'will': 69, 'chocolate': 70, 'years': 71, 'full': 72, 'green': 73, 'very': 74, 'juicy': 75, 'touch': 76, 'lemon': 77, 'while': 78, 'firm': 79, 's