In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import multiprocessing
import gensim.models.word2vec as w2v



In [2]:
data = pd.read_csv('winemag-data_first150k.csv')

In [3]:
labels = data['variety']
descriptions = data['description']

In [4]:
print('{}   :   {}'.format(labels.tolist()[0], descriptions.tolist()[0]))
print('{}   :   {}'.format(labels.tolist()[56], descriptions.tolist()[56]))
print('{}   :   {}'.format(labels.tolist()[93], descriptions.tolist()[93]))

Cabernet Sauvignon   :   This tremendous 100% varietal wine hails from Oakville and was aged over three years in oak. Juicy red-cherry fruit and a compelling hint of caramel greet the palate, framed by elegant, fine tannins and a subtle minty tone in the background. Balanced and rewarding from start to finish, it has years ahead of it to develop further nuance. Enjoy 2022–2030.
Sauvignon Blanc   :   Delicious while also young and textured, this wine comes from biodynamically grown grapes. It has a strong sense of minerality as well as intense citrus and green fruits. It's tight at the moment and needs to round out, so drink from 2018.
Chardonnay   :   A smoky scent and earthy, crisp-apple flavors make this medium-bodied wine a change of pace from the average butterball Chardonnay. It has welcome acidity and a nicely smooth texture.


In [5]:
varietal_counts = labels.value_counts()
print(varietal_counts[:5])

Chardonnay                  14482
Pinot Noir                  14291
Cabernet Sauvignon          12800
Red Blend                   10062
Bordeaux-style Red Blend     7347
Name: variety, dtype: int64


In [6]:
corpus_raw = ""
for description in descriptions[:10000]:
    corpus_raw += description

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yerin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [11]:
raw_sentences = tokenizer.tokenize(corpus_raw)

In [12]:
def sentence_to_wordlist(raw):
    clean = re.sub("[^a-zA-Z]"," ", raw) #[^a-zA-Z] means any character that IS NOT a-z OR A-Z
    words = clean.split()
    return words

In [13]:
sentences = []
for raw_sentence in raw_sentences:
    if len(raw_sentence) > 0:
        sentences.append(sentence_to_wordlist(raw_sentence))

In [14]:
print(raw_sentences[234])
print(sentence_to_wordlist(raw_sentences[234]))

Tart cherry lingers on the finish.A deeper salmon color with elegantly lacy bubbles and a slight cloudy appearance, this sparkler by Norm Yost offers dessicated watermelon, dried orange blossoms, yeast, citrus rinds and fresher strawberry notes on the nose.
['Tart', 'cherry', 'lingers', 'on', 'the', 'finish', 'A', 'deeper', 'salmon', 'color', 'with', 'elegantly', 'lacy', 'bubbles', 'and', 'a', 'slight', 'cloudy', 'appearance', 'this', 'sparkler', 'by', 'Norm', 'Yost', 'offers', 'dessicated', 'watermelon', 'dried', 'orange', 'blossoms', 'yeast', 'citrus', 'rinds', 'and', 'fresher', 'strawberry', 'notes', 'on', 'the', 'nose']


In [15]:
token_count = sum([len(sentence) for sentence in sentences])
print('The wine corpus contains {0:,} tokens'.format(token_count))

The wine corpus contains 408,741 tokens


In [16]:
num_features = 300
min_word_count = 10
num_workers = multiprocessing.cpu_count()
context_size = 10
downsampling = 1e-3
seed=1993

In [17]:
wine2vec = w2v.Word2Vec(
    sg=1,
    seed=seed,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context_size,
    sample=downsampling
)

In [18]:
wine2vec.build_vocab(sentences)

In [19]:
print('Word2Vec vocabulary length:', len(wine2vec.wv.vocab))

Word2Vec vocabulary length: 2612


In [20]:
print(wine2vec.corpus_count)

17323


In [21]:
wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.iter)

  """Entry point for launching an IPython kernel.


(1354033, 2043705)

In [22]:
wine2vec.most_similar('melon')

  """Entry point for launching an IPython kernel.


[('papaya', 0.8899217844009399),
 ('honeydew', 0.8760225772857666),
 ('pineapple', 0.8726072311401367),
 ('banana', 0.8650798201560974),
 ('passion', 0.8118263483047485),
 ('mango', 0.8114340305328369),
 ('guava', 0.8062987923622131),
 ('peach', 0.8025190234184265),
 ('cantaloupe', 0.8024674654006958),
 ('pit', 0.8010448217391968)]

In [23]:
wine2vec.most_similar('acidic')

  """Entry point for launching an IPython kernel.


[('flat', 0.8649677038192749),
 ('watery', 0.8445253372192383),
 ('tartness', 0.843451201915741),
 ('sticky', 0.8260294198989868),
 ('modest', 0.8254812359809875),
 ('pulpy', 0.8177571296691895),
 ('foamy', 0.8152469396591187),
 ('dilute', 0.8147495985031128),
 ('angular', 0.8116484880447388),
 ('choppy', 0.8089189529418945)]

In [33]:
wine2vec.most_similar('Chardonnay')

  """Entry point for launching an IPython kernel.


[('Gris', 0.8256964683532715),
 ('Chenin', 0.7940977811813354),
 ('Blanc', 0.7609167098999023),
 ('Grigio', 0.7533526420593262),
 ('Marsanne', 0.7502498626708984),
 ('traditional', 0.7416612505912781),
 ('Muscat', 0.7396905422210693),
 ('Albari', 0.7380856275558472),
 ('Roussanne', 0.7364259958267212),
 ('Verdejo', 0.7351171374320984)]