In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
wine_df = pd.read_csv("winemag-data-130k-v2.csv")

In [3]:
wine_df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [4]:
varieties = wine_df['variety'].value_counts()
top_wines_df = wine_df.loc[wine_df['variety'].isin(varieties.axes[0][:20])]
top_wines_df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [5]:
wine_stop_words = []
for variety in top_wines_df['variety'].unique():
    for word in variety.split(' '):
        wine_stop_words.append(word.lower())
wine_stop_words = pd.Series(data=wine_stop_words).unique()

In [6]:
wine_stop_words = np.append(wine_stop_words,"wine")

In [7]:
wine_stop_words

array(['white', 'blend', 'portuguese', 'red', 'pinot', 'gris', 'riesling',
       'noir', 'cabernet', 'sauvignon', 'chardonnay', 'malbec', 'merlot',
       'blanc', 'sangiovese', 'bordeaux-style', 'rosé', 'zinfandel',
       'syrah', 'nebbiolo', 'rhône-style', 'sparkling', 'tempranillo',
       'wine'], dtype=object)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # term frequency–inverse document frequency, is a numerical statistic that is intended to reflect how important a word is to a document in a collections
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction import text

In [12]:
stop_word_list = text.ENGLISH_STOP_WORDS.union(wine_stop_words)
vect = TfidfVectorizer(stop_words=list(stop_word_list))

In [13]:
X = vect.fit_transform(top_wines_df['description'])

In [14]:
lda = LDA(learning_method='batch',n_components=20)
topics = lda.fit_transform(X)

In [15]:
#This function was created by Mueller and Guido here: https://github.com/amueller/mglearn/blob/master/mglearn/tools.py
def print_topics(topics, feature_names, sorting, topics_per_chunk=5,
                 n_words=10):
    for i in range(0, len(topics), topics_per_chunk):
        # for each chunk:
        these_topics = topics[i: i + topics_per_chunk]
        # maybe we have less than topics_per_chunk left
        len_this_chunk = len(these_topics)
        # print topic headers
        print(("topic {!s:<8}" * len_this_chunk).format(*these_topics))
        print(("-------- {0:<5}" * len_this_chunk).format(""))
        # print top n_words frequent words
        for i in range(n_words):
            try:
                print(("{!s:<14}" * (len_this_chunk)).format(
                    *feature_names[sorting[these_topics, i]]))
            except:
                pass
        print("\n")

In [17]:
sorting = np.argsort(lda.components_, axis=1)[:, ::-1]
feature_names = np.array(vect.get_feature_names())
        
print_topics(topics=range(20), feature_names=feature_names, sorting=sorting)

topic 0       topic 1       topic 2       topic 3       topic 4       
--------      --------      --------      --------      --------      
cherry        aromas        apple         café          finish        
black         fruit         lemon         au            flavors       
fruit         black         citrus        lait          aromas        
bodied        flavors       pear          baguette      palate        
oak           nose          palate        verbena       green         
finish        palate        flavors       thins         feels         
spice         cherry        finish        slices        herbal        
tannins       dried         fruit         recognize     berry         
blackberry    dark          fresh         miss          plum          
soft          spice         light         blanca        fruit         


topic 5       topic 6       topic 7       topic 8       topic 9       
--------      --------      --------      --------      --------      
du  