In [38]:
import pandas as pd
brew = pd.read_csv("../data/CanadianBreweries.csv")
beer = pd.read_csv("../data/CanadianBeers.csv")
reviews = pd.read_csv("../data/CanadianReviews.csv")

#### Seperating the comment data

In [9]:
comments = reviews['comment']
comments.head()

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
Name: comment, dtype: object

#### Getting rid of NA

In [10]:
comments = comments.dropna()
comments.head()

9     On tap at Birch &amp; Barley.    Hazy yellow i...
10    Received a mason jar of this from a friend. Wa...
11    Had this twice, in two different retaurants, o...
12    On tap at the brewery tap room    A - A lovely...
13    Slightly hazy copper color with a low amount o...
Name: comment, dtype: object

#### Importing tools

In [6]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wangz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

#### Word Processing

We need to process the data for topic modelling, which includes splitting sentences into words, removing punctuation, lowercasing the words, removing stopwords, lemmatizing words, removing stop word, and stemming the words

Lemmatizing means changing the word to first person form and present form.
Stemming means reducing words to their root form.

In [7]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

Cheking the function

In [17]:
sample = comments.iloc[0]
sample

'On tap at Birch &amp; Barley.    Hazy yellow in color with a one finger white head.    The aroma is wheaty with citrus and big spiced and floral notes. Good if bordering on a bit much.    The flavor is heavily spiced: lavender, sage, coriander, and others. Wheat, lemon, and honey besides that. Decently tasty.    Medium-light bodied and fairly tingly from the spices with average carbonation.    A successful experiment I think, if a bit overdone in some ways.        '

In [18]:
preprocess(sample)

['birch',
 'barley',
 'hazi',
 'yellow',
 'color',
 'finger',
 'white',
 'head',
 'aroma',
 'wheati',
 'citrus',
 'spice',
 'floral',
 'note',
 'good',
 'border',
 'flavor',
 'heavili',
 'spice',
 'lavend',
 'sage',
 'coriand',
 'wheat',
 'lemon',
 'honey',
 'decent',
 'tasti',
 'medium',
 'light',
 'bodi',
 'fair',
 'ting',
 'spice',
 'averag',
 'carbon',
 'success',
 'experi',
 'think',
 'overdo',
 'way']

#### Now is the time to process all comments. Warning: this step takes a while

In [19]:
processed_doc = comments.apply(preprocess)

In [20]:
processed_doc.head()

9     [birch, barley, hazi, yellow, color, finger, w...
10    [receiv, mason, friend, tell, pour, drink, clo...
11    [twice, differ, retaur, recent, trip, ïles, ma...
12    [breweri, room, love, cloudi, light, golden, y...
13    [slight, hazi, copper, color, activ, beig, fin...
Name: comment, dtype: object

#### Build a dictionary from the corpus, and remove common words that are in over 50% of the comments, and remove those that appear less than 15 times

In [21]:
dictionary = gensim.corpora.Dictionary(processed_doc)

In [24]:
dictionary.filter_extremes(no_below=10, no_above=0.35, keep_n=100000)

#### Saving the corpus into a bag of words (BoW) format

In [25]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

#### Running LDA using Bow with 8 threads

In [30]:
lda_model_10 = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=7)

In [31]:
for idx, topic in lda_model_10.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.012*"sour" + 0.011*"flavour" + 0.010*"aroma" + 0.009*"slight" + 0.008*"tart" + 0.008*"mouthfeel" + 0.008*"littl" + 0.007*"drink" + 0.007*"appl" + 0.007*"colour"
Topic: 1 
Words: 0.012*"creami" + 0.010*"spici" + 0.010*"smooth" + 0.009*"pepper" + 0.009*"stout" + 0.008*"drink" + 0.008*"glass" + 0.007*"spice" + 0.007*"note" + 0.007*"alcohol"
Topic: 2 
Words: 0.020*"aroma" + 0.017*"flavor" + 0.015*"dark" + 0.011*"hint" + 0.011*"mouthfeel" + 0.011*"glass" + 0.011*"slight" + 0.010*"color" + 0.010*"drinkabl" + 0.009*"note"
Topic: 3 
Words: 0.025*"hop" + 0.021*"caramel" + 0.018*"aroma" + 0.014*"amber" + 0.011*"slight" + 0.011*"malti" + 0.010*"brew" + 0.009*"earthi" + 0.009*"balanc" + 0.008*"style"
Topic: 4 
Words: 0.014*"littl" + 0.014*"flavor" + 0.012*"look" + 0.009*"drink" + 0.009*"glass" + 0.008*"think" + 0.008*"aroma" + 0.007*"alcohol" + 0.007*"feel" + 0.007*"dark"
Topic: 5 
Words: 0.023*"chocol" + 0.023*"dark" + 0.022*"coffe" + 0.016*"roast" + 0.014*"black" + 0.013*"brow

#### How many styles of beers are recorded in Canada

In [15]:
beer['style'].value_counts()

American IPA                                   1913
American Pale Ale (APA)                        1132
Belgian Saison                                  964
American Wild Ale                               606
Fruit and Field Beer                            553
American Imperial IPA                           551
American Amber / Red Ale                        445
American Blonde Ale                             445
Belgian Witbier                                 376
American Stout                                  348
American Lager                                  313
American Porter                                 300
American Imperial Stout                         287
German Hefeweizen                               267
American Brown Ale                              245
Herb and Spice Beer                             227
American Pale Wheat Ale                         205
Berliner Weisse                                 185
American Black Ale                              183
Bohemian Pil

#### Using HDP to determin "optimal model numer"

In [68]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
hdp = HdpModel(bow_corpus, dictionary)

In [None]:
hdp.save('CanadianHDP')

In [70]:
hdp.print_topics(num_topics=50, num_words=10)

[(0,
  '0.009*appl + 0.009*fruit + 0.009*citrus + 0.008*bottl + 0.008*orang + 0.007*littl + 0.007*glass + 0.007*flavor + 0.007*aroma + 0.007*note'),
 (1,
  '0.010*citrus + 0.010*fruit + 0.010*orang + 0.009*aroma + 0.009*flavor + 0.008*grapefruit + 0.008*littl + 0.007*note + 0.007*bottl + 0.007*slight'),
 (2,
  '0.009*citrus + 0.008*orang + 0.007*fruit + 0.006*glass + 0.006*caramel + 0.006*bottl + 0.006*aroma + 0.006*grapefruit + 0.006*slight + 0.006*note'),
 (3,
  '0.010*citrus + 0.008*glass + 0.007*caramel + 0.007*orang + 0.006*leav + 0.006*floral + 0.006*finger + 0.006*grapefruit + 0.006*fruiti + 0.006*overal'),
 (4,
  '0.007*fruit + 0.006*citrus + 0.006*flavor + 0.006*orang + 0.006*bottl + 0.006*aroma + 0.005*littl + 0.005*slight + 0.005*appl + 0.005*glass'),
 (5,
  '0.006*citrus + 0.006*fruit + 0.006*orang + 0.005*glass + 0.005*aroma + 0.005*bottl + 0.005*grapefruit + 0.004*slight + 0.004*flavor + 0.004*littl'),
 (6,
  '0.005*citrus + 0.004*orang + 0.004*fruit + 0.004*slight + 0.00

In [43]:
filtered_beer = beer[beer['style'] == 'Belgian Saison']

In [45]:
filtered_beer = filtered_beer.append(beer[beer['style'] == 'American IPA'])

In [46]:
filtered_beer = filtered_beer.append(beer[beer['style'] == 'Fruit and Field Beer'])

In [51]:
filtered_reviews = reviews[reviews['beer_number'].isin(filtered_beer['beer_number'])]

In [52]:
filtered_comments = filtered_reviews['comment'].dropna()

In [54]:
processed_doc = filtered_comments.apply(preprocess)

In [56]:
dictionary = gensim.corpora.Dictionary(processed_doc)

In [62]:
dictionary.filter_extremes(no_below=10, no_above=0.4, keep_n=100000)

In [63]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

In [66]:
lda_model_5 = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=7)

In [67]:
for idx, topic in lda_model_5.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.016*"appl" + 0.014*"flavor" + 0.011*"aroma" + 0.009*"bottl" + 0.008*"color" + 0.008*"glass" + 0.008*"fruit" + 0.008*"grapefruit" + 0.008*"drink" + 0.008*"leav"
Topic: 1 
Words: 0.017*"citrus" + 0.015*"orang" + 0.010*"note" + 0.010*"grapefruit" + 0.010*"aroma" + 0.009*"bottl" + 0.009*"fruit" + 0.009*"flavor" + 0.008*"yeast" + 0.008*"caramel"
Topic: 2 
Words: 0.011*"orang" + 0.010*"slight" + 0.009*"grapefruit" + 0.009*"nose" + 0.009*"fruit" + 0.008*"pine" + 0.007*"caramel" + 0.007*"bottl" + 0.007*"littl" + 0.006*"flavour"
Topic: 3 
Words: 0.012*"appl" + 0.010*"citrus" + 0.008*"nose" + 0.008*"mouthfeel" + 0.008*"flavor" + 0.008*"overal" + 0.008*"finger" + 0.007*"aroma" + 0.007*"glass" + 0.006*"orang"
Topic: 4 
Words: 0.014*"fruit" + 0.013*"citrus" + 0.012*"littl" + 0.011*"glass" + 0.009*"bottl" + 0.009*"finger" + 0.008*"slight" + 0.008*"fruiti" + 0.008*"aroma" + 0.008*"orang"
