In [1]:
import pandas as pd
brew = pd.read_csv("CanadianBreweries.csv")
beer = pd.read_csv("CanadianBeers.csv")
reviews = pd.read_csv("CanadianReviews.csv")

#### Seperating the comment data

In [2]:
comments = reviews['comment']
comments.head()

0    Appearance - Pours a hazy blush with two finge...
1    A collaboration brew with Small Pony Barrel Wo...
2                                                  NaN
3                                                  NaN
4                                                  NaN
Name: comment, dtype: object

#### Getting rid of NA

In [3]:
comments = comments.dropna()
comments.head()

0    Appearance - Pours a hazy blush with two finge...
1    A collaboration brew with Small Pony Barrel Wo...
5    Is pours a Light bronze/gold with a 3 finger w...
7    1L howler from Sherbrooke Liquor store - made ...
8    1L howler from Sherbrooke Liquor store, who re...
Name: comment, dtype: object

#### Importing tools

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/cedric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Word Processing

We need to process the data for topic modelling, which includes splitting sentences into words, removing punctuation, lowercasing the words, removing stopwords, lemmatizing words, removing stop word, and stemming the words

Lemmatizing means changing the word to first person form and present form.
Stemming means reducing words to their root form.

In [5]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

Cheking the function

In [6]:
sample = comments[1]
sample

'A collaboration brew with Small Pony Barrel Works.    Appearance - Pours a hazy gold with a finger of bubbly white head.    Smell - spicy earthy yeast, vinous aromas, bready and biscuity malts, spicy earthy, leafy, and floral hops, lime and citrus aromas.    Taste - Spicy earthy yeast then quickly goes into the bold tart and pungent vinous characteristics. The bready and biscuity malts, spicy earthy, leafy, and floral hops come through next. The lime and citrus notes finish the brew off.    Mouthfeel - Medium bodied with moderate to high carbonation. Finishes tart and sticky alongside a bold tartness.     Overall - A bold and pungent brew that definitely delivers on the sourness. A single can is plenty for me but nonetheless an enjoyable brew!        '

In [7]:
preprocess(sample)

['collabor',
 'brew',
 'small',
 'poni',
 'barrel',
 'work',
 'appear',
 'pour',
 'hazi',
 'gold',
 'finger',
 'bubbl',
 'white',
 'head',
 'smell',
 'spici',
 'earthi',
 'yeast',
 'vinous',
 'aroma',
 'breadi',
 'biscuiti',
 'malt',
 'spici',
 'earthi',
 'leafi',
 'floral',
 'hop',
 'lime',
 'citrus',
 'aroma',
 'tast',
 'spici',
 'earthi',
 'yeast',
 'quick',
 'go',
 'bold',
 'tart',
 'pungent',
 'vinous',
 'characterist',
 'breadi',
 'biscuiti',
 'malt',
 'spici',
 'earthi',
 'leafi',
 'floral',
 'hop',
 'come',
 'lime',
 'citrus',
 'note',
 'finish',
 'brew',
 'mouthfeel',
 'medium',
 'bodi',
 'moder',
 'high',
 'carbon',
 'finish',
 'tart',
 'sticki',
 'alongsid',
 'bold',
 'tart',
 'overal',
 'bold',
 'pungent',
 'brew',
 'definit',
 'deliv',
 'sour',
 'singl',
 'plenti',
 'nonetheless',
 'enjoy',
 'brew']

#### Now is the time to process all comments. Warning: this step takes a while

In [8]:
processed_doc = comments.apply(preprocess)

In [9]:
processed_doc.head()

0    [appear, pour, hazi, blush, finger, bubbl, whi...
1    [collabor, brew, small, poni, barrel, work, ap...
5    [pour, light, bronz, gold, finger, white, foam...
7    [howler, sherbrook, liquor, store, earl, grey,...
8    [howler, sherbrook, liquor, store, need, asses...
Name: comment, dtype: object

#### Build a dictionary from the corpus, and remove common words that are in over 50% of the comments, and remove those that appear less than 15 times

In [10]:
dictionary = gensim.corpora.Dictionary(processed_doc)

In [11]:
dictionary.filter_extremes(no_below=15, no_above=0.4, keep_n=100000)

#### Saving the corpus into a bag of words (BoW) format

In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

#### Running LDA using Bow with 8 threads

In [13]:
lda_model_30 = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=dictionary, passes=2, workers=7)

In [14]:
for idx, topic in lda_model_30.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.020*"aroma" + 0.015*"citrus" + 0.014*"drink" + 0.013*"flavor" + 0.012*"color" + 0.011*"mouthfeel" + 0.011*"white" + 0.009*"bottl" + 0.008*"overal" + 0.008*"great"
Topic: 1 
Words: 0.012*"dark" + 0.011*"bottl" + 0.008*"brew" + 0.008*"leav" + 0.007*"littl" + 0.007*"chocol" + 0.007*"brown" + 0.007*"caramel" + 0.007*"finger" + 0.006*"glass"
Topic: 2 
Words: 0.020*"pumpkin" + 0.016*"glass" + 0.016*"white" + 0.014*"bottl" + 0.013*"medium" + 0.012*"nose" + 0.011*"spice" + 0.010*"pint" + 0.010*"leav" + 0.009*"orang"
Topic: 3 
Words: 0.024*"citrus" + 0.016*"orang" + 0.015*"hop" + 0.015*"grapefruit" + 0.013*"pine" + 0.012*"medium" + 0.012*"balanc" + 0.011*"white" + 0.009*"aroma" + 0.009*"mouthfeel"
Topic: 4 
Words: 0.015*"medium" + 0.014*"hop" + 0.011*"caramel" + 0.011*"graini" + 0.011*"white" + 0.010*"earthi" + 0.010*"breadi" + 0.010*"leafi" + 0.009*"leav" + 0.009*"smooth"
Topic: 5 
Words: 0.012*"glass" + 0.010*"earthi" + 0.009*"graini" + 0.009*"floral" + 0.009*"leav" + 0.009

#### How many styles of beers are recorded in Canada

In [15]:
beer['style'].value_counts()

American IPA                                   1913
American Pale Ale (APA)                        1132
Belgian Saison                                  964
American Wild Ale                               606
Fruit and Field Beer                            553
American Imperial IPA                           551
American Amber / Red Ale                        445
American Blonde Ale                             445
Belgian Witbier                                 376
American Stout                                  348
American Lager                                  313
American Porter                                 300
American Imperial Stout                         287
German Hefeweizen                               267
American Brown Ale                              245
Herb and Spice Beer                             227
American Pale Wheat Ale                         205
Berliner Weisse                                 185
American Black Ale                              183
Bohemian Pil

#### Using HDP to determin "optimal model numer"

In [None]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
hdp = HdpModel(bow_corpus, dictionary)

In [None]:
hdp.save('CanadianHDP')

In [None]:
hdp.print_topics(num_topics=30, num_words=10)