In [1]:
import pandas as pd
brew = pd.read_csv("./data/CanadianBreweries.csv")
beer = pd.read_csv("./data/CanadianBeers.csv")
reviews = pd.read_csv("./data/CanadianReviews.csv")

#### Seperating the comment data

In [2]:
comments = reviews['comment']
comments.head()

0    Appearance - Pours a hazy blush with two finge...
1    A collaboration brew with Small Pony Barrel Wo...
2                                                  NaN
3                                                  NaN
4                                                  NaN
Name: comment, dtype: object

#### Getting rid of NA

In [3]:
comments = comments.dropna()
comments.head()

0    Appearance - Pours a hazy blush with two finge...
1    A collaboration brew with Small Pony Barrel Wo...
5    Is pours a Light bronze/gold with a 3 finger w...
7    1L howler from Sherbrooke Liquor store - made ...
8    1L howler from Sherbrooke Liquor store, who re...
Name: comment, dtype: object

#### Importing tools

In [4]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/cedric/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Word Processing

We need to process the data for topic modelling, which includes splitting sentences into words, removing punctuation, lowercasing the words, removing stopwords, lemmatizing words, removing stop word, and stemming the words

Lemmatizing means changing the word to first person form and present form.
Stemming means reducing words to their root form.

In [5]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

Cheking the function

In [6]:
sample = comments.iloc[0]
sample

'Appearance - Pours a hazy blush with two fingers of bubbly white head.    Smell - spicy earthy, leafy, and floral hops, peach, apricot, blackberry, wheaty and biscuity malts, and spicy earthy yeast.    Taste - spicy earthy, leafy, and floral hops quickly followed by the natural flavours of peach, apricot, and blackberry. The wheaty and biscuity malts follow suit and the spicy earthy yeast helps to finish off the brew.     Mouthfeel - Medium bodied with moderate to high carbonation. Finishes smooth with the fruits lingering.     Overall - A flavourful brew that displays how three summer fruits can come together to create a nice balance between sweet and tart. This brew is by no means tart or sweet; it has a nice balance to it. Highly sessionable for sure. This would be ideal for patio weather!        '

In [7]:
preprocess(sample)

['appear',
 'pour',
 'hazi',
 'blush',
 'finger',
 'bubbl',
 'white',
 'head',
 'smell',
 'spici',
 'earthi',
 'leafi',
 'floral',
 'hop',
 'peach',
 'apricot',
 'blackberri',
 'wheati',
 'biscuiti',
 'malt',
 'spici',
 'earthi',
 'yeast',
 'tast',
 'spici',
 'earthi',
 'leafi',
 'floral',
 'hop',
 'quick',
 'follow',
 'natur',
 'flavour',
 'peach',
 'apricot',
 'blackberri',
 'wheati',
 'biscuiti',
 'malt',
 'follow',
 'suit',
 'spici',
 'earthi',
 'yeast',
 'help',
 'finish',
 'brew',
 'mouthfeel',
 'medium',
 'bodi',
 'moder',
 'high',
 'carbon',
 'finish',
 'smooth',
 'fruit',
 'linger',
 'overal',
 'flavour',
 'brew',
 'display',
 'summer',
 'fruit',
 'come',
 'creat',
 'nice',
 'balanc',
 'sweet',
 'tart',
 'brew',
 'mean',
 'tart',
 'sweet',
 'nice',
 'balanc',
 'high',
 'session',
 'sure',
 'ideal',
 'patio',
 'weather']

#### Now is the time to process all comments. Warning: this step takes a while

In [8]:
processed_doc = comments.apply(preprocess)

In [9]:
processed_doc.head()

0    [appear, pour, hazi, blush, finger, bubbl, whi...
1    [collabor, brew, small, poni, barrel, work, ap...
5    [pour, light, bronz, gold, finger, white, foam...
7    [howler, sherbrook, liquor, store, earl, grey,...
8    [howler, sherbrook, liquor, store, need, asses...
Name: comment, dtype: object

#### Build a dictionary from the corpus, and remove common words that are in over 50% of the comments, and remove those that appear less than 15 times

In [10]:
dictionary = gensim.corpora.Dictionary(processed_doc)

In [11]:
dictionary.filter_extremes(no_below=5, no_above=0.35, keep_n=100000)

#### Saving the corpus into a bag of words (BoW) format

In [12]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

#### Running LDA using Bow with 8 threads

In [None]:
lda_model_20 = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=7)
lda_model_30 = gensim.models.LdaMulticore(bow_corpus, num_topics=30, id2word=dictionary, passes=2, workers=7)

In [None]:
for idx, topic in lda_model_20.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
for idx, topic in lda_model_30.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

#### How many styles of beers are recorded in Canada

In [None]:
beer['style'].value_counts()

#### Using HDP to determin "optimal model numer"

In [None]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel
hdp = HdpModel(bow_corpus, dictionary)

## Trying to seperate beer reviews by style

In [None]:
hdp.save('CanadianHDP')

In [None]:
hdp.print_topics(num_topics=50, num_words=10)

In [None]:
filtered_beer = beer[beer['style'] == 'Belgian Saison']

In [None]:
filtered_beer = filtered_beer.append(beer[beer['style'] == 'American IPA'])

In [None]:
filtered_beer = filtered_beer.append(beer[beer['style'] == 'Fruit and Field Beer'])

In [None]:
filtered_reviews = reviews[reviews['beer_number'].isin(filtered_beer['beer_number'])]

In [None]:
filtered_comments = filtered_reviews['comment'].dropna()

In [None]:
processed_doc = filtered_comments.apply(preprocess)

In [None]:
dictionary = gensim.corpora.Dictionary(processed_doc)

In [None]:
dictionary.filter_extremes(no_below=10, no_above=0.4, keep_n=100000)

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_doc]

In [None]:
lda_model_5 = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=7)

In [None]:
for idx, topic in lda_model_5.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))