In [71]:
import re
import numpy as np
import pandas as pd

from tqdm import tqdm
from pprint import pprint

# gensim
import gensim
import gensim.corpora as corpora
from gensim.models.callbacks import PerplexityMetric, CoherenceMetric
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy
import spacy

# plotting
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# enable logging for gensim
import logging
logging.basicConfig(filename='model_callbacks.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.NOTSET)

# filter warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# import stop words
from nltk.corpus import stopwords

stop_words = stopwords.words("english")
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
# import the dataset
data = pd.read_csv("../data/interim/reviews_merged.csv")
print(data.columns)

data.head()

Index(['Unnamed: 0', 'title', 'asin', 'brand', 'rank', 'verified', 'overall',
       'reviewerID', 'reviewText'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,title,asin,brand,rank,verified,overall,reviewerID,reviewText
0,0,Reversi Sensory Challenger,42000742,Fidelity Electronics,"['>#2,623,937 in Toys &amp; Games (See Top 100...",True,5.0,A285UYK35TF093,Simply extraordinary! A jump into my past and ...
1,1,Medal of Honor: Warfighter - Includes Battlefi...,78764343,by\n \n EA Games,"['>#67,231 in Video Games (See Top 100 in Vide...",True,5.0,A24SSUT5CSW8BH,I want to start off by saying I have never pla...
2,2,Medal of Honor: Warfighter - Includes Battlefi...,78764343,by\n \n EA Games,"['>#67,231 in Video Games (See Top 100 in Vide...",True,5.0,AB9S9279OZ3QO,I haven't gotten around to playing the campaig...
3,3,Medal of Honor: Warfighter - Includes Battlefi...,78764343,by\n \n EA Games,"['>#67,231 in Video Games (See Top 100 in Vide...",False,4.0,AK3V0HEBJMQ7J,this will be my second medal of honor I love h...
4,4,Xbox 360 MAS STICK,324411812,by\n \n MAS SYSTEMS,"['>#105,263 in Video Games (See Top 100 in Vid...",True,1.0,A24KMZPZDZ5T3,"Ordered an arcade stick, received a business l..."


In [5]:
# let's read a few lines of the reviews
for i in range(5):
    print(data.iloc[i]['reviewText'])
    print('\n')

Simply extraordinary! A jump into my past and sounds from the late seventies. Perfect like new! Simply amazing. Thank you!


I want to start off by saying I have never played the Call of Duty games. This is only the second first person shooter game that I have own. I think it is a lot of fun. Has good graphics and nice story line. It does take some skill to get through the levels. I think all players can enjoy this game. There are three levels to choose from based on your skill level. If your looking for first person shooter game that has current military type play than this is a good buy.


I haven't gotten around to playing the campaign but the multiplayer is solid and pretty fun. Includes Zero Dark Thirty pack, an Online Pass, and the all powerful Battlefield 4 Beta access.


this will be my second medal of honor I love how the incorporate real life military stories in the game great


Ordered an arcade stick, received a business law text book. Not sure what happened




In [7]:
reviews = data['reviewText'].values.tolist()
reviews

['Simply extraordinary! A jump into my past and sounds from the late seventies. Perfect like new! Simply amazing. Thank you!',
 'I want to start off by saying I have never played the Call of Duty games. This is only the second first person shooter game that I have own. I think it is a lot of fun. Has good graphics and nice story line. It does take some skill to get through the levels. I think all players can enjoy this game. There are three levels to choose from based on your skill level. If your looking for first person shooter game that has current military type play than this is a good buy.',
 "I haven't gotten around to playing the campaign but the multiplayer is solid and pretty fun. Includes Zero Dark Thirty pack, an Online Pass, and the all powerful Battlefield 4 Beta access.",
 'this will be my second medal of honor I love how the incorporate real life military stories in the game great',
 'Ordered an arcade stick, received a business law text book. Not sure what happened',
 'G

In [10]:
# removing new line characters
reviews = [re.sub('\s+', ' ', sent) for sent in reviews]
reviews

['Simply extraordinary! A jump into my past and sounds from the late seventies. Perfect like new! Simply amazing. Thank you!',
 'I want to start off by saying I have never played the Call of Duty games. This is only the second first person shooter game that I have own. I think it is a lot of fun. Has good graphics and nice story line. It does take some skill to get through the levels. I think all players can enjoy this game. There are three levels to choose from based on your skill level. If your looking for first person shooter game that has current military type play than this is a good buy.',
 "I haven't gotten around to playing the campaign but the multiplayer is solid and pretty fun. Includes Zero Dark Thirty pack, an Online Pass, and the all powerful Battlefield 4 Beta access.",
 'this will be my second medal of honor I love how the incorporate real life military stories in the game great',
 'Ordered an arcade stick, received a business law text book. Not sure what happened',
 'G

In [11]:
# removing html tags
reviews = [re.sub('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', ' ', text) for text in reviews]
reviews 

['Simply extraordinary! A jump into my past and sounds from the late seventies. Perfect like new! Simply amazing. Thank you!',
 'I want to start off by saying I have never played the Call of Duty games. This is only the second first person shooter game that I have own. I think it is a lot of fun. Has good graphics and nice story line. It does take some skill to get through the levels. I think all players can enjoy this game. There are three levels to choose from based on your skill level. If your looking for first person shooter game that has current military type play than this is a good buy.',
 "I haven't gotten around to playing the campaign but the multiplayer is solid and pretty fun. Includes Zero Dark Thirty pack, an Online Pass, and the all powerful Battlefield 4 Beta access.",
 'this will be my second medal of honor I love how the incorporate real life military stories in the game great',
 'Ordered an arcade stick, received a business law text book. Not sure what happened',
 'G

In [12]:
%%time
# tokenize words and clean-up text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
reviews_tokens = list(sent_to_words(reviews))

print(reviews_tokens[1])

I want to start off by saying I have never played the Call of Duty games. This is only the second first person shooter game that I have own. I think it is a lot of fun. Has good graphics and nice story line. It does take some skill to get through the levels. I think all players can enjoy this game. There are three levels to choose from based on your skill level. If your looking for first person shooter game that has current military type play than this is a good buy.


In [14]:
reviews_tokens[:100]

[['simply',
  'extraordinary',
  'jump',
  'into',
  'my',
  'past',
  'and',
  'sounds',
  'from',
  'the',
  'late',
  'seventies',
  'perfect',
  'like',
  'new',
  'simply',
  'amazing',
  'thank',
  'you'],
 ['want',
  'to',
  'start',
  'off',
  'by',
  'saying',
  'have',
  'never',
  'played',
  'the',
  'call',
  'of',
  'duty',
  'games',
  'this',
  'is',
  'only',
  'the',
  'second',
  'first',
  'person',
  'shooter',
  'game',
  'that',
  'have',
  'own',
  'think',
  'it',
  'is',
  'lot',
  'of',
  'fun',
  'has',
  'good',
  'graphics',
  'and',
  'nice',
  'story',
  'line',
  'it',
  'does',
  'take',
  'some',
  'skill',
  'to',
  'get',
  'through',
  'the',
  'levels',
  'think',
  'all',
  'players',
  'can',
  'enjoy',
  'this',
  'game',
  'there',
  'are',
  'three',
  'levels',
  'to',
  'choose',
  'from',
  'based',
  'on',
  'your',
  'skill',
  'level',
  'if',
  'your',
  'looking',
  'for',
  'first',
  'person',
  'shooter',
  'game',
  'that',
  'has

In [16]:
%%time
# building bigrams and trigrams
bigram = gensim.models.Phrases(reviews_tokens, min_count=5, threshold=100) # higher threshold fewer phrases

bigram_mod = gensim.models.phrases.Phraser(bigram)

In [29]:
# see bigram example
for i in range(100):
    print(bigram_mod[reviews_tokens[i]])
    print('\n')

['simply', 'extraordinary', 'jump', 'into', 'my', 'past', 'and', 'sounds', 'from', 'the', 'late', 'seventies', 'perfect', 'like', 'new', 'simply', 'amazing', 'thank', 'you']


['want', 'to', 'start', 'off', 'by', 'saying', 'have', 'never', 'played', 'the', 'call', 'of', 'duty', 'games', 'this', 'is', 'only', 'the', 'second', 'first', 'person', 'shooter', 'game', 'that', 'have', 'own', 'think', 'it', 'is', 'lot', 'of', 'fun', 'has', 'good', 'graphics', 'and', 'nice', 'story', 'line', 'it', 'does', 'take', 'some', 'skill', 'to', 'get', 'through', 'the', 'levels', 'think', 'all', 'players', 'can', 'enjoy', 'this', 'game', 'there', 'are', 'three', 'levels', 'to', 'choose', 'from', 'based', 'on', 'your', 'skill', 'level', 'if', 'your', 'looking', 'for', 'first', 'person', 'shooter', 'game', 'that', 'has', 'current', 'military', 'type', 'play', 'than', 'this', 'is', 'good', 'buy']


['haven', 'gotten', 'around', 'to', 'playing', 'the', 'campaign', 'but', 'the', 'multiplayer', 'is', 'solid', 

In [48]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts)]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in tqdm(texts)]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(' '.join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
        
    return texts_out

In [40]:
%%time
# removing stop words
reviews_no_stops = remove_stopwords(reviews_tokens)

# form bigrams
reviews_bigrams = make_bigrams(reviews_no_stops)

100%|██████████| 1932240/1932240 [15:07<00:00, 2130.20it/s] 
100%|██████████| 1932240/1932240 [09:37<00:00, 3347.50it/s] 


CPU times: user 13min 29s, sys: 6min 35s, total: 20min 5s
Wall time: 25min 30s


In [41]:
%store
reviews_bigrams

Stored variables and their in-db values:


[['simply',
  'extraordinary',
  'jump',
  'past',
  'sounds',
  'late',
  'seventies',
  'perfect',
  'like',
  'new',
  'simply',
  'amazing',
  'thank'],
 ['want',
  'start',
  'saying',
  'never',
  'played',
  'call',
  'duty',
  'games',
  'second',
  'first',
  'person',
  'shooter',
  'game',
  'think',
  'lot',
  'fun',
  'good',
  'graphics',
  'nice',
  'story',
  'line',
  'take',
  'skill',
  'get',
  'levels',
  'think',
  'players',
  'enjoy',
  'game',
  'three',
  'levels',
  'choose',
  'based',
  'skill',
  'level',
  'looking',
  'first',
  'person',
  'shooter',
  'game',
  'current',
  'military',
  'type',
  'play',
  'good',
  'buy'],
 ['gotten',
  'around',
  'playing',
  'campaign',
  'multiplayer',
  'solid',
  'pretty',
  'fun',
  'includes',
  'zero',
  'dark',
  'thirty',
  'pack',
  'online',
  'pass',
  'powerful',
  'battlefield',
  'beta',
  'access'],
 ['second',
  'medal',
  'honor',
  'love',
  'incorporate',
  'real',
  'life',
  'military',
  'sto

In [49]:
# initialize 'en' model
nlp = spacy.load('en', disable=['parser', 'ner'])

# lemmatization only keeping noun, adj, vb, adv
reviews_lemmatized = lemmatization(reviews_bigrams)

100%|██████████| 1932240/1932240 [1:41:57<00:00, 315.83it/s]   


In [50]:
# see bigram example
for i in range(100):
    print(reviews_lemmatized[i])
    print('\n')

['simply', 'extraordinary', 'jump', 'sound', 'late', 'seventy', 'perfect', 'simply', 'amazing', 'thank']


['want', 'start', 'say', 'never', 'play', 'call', 'duty', 'game', 'second', 'first', 'person', 'shooter', 'game', 'think', 'lot', 'fun', 'good', 'graphic', 'story', 'line', 'take', 'skill', 'get', 'level', 'think', 'player', 'enjoy', 'game', 'level', 'choose', 'base', 'skill', 'level', 'look', 'first', 'person', 'shooter', 'game', 'current', 'military', 'type', 'play', 'good', 'buy']


['get', 'play', 'campaign', 'multiplayer', 'solid', 'pretty', 'fun', 'include', 'dark', 'pack', 'online', 'pass', 'powerful', 'battlefield', 'beta', 'access']


['medal', 'honor', 'love', 'incorporate', 'real', 'life', 'military', 'story', 'game', 'great']


['order', 'receive', 'business', 'law', 'text', 'book', 'sure', 'happen']


['great', 'information', 'old', 'use', 'new', 'window', 'system']


['really', 'think', 'would', 'issue', 'turn', 'game', 'play', 'star', 'cause', 'know', 'good', 'game'

In [52]:
%store

reviews_lemmatized

Stored variables and their in-db values:


[['simply',
  'extraordinary',
  'jump',
  'sound',
  'late',
  'seventy',
  'perfect',
  'simply',
  'amazing',
  'thank'],
 ['want',
  'start',
  'say',
  'never',
  'play',
  'call',
  'duty',
  'game',
  'second',
  'first',
  'person',
  'shooter',
  'game',
  'think',
  'lot',
  'fun',
  'good',
  'graphic',
  'story',
  'line',
  'take',
  'skill',
  'get',
  'level',
  'think',
  'player',
  'enjoy',
  'game',
  'level',
  'choose',
  'base',
  'skill',
  'level',
  'look',
  'first',
  'person',
  'shooter',
  'game',
  'current',
  'military',
  'type',
  'play',
  'good',
  'buy'],
 ['get',
  'play',
  'campaign',
  'multiplayer',
  'solid',
  'pretty',
  'fun',
  'include',
  'dark',
  'pack',
  'online',
  'pass',
  'powerful',
  'battlefield',
  'beta',
  'access'],
 ['medal',
  'honor',
  'love',
  'incorporate',
  'real',
  'life',
  'military',
  'story',
  'game',
  'great'],
 ['order', 'receive', 'business', 'law', 'text', 'book', 'sure', 'happen'],
 ['great', 'infor

In [55]:
data['reviewBigrams'] = reviews_bigrams
data['reviewLemmatized'] = reviews_lemmatized

# save to interim
data.to_csv("../data/interim/reviews_merged_lemmatized.csv")

In [57]:
reviews_lemmatized

[['simply',
  'extraordinary',
  'jump',
  'sound',
  'late',
  'seventy',
  'perfect',
  'simply',
  'amazing',
  'thank'],
 ['want',
  'start',
  'say',
  'never',
  'play',
  'call',
  'duty',
  'game',
  'second',
  'first',
  'person',
  'shooter',
  'game',
  'think',
  'lot',
  'fun',
  'good',
  'graphic',
  'story',
  'line',
  'take',
  'skill',
  'get',
  'level',
  'think',
  'player',
  'enjoy',
  'game',
  'level',
  'choose',
  'base',
  'skill',
  'level',
  'look',
  'first',
  'person',
  'shooter',
  'game',
  'current',
  'military',
  'type',
  'play',
  'good',
  'buy'],
 ['get',
  'play',
  'campaign',
  'multiplayer',
  'solid',
  'pretty',
  'fun',
  'include',
  'dark',
  'pack',
  'online',
  'pass',
  'powerful',
  'battlefield',
  'beta',
  'access'],
 ['medal',
  'honor',
  'love',
  'incorporate',
  'real',
  'life',
  'military',
  'story',
  'game',
  'great'],
 ['order', 'receive', 'business', 'law', 'text', 'book', 'sure', 'happen'],
 ['great', 'infor

In [59]:
%%time
# creating the dictionary and corpus needed for topic modeling
id2word = corpora.Dictionary(reviews_lemmatized)

# create a corpus
corpus = [id2word.doc2bow(text) for text in tqdm(reviews_lemmatized)]

100%|██████████| 1932240/1932240 [10:28<00:00, 3073.56it/s] 

CPU times: user 3min 4s, sys: 4min 14s, total: 7min 18s
Wall time: 12min 22s





In [65]:
# view
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1)]


In [64]:
# printing a human readable format for corpus (term-frequency)

[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('amazing', 1),
  ('extraordinary', 1),
  ('jump', 1),
  ('late', 1),
  ('perfect', 1),
  ('seventy', 1),
  ('simply', 2),
  ('sound', 1),
  ('thank', 1)]]

In [74]:
%store

id2word, corpus

Stored variables and their in-db values:


(<gensim.corpora.dictionary.Dictionary at 0x7fae95f3e668>,
 [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1)],
  [(9, 1),
   (10, 1),
   (11, 1),
   (12, 1),
   (13, 1),
   (14, 1),
   (15, 1),
   (16, 2),
   (17, 1),
   (18, 4),
   (19, 1),
   (20, 2),
   (21, 1),
   (22, 3),
   (23, 1),
   (24, 1),
   (25, 1),
   (26, 1),
   (27, 1),
   (28, 2),
   (29, 2),
   (30, 1),
   (31, 1),
   (32, 1),
   (33, 2),
   (34, 2),
   (35, 1),
   (36, 1),
   (37, 1),
   (38, 2),
   (39, 1),
   (40, 1)],
  [(17, 1),
   (19, 1),
   (29, 1),
   (41, 1),
   (42, 1),
   (43, 1),
   (44, 1),
   (45, 1),
   (46, 1),
   (47, 1),
   (48, 1),
   (49, 1),
   (50, 1),
   (51, 1),
   (52, 1),
   (53, 1)],
  [(18, 1),
   (26, 1),
   (36, 1),
   (54, 1),
   (55, 1),
   (56, 1),
   (57, 1),
   (58, 1),
   (59, 1),
   (60, 1)],
  [(61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1)],
  [(54, 1), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1)],
  [(18, 2),
   (20, 1),
