Researcher - TRISHUL CHOWDHURY

In [None]:
import nltk; nltk.download('stopwords')

In [16]:
import stop_words

In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [19]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [5]:
import pandas as pd
df = pd.read_csv('amazon_200.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
df.columns

Index(['id', 'name', 'asins', 'brand', 'categories', 'keys', 'manufacturer',
       'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen',
       'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id',
       'reviews.numHelpful', 'reviews.rating', 'reviews.sourceURLs',
       'reviews.text', 'reviews.title', 'reviews.userCity',
       'reviews.userProvince', 'reviews.username'],
      dtype='object')

In [8]:
df["reviews.text"]=df["reviews.text"].astype(str)

In [9]:
# Convert to list
data = df["reviews.text"].values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['This product so far has not disappointed. My children love to use it and I '
 'like the ability to monitor control what content they see with ease.']


In [14]:
## Tokenize words and Clean-up text

In [10]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['this', 'product', 'so', 'far', 'has', 'not', 'disappointed', 'my', 'children', 'love', 'to', 'use', 'it', 'and', 'like', 'the', 'ability', 'to', 'monitor', 'control', 'what', 'content', 'they', 'see', 'with', 'ease']]


In [16]:
# Creating Bigram and Trigram Models
# Bigrams are two words frequently occurring together in the document. Trigrams are 3 words frequently occurring.

In [11]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['this', 'product', 'so', 'far', 'has', 'not', 'disappointed', 'my', 'children', 'love', 'to', 'use', 'it', 'and', 'like', 'the', 'ability', 'to', 'monitor', 'control', 'what', 'content', 'they', 'see', 'with', 'ease']


In [18]:
#Remove Stopwords, Make Bigrams and Lemmatize

In [12]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['parser', 'ner'])

In [20]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
#nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['product', 'far', 'disappointed', 'child', 'love', 'ability', 'monitor', 'control', 'content', 'see', 'ease']]


In [None]:
#Create the Dictionary and Corpus needed for Topic Modeling

In [21]:
id2word = corpora.Dictionary(data_lemmatized)

In [32]:
dict(list(id2word.items())[0:10])

{0: 'ability',
 1: 'child',
 2: 'content',
 3: 'control',
 4: 'disappointed',
 5: 'ease',
 6: 'far',
 7: 'love',
 8: 'monitor',
 9: 'product'}

In [24]:
id2word[0]

'ability'

In [25]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('ability', 1),
  ('child', 1),
  ('content', 1),
  ('control', 1),
  ('disappointed', 1),
  ('ease', 1),
  ('far', 1),
  ('love', 1),
  ('monitor', 1),
  ('product', 1),
  ('see', 1)]]

In [26]:
# Building the Topic Model

In [27]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [28]:
# View the topics in LDA model

In [29]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.257*"need" + 0.220*"make" + 0.068*"memory" + 0.060*"enough" + '
  '0.053*"sure" + 0.051*"help" + 0.046*"may" + 0.045*"fact" + 0.042*"cheap" + '
  '0.026*"thank"'),
 (1,
  '0.260*"set" + 0.207*"content" + 0.110*"access" + 0.107*"allow" + '
  '0.048*"kid" + 0.045*"spend" + 0.035*"instal" + 0.026*"travel" + '
  '0.023*"plan" + 0.015*"limit"'),
 (2,
  '0.537*"good" + 0.104*"money" + 0.096*"worth" + 0.089*"internet" + '
  '0.089*"nice" + 0.048*"small" + 0.000*"surfing" + 0.000*"cable" + '
  '0.000*"box" + 0.000*"stick"'),
 (3,
  '0.264*"great" + 0.179*"work" + 0.151*"well" + 0.092*"would" + '
  '0.074*"recommend" + 0.045*"price" + 0.034*"look" + 0.033*"little" + '
  '0.032*"say" + 0.020*"definitely"'),
 (4,
  '0.113*"way" + 0.112*"find" + 0.102*"far" + 0.086*"take" + 0.074*"control" + '
  '0.062*"old" + 0.054*"never" + 0.053*"year" + 0.051*"perfect" + '
  '0.047*"seem"'),
 (5,
  '0.236*"lot" + 0.117*"speed" + 0.093*"sound" + 0.086*"always" + 0.049*"hard" '
  '+ 0.043*"speaker" + 0

In [30]:
#Compute Model Perplexity and Coherence Score
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -13.383275113512832

Coherence Score:  0.45583310717988335


In [31]:
#Visualize the topics-keywords
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

### Building LDA Mallet Model

In [65]:
import os
os.path

<module 'ntpath' from 'C:\\Users\\Trishul Chowdhury\\anaconda3\\envs\\tensorflow\\lib\\ntpath.py'>

In [84]:
import os
from gensim.models.wrappers import LdaMallet
os.environ.update({'MALLET_HOME':r'C:\mallet'})
mallet_path = r'c:\mallet\bin\mallet.bat' # update this path

In [85]:
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [86]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

[(18,
  [('long', 0.05335348785130445),
   ('charge', 0.0447878177135819),
   ('time', 0.04215653342290897),
   ('battery', 0.03510245213301982),
   ('case', 0.03224722875377897),
   ('kindle', 0.02737655357742694),
   ('day', 0.022337924084648974),
   ('week', 0.022113984996081065),
   ('back', 0.021050274325383495),
   ('cover', 0.020210502743253835)]),
 (15,
  [('screen', 0.07484636798151163),
   ('kindle', 0.05998214191921845),
   ('turn', 0.03513840012605704),
   ('light', 0.03214454540679657),
   ('hand', 0.02542150323021167),
   ('page', 0.024738694259152268),
   ('model', 0.021534744471873524),
   ('paperwhite', 0.020799411733809548),
   ('feel', 0.020379221597772994),
   ('touch', 0.018173223383581072)]),
 (0,
  [('purchase', 0.24176727134820808),
   ('enjoy', 0.09904557234056001),
   ('happy', 0.06769483597749826),
   ('family', 0.05764490234498451),
   ('item', 0.05549585993300044),
   ('learn', 0.04898552556728399),
   ('friend', 0.0424119840718033),
   ('day', 0.0349535427

In [87]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)


Coherence Score:  0.5766526924705814
