In [1]:
# importing libraries
import pandas as pd
import numpy as np
from pandas import DataFrame, Series 
import re
import string
import nltk
import spacy
import os
import gensim
from gensim import corpora

# libraries for visualization
import pyLDAvis
import pyLDAvis.gensim_models

from gensim.models.coherencemodel import CoherenceModel


In [2]:
# load data
review_data= pd.read_csv("Desktop/Reviews.csv")
print(review_data.head(3))
print(len(review_data))
print('Unique Products')
print(len(review_data.groupby('ProductId')))
print('Unique Users')
print(len(review_data.groupby('UserId')))

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1      Not as Advertised  Product arrived labeled as Jumbo Salted Peanut...  
2  "Delight" says it all  This is a confection that has been around a fe...  
568454
Unique Products
74258
Unique Users
256059


In [3]:
# cleaning data
def clean_text(text ): 
    delete_dict = {sp_character: '' for sp_character in string.punctuation} 
    delete_dict[' '] = ' ' 
    table = str.maketrans(delete_dict)
    text1 = text.translate(table)
   # print('cleaned:'+text1.head(2))
    textArr= text1.split()
    text2 = ' '.join([w for w in textArr if ( not w.isdigit() and  ( not w.isdigit() and len(w)>3))]) 
    
    return text2.lower()

In [4]:
review_data.dropna(axis = 0, how ='any',inplace=True) 



review_data['Text'] = review_data['Text'].apply(clean_text)
review_data['Num_words_text'] = review_data['Text'].apply(lambda x:len(str(x).split())) 

print('-------Dataset --------')
print(review_data['Score'].value_counts())
print(len(review_data))
print('-------------------------')
max_review_data_sentence_length  = review_data['Num_words_text'].max()
# words in review 100<=90
mask = (review_data['Num_words_text'] < 100) & (review_data['Num_words_text'] >=90)
df_short_reviews = review_data[mask]
# for every score 1-5 we are taking 20 samples. Total samples = 100
df_sampled = df_short_reviews.groupby('Score').apply(lambda x: x.sample(n=20)).reset_index(drop = True)

print('No of Short reviews')
print(len(df_short_reviews))

-------Dataset --------
5    363111
4     80655
1     52264
3     42638
2     29743
Name: Score, dtype: int64
568411
-------------------------
No of Short reviews
11815


In [5]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# function to remove stopwords
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text


df_sampled['Text']=df_sampled['Text'].apply(remove_stopwords)

In [6]:
 # lemmas for words that represent noun and adjective as they tolk about the topics more
 # compare to other parts of speech 
nlp = spacy.load("en_core_web_md", disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [7]:
# making a list of words
text_list=df_sampled['Text'].tolist()
print(text_list[1])
tokenized_reviews = lemmatization(text_list)
print(tokenized_reviews[1])

first didnt taste like anything time added enough taste something disgusted flavor like truly found ways describe flavorbr sucking brown saccharinebr chewing stale aspirinbr someone dropped cigarette butt waterbr sweaty socks wrung glassbr someone left used motor bottom glassbr drink great idea terrible rendering teenager wanted tempted science experiment call foot wondering would stop growth said feet dont want stop growing taller wonder sprinkle directly feet would work back youbr keep trying figure people like finally figured message water spelled vodka
['time', 'enough', 'taste', 'disgusted', 'flavor', 'way', 'flavorbr', 'brown', 'saccharinebr', 'stale', 'aspirinbr', 'cigarette', 'butt', 'sweaty', 'sock', 'motor', 'bottom', 'great', 'idea', 'terrible', 'rendering', 'teenager', 'science', 'experiment', 'call', 'foot', 'growth', 'foot', 'tall', 'wonder', 'foot', 'youbr', 'figure', 'people', 'message', 'water', 'vodka']


In [8]:
# creating a document term matrix
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]
print(doc_term_matrix[1])

[(32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 3), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1)]


In [9]:
# Creating the object for LDA model using gensim library
LDA = gensim.models.ldamodel.LdaModel

# Building LDA model
lda_model = LDA(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=5,
    random_state=100,
    chunksize=100,
    passes=20,
    iterations=100
)

In [10]:

# will print 5 topics from 100 reviews containing 90 to 99 words for each review
lda_model.print_topics()

[(0,
  '0.017*"cookie" + 0.014*"good" + 0.013*"flavor" + 0.012*"product" + 0.011*"sugar" + 0.011*"taste" + 0.009*"time" + 0.007*"food" + 0.007*"ingredient" + 0.007*"baby"'),
 (1,
  '0.023*"food" + 0.007*"stomach" + 0.006*"market" + 0.006*"hair" + 0.005*"many" + 0.005*"star" + 0.005*"curry" + 0.005*"mushroom" + 0.005*"diet" + 0.004*"product"'),
 (2,
  '0.020*"product" + 0.018*"coffee" + 0.013*"food" + 0.011*"good" + 0.010*"flavor" + 0.008*"dog" + 0.007*"pod" + 0.007*"water" + 0.007*"many" + 0.006*"great"'),
 (3,
  '0.015*"product" + 0.012*"taste" + 0.012*"drink" + 0.010*"sweet" + 0.009*"shipping" + 0.008*"bar" + 0.008*"good" + 0.007*"energy" + 0.007*"cherry" + 0.007*"whole"'),
 (4,
  '0.043*"coffee" + 0.021*"flavor" + 0.015*"good" + 0.009*"chip" + 0.008*"product" + 0.008*"brand" + 0.008*"taste" + 0.008*"chicken" + 0.008*"well" + 0.007*"food"')]

In [11]:
# Visualizing the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
vis

  default_term_info = default_term_info.sort_values(


  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
