In [1]:
import pandas as pd
import numpy as np
import spacy
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data = pd.read_csv("./Data/Reviews.csv")

In [3]:
data.shape

(568454, 10)

In [4]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
Id                        568454 non-null int64
ProductId                 568454 non-null object
UserId                    568454 non-null object
ProfileName               568438 non-null object
HelpfulnessNumerator      568454 non-null int64
HelpfulnessDenominator    568454 non-null int64
Score                     568454 non-null int64
Time                      568454 non-null int64
Summary                   568428 non-null object
Text                      568454 non-null object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
data.describe()

Unnamed: 0,Id,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time
count,568454.0,568454.0,568454.0,568454.0,568454.0
mean,284227.5,1.743817,2.22881,4.183199,1296257000.0
std,164098.679298,7.636513,8.28974,1.310436,48043310.0
min,1.0,0.0,0.0,1.0,939340800.0
25%,142114.25,0.0,0.0,4.0,1271290000.0
50%,284227.5,0.0,1.0,5.0,1311120000.0
75%,426340.75,2.0,2.0,5.0,1332720000.0
max,568454.0,866.0,923.0,5.0,1351210000.0


In [7]:
data.Text.head(10)

0    I have bought several of the Vitality canned d...
1    Product arrived labeled as Jumbo Salted Peanut...
2    This is a confection that has been around a fe...
3    If you are looking for the secret ingredient i...
4    Great taffy at a great price.  There was a wid...
5    I got a wild hair for taffy and ordered this f...
6    This saltwater taffy had great flavors and was...
7    This taffy is so good.  It is very soft and ch...
8    Right now I'm mostly just sprouting this so my...
9    This is a very healthy dog food. Good for thei...
Name: Text, dtype: object

In [22]:
X = pd.DataFrame(data.groupby(['ProductId']).count())

In [23]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74258 entries, 0006641040 to B009WVB40S
Data columns (total 9 columns):
Id                        74258 non-null int64
UserId                    74258 non-null int64
ProfileName               74258 non-null int64
HelpfulnessNumerator      74258 non-null int64
HelpfulnessDenominator    74258 non-null int64
Score                     74258 non-null int64
Time                      74258 non-null int64
Summary                   74258 non-null int64
Text                      74258 non-null int64
dtypes: int64(9)
memory usage: 5.7+ MB


In [24]:
Y = X.iloc[1:10,2:3]

In [8]:

vectorizer = CountVectorizer(decode_error='ignore')
X = vectorizer.fit_transform(data.Text.head(20))

In [9]:
start_time = time.time()
lda = LatentDirichletAllocation(n_components = 10)
topics = lda.fit_transform(X)
print ("Complete in {} seconds!".format(time.time() - start_time))



Complete in 0.342641115189 seconds!


In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #{}:".format(topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print("\nTopics in LDA model:")

print_top_words(lda, vectorizer.get_feature_names(), 20)


Topics in LDA model:
Topic #0:
my taffy was this and only pieces hair enjoyable melon complaint weeks with red it etc there delightful five particular
Topic #1:
sprouting too grass right rye mostly rotate around so wheatgrass it eat can with love just this they and cats
Topic #2:
the and this is it of very taffy was you in are if recommend great with would chewy to witch
Topic #3:
these and with very were have good am be my definitely others flavor love purchase shared twizzler fresh came all
Topic #4:
to and was them so at they eating for movies love like are fresh price good not fast can my
Topic #5:
the and of it food my br to in this sauce product we you now for have were if on
Topic #6:
the and is company in it twists this 45 made licorice firms by oldest world error to don satisfied taste
Topic #7:
for good dog puppies food digestion small their feeding every very at amount healthy this required her eats also my
Topic #8:
my yummy six while son around pleasure guilty pounds twiz

In [12]:
vectorizer2 = CountVectorizer(decode_error='ignore',stop_words='english')
Y = vectorizer2.fit_transform(data.Text.head(20))

In [13]:
start_time = time.time()
lda2 = LatentDirichletAllocation(n_components = 10)
topics = lda2.fit_transform(Y)
print ("Complete in {} seconds!".format(time.time() - start_time))

Complete in 0.0669820308685 seconds!




In [14]:
print("\nTopics in LDA model:")

print_top_words(lda, vectorizer.get_feature_names(), 20)


Topics in LDA model:
Topic #0:
my taffy was this and only pieces hair enjoyable melon complaint weeks with red it etc there delightful five particular
Topic #1:
sprouting too grass right rye mostly rotate around so wheatgrass it eat can with love just this they and cats
Topic #2:
the and this is it of very taffy was you in are if recommend great with would chewy to witch
Topic #3:
these and with very were have good am be my definitely others flavor love purchase shared twizzler fresh came all
Topic #4:
to and was them so at they eating for movies love like are fresh price good not fast can my
Topic #5:
the and of it food my br to in this sauce product we you now for have were if on
Topic #6:
the and is company in it twists this 45 made licorice firms by oldest world error to don satisfied taste
Topic #7:
for good dog puppies food digestion small their feeding every very at amount healthy this required her eats also my
Topic #8:
my yummy six while son around pleasure guilty pounds twiz