In [2]:
!ls

Topic_Modeling_Sentiment_Predict_Amazon_reviews.ipynb
neg_tweets.txt
pos_tweets.txt
reviews_Electronics_5.json


In [3]:
reset -sf

In [4]:
%pylab inline
import pandas as pd
import numpy as np
import json

Populating the interactive namespace from numpy and matplotlib


### Download Data

In [5]:
data = ''
with open('reviews_Electronics_5.json', 'r') as f:
    for line in f.readlines():
        data += line
        
df = pd.read_json(data, lines=True)
len(df)

1689188

In [6]:
df.head(3)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,528881469,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",AO94DHGC771SJ,amazdnu,Gotta have GPS!,1370131200
1,528881469,"[12, 15]",1,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",AMO214LNFCEI4,Amazon Customer,Very Disappointed,1290643200
2,528881469,"[43, 45]",3,"Well, what can I say. I've had this unit in m...","09 9, 2010",A3N7T0DY83Y4IG,C. A. Freeman,1st impression,1283990400


In [7]:
from collections import Counter
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import silhouette_score
from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition.online_lda import LatentDirichletAllocation

### Preprocess for Data

In [8]:
def re_tokenize_and_clean(df_review):
    '''
    1. making lowercase, lower()
    2. tokenize each review into words(tokens) 
       w/ regexp(regular expression) in order to leave only words, clean [, . '' "" () digits]
    3. stemming, clean word format such as the past or countinous forms
    
    Param: df_review (pandas.series)
    
    '''
    RT = RegexpTokenizer(r'\b[^\d\W]+\b')
    PS = PorterStemmer()
    input_review = df_review.apply(lambda x: str(x).lower())  \
                              .apply(lambda x: RT.tokenize(x))  \
                              .apply(lambda x: [PS.stem(i) for i in x]) \
                              .apply(lambda x: str(x).replace("', '", " ") \
                                                     .replace("['", " ")   \
                                                     .replace("']", " "))
    return input_review

In [9]:
#df = df[:200]

In [10]:
df['input_review'] = re_tokenize_and_clean(df['reviewText'])

In [11]:
df['input_review'][:2]

0     we got thi gp for my husband who is an otr ov...
1     i m a profession otr truck driver and i bough...
Name: input_review, dtype: object

### Vectorizer & Fitting

In [12]:
'''
Two vecterizer used as vertorizer, Count(Tf) and Tf-idf, in LDA classification model.
param: max_features - number of top max_features for vocabulary 
       stop_word    - cleaning words
'''
# Count Verterizer
tf_vec = CountVectorizer(max_features=1000, stop_words='english')
tf_fit = tf_vec.fit_transform(df['input_review'].values)
print(tf_fit.shape)

# Tf-idf Vectorizer
tfidf_vec = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_fit = tfidf_vec.fit_transform(df['input_review'].values)
print(tfidf_fit.shape)

(1689188, 1000)
(1689188, 1000)


> LDA with Count Vectorizer performed usually better as compare with Tfidf Vectorizer LDA. <br>
I think because it is a probabilistic graphical model.<br>
It means that probabilistic graphical model (LDA) is not require idf value, give weight for words based on frequency in other documents.<br>
Therefore, Count (Tf) vectorizer is a good for LDA. 

### LDA w/ Tf

In [13]:
'''
param: n_components    - number of groups for categorize
       max_iter        - maxium number of iterations
       learning_method - update method for components; batch(0.20 folds) & online(faster)
       learning_offset - parameter that downweights early iterations in learning
       random_state    - the random number generator, like serier number for generator
'''
lda = LatentDirichletAllocation(n_components=5,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=77).fit(tf_fit)

In [14]:
def top_words(model, tf_words, n_top_words=13):
    for ix, component in enumerate(model.components_):
        print('============================='*4)
        print("Components {0}:".format(ix+1))
        print(" | ".join([tf_words[i] for i in component.argsort()[:-n_top_words - 1:-1]]))
        print('============================='*4, '\n')

print("\nComponents in LDA_Tf model: \n")
tf_words = tf_vec.get_feature_names()
top_words(lda, tf_words)


Components in LDA_Tf model: 

Components 1:
thi | wa | work | cabl | tv | connect | use | devic | great | just | set | router | product

Components 2:
camera | thi | use | veri | wa | pictur | len | light | good | qualiti | like | video | great

Components 3:
sound | thi | speaker | use | good | headphon | qualiti | veri | music | like | wa | great | bluetooth

Components 4:
thi | use | drive | usb | wa | card | work | comput | window | time | ha | instal | just

Components 5:
thi | case | use | ipad | tablet | batteri | like | charg | veri | keyboard | wa | fit | screen



In [15]:
# check the balance of performance
lda_vec = lda.fit_transform(tf_fit)
Counter([np.argmax(i) for i in lda_vec])

Counter({0: 453432, 1: 238197, 2: 246294, 3: 334256, 4: 417009})

> It seems like documents (reviews) distribution is blanced, but I will try another LDA and NMF with Tfidf to compare with this.

### Visualization - LDA w/ Count (Tf) Vectorize

In [16]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [17]:
# Visualize result of classification
pyLDAvis.sklearn.prepare(lda, tf_fit, tf_vec)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


> The result looks like a good fitted LDA model for Amazon classification.<br>
However, I will try Tf-idf vectorizer for LDA model for just see the result as experiment. <br>
I didn't expected get a better result than LDA with Count(Tf) vertorizer.

### LDA w/ Tf-idf

In [18]:
lda_01 = LatentDirichletAllocation(n_components=5,
                                   max_iter=5,
                                   learning_method='online',
                                   learning_offset=50.,
                                   random_state=77).fit(tfidf_fit)

In [19]:
print("\nComponents in LDA_Tfidf model: \n")
tf_words_01 = tfidf_vec.get_feature_names()
top_words(lda_01, tf_words_01)


Components in LDA_Tfidf model: 

Components 1:
tv | watch | hdmi | roku | remot | movi | player | netflix | video | stream | samsung | hd | dvd

Components 2:
case | thi | ipad | cover | protect | screen | fit | kindl | tablet | protector | veri | like | use

Components 3:
thi | drive | use | wa | keyboard | work | card | usb | devic | comput | instal | veri | window

Components 4:
sound | thi | camera | batteri | use | speaker | wa | good | great | veri | headphon | qualiti | like

Components 5:
cabl | work | charg | thi | great | charger | product | price | good | wa | plug | phone | usb



In [20]:
# check the balance of performance
lda_vec_01 = lda_01.fit_transform(tfidf_fit)
Counter([np.argmax(i) for i in lda_vec_01])

Counter({0: 58252, 1: 329648, 2: 560401, 3: 530166, 4: 210721})

In [21]:
# Visualize result of classification
pyLDAvis.sklearn.prepare(lda_01, tfidf_fit, tfidf_vec)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


### Try NMF for Comparing Model as Test

In [22]:
nmf = NMF(init="nndsvd",
          n_components=5,
          max_iter=200,
          random_state=77).fit(tfidf_fit)

In [23]:
def top_words(model, tf_words, n_top_words=13):
    for ix, component in enumerate(model.components_):
        print('============================='*4)
        print("Components {0}:".format(ix+1))
        print(" | ".join([tf_words[i] for i in component.argsort()[:-n_top_words - 1:-1]]))
        print('============================='*4, '\n')

print("\nComponents in NMF model: \n")
tf_words = tfidf_vec.get_feature_names()
top_words(nmf, tf_words)


Components in NMF model: 

Components 1:
thi | work | wa | use | drive | product | just | time | great | devic | usb | comput | instal

Components 2:
camera | len | batteri | thi | pictur | canon | use | shoot | video | photo | shot | bag | light

Components 3:
sound | speaker | headphon | good | qualiti | ear | veri | bass | great | music | price | listen | volum

Components 4:
case | ipad | fit | protect | cover | thi | kindl | veri | nice | like | screen | look | tablet

Components 5:
cabl | hdmi | tv | work | qualiti | connect | thi | price | great | need | connector | good | buy



In [24]:
# check the balance of performance
nmf_vec = nmf.fit_transform(tfidf_fit)
Counter([np.argmax(i) for i in nmf_vec])

Counter({0: 786802, 1: 221655, 2: 276781, 3: 257591, 4: 146359})

>It seems like a good balanced NMF model for documents(reviews) distribution.

### Conclusion for Amazon Review Classification

>I applied two vectorizer, Counter(Tf) and Tf-idf, and two different model, LDA and NMF.<br>
* LDA - Count(Tf) vectorizer : Since LDA is probabilistic graphical model, it is a standard Topic modeling process.
  - The result is a good fitting model because documents(reviews) distribution is balenced.<br><br>
* LDA - Tfidf vectorizer: Just try to see the result (LDA is not require for idf (weighted) values for words.
  - The result looks not really balenced. <br><br>
* NMF - Tfidf vectorizer: NMF is require Tfidf values because this algorithm is based on 2 matrix multiplication, frequency and weight of words.

### Sentiment Analysis

In [25]:
df.columns

Index(['asin', 'helpful', 'overall', 'reviewText', 'reviewTime', 'reviewerID',
       'reviewerName', 'summary', 'unixReviewTime', 'input_review'],
      dtype='object')

In [26]:
'''
In order to train sentiment model, use text convertor function with Regexp Tokenizer.
param: s - a tweet (str)
'''
def convertor(s):
    RT = RegexpTokenizer(r'\b[^\d\W]+\b')
    return({w: True for w in (RT.tokenize(s.lower()))})

In [27]:
'''
For sentiment model training, use tweets.txt of positive and negative sets.
Make positive and negative lists with convertor function. 
'''
pos = []
with open("./pos_tweets.txt") as fp:
    for tweet in fp: 
        pos.append([convertor(tweet), 'pos'])
        
neg = []
with open("./neg_tweets.txt") as fn:
    for tweet in fn: 
        neg.append([convertor(tweet), 'neg'])

In [28]:
'''
80% for training dateset, contain pos & neg tweets.
20% for test dataset for validate model quality.
'''
train = pos[int(len(pos)*.2):] + neg[int(len(pos)*.2):]
test = pos[:int(len(pos)*.2)] + neg[:int(len(pos)*.2)]

In [29]:
from nltk.classify import NaiveBayesClassifier
'''
Use Navie Bayes classifier.
show_most_informatice_feartures() shows list of the most effective words for pos/neg from train data. 
'''
classifier = NaiveBayesClassifier.train(train)
classifier.show_most_informative_features()

Most Informative Features
                 awesome = True              pos : neg    =     29.8 : 1.0
                headache = True              neg : pos    =     21.5 : 1.0
                   thank = True              pos : neg    =     14.5 : 1.0
                      no = True              neg : pos    =     14.0 : 1.0
                       n = True              pos : neg    =     12.8 : 1.0
                    love = True              pos : neg    =     12.3 : 1.0
                      hi = True              pos : neg    =     11.8 : 1.0
               beautiful = True              pos : neg    =     11.1 : 1.0
                   loved = True              pos : neg    =     11.1 : 1.0
                   funny = True              pos : neg    =      9.4 : 1.0


In [30]:
# Test with df['reviewText'][0]
print(classifier.classify(convertor(df['reviewText'][0])))

pos


In [31]:
'''
Validate sentiment model.
Shows accuracy score for test set.
This accuracy actually based on the trained tweet, so that might not really accurate for amazon review.
In order to improve the model accuracy for amazon review, it needs suitable kinds of date set.
'''
from nltk.classify.util import accuracy
print(accuracy(classifier, test))

0.8414634146341463


In [32]:
df.head(1)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,input_review
0,528881469,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",AO94DHGC771SJ,amazdnu,Gotta have GPS!,1370131200,we got thi gp for my husband who is an otr ov...


In [33]:
len(df), len(df['asin'].unique())

(1689188, 63001)

In [34]:
df_sen = df[['asin', 'reviewText', 'reviewTime']]
df_sen.head(3)

Unnamed: 0,asin,reviewText,reviewTime
0,528881469,We got this GPS for my husband who is an (OTR)...,"06 2, 2013"
1,528881469,"I'm a professional OTR truck driver, and I bou...","11 25, 2010"
2,528881469,"Well, what can I say. I've had this unit in m...","09 9, 2010"


In [35]:
len(df_sen['asin'][0])

10

In [36]:
# Check the length of asin(product ID).
for i in range(len(df_sen['asin'])):
    assert len(df_sen['asin'][i]) == 10

In [37]:
from datetime import date
'''
To break year and week from timestamp.
'''
df_sen['yr'] = pd.to_datetime(df_sen['reviewTime']).apply(lambda x: x.isocalendar()[0])
df_sen['wk'] = pd.to_datetime(df_sen['reviewTime']).apply(lambda x: x.isocalendar()[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [38]:
df_sen.head(3)

Unnamed: 0,asin,reviewText,reviewTime,yr,wk
0,528881469,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",2013,22
1,528881469,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",2010,47
2,528881469,"Well, what can I say. I've had this unit in m...","09 9, 2010",2010,36


In [39]:
df_sen = (df_sen.sort_values(['asin', 'yr', 'wk'], ascending=[True, True, True])).reset_index()

In [40]:
def sentiment_nextWeek_review(asin, df):
    '''
    This is function for predict sentiment of next a week reviews.
    The function use the sentiment model, trained by positive and negative tweets, 84,15% accrate for test tweet.
    However, the df, which consist of Amazon reviews, might different style as tweets, 
    so the accracy is not really confident.
    
    param: asin - productID (str)
           df   - dataframe which is already sorted (pd.DataFrame) 
    '''
    pos, neg, cnt, review_rate = 0, 0, 0, 0                  # number of positive, negative and total reviews
    if asin in df['asin'].unique():
        for i, a in enumerate(df['asin']):
            if asin == a:
                cnt += 1
                '''
                Using tweet sentiment model
                * sen values will be pos or neg
                '''
                sen = classifier.classify(convertor(df['reviewText'][i]))
                if sen == 'pos':
                    pos += 1
                if sen == 'neg':
                    neg += 1
            else:
                if cnt > 0:
                    '''
                    yr_gap : last reviewed year - first reviewed year
                    total_gap_wk : (year = 52 weeks) -> convert into weeks
                    ave_num_review_per_week : number of reviews per a week
                    review_rate : probability for posible review (a week)
                                  ex) if the ave_num_review_per_week value is 3.5, 
                                      the number of expected reviews will be 3.5 next week for particular asin(product)
                    '''
                    yr_gap = df['yr'][i-1] - df['yr'][i-cnt]
                    total_gap_wk = (yr_gap-1)*52 + (52-df['wk'][i-cnt]) + df['wk'][i-1]
                    #print(total_gap_wk)
                    ave_num_review_per_week = cnt/total_gap_wk    # expected values
                    review_rate = ave_num_review_per_week/cnt     # In order to calculate expected number of pos & neg reviews
                    #print(cnt)
                    break
    else:
        print('The product, (asin #: {0}) cannot find in the list of review.'.format(asin))

    return asin, pos, neg, cnt, review_rate

In [41]:
asin, pos, neg, cnt, review_rate = sentiment_nextWeek_review('0972683275', df_sen)
print(' Predict for Review Qualities for {0} : Next a Week'.format(asin))
print('==============='*4)
print(' Positive Review, E(pos): {0:.4f}\n Negative Review, E(neg): {1:.4f}\n Total Review,  E(Total): {2:.4f}' \
     .format(pos*review_rate, neg*review_rate, cnt*review_rate))
print('==============='*4)

 Predict for Review Qualities for 0972683275 : Next a Week
 Positive Review, E(pos): 0.1958
 Negative Review, E(neg): 0.4639
 Total Review,  E(Total): 0.6596


* The product, (asin #: 0972683275) can get 0.6596 reviews, 0.4639 positive and 0.1958 negative reviews, next 1 week.

In [42]:
# Test for first 30 products
uniq_prod = df_sen['asin'].unique()
uniq_prod[:30]

array(['0528881469', '0594451647', '0594481813', '0972683275',
       '1400501466', '1400501520', '1400501776', '1400532620',
       '1400532655', '140053271X', '1400532736', '1400599997',
       '1400698987', '1400699169', '1615527613', '3744295508',
       '3930992868', '3936710058', '6301977173', '7214047977',
       '7507825604', '7799813393', '8862935293', '8862936826',
       '8918010656', '9043413585', '9573212900', '9573212919',
       '9575871979', '9625993428'], dtype=object)

In [43]:
for i in uniq_prod[:20]:
    asin, pos, neg, cnt, review_rate = sentiment_nextWeek_review(i, df_sen)
    print(' \nPredict for Review Qualities for {0} : Next a Week'.format(asin))
    print('==============='*4)
    print(' Positive Review, E(pos): {0:.4f}\n Negative Review, E(neg): {1:.4f}\n Total Review,  E(Total): {2:.4f}' \
         .format(pos*review_rate, neg*review_rate, cnt*review_rate))
    print('==============='*4)
    print('* The product, (asin #: {0}) can get {1:.4f} reviews, \n  {2:.4f} positive and {3:.4f} negative reviews, next 1 week.'\
         .format(i, cnt*review_rate, pos*review_rate, neg*review_rate), '\n')

 
Predict for Review Qualities for 0528881469 : Next a Week
 Positive Review, E(pos): 0.0070
 Negative Review, E(neg): 0.0282
 Total Review,  E(Total): 0.0352
* The product, (asin #: 0528881469) can get 0.0352 reviews, 
  0.0070 positive and 0.0282 negative reviews, next 1 week. 

 
Predict for Review Qualities for 0594451647 : Next a Week
 Positive Review, E(pos): 0.0370
 Negative Review, E(neg): 0.1481
 Total Review,  E(Total): 0.1852
* The product, (asin #: 0594451647) can get 0.1852 reviews, 
  0.0370 positive and 0.1481 negative reviews, next 1 week. 

 
Predict for Review Qualities for 0594481813 : Next a Week
 Positive Review, E(pos): 0.0200
 Negative Review, E(neg): 0.1400
 Total Review,  E(Total): 0.1600
* The product, (asin #: 0594481813) can get 0.1600 reviews, 
  0.0200 positive and 0.1400 negative reviews, next 1 week. 

 
Predict for Review Qualities for 0972683275 : Next a Week
 Positive Review, E(pos): 0.1958
 Negative Review, E(neg): 0.4639
 Total Review,  E(Total): 0.