#Predicting sentiment from product reviews

#Fire up GraphLab Create

In [38]:
import graphlab
graphlab.product_key.set_product_key('81D7-5BA3-48F5-157F-9580-7FD8-D78A-EA18')

#Read some product review data

Loading reviews for a set of baby products. 

In [39]:
products = graphlab.SFrame('amazon_baby.gl/')

#Let's explore this data together

Data includes the product name, the review text and the rating of the review. 

#Build the word count vector for each review

In [40]:
#ignore all 3* reviews
products = products[products['rating'] != 3]
#positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4

In [41]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])

In [42]:
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

In [43]:
for i in selected_words:
    products[i] = products["word_count"].apply(lambda x: x[i] if i in x.keys() else 0 )

In [44]:
products.head()

name,review,rating,sentiment,word_count,awesome
Planetwise Wipe Pouch,it came early and was not disappointed. i love ...,5.0,1,"{'and': 3, 'love': 1, 'it': 2, 'highly': 1, ...",0
Annas Dream Full Quilt with 2 Shams ...,Very soft and comfortable and warmer than it ...,5.0,1,"{'and': 2, 'quilt': 1, 'it': 1, 'comfortable': ...",0
Stop Pacifier Sucking without tears with ...,This is a product well worth the purchase. I ...,5.0,1,"{'ingenious': 1, 'and': 3, 'love': 2, ...",0
Stop Pacifier Sucking without tears with ...,All of my kids have cried non-stop when I tried to ...,5.0,1,"{'and': 2, 'parents!!': 1, 'all': 2, 'puppet.': ...",0
Stop Pacifier Sucking without tears with ...,"When the Binky Fairy came to our house, we didn't ...",5.0,1,"{'and': 2, 'this': 2, 'her': 1, 'help': 2, ...",0
A Tale of Baby's Days with Peter Rabbit ...,"Lovely book, it's bound tightly so you may no ...",4.0,1,"{'shop': 1, 'noble': 1, 'is': 1, 'it': 1, 'as': ...",0
"Baby Tracker&reg; - Daily Childcare Journal, ...",Perfect for new parents. We were able to keep ...,5.0,1,"{'and': 2, 'all': 1, 'right': 1, 'when': 1, ...",0
"Baby Tracker&reg; - Daily Childcare Journal, ...",A friend of mine pinned this product on Pinte ...,5.0,1,"{'and': 1, 'help': 1, 'give': 1, 'is': 1, ' ...",0
"Baby Tracker&reg; - Daily Childcare Journal, ...",This has been an easy way for my nanny to record ...,4.0,1,"{'journal.': 1, 'nanny': 1, 'standarad': 1, ...",0
"Baby Tracker&reg; - Daily Childcare Journal, ...",I love this journal and our nanny uses it ...,4.0,1,"{'all': 1, 'forget': 1, 'just': 1, 'food': 1, ...",0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate
0,0,0,1,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,2,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0
0,0,0,2,0,0,0,0,0,0


In [45]:
for i in selected_words:
    print i, products[i].sum()

awesome 2002
great 42420
fantastic 873
amazing 1305
love 40277
horrible 659
bad 3197
terrible 673
awful 345
wow 131
hate 1057


In [46]:
max_count, min_count = 0, 9223372036854775807
max_word, min_word = selected_words[0], selected_words[0]
for word in selected_words:
    total = products[word].sum()
    if total > max_count:
        max_word = word
        max_count = total
    if total < min_count:
        min_word = word
        min_count = total
print('most used: ', max_word, max_count)
print('least used: ', min_word, min_count)

('most used: ', 'great', 42420)
('least used: ', 'wow', 131)


##Let's train the sentiment classifier

In [47]:
train_data,test_data = products.random_split(.8, seed=0)

In [48]:
sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)

#Evaluate the sentiment model

In [49]:
sentiment_model.evaluate(test_data)

{'accuracy': 0.916256305548883,
 'auc': 0.9446492867438502,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        1        |  1328 |
 |      0       |        0        |  4000 |
 |      1       |        1        | 26515 |
 |      1       |        0        |  1461 |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.9500349343413533,
 'log_loss': 0.26106698432422204,
 'precision': 0.9523039902309378,
 'recall': 0.9477766657134686,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+----------------+----------------+-------+------+
 | threshold |      fpr       |      tpr       |   p   |  n   |
 +-----------+----------------+----------------+-------+------+
 |    0.0    |      1.0       | 

In [50]:
sentiment_model.show(view='Evaluation')

Canvas is accessible via web browser at the URL: http://localhost:59282/index.html
Opening Canvas in default web browser.


In [51]:
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)

In [52]:
selected_words_model['coefficients']

name,index,class,value,stderr
(intercept),,1,1.36728315229,0.00861805467825
awesome,,1,1.05800888878,0.110865296265
great,,1,0.883937894898,0.0217379527921
fantastic,,1,0.891303090304,0.154532343591
amazing,,1,0.892802422508,0.127989503231
love,,1,1.39989834302,0.0287147460124
horrible,,1,-1.99651800559,0.0973584169028
bad,,1,-0.985827369929,0.0433603009142
terrible,,1,-2.09049998487,0.0967241912229
awful,,1,-1.76469955631,0.134679803365


In [54]:
selected_words_model['coefficients'].sort('value', ascending=False)

name,index,class,value,stderr
love,,1,1.39989834302,0.0287147460124
(intercept),,1,1.36728315229,0.00861805467825
awesome,,1,1.05800888878,0.110865296265
amazing,,1,0.892802422508,0.127989503231
fantastic,,1,0.891303090304,0.154532343591
great,,1,0.883937894898,0.0217379527921
wow,,1,-0.0541450123332,0.275616449416
bad,,1,-0.985827369929,0.0433603009142
hate,,1,-1.40916406276,0.0771983993506
awful,,1,-1.76469955631,0.134679803365


In [55]:
most_positive = -1
most_positive_word = ''
most_negative = 1
most_negative_word = ''
for row in selected_words_model['coefficients']:
    if row['name'] == '(intercept)': continue
    if row['value']>most_positive:
        most_positive = row['value']
        most_positive_word = row['name']
    if row['value']<most_negative:
        most_negative = row['value']
        most_negative_word = row['name']
print('most positive: ', most_positive_word, most_positive)
print('most negative: ', most_negative_word, most_negative)

('most positive: ', 'love', 1.3998983430174745)
('most negative: ', 'terrible', -2.090499984872604)


In [56]:
selected_words_model.evaluate(test_data)

{'accuracy': 0.8431419649291376,
 'auc': 0.6648096413721418,
 'confusion_matrix': Columns:
 	target_label	int
 	predicted_label	int
 	count	int
 
 Rows: 4
 
 Data:
 +--------------+-----------------+-------+
 | target_label | predicted_label | count |
 +--------------+-----------------+-------+
 |      0       |        0        |  234  |
 |      0       |        1        |  5094 |
 |      1       |        1        | 27846 |
 |      1       |        0        |  130  |
 +--------------+-----------------+-------+
 [4 rows x 3 columns],
 'f1_score': 0.914242563530107,
 'log_loss': 0.40547471103673566,
 'precision': 0.8453551912568306,
 'recall': 0.9953531598513011,
 'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 100001
 
 Data:
 +-----------+-----+-----+-------+------+
 | threshold | fpr | tpr |   p   |  n   |
 +-----------+-----+-----+-------+------+
 |    0.0    | 1.0 | 1.0 | 27976 | 5328 |
 |   1e-05   | 1.0 | 1.0 | 27976 | 5328 |
 |   2e-05   |

In [57]:
selected_words_model.show(view='Evaluation')

Canvas is accessible via web browser at the URL: http://localhost:59282/index.html
Opening Canvas in default web browser.


In [58]:
import numpy as np
from collections import Counter

print(Counter(train_data['sentiment']))
print("The accuracy majority class classifier on this task is: " + str(1))
print("Accuracy Score of the baseline model: %8.5f" %(float(len(test_data[test_data['sentiment']==1]))/float(len(test_data))))

Counter({1: 112283, 0: 21165})
The accuracy majority class classifier on this task is: 1
Accuracy Score of the baseline model:  0.84002


#Applying the learned model to understand sentiment for Giraffe

In [59]:
diaper_champ_reviews = products[products['name'] == "Baby Trend Diaper Champ"]

In [60]:
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')

In [61]:
diaper_champ_reviews.head()

name,review,rating,sentiment,word_count,awesome
Baby Trend Diaper Champ,Ok - newsflash. Diapers are just smelly. We've ...,4.0,1,"{'son': 1, 'just': 2, 'less': 1, '-': 3, ...",0
Baby Trend Diaper Champ,"My husband and I selected the Diaper ""Champ"" ma ...",1.0,0,"{'material)': 1, 'bags,': 1, 'less': 1, 'when': 3, ...",0
Baby Trend Diaper Champ,Excellent diaper disposal unit. I used it in ...,5.0,1,"{'control': 1, 'am': 1, 'it': 1, 'used': 1, ' ...",0
Baby Trend Diaper Champ,We love our diaper champ. It is very easy to use ...,5.0,1,"{'and': 3, 'over.': 1, 'all': 1, 'bags.': 1, ...",0
Baby Trend Diaper Champ,Two girlfriends and two family members put me ...,5.0,1,"{'just': 1, '-': 3, 'both': 1, 'results': 1, ...",0
Baby Trend Diaper Champ,I waited to review this until I saw how it ...,4.0,1,"{'lysol': 1, 'all': 1, 'mom.': 1, 'busy': 1, ...",0
Baby Trend Diaper Champ,I have had a diaper genie for almost 4 years since ...,1.0,0,"{'all': 1, 'bags.': 1, 'just': 1, ""don't"": 2, ...",0
Baby Trend Diaper Champ,I originally put this item on my baby registry ...,5.0,1,"{'lysol': 1, 'all': 2, 'bags.': 1, 'feedback': ...",0
Baby Trend Diaper Champ,I am so glad I got the Diaper Champ instead of ...,5.0,1,"{'and': 2, 'all': 1, 'just': 1, 'is': 2, ' ...",0
Baby Trend Diaper Champ,We had 2 diaper Genie's both given to us as a ...,4.0,1,"{'hand.': 1, 'both': 1, '(required': 1, 'befo ...",0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,predicted_sentiment
0,0,0,0,0,0,0,0,0,0,0.958443580893
0,0,0,0,0,0,0,0,0,0,2.47155884995e-12
0,0,0,0,0,0,0,0,0,0,0.999994864775
0,0,0,1,0,0,0,0,0,0,0.998779072633
0,0,0,0,1,0,0,0,0,0,0.999999604504
0,0,0,0,0,1,0,0,0,0,0.999952233179
0,0,0,0,0,0,0,0,0,0,0.972560724165
0,0,0,0,0,0,0,0,0,0,0.999999642488
0,0,0,0,0,0,0,0,0,0,0.97415225478
0,0,0,2,0,0,0,0,0,0,0.99267406035


##Sort the reviews based on the predicted sentiment and explore

In [62]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)

In [63]:
diaper_champ_reviews.head()

name,review,rating,sentiment,word_count,awesome
Baby Trend Diaper Champ,Baby Luke can turn a clean diaper to a dirty ...,5.0,1,"{'all': 1, 'less': 1, ""friend's"": 1, '(which': ...",0
Baby Trend Diaper Champ,I LOOOVE this diaper pail! Its the easies ...,5.0,1,"{'just': 1, 'over': 1, 'rweek': 1, 'sooo': 1, ...",0
Baby Trend Diaper Champ,We researched all of the different types of di ...,4.0,1,"{'all': 2, 'just': 4, ""don't"": 2, 'one,': 1, ...",0
Baby Trend Diaper Champ,My baby is now 8 months and the can has been ...,5.0,1,"{""don't"": 1, 'able': 2, 'over': 1, 'soon': 1, ...",0
Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper ...",5.0,1,"{'just': 3, 'money': 1, 'still': 3, 'fine': 1, ...",0
Baby Trend Diaper Champ,Diaper Champ or Diaper Genie? That was my ...,5.0,1,"{'son': 2, 'all': 1, 'bags.': 1, 'son,': 1, ...",0
Baby Trend Diaper Champ,Wow! This is fabulous. It was a toss-up between ...,5.0,1,"{'and': 4, 'this': 3, 'stink': 1, 'garbage' ...",0
Baby Trend Diaper Champ,I originally put this item on my baby registry ...,5.0,1,"{'lysol': 1, 'all': 2, 'bags.': 1, 'feedback': ...",0
Baby Trend Diaper Champ,Two girlfriends and two family members put me ...,5.0,1,"{'just': 1, '-': 3, 'both': 1, 'results': 1, ...",0
Baby Trend Diaper Champ,I am one of those super- critical shoppers who ...,5.0,1,"{'all': 1, 'humid': 1, 'just': 1, 'less': 1, ...",0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,predicted_sentiment
0,0,0,0,0,0,0,0,0,0,0.999999937267
0,0,0,1,0,0,0,0,0,0,0.999999917406
0,0,0,0,0,1,0,0,0,0,0.999999899509
2,0,0,0,0,1,0,0,0,0,0.999999836182
0,0,0,2,0,0,0,0,0,0,0.999999824745
0,0,0,0,0,0,0,0,0,0,0.999999759315
0,0,0,0,0,0,0,0,0,0,0.999999692111
0,0,0,0,0,0,0,0,0,0,0.999999642488
0,0,0,0,1,0,0,0,0,0,0.999999604504
0,0,0,1,0,0,0,0,0,0,0.999999486804


##Most positive reviews for the giraffe

In [69]:
diaper_champ_reviews = products[products['name'] == "Baby Trend Diaper Champ"]
diaper_champ_reviews['predicted_sentiment'] = selected_words_model.predict(diaper_champ_reviews, output_type='probability')
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
diaper_champ_reviews.head()

name,review,rating,sentiment,word_count,awesome
Baby Trend Diaper Champ,I LOVE LOVE LOVE this product! It is SO much ...,4.0,1,"{'rating': 1, 'contacted': 1, 'over': ...",0
Baby Trend Diaper Champ,I received my Diaper Champ at my baby shower ...,5.0,1,"{'bags.': 1, ""don't"": 1, 'son.': 1, 'of,': 1, ...",0
Baby Trend Diaper Champ,"Love it, love it, love it! This lives up to ...",5.0,1,"{'instead': 1, 'all': 1, 'already': 1, 'love': 3, ...",0
Baby Trend Diaper Champ,Works great - no smells. LOVE that it uses reg ...,5.0,1,"{'and': 2, 'bags.': 1, 'garbage': 1, 'wastef ...",0
Baby Trend Diaper Champ,I love this diaper pale and wouldn't dream of ...,5.0,1,"{'and': 3, 'love': 1, 'use.': 1, 'is': 2, ' ...",0
Baby Trend Diaper Champ,I've worked with kids more than half my life. ...,5.0,1,"{'and': 4, 'genies': 1, 'all': 1, 'because': 1, ...",0
Baby Trend Diaper Champ,I love this diaper pail. It keeps the diapers ...,4.0,1,"{'and': 1, 'old': 1, 'extra': 1, 'is': 1, ...",0
Baby Trend Diaper Champ,"This is absolutely, by far, the best diaper ...",5.0,1,"{'just': 3, 'money': 1, 'still': 3, 'fine': 1, ...",0
Baby Trend Diaper Champ,Love the Diaper Champ. I had planned to get the ...,4.0,1,"{'reviews,': 1, 'infant': 1, 'bags.': 1, 'just' ...",0
Baby Trend Diaper Champ,We had 2 diaper Genie's both given to us as a ...,4.0,1,"{'hand.': 1, 'both': 1, '(required': 1, 'befo ...",0

great,fantastic,amazing,love,horrible,bad,terrible,awful,wow,hate,predicted_sentiment
1,0,0,3,0,0,0,0,0,0,0.998423414594
0,0,0,3,0,0,0,0,0,0,0.996192539732
0,0,0,3,0,0,0,0,0,0,0.996192539732
2,0,0,1,0,0,0,0,0,0,0.989387539605
2,0,0,1,0,0,0,0,0,0,0.989387539605
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527
0,0,0,2,0,0,0,0,0,0,0.984739056527


In [70]:
diaper_champ_reviews = products[products['name'] == "Baby Trend Diaper Champ"]
diaper_champ_reviews = diaper_champ_reviews.sort('rating', ascending = False)
diaper_champ_reviews = diaper_champ_reviews.sort('sentiment', ascending = False)

In [71]:
selected_words_model.predict(diaper_champ_reviews[0:10], output_type='probability')

dtype: float
Rows: 10
[0.7969408512906704, 0.9047558080934621, 0.7969408512906704, 0.9408763934283928, 0.3476840527363245, 0.7969408512906704, 0.7969408512906704, 0.7969408512906704, 0.7969408512906704, 0.7969408512906704]

In [72]:
sentiment_model.predict(diaper_champ_reviews[0:10], output_type='probability')

dtype: float
Rows: 10
[0.9999606821195993, 0.9996043579576536, 0.9999948647750444, 0.9987790726331038, 0.9999996045038924, 0.9986935667060233, 0.9981852686334992, 0.9999996424880316, 0.9741522547795112, 0.9996161231956484]

In [73]:
diaper_champ_reviews[0]['word_count']

{'"funnel"': 1,
 '(see': 1,
 '(where': 1,
 '-': 2,
 '180-degree': 1,
 'a': 6,
 'ability': 1,
 'all': 1,
 'and': 3,
 'another': 1,
 'any': 1,
 'arc,': 1,
 'are': 1,
 'as': 2,
 'bag': 3,
 'bags': 1,
 'be': 3,
 'between': 1,
 'bit': 1,
 'blue': 1,
 'bottom': 1,
 'braun': 1,
 'but': 1,
 'can': 1,
 'cartridges,': 1,
 'cleaning': 1,
 'clever': 1,
 'complaints': 1,
 'contents': 1,
 'dark': 1,
 'dealing': 1,
 'deposit': 1,
 'design': 1,
 'diaper': 1,
 'dogs': 1,
 'easy': 1,
 'etc.)': 1,
 'expendables.': 1,
 'fair': 1,
 'fairly': 1,
 'feature': 1,
 'fell': 1,
 'fingers': 1,
 'for': 2,
 'fresh': 1,
 'from': 1,
 'gonna': 1,
 'goodies': 1,
 'gravity': 1,
 'handle': 1,
 'hazardous': 1,
 'hermetic': 1,
 'holder)': 1,
 'i': 2,
 'if': 1,
 'immediately': 1,
 'in': 2,
 'inside.': 1,
 'installation': 1,
 'into': 2,
 'is': 5,
 'it': 2,
 'keeping': 1,
 'knock': 1,
 'latch': 1,
 'lead': 1,
 'lid': 1,
 'locked': 1,
 'non-existent': 1,
 'normal': 1,
 'not': 2,
 'odor': 1,
 'of': 8,
 'off': 1,
 'on': 2,
 'one'