In [225]:
import graphlab
products = graphlab.SFrame('amazon_baby.gl/')
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']
graphlab.canvas.set_target('ipynb')

In [226]:
products['word_count'] = graphlab.text_analytics.count_words(products['review'])


In [227]:
for word in selected_words:    
    products[word] = products['word_count'].apply(lambda x: x[word] if word in x else 0)

In [228]:
for w in selected_words:
    print "word %s is used %d times" %(w,products[w].sum())


word awesome is used 2090 times
word great is used 45206 times
word fantastic is used 932 times
word amazing is used 1363 times
word love is used 42065 times
word horrible is used 734 times
word bad is used 3724 times
word terrible is used 748 times
word awful is used 383 times
word wow is used 144 times
word hate is used 1220 times


In [230]:
train_data,test_data = products.random_split(.8, seed=0)
graphlab.canvas.set_target('ipynb')
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'] >=4


In [240]:

sentiment_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=['word_count'],
                                                     validation_set=test_data)
selected_words_model = graphlab.logistic_classifier.create(train_data,
                                                     target='sentiment',
                                                     features=selected_words,
                                                     validation_set=test_data)

PROGRESS: Logistic regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 133448
PROGRESS: Number of classes           : 2
PROGRESS: Number of feature columns   : 1
PROGRESS: Number of unpacked features : 219217
PROGRESS: Number of coefficients    : 219218
PROGRESS: Starting L-BFGS
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+-----------+--------------+-------------------+---------------------+
PROGRESS: | Iteration | Passes   | Step size | Elapsed Time | Training-accuracy | Validation-accuracy |
PROGRESS: +-----------+----------+-----------+--------------+-------------------+---------------------+
PROGRESS: | 1         | 5        | 0.000002  | 1.414977     | 0.841481          | 0.839989            |
PROGRESS: | 2         | 9        | 3.000000  | 2.856995     | 0.947425          | 0.894877            |
PROGRESS: | 3         | 10       | 3.000000  | 3.406383     | 0.92

In [242]:
selected_words_model['coefficients'].sort('value',ascending = True)

name,index,class,value
terrible,,1,-2.09049998487
horrible,,1,-1.99651800559
awful,,1,-1.76469955631
hate,,1,-1.40916406276
bad,,1,-0.985827369929
wow,,1,-0.0541450123333
great,,1,0.883937894898
fantastic,,1,0.891303090304
amazing,,1,0.892802422508
awesome,,1,1.05800888878


In [233]:
selected_words_model['coefficients'].sort('value',ascending = True)

name,index,class,value
terrible,,1,-2.09049998487
horrible,,1,-1.99651800559
awful,,1,-1.76469955631
hate,,1,-1.40916406276
bad,,1,-0.985827369929
wow,,1,-0.0541450123333
great,,1,0.883937894898
fantastic,,1,0.891303090304
amazing,,1,0.892802422508
awesome,,1,1.05800888878


In [234]:
selected_words_model.evaluate(test_data, metric='roc_curve')

{'roc_curve': Columns:
 	threshold	float
 	fpr	float
 	tpr	float
 	p	int
 	n	int
 
 Rows: 1001
 
 Data:
 +------------------+-------------------+----------------+-------+------+
 |    threshold     |        fpr        |      tpr       |   p   |  n   |
 +------------------+-------------------+----------------+-------+------+
 |       0.0        | 0.000188928773852 |      0.0       | 28020 | 5293 |
 | 0.0010000000475  |   0.999811071226  |      1.0       | 28020 | 5293 |
 | 0.00200000009499 |   0.999622142452  |      1.0       | 28020 | 5293 |
 | 0.00300000002608 |   0.999622142452  |      1.0       | 28020 | 5293 |
 | 0.00400000018999 |   0.999433213678  | 0.999964311206 | 28020 | 5293 |
 | 0.00499999988824 |   0.999433213678  | 0.999964311206 | 28020 | 5293 |
 | 0.00600000005215 |   0.999244284905  | 0.999964311206 | 28020 | 5293 |
 | 0.00700000021607 |   0.999244284905  | 0.999964311206 | 28020 | 5293 |
 | 0.00800000037998 |   0.999244284905  | 0.999964311206 | 28020 | 5293 |
 | 0.008

In [235]:
selected_words_model.show(view='Evaluation')

In [236]:
baby_tren_reviews=products[products['name'] == 'Baby Trend Diaper Champ']

In [237]:
baby_tren_reviews['predicted_sentiment_word_count'] = sentiment_model.predict(baby_tren_reviews, output_type='probability')
baby_tren_reviews['predicted_sentiment_selected_word'] = selected_words_model.predict(baby_tren_reviews, output_type='probability')

In [238]:

baby_tren_reviews.sort('review',ascending = False)

name,review,rating,word_count,awesome,great,fantastic
Baby Trend Diaper Champ,you can use any ol' bag with this. go to the ...,5.0,"{'and': 2L, 'bags.': 1L, 'they': 1L, 'is': 1L, ...",0,0.0,0.0
Baby Trend Diaper Champ,this works really well. I found it easier than ...,5.0,"{'and': 3L, 'dispose': 1L, 'hand.': 1L, 'drop': ...",0,0.0,0.0
Baby Trend Diaper Champ,my son is now 3 months old and when he was born ...,5.0,"{'and': 3L, 'this': 2L, 'because': 2L, 'love': ...",0,0.0,0.0
Baby Trend Diaper Champ,love it because it uses the bags from the gro ...,4.0,"{'because': 1L, 'love': 1L, 'it': 2L, 'have': ...",0,0.0,0.0
Baby Trend Diaper Champ,i love this diaper champ..not only does it ...,5.0,"{'and': 3L, 'this': 1L, 'liked': 1L, 'love': 1L, ...",0,0.0,0.0
Baby Trend Diaper Champ,You really can not appreciate this device ...,5.0,"{'ignored': 1L, 'well.who': 1L, ...",0,0.0,0.0
Baby Trend Diaper Champ,You cannot imagine how much money you'll save ...,5.0,"{'and': 2L, 'garbage': 1L, 'money': 1L, 'it': ...",0,0.0,0.0
Baby Trend Diaper Champ,Wow! This is fabulous. It was a toss-up between ...,5.0,"{'and': 4L, '""genie"".': 1L, 'since': 1L, ...",0,0.0,0.0
Baby Trend Diaper Champ,Worth every single penny and then some! I ...,5.0,"{'and': 2L, 'upstairs': 1L, 'doing': 1L, ...",0,0.0,0.0
Baby Trend Diaper Champ,Worst diaper pale ever!! I've had mine for 2 y ...,1.0,"{'and': 4L, '""chump""': 1L, 'stink': 1L, 'drop': ...",0,0.0,0.0

amazing,love,horrible,bad,terrible,awful,wow,hate,sentiment,predicted_sentiment_word_ count ...
0.0,0.0,0,0.0,0,0,0,0,1,0.997557996362
0.0,0.0,0,0.0,0,0,0,0,1,0.998693566706
0.0,1.0,0,0.0,0,0,0,0,1,0.999988228677
0.0,1.0,0,0.0,0,0,0,0,1,0.989429526331
0.0,1.0,0,0.0,0,0,0,0,1,0.999498668331
0.0,0.0,0,0.0,0,0,0,0,1,0.0568630344034
0.0,0.0,0,0.0,0,0,0,0,1,0.984047021751
0.0,0.0,0,0.0,0,0,0,0,1,0.999999692111
0.0,0.0,0,0.0,0,0,0,0,1,0.999882427276
0.0,0.0,0,0.0,0,0,0,0,0,0.000329802188674

predicted_sentiment_selec ted_word ...
0.796940851291
0.796940851291
0.940876393428
0.940876393428
0.940876393428
0.796940851291
0.796940851291
0.796940851291
0.796940851291
0.796940851291


In [239]:
baby_tren_reviews['review'][0]

"Ok - newsflash.  Diapers are just smelly.  We've had this pail for 2.5 years now.  It was our first and primary one.  There were no major smell problems until after one year, when our son started eating solids.  Also, we change the bag twice weekly as 3 days is about the max for smell-containment.  Around 20-22 months we started shopping for a container that would be less smelly and didn't find one as good.  (We have a cheaper one upstairs which broke immediately and always stunk!)  We finally just put the Diaper Champ in the attic a few months ago and use the cheap one with the flip-up lid - mainly since the cheapo fits inside the cabinet and we didn't notice a big difference in smell-control.  (The most helpful action is to tie the dirty diapers inside a small plastic bag before putting them in the pail.)A couple of our friends have this pail and were pleased until the children started eating solid food and things got stinkier - but that's pretty much the consensus according to many