In [1]:
from __future__ import division
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import sys
import helpers


def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

# get the data
products = pd.read_csv("amazon_baby.csv")
print list(products)
print type(products)


['name', 'review', 'rating']
<class 'pandas.core.frame.DataFrame'>


In [2]:
# clean the data
products = products.fillna({'review':''})
products['review_clean'] = products['review'].apply(remove_punctuation)
print list(products)
# ignore all rating 3s since they tend to be neutral
products = products[products['rating'] != 3]
print type(products)


['name', 'review', 'rating', 'review_clean']
<class 'pandas.core.frame.DataFrame'>


In [3]:
# extract sentiment positive: rating >=4, negative: rating <= 2
products['sentiment'] = products['rating'].apply(lambda rating: + 1 if rating > 3 else -1)
print list(products)

['name', 'review', 'rating', 'review_clean', 'sentiment']


In [4]:
# split into test data and training data randomly
# to get the same results at the test use json indexes
with open('module-2-assignment-test-idx.json') as data_file:
    test_idx = json.load(data_file)
test_data = products.iloc[test_idx].copy()
print len(test_data)

33336


In [5]:
with open('module-2-assignment-train-idx.json') as data_file:
    train_idx = json.load(data_file)
train_data = products.iloc[train_idx].copy()
print len(train_data)


133416


In [6]:
# compute word count in each review: bag of words
# use sparse matrix to store the collection of word count vectors
# because some words occur only in some reviews

# 1. Learn a vocabulary of all words in all reviews in the training data. Each word is a column
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') #single word pattern
print vectorizer
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
print vectorizer.get_feature_names()


CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\b\\w+\\b', tokenizer=None,
        vocabulary=None)


In [7]:
# 2. Convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])


In [8]:
print train_matrix
print test_matrix


  (0, 87458)	1
  (0, 52346)	1
  (0, 60973)	1
  (0, 35380)	1
  (0, 67820)	1
  (0, 119315)	1
  (0, 75845)	1
  (0, 59309)	1
  (0, 52830)	1
  (0, 119288)	1
  (0, 69878)	2
  (0, 72811)	1
  (0, 14624)	1
  (0, 119389)	1
  (0, 80500)	1
  (0, 63567)	1
  (0, 54276)	1
  (0, 34453)	1
  (0, 72510)	2
  (0, 116798)	1
  (0, 10505)	3
  (0, 37328)	1
  (0, 21721)	1
  (0, 57486)	3
  (1, 85937)	1
  :	:
  (133415, 45698)	1
  (133415, 119439)	3
  (133415, 117337)	1
  (133415, 37640)	2
  (133415, 10440)	1
  (133415, 54987)	1
  (133415, 62056)	2
  (133415, 45067)	2
  (133415, 51051)	3
  (133415, 83729)	2
  (133415, 7280)	7
  (133415, 57196)	4
  (133415, 107483)	2
  (133415, 44646)	4
  (133415, 63165)	1
  (133415, 108946)	5
  (133415, 96190)	1
  (133415, 106249)	10
  (133415, 115480)	3
  (133415, 87458)	1
  (133415, 69878)	3
  (133415, 54276)	9
  (133415, 72510)	1
  (133415, 10505)	5
  (133415, 57486)	1
  (0, 5842)	1
  (0, 7280)	1
  (0, 9530)	1
  (0, 10440)	1
  (0, 10803)	1
  (0, 12630)	1
  (0, 13597)	2
  (0, 1

In [9]:
# Train a sentiment classifier with logistic regression
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression() #call to get an instance of the linearRegression class
print "training the model..."
sentiment_model = logistic_regression.fit(train_matrix, train_data["sentiment"])
print "finished training the model!"


training the model...
finished training the model!


In [10]:
# There should be over 100,000 coefficients in this sentiment_model.
# Recall from the lecture that positive weights w_j correspond to weights that cause positive sentiment,
# while negative weights correspond to negative sentiment.
# Calculate the number of positive (>= 0, which is actually nonnegative) coefficients.
# Quiz question: How many weights are >= 0?

sentiment_model_nonnegative_weights = logistic_regression.coef_[logistic_regression.coef_ >= 0]
sentiment_model_negative_weights = logistic_regression.coef_[logistic_regression.coef_ < 0]

print "weights > = 0 ---> ", len(sentiment_model_nonnegative_weights)
print "weights < 0   ---> ", len(sentiment_model_negative_weights)


weights > = 0 --->  85938
weights < 0   --->  35774


In [11]:
# Making predictions on the test data
sample_test_data = test_data[10:13]
print sample_test_data
print len(sample_test_data)


                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         review_clean  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  
3


In [12]:
# digging deeper into test data

type(sample_test_data)
list(sample_test_data)

print "maybe positive\n", sample_test_data['review'].iloc[0]
print "maybe negative\n", sample_test_data['review'].iloc[1]
print "also maybe negative\n", sample_test_data['review'].iloc[2]


maybe positive
Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.
maybe negative
Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.
also maybe negative
Was so excited to get this product for my baby girls bedroom!  When I got it the back is NOT STICKY at all!  Every time I walked into the bedroom I was picking up pieces off of the floor!  Very very frustrating!  Ended up having to super glue it to the wall...very disappointing.  I wouldn't waste the time or money on it.


In [13]:
print "predicting sentiment..."
# predictions for the sample test data
sample_test_matrix = vectorizer.transform(sample_test_data["review_clean"])
sample_scores = sentiment_model.decision_function(sample_test_matrix)
print sample_scores

sample_sentiments = sentiment_model.predict(sample_test_matrix)
print sample_sentiments


predicting sentiment...
[  5.59843439  -3.15360099 -10.42482888]
[ 1 -1 -1]


In [14]:
# Using the scores calculated previously, write code to calculate the probability 
# that a sentiment is positive using the above formula. 
# For each row, the probabilities should be a number in the range [0, 1].

def get_probability(score):
    return 1/(1+np.exp(-score))

print get_probability(sample_scores[0])
print get_probability(sample_scores[1])
print "%.6f" % get_probability(sample_scores[2])

0.996310008801
0.0409496234075
0.000030


In [15]:
# Now examine the whole test data set
# Using the sentiment_model, find the 20 reviews in the entire test_data with the highest probability 
# of being classified as a positive review. 
# We refer to these as the "most positive reviews."

scores = sentiment_model.decision_function(test_matrix)
probabilities = np.vectorize(get_probability)

# add a new column in the test_data with the calculated probabilities
test_data.loc[:,'probabilities'] = probabilities(scores)
print test_data['probabilities']

9         0.779299
10        0.999999
16        0.933782
20        0.999978
28        0.979965
36        0.999954
37        0.998730
41        0.801605
43        0.998449
56        0.997332
59        0.996310
71        0.040950
91        0.000030
112       0.995612
115       0.992937
116       0.976605
120       0.992583
123       0.939207
129       0.039157
135       0.998936
140       0.998639
146       0.999055
149       1.000000
157       0.999969
158       0.999976
160       0.979026
164       0.975281
171       0.952372
177       1.000000
180       0.999620
            ...   
183365    0.999047
183370    1.000000
183377    0.877573
183388    0.997951
183393    0.811225
183397    0.996710
183404    0.897002
183405    1.000000
183406    0.999929
183414    0.479149
183415    0.981972
183418    0.868000
183434    0.999901
183436    0.999974
183447    0.999999
183456    0.998470
183459    0.929161
183460    1.000000
183461    0.281355
183465    1.000000
183468    0.999984
183473    0.

In [16]:
# sort test data according to probability
# Quiz Question: Which of the following products are represented in the 20 most positive reviews?

test_data_sorted = test_data.sort_values('probabilities', ascending=False)
test_data_sorted[-20:]

Unnamed: 0,name,review,rating,review_clean,sentiment,probabilities
83234,"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs",My Experience: Babykicks Inserts failure vs RA...,5,My Experience Babykicks Inserts failure vs RAV...,1,1.656314e-09
31741,"Regalo My Cot Portable Bed, Royal Blue",If I could give this product zero stars I woul...,1,If I could give this product zero stars I woul...,-1,1.637796e-09
1116,Safety 1st Deluxe 4-in-1 Bath Station,This item is junk. I originally chose it beca...,1,This item is junk I originally chose it becau...,-1,1.083568e-09
154878,VTech Communications Safe &amp; Sound Digital ...,"First, the distance on these are no more than ...",1,First the distance on these are no more than 7...,-1,9.092509e-10
149987,NUK Cook-n-Blend Baby Food Maker,It thought this would be great. I did a lot of...,1,It thought this would be great I did a lot of ...,-1,7.424565e-10
75994,"Peg-Perego Tatamia High Chair, White Latte",I can see why there are so many good reviews o...,2,I can see why there are so many good reviews o...,-1,7.216031e-10
40079,Chicco Cortina KeyFit 30 Travel System in Adve...,My wife and I have used this system in two car...,1,My wife and I have used this system in two car...,-1,6.570205e-10
172090,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...,I read so many reviews saying the Belkin WiFi ...,2,I read so many reviews saying the Belkin WiFi ...,-1,6.523675e-10
59546,Ellaroo Mei Tai Baby Carrier - Hershey,This is basically an overpriced piece of fabri...,1,This is basically an overpriced piece of fabri...,-1,4.64636e-10
9915,Cosco Alpha Omega Elite Convertible Car Seat,I bought this car seat after both seeing the ...,1,I bought this car seat after both seeing the ...,-1,4.132717e-10


In [17]:
# Quiz Question: Which of the following products are represented in the 20 most negative reviews?
test_data_sorted[-20:]

Unnamed: 0,name,review,rating,review_clean,sentiment,probabilities
83234,"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs",My Experience: Babykicks Inserts failure vs RA...,5,My Experience Babykicks Inserts failure vs RAV...,1,1.656314e-09
31741,"Regalo My Cot Portable Bed, Royal Blue",If I could give this product zero stars I woul...,1,If I could give this product zero stars I woul...,-1,1.637796e-09
1116,Safety 1st Deluxe 4-in-1 Bath Station,This item is junk. I originally chose it beca...,1,This item is junk I originally chose it becau...,-1,1.083568e-09
154878,VTech Communications Safe &amp; Sound Digital ...,"First, the distance on these are no more than ...",1,First the distance on these are no more than 7...,-1,9.092509e-10
149987,NUK Cook-n-Blend Baby Food Maker,It thought this would be great. I did a lot of...,1,It thought this would be great I did a lot of ...,-1,7.424565e-10
75994,"Peg-Perego Tatamia High Chair, White Latte",I can see why there are so many good reviews o...,2,I can see why there are so many good reviews o...,-1,7.216031e-10
40079,Chicco Cortina KeyFit 30 Travel System in Adve...,My wife and I have used this system in two car...,1,My wife and I have used this system in two car...,-1,6.570205e-10
172090,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...,I read so many reviews saying the Belkin WiFi ...,2,I read so many reviews saying the Belkin WiFi ...,-1,6.523675e-10
59546,Ellaroo Mei Tai Baby Carrier - Hershey,This is basically an overpriced piece of fabri...,1,This is basically an overpriced piece of fabri...,-1,4.64636e-10
9915,Cosco Alpha Omega Elite Convertible Car Seat,I bought this car seat after both seeing the ...,1,I bought this car seat after both seeing the ...,-1,4.132717e-10


In [18]:
#We will now evaluate the accuracy of the trained classifier. Recall that the accuracy is given by

#accuracy=# correctly classified examples/# total examples

#This can be computed as follows:
#•Step 1: Use the sentiment_model to compute class predictions.
#•Step 2: Count the number of data points when the predicted class labels match the ground truth labels.
#•Step 3: Divide the total number of correct predictions by the total number of data points in the dataset.

#Quiz Question: What is the accuracy of the sentiment_model on the test_data? Round your answer to 2 decimal places (e.g. 0.76).
true_positives = len(test_data[(test_data['sentiment'] == 1) & (test_data['probabilities'] > 0.5)])
print true_positives
true_negatives = len(test_data[(test_data['sentiment'] == -1) & (test_data['probabilities'] <= 0.5)])
print true_negatives
print len(test_data)

accuracy = (true_positives + true_negatives) / len(test_data)
print "accuracy: %.2f" % accuracy
#Quiz Question: Does a higher accuracy value on the training_data always imply that the classifier is better?



27289
3789
33336
accuracy: 0.93


In [19]:
#Learn another classifier with fewer words
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

print "training the model..."
logistic_regression_simple = LogisticRegression() #call to get an instance of the linearRegression class
simple_model = logistic_regression_simple.fit(train_matrix_word_subset, train_data["sentiment"])
print "finished training the model!"

training the model...
finished training the model!


In [20]:
#Let us inspect the weights (coefficients) of the simple_model. First, build a table to store (word, coefficient) pairs.
#If you are using SFrame with scikit-learn, you can combine words with coefficients by running

simple_model_coef_table = pd.DataFrame({'coef': simple_model.coef_.flatten(), 'words':np.array(significant_words)})
print simple_model_coef_table
print len(simple_model_coef_table)
print len(significant_words)

#Sort the data frame by the coefficient value in descending order.

simple_model_coef_table_sorted = simple_model_coef_table.sort_values('coef', ascending=False)
print simple_model_coef_table_sorted

        coef         words
0   1.363690          love
1   0.944000         great
2   1.192538          easy
3   0.085513           old
4   0.520186        little
5   1.509812       perfect
6   1.673074         loves
7   0.503760          well
8   0.190909          able
9   0.058855           car
10 -1.651576         broke
11 -0.209563          less
12 -0.511380          even
13 -2.033699         waste
14 -2.348298  disappointed
15 -0.621169          work
16 -0.320556       product
17 -0.898031         money
18 -0.362167         would
19 -2.109331        return
20
20
        coef         words
6   1.673074         loves
5   1.509812       perfect
0   1.363690          love
2   1.192538          easy
1   0.944000         great
4   0.520186        little
7   0.503760          well
8   0.190909          able
3   0.085513           old
9   0.058855           car
11 -0.209563          less
16 -0.320556       product
18 -0.362167         would
12 -0.511380          even
15 -0.621169          

In [21]:
#Quiz Question: Consider the coefficients of simple_model. 
#How many of the 20 coefficients (corresponding to the 20 significant_words) are positive for the simple_model?
print len(simple_model_coef_table_sorted[simple_model_coef_table_sorted['coef']>0])

10


In [31]:
#Quiz Question: Are the positive words in the simple_model also positive words in the sentiment_model?

sentiment_model_coef_table = pd.DataFrame({'coef':sentiment_model.coef_.flatten(), 'words':vectorizer.get_feature_names()})
for word in significant_words:
    print word, sentiment_model_coef_table[sentiment_model_coef_table['words'] == word]['coef']


love 63567    1.576803
Name: coef, dtype: float64
great 48789    1.233252
Name: coef, dtype: float64
easy 37640    1.357549
Name: coef, dtype: float64
old 74106    0.055724
Name: coef, dtype: float64
little 62602    0.640014
Name: coef, dtype: float64
perfect 78982    1.860938
Name: coef, dtype: float64
loves 63646    1.518743
Name: coef, dtype: float64
well 117906    0.539629
Name: coef, dtype: float64
able 7386    0.393505
Name: coef, dtype: float64
car 22122    0.123291
Name: coef, dtype: float64
broke 20190   -1.391323
Name: coef, dtype: float64
less 61494   -0.27588
Name: coef, dtype: float64
even 39961   -0.464911
Name: coef, dtype: float64
waste 117082   -1.994456
Name: coef, dtype: float64
disappointed 34453   -2.194071
Name: coef, dtype: float64
work 119932   -0.462305
Name: coef, dtype: float64
product 83729   -0.190043
Name: coef, dtype: float64
money 68076   -0.78457
Name: coef, dtype: float64
would 120336   -0.286596
Name: coef, dtype: float64
return 89499   -1.658082
Name

Unnamed: 0,coef,words
0,-1.238194e+00,0
1,1.740091e-04,00
2,2.631484e-02,000
3,5.577676e-03,0001
4,3.726424e-05,001
5,9.294384e-07,001cm
6,2.503400e-03,002
7,2.698817e-01,01
8,2.552901e-01,010
9,-1.777706e-03,010204
