# W1_Predicting Sentiment from Product Reviews

In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv('amazon_baby.csv')
data = data.fillna(value={'review':''}) # replace all NaN elements in column ‘review’ with ''
data.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [3]:
# perform text cleaning
def remove_punctuation(text):
    table = str.maketrans(dict.fromkeys(string.punctuation))
    return text.translate(table)

data['cleaned_review'] = data['review'].apply(remove_punctuation)
data.head()

Unnamed: 0,name,review,rating,cleaned_review
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [4]:
# extract sentiments
data = data[data['rating'] != 3]
data['sentiment'] = data['rating'].apply(lambda rating : +1 if rating > 3 else -1) # 1 means positive; -1 means negative
data.head()

Unnamed: 0,name,review,rating,cleaned_review,sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...,1


In [5]:
# import train and test data
train_index = pd.read_json('module-2-assignment-train-idx.json')[0].values
test_index = pd.read_json('module-2-assignment-test-idx.json')[0].values
train = data.iloc[train_index]
test = data.iloc[test_index]

In [6]:
# build word count vector for each review
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train['cleaned_review'])
test_matrix = vectorizer.transform(test['cleaned_review'])

In [7]:
# train a sentiment classifier with logistic regression
sentiment_model = LogisticRegression().fit(X=train_matrix, y=train['sentiment'])

In [8]:
print('total:', len(sentiment_model.coef_[0]))
print('positive:', (sentiment_model.coef_>=0).sum())

total: 121712
positive: 86781


In [9]:
# make predictions with logistic regression
sample_test = test[10:13]
sample_test.iloc[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [10]:
sample_test.iloc[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [11]:
# predict score
sample_test_matrix = vectorizer.transform(sample_test['cleaned_review'])
scores = sentiment_model.decision_function(sample_test_matrix)
scores

array([  5.60167742,  -3.16956136, -10.42345957])

In [12]:
# predict label
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1], dtype=int64)

In [13]:
# predict probability
# lst method
1 / (1 + np.exp(-scores)) 

array([  9.96321912e-01,   4.03273878e-02,   2.97259754e-05])

In [14]:
# 2nd method
sentiment_model.predict_proba(sample_test_matrix)

array([[  3.67808778e-03,   9.96321912e-01],
       [  9.59672612e-01,   4.03273878e-02],
       [  9.99970274e-01,   2.97259754e-05]])

In [15]:
# find the most positive (and negative) review
test['positive_proba'] = sentiment_model.predict_proba(test_matrix)[:,1]
best20 = test.sort_values(by='positive_proba', ascending=False).head(20)
best20['name'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


P'Kolino Silly Soft Seating in Tias, Green                                              1
Graco Pack 'n Play Element Playard - Flint                                              1
Buttons Cloth Diaper Cover - One Size - 8 Color Options                                 1
Mamas &amp; Papas 2014 Urbo2 Stroller - Black                                           1
Fisher-Price Cradle 'N Swing,  My Little Snugabunny                                     1
Roan Rocco Classic Pram Stroller 2-in-1 with Bassinet and Seat Unit - Coffee            1
Infantino Wrap and Tie Baby Carrier, Black Blueberries                                  1
Britax Decathlon Convertible Car Seat, Tiffany                                          1
Baby Jogger City Mini GT Single Stroller, Shadow/Orange                                 1
Simple Wishes Hands-Free Breastpump Bra, Pink, XS-L                                     1
Evenflo 6 Pack Classic Glass Bottle, 4-Ounce                                            1
Baby Einst

In [16]:
worst20 = test.sort_values(by='positive_proba').head(20)
worst20['name'].value_counts()

VTech Communications Safe &amp; Sound Digital Audio Monitor with two Parent Units                         1
Motorola Digital Video Baby Monitor with Room Temperature Thermometer                                     1
Safety 1st Deluxe 4-in-1 Bath Station                                                                     1
Peg-Perego Tatamia High Chair, White Latte                                                                1
Regalo My Cot Portable Bed, Royal Blue                                                                    1
Adiri BPA Free Natural Nurser Ultimate Bottle Stage 1 White, Slow Flow (0-3 months)                       1
Safety 1st High-Def Digital Monitor                                                                       1
Philips AVENT Newborn Starter Set                                                                         1
Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs                                                             1
Belkin WeMo Wi-Fi Baby Monit

In [17]:
# compute accuracy of the classifier
predicted_label = sentiment_model.predict(test_matrix)
test['predicted_label'] = predicted_label
test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,name,review,rating,cleaned_review,sentiment,positive_proba,predicted_label
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.784441,1
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999,1
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,0.933196,1
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999979,1
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,0.980246,1


In [18]:
accurancy = sum(test['sentiment']==test['predicted_label']) / test.shape[0]
accurancy

0.93229541636669067

In [19]:
# learn another classifier with fewer words
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 'work', 'product', 'money', 'would', 'return']

vectorizer_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_subset = vectorizer_subset.fit_transform(train['cleaned_review'])
test_matrix_subset = vectorizer_subset.transform(test['cleaned_review'])

simple_sentiment_model = LogisticRegression().fit(X=train_matrix_subset, y=train['sentiment'])

coef_df = pd.DataFrame({'word':significant_words, 'coef':simple_sentiment_model.coef_[0]})
coef_df

Unnamed: 0,coef,word
0,1.36369,love
1,0.944,great
2,1.192538,easy
3,0.085513,old
4,0.520186,little
5,1.509812,perfect
6,1.673074,loves
7,0.50376,well
8,0.190909,able
9,0.058855,car


In [20]:
(simple_sentiment_model.coef_>0).sum()

10

In [21]:
coef_df_positive = coef_df[coef_df['coef']>0]
coef_df_positive

Unnamed: 0,coef,word
0,1.36369,love
1,0.944,great
2,1.192538,easy
3,0.085513,old
4,0.520186,little
5,1.509812,perfect
6,1.673074,loves
7,0.50376,well
8,0.190909,able
9,0.058855,car


In [22]:
word_matrix = vectorizer.transform(coef_df_positive['word'])
sentiment_model.predict(word_matrix)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [23]:
# compare models
# accurancy on training data
prediction_1 = sentiment_model.predict(train_matrix)
prediction_2 = simple_sentiment_model.predict(train_matrix_subset)
accurancy_1 = sum(train['sentiment']==prediction_1) / train.shape[0]
accurancy_2 = sum(train['sentiment']==prediction_2) / train.shape[0]
print(accurancy_1, accurancy_2)

0.968489536487 0.866822570007


In [24]:
# accurancy on test data
prediction_1 = sentiment_model.predict(test_matrix)
prediction_2 = simple_sentiment_model.predict(test_matrix_subset)
accurancy_1 = sum(test['sentiment']==prediction_1) / test.shape[0]
accurancy_2 = sum(test['sentiment']==prediction_2) / test.shape[0]
print(accurancy_1, accurancy_2)

0.932295416367 0.869360451164


In [25]:
# majority class prediction
positive_num = sum(test['sentiment']==1)
majority_accurancy = positive_num / test.shape[0]
majority_accurancy

0.84278257739380846