In [17]:
import pandas as pd
import numpy as np
from string import punctuation
import re

In [138]:
df = pd.read_csv('amazon_baby.csv')

In [139]:
df.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


In [140]:
# data cleaning
df = df.fillna({'review':''})
table = str.maketrans("","")
regex = re.compile('[%s]' % re.escape(string.punctuation))
def remove_punctuation(text):
    return regex.sub('', text)
df['review_clean'] = df['review'].apply(remove_punctuation)
df = df[df.rating != 3]
df = df.reset_index(drop=True)

In [141]:
# label
df['sentiment'] = df['rating'].apply(lambda rating: 1 if rating > 3 else -1)

In [142]:
train_index = pd.read_json('module-2-assignment-train-idx.json')
test_index = pd.read_json('module-2-assignment-test-idx.json')
train_data = df.ix[train_index[0]].reset_index(drop=True)
test_data = df.ix[test_index[0]].reset_index(drop=True)

In [143]:
# Build the word count vector
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])

In [144]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression()
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [217]:
sentiment_model.coef_[sentiment_model.coef_ >= 0].shape

(85839,)

In [146]:
sample_test_data = test_data[10:13].reset_index(drop=True)
print(sample_test_data)

                                                name  \
0                          Our Baby Girl Memory Book   
1  Wall Decor Removable Decal Sticker - Colorful ...   
2  New Style Trailing Cherry Blossom Tree Decal R...   

                                              review  rating  \
0  Absolutely love it and all of the Scripture in...       5   
1  Would not purchase again or recommend. The dec...       2   
2  Was so excited to get this product for my baby...       1   

                                        review_clean  sentiment  
0  Absolutely love it and all of the Scripture in...          1  
1  Would not purchase again or recommend The deca...         -1  
2  Was so excited to get this product for my baby...         -1  


In [147]:
sample_test_data.ix[0]['review']

'Absolutely love it and all of the Scripture in it.  I purchased the Baby Boy version for my grandson when he was born and my daughter-in-law was thrilled to receive the same book again.'

In [148]:
sample_test_data.ix[1]['review']

'Would not purchase again or recommend. The decals were thick almost plastic like and were coming off the wall as I was applying them! The would NOT stick! Literally stayed stuck for about 5 minutes then started peeling off.'

In [149]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[  5.59253515  -3.13104823 -10.4160051 ]


In [150]:
def calculate_prob(score):
    return 1 / (1 + np.exp(-1 * score))

In [151]:
calculate_prob(scores)

array([  9.96288257e-01,   4.18445594e-02,   2.99483882e-05])

In [152]:
sentiment_model.predict_proba(sample_test_matrix)

array([[  3.71174265e-03,   9.96288257e-01],
       [  9.58155441e-01,   4.18445594e-02],
       [  9.99970052e-01,   2.99483882e-05]])

In [153]:
test_matrix = vectorizer.transform(test_data['review_clean'])
test_scores = sentiment_model.decision_function(test_matrix)
print(test_scores)

[  1.27279987  14.11120818   2.65253384 ...,  12.10525162  12.84538988
   3.93851331]


In [220]:
positive_probs = calculate_prob(test_scores)

In [221]:
positive_probs = sentiment_model.predict_proba(test_matrix)[:, 1]
highest_20 = np.argpartition(positive_probs, -20)[-20:]
test_data.ix[highest_20]

Unnamed: 0,name,review,rating,review_clean,sentiment
33060,Summer Infant Wide View Digital Color Video Mo...,I love this baby monitor. I can compare this ...,5,I love this baby monitor I can compare this o...,1
26838,"Baby Jogger City Mini GT Double Stroller, Shad...","We are well pleased with this stroller, and I ...",4,We are well pleased with this stroller and I w...,1
30076,Ikea 36 Pcs Kalas Kids Plastic BPA Free Flatwa...,For the price this set is unbelievable- and tr...,5,For the price this set is unbelievable and tru...,1
4140,"Britax Decathlon Convertible Car Seat, Tiffany",I researched a few different seats to put in o...,4,I researched a few different seats to put in o...,1
11923,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1
9555,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1
17558,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1
26830,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,Amazing Love Love Love it All 5 STARS all the...,1
32782,Mamas &amp; Papas 2014 Urbo2 Stroller - Black,After much research I purchased an Urbo2. It's...,4,After much research I purchased an Urbo2 Its e...,1
30634,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1


In [222]:
negative_probs = sentiment_model.predict_proba(test_matrix)[:, 0]
lowest_20 = np.argpartition(negative_probs, -20)[-20:]
test_data.ix[lowest_20]

Unnamed: 0,name,review,rating,review_clean,sentiment
15062,"Thirsties Hemp Inserts 2 Pack, Small 6-18 Lbs",My Experience: Babykicks Inserts failure vs RA...,5,My Experience Babykicks Inserts failure vs RAV...,1
5831,"Regalo My Cot Portable Bed, Royal Blue",If I could give this product zero stars I woul...,1,If I could give this product zero stars I woul...,-1
28120,VTech Communications Safe &amp; Sound Digital ...,"First, the distance on these are no more than ...",1,First the distance on these are no more than 7...,-1
205,Safety 1st Deluxe 4-in-1 Bath Station,This item is junk. I originally chose it beca...,1,This item is junk I originally chose it becau...,-1
27231,NUK Cook-n-Blend Baby Food Maker,It thought this would be great. I did a lot of...,1,It thought this would be great I did a lot of ...,-1
7310,Chicco Cortina KeyFit 30 Travel System in Adve...,My wife and I have used this system in two car...,1,My wife and I have used this system in two car...,-1
13751,"Peg-Perego Tatamia High Chair, White Latte",I can see why there are so many good reviews o...,2,I can see why there are so many good reviews o...,-1
31226,Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...,I read so many reviews saying the Belkin WiFi ...,2,I read so many reviews saying the Belkin WiFi ...,-1
10814,Ellaroo Mei Tai Baby Carrier - Hershey,This is basically an overpriced piece of fabri...,1,This is basically an overpriced piece of fabri...,-1
1942,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1


In [174]:
# Accuracy
predictions = sentiment_model.predict(test_matrix)
np.sum(test_data['sentiment'] == predictions) /predictions.shape[0]

0.9324154067674586

In [175]:
# Fewer features
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [176]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

In [177]:
simple_model = LogisticRegression()
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [178]:
simple_model_coef_table = pd.DataFrame({'word':significant_words,
                                         'coefficient':simple_model.coef_.flatten()})

In [182]:
simple_model_coef_table.sort_values(by='coefficient', ascending=False)

Unnamed: 0,coefficient,word
6,1.673074,loves
5,1.509812,perfect
0,1.36369,love
2,1.192538,easy
1,0.944,great
4,0.520186,little
7,0.50376,well
8,0.190909,able
3,0.085513,old
9,0.058855,car


In [183]:
simple_model_coef_table[simple_model_coef_table.coefficient > 0].shape

(10, 2)

In [184]:
simple_model_coef_table[simple_model_coef_table.coefficient > 0]

Unnamed: 0,coefficient,word
0,1.36369,love
1,0.944,great
2,1.192538,easy
3,0.085513,old
4,0.520186,little
5,1.509812,perfect
6,1.673074,loves
7,0.50376,well
8,0.190909,able
9,0.058855,car


In [187]:
sentiment_model.coef_

array([[ -1.23619965e+00,   2.11462508e-04,   2.58815576e-02, ...,
          1.13651116e-02,   3.21881618e-03,  -7.22786600e-05]])

In [207]:
# Compare models
sentiment_model.score(train_matrix, train_data['sentiment'])

0.96763506625891949

In [208]:
np.sum(sentiment_model.predict(train_matrix) == train_data['sentiment']) / train_data.shape[0]

0.96763506625891949

In [209]:
simple_model.score(train_matrix_word_subset, train_data['sentiment'])

0.8668225700065959

In [212]:
sentiment_model.score(test_matrix, test_data['sentiment'])

0.9324154067674586

In [211]:
simple_model.score(test_matrix_word_subset, test_data['sentiment'])

0.86936045116390692

In [223]:
# Majority class classifier
test_data[test_data.sentiment == 1].shape[0] / test_data.shape[0]

0.8427825773938085