In [228]:
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
#http://scikit-learn.org/stable/modules/feature_extraction.html#the-bag-of-words-representation
from sklearn import datasets, linear_model
from sklearn.linear_model import LogisticRegression

#### Load Amazon dataset and Perform text cleaning

##### We start by removing punctuation, so that words "cake." and "cake!" are counted as the same word.

In [7]:
products = pd.read_csv('amazon_baby.csv')

In [39]:
products.head()
# fill in N/A's in the review column
products = products.fillna({'review':''})  

In [34]:
def remove_punctuation(text):
    # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
    table = str.maketrans({key: None for key in string.punctuation})
    return text.translate(table)

In [35]:
mystr = products['review'][0]

In [37]:
remove_punctuation(mystr)

'These flannel wipes are OK but in my opinion not worth keeping  I also ordered someImse Vimse Cloth WipesOcean Blue12 countwhich are larger had a nicer softer texture and just seemed higher quality  I use cloth wipes for hands and faces and have been usingThirsties 6 Pack Fab Wipes Boyfor about 8 months now and need to replace them because they are starting to get rough and have had stink issues for a while that stripping no longer handles'

In [38]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [40]:
products.head()

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


#### Extract Sentiments

##### We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment. Now, we will assign reviews with a rating of 4 or higher to be positive reviews, while the ones with rating of 2 or lower are negative. For the sentiment column, we use +1 for the positive class label and -1 for the negative class label. 

In [41]:
products = products[products['rating'] != 3]

In [43]:
products.head()

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,When the Binky Fairy came to our house we didn...


In [44]:
products['sentiment'] = products['rating'].apply(lambda x: +1 if x > 3 else -1)

#### Split into training and test sets

In [46]:
train_idx = pd.read_json('module-2-assignment-train-idx.json')
test_idx = pd.read_json('module-2-assignment-test-idx.json')

In [72]:
train_data = products.iloc[train_idx[0]]
test_data = products.iloc[test_idx[0]]

#### Build the word count vector for each review

In [128]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#the-bag-of-words-representation
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['review_clean'])

In [88]:
s = ['I like you mom', 'I love you']

In [89]:
vectorizer.fit_transform(s).toarray()

array([[1, 1, 0, 1, 1],
       [1, 0, 1, 0, 1]], dtype=int64)

#### Train a sentiment classifier with logistic regression
##### We will now use logistic regression to create a sentiment classifier on the training data.

In [95]:
sentiment_model = LogisticRegression()

In [96]:
sentiment_model.fit(train_matrix, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

##### There should be over 100,000 coefficients in this sentiment_model. Recall from the lecture that positive weights w_j correspond to weights that cause positive sentiment, while negative weights correspond to negative sentiment. Calculate the number of positive (>= 0, which is actually nonnegative) coefficients.

In [105]:
len(sentiment_model.coef_[sentiment_model.coef_ >= 0])

85877

#### Making predictions with logistic regression

##### Now that a model is trained, we can make predictions on the test data. In this section, we will explore this in the context of 3 data points in the test data. Take the 11th, 12th, and 13th data points in the test data and save them to sample_test_data. The following cell extracts the three data points from the SFrame test_data and print their content:

In [110]:
sample_test_data = test_data[10: 13]

In [121]:
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


##### We will now make a class prediction for the sample_test_data. The sentiment_model should predict +1 if the sentiment is positive and -1 if the sentiment is negative. Recall from the lecture that the score (sometimes called margin) for the logistic regression model is defined as: scorei=w⊺h(xi)

In [129]:
sample_test_matrix = vectorizer.transform(sample_test_data['review'])

In [130]:
scores = sentiment_model.decision_function(sample_test_matrix)

In [131]:
scores

array([  4.50689343,  -3.12665506, -10.57712362])

#### Prediciting Sentiment
##### Checkpoint: Make sure your class predictions match with the ones obtained from sentiment_model. The logistic regression classifier in scikit-learn comes with the predict function for this purpose.

In [132]:
sentiment_model.predict(sample_test_matrix)

array([ 1, -1, -1], dtype=int64)

#### Probability Predictions
##### Quiz question: Of the three data points in sample_test_data, which one (first, second, or third) has the lowest probability of being classified as a positive review?

In [134]:
probability = 1 / ( 1 + np.exp(-scores))

In [135]:
probability

array([9.89087711e-01, 4.20210525e-02, 2.54919175e-05])

In [144]:
sentiment_model.predict_proba(sample_test_matrix)#[2].sum()

array([[1.09122890e-02, 9.89087711e-01],
       [9.57978948e-01, 4.20210525e-02],
       [9.99974508e-01, 2.54919175e-05]])

#### Find the most positive (and negative) review

In [145]:
test_scores = sentiment_model.decision_function(test_matrix)

In [146]:
test_probability = 1 / ( 1 + np.exp(-test_scores))

In [148]:
test_data['probability'] = test_probability

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [149]:
test_data.head()

Unnamed: 0,name,review,rating,review_clean,sentiment,probability
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4,This has been an easy way for my nanny to reco...,1,0.783055
10,"Baby Tracker&reg; - Daily Childcare Journal, S...",I love this journal and our nanny uses it ever...,4,I love this journal and our nanny uses it ever...,1,0.999999
16,Nature's Lullabies First Year Sticker Calendar,"I love this little calender, you can keep trac...",5,I love this little calender you can keep track...,1,0.934042
20,Nature's Lullabies Second Year Sticker Calendar,I had a hard time finding a second year calend...,5,I had a hard time finding a second year calend...,1,0.999979
28,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,One of babys first and favorite books and it i...,1,0.980318


In [155]:
most_positive_reviews_index = test_data.sort_values("probability", ascending=False).head(20).index

In [159]:
most_negative_reviews_index = test_data.sort_values("probability", ascending=True).head(20).index

In [226]:
test_data.sort_values("probability", ascending=True).head(20)['name']

16042           Fisher-Price Ocean Wonders Aquarium Bouncer
120209    Levana Safe N'See Digital Video Baby Monitor w...
77072        Safety 1st Exchangeable Tip 3 in 1 Thermometer
48694     Adiri BPA Free Natural Nurser Ultimate Bottle ...
155287    VTech Communications Safe &amp; Sounds Full Co...
94560     The First Years True Choice P400 Premium Digit...
53207                   Safety 1st High-Def Digital Monitor
81332                 Cloth Diaper Sprayer--styles may vary
113995    Motorola Digital Video Baby Monitor with Room ...
10677                     Philips AVENT Newborn Starter Set
59546                Ellaroo Mei Tai Baby Carrier - Hershey
9915           Cosco Alpha Omega Elite Convertible Car Seat
172090    Belkin WeMo Wi-Fi Baby Monitor for Apple iPhon...
75994            Peg-Perego Tatamia High Chair, White Latte
40079     Chicco Cortina KeyFit 30 Travel System in Adve...
149987                     NUK Cook-n-Blend Baby Food Maker
154878    VTech Communications Safe &amp

#### Compute accuracy of the classifier

In [161]:
test_data['prediction'] = sentiment_model.predict(test_matrix)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [164]:
len(test_data[test_data['sentiment'] == test_data['prediction']]) / len(test_data)

0.9321154307655387

#### Learn another classifier with fewer words
##### Compute a new set of word count vectors using only these words. The CountVectorizer class has a parameter that lets you limit the choice of words when building word count vectors:

In [166]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [167]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])

 #### Train a logistic regression model on a subset of data

In [168]:
simple_model = LogisticRegression()

In [169]:
simple_model.fit(train_matrix_word_subset, train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [177]:
simple_model.coef_.flatten()

array([ 1.36368976,  0.94399959,  1.19253827,  0.08551278,  0.52018576,
        1.50981248,  1.67307389,  0.50376046,  0.19090857,  0.05885467,
       -1.65157634, -0.20956286, -0.51137963, -2.03369861, -2.34829822,
       -0.62116877, -0.32055624, -0.89803074, -0.36216674, -2.10933109])

In [227]:
simple_model.intercept_

array([1.29937369])

In [178]:
simple_model_coef_table = pd.DataFrame({'word': significant_words,
                                       'coefficient': simple_model.coef_.flatten()})

In [183]:
simple_model_coef_table.sort_values('coefficient', ascending=False)

Unnamed: 0,coefficient,word
6,1.673074,loves
5,1.509812,perfect
0,1.36369,love
2,1.192538,easy
1,0.944,great
4,0.520186,little
7,0.50376,well
8,0.190909,able
3,0.085513,old
9,0.058855,car


In [182]:
np.sum(simple_model_coef_table.sort_values('coefficient', ascending=False)['coefficient'] > 0)

10

In [184]:
vocab = vectorizer.vocabulary_.keys()

In [188]:
#coeffs = {vocab[i]: c for i, c in enumerate(sentiment_model.coef_[0])}

#### Comparing models

In [190]:
train_data['prediction_sentiment_model'] = sentiment_model.predict(train_matrix)
train_data['prediction_simple_model'] = simple_model.predict(train_matrix_word_subset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [203]:
acc_sentiment_model_train = len(train_data[train_data['sentiment'] == train_data['prediction_sentiment_model']]) / len(train_data)

In [209]:
acc_simpl_model_train = len(train_data[train_data['sentiment'] == train_data['prediction_simple_model']]) / len(train_data)

In [210]:
acc_sentiment_model_train > acc_simpl_model_train

True

In [211]:
acc_sentiment_model_train

0.967934880374168

In [212]:
acc_simpl_model_train

0.8668225700065959

In [202]:
test_data['prediction_simple_model'] = simple_model.predict(test_matrix_word_subset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [213]:
acc_sentiment_model_test = len(test_data[test_data['sentiment'] == test_data['prediction']]) / len(test_data)
acc_simple_model_test = len(test_data[test_data['sentiment'] == test_data['prediction_simple_model']]) / len(test_data)

In [214]:
acc_sentiment_model_test

0.9321154307655387

In [215]:
acc_simpl_model_test

0.8668225700065959

#### Baseline: Majority class prediction
##### It is quite common to use the majority class classifier as the a baseline (or reference) model for comparison with your classifier model. The majority classifier model predicts the majority class for all data points. At the very least, you should healthily beat the majority class classifier, otherwise, the model is (usually) pointless.

In [216]:
len(train_data[train_data['sentiment'] == 1])

112164

In [217]:
len(train_data[train_data['sentiment'] == -1])

21252

In [218]:
len(train_data[train_data['sentiment'] == 1]) / len(train_data)

0.8407087605684476