In [1]:
import pandas as pd
import string
import numpy as np
products=pd.read_csv('amazon_baby.csv')

In [2]:
products=products.fillna({'review':''})
full=len(products['review'])
print(full)

183531


In [3]:
translator=str.maketrans({key: None for key in string.punctuation})
def translate(text):
    return text.translate(translator)
products['clean_review']=products['review'].iloc[:].apply(translate)

With one argument:

str.maketrans({'a': 'b', 'c': None})
You give the function a mapping that follows the rules for translation tables and it returns an equivalent table for that mapping. Things that map to None are removed

With two arguments:

str.maketrans('abc', 'xyz')
You give it two strings. Each character in the first string is replaced by the character at that index in the second string. So 'a' maps to 'x', 'b' to 'y', and 'c' to 'z'.

The one you're using, with three arguments, works the same as two arguments, but has a third string.

str.maketrans('abc', 'xyz', 'hij')
This is the same as the two argument version, except that the characters from the third string are removed, as if they were mapped to None. So your table is saying "Don't replace anything, but remove the characters that show up in this string".

In [4]:
products=products[products['rating']!=3].copy()

## products['index'] actually is a copy, assigning values to copy will not change the source file.
## products.loc['index'] will lead to the selected column, assigning values in this way change the source file.
## products.iloc[integer] will lead to rows from 0 to len-1

In [5]:
products['sentiment']=products['rating'].apply(lambda rating: +1 if rating>3 else -1)

In [6]:
train=pd.read_json('module-2-assignment-train-idx.json')
train_data=products.iloc[train[0]]
train_data.shape

(133416, 5)

In [7]:
test=pd.read_json('module-2-assignment-test-idx.json')
test_data=products.iloc[test[0]]

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
     # Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(train_data['clean_review'])
# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(test_data['clean_review'])

In [9]:
words_vec=vectorizer.vocabulary_.keys()

In [10]:
from sklearn import linear_model
sentiment=linear_model.LogisticRegression()
train_matrix.shape

(133416, 121712)

In [11]:
sentiment.fit(train_matrix,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
sentiment.coef_.shape

(1, 121712)

In [13]:
np.sum(sentiment.coef_>0)

86781

In [14]:
sample_test_data = test_data[10:13]
print (sample_test_data)

                                                 name  \
59                          Our Baby Girl Memory Book   
71  Wall Decor Removable Decal Sticker - Colorful ...   
91  New Style Trailing Cherry Blossom Tree Decal R...   

                                               review  rating  \
59  Absolutely love it and all of the Scripture in...       5   
71  Would not purchase again or recommend. The dec...       2   
91  Was so excited to get this product for my baby...       1   

                                         clean_review  sentiment  
59  Absolutely love it and all of the Scripture in...          1  
71  Would not purchase again or recommend The deca...         -1  
91  Was so excited to get this product for my baby...         -1  


In [15]:
sample_test_matrix=vectorizer.transform(sample_test_data['clean_review'])

In [16]:
sample_score=sentiment.decision_function(sample_test_matrix)

In [17]:
sentiment.predict(sample_test_matrix)

array([ 1, -1, -1], dtype=int64)

In [18]:
def label(score):
    label_list=[]
    for i in range(len(score)):
        if score[i]>0:
            label_list.append(1)
        else:
            label_list.append(-1)
    return label_list

In [19]:
label(sample_score)

[1, -1, -1]

In [20]:
from math import exp
def probability(score):
    prob_list=[]
    for i in range(len(score)):
        prob_list.append(1.0/(1.0+exp(-score[i])))
    return prob_list

In [21]:
probability(sample_score)

[0.9963219122192273, 0.04032738776318565, 2.972597544224787e-05]

In [22]:
test_matrix=vectorizer.transform(test_data['clean_review'])

In [23]:
test_score=sentiment.decision_function(test_matrix)

In [24]:
problist=probability(test_score)

In [25]:
len(problist)

33336

In [26]:
test_data.loc[:,'probability']=problist

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [27]:
test_data.sort_values('probability',ascending=False)[0:20]

Unnamed: 0,name,review,rating,clean_review,sentiment,probability
140816,"Diono RadianRXT Convertible Car Seat, Plum",I bought this seat for my tall (38in) and thin...,5,I bought this seat for my tall 38in and thin 2...,1,1.0
168697,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1,1.0
97325,Freemie Hands-Free Concealable Breast Pump Col...,I absolutely love this product. I work as a C...,5,I absolutely love this product I work as a Cu...,1,1.0
147949,"Baby Jogger City Mini GT Single Stroller, Shad...","Amazing, Love, Love, Love it !!! All 5 STARS a...",5,Amazing Love Love Love it All 5 STARS all the...,1,1.0
137034,Graco Pack 'n Play Element Playard - Flint,My husband and I assembled this Pack n' Play l...,4,My husband and I assembled this Pack n Play la...,1,1.0
52631,Evenflo X Sport Plus Convenience Stroller - Ch...,After seeing this in Parent's Magazine and rea...,5,After seeing this in Parents Magazine and read...,1,1.0
50315,"P'Kolino Silly Soft Seating in Tias, Green",I've purchased both the P'Kolino Little Reader...,4,Ive purchased both the PKolino Little Reader C...,1,1.0
168081,Buttons Cloth Diaper Cover - One Size - 8 Colo...,"We are big Best Bottoms fans here, but I wante...",4,We are big Best Bottoms fans here but I wanted...,1,1.0
119182,Roan Rocco Classic Pram Stroller 2-in-1 with B...,Great Pram Rocco!!!!!!I bought this pram from ...,5,Great Pram RoccoI bought this pram from Europe...,1,1.0
66059,"Evenflo 6 Pack Classic Glass Bottle, 4-Ounce",It's always fun to write a review on those pro...,5,Its always fun to write a review on those prod...,1,1.0


In [28]:
test_data.sort_values('probability',ascending=True)[0:20]

Unnamed: 0,name,review,rating,clean_review,sentiment,probability
16042,Fisher-Price Ocean Wonders Aquarium Bouncer,We have not had ANY luck with Fisher-Price pro...,2,We have not had ANY luck with FisherPrice prod...,-1,8.446278e-16
120209,Levana Safe N'See Digital Video Baby Monitor w...,This is the first review I have ever written o...,1,This is the first review I have ever written o...,-1,1.596406e-15
77072,Safety 1st Exchangeable Tip 3 in 1 Thermometer,I thought it sounded great to have different t...,1,I thought it sounded great to have different t...,-1,8.114017e-14
48694,Adiri BPA Free Natural Nurser Ultimate Bottle ...,I will try to write an objective review of the...,2,I will try to write an objective review of the...,-1,9.88539e-14
155287,VTech Communications Safe &amp; Sounds Full Co...,"This is my second video monitoring system, the...",1,This is my second video monitoring system the ...,-1,1.93093e-13
94560,The First Years True Choice P400 Premium Digit...,Note: we never installed batteries in these un...,1,Note we never installed batteries in these uni...,-1,3.360673e-13
53207,Safety 1st High-Def Digital Monitor,We bought this baby monitor to replace a diffe...,1,We bought this baby monitor to replace a diffe...,-1,3.275387e-11
81332,Cloth Diaper Sprayer--styles may vary,I bought this sprayer out of desperation durin...,1,I bought this sprayer out of desperation durin...,-1,3.325852e-11
10677,Philips AVENT Newborn Starter Set,"It's 3am in the morning and needless to say, t...",1,Its 3am in the morning and needless to say thi...,-1,9.496152e-11
113995,Motorola Digital Video Baby Monitor with Room ...,DO NOT BUY THIS BABY MONITOR!I purchased this ...,1,DO NOT BUY THIS BABY MONITORI purchased this m...,-1,9.617282e-11


In [29]:
test_data.loc[:,'label']=label(test_score)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [30]:
test_data.loc[:,'accurate']=(test_data['label']==test_data['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [31]:
np.sum(test_data.loc[:,'accurate'])/len(test_data)

0.9322954163666907

In [32]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [33]:
vectorizer_word_subset = CountVectorizer(vocabulary=significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['clean_review'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['clean_review'])

In [34]:
simple_sentiment=linear_model.LogisticRegression()
simple_sentiment.fit(train_matrix_word_subset,train_data['sentiment'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [35]:
simple_weights=pd.Series(simple_sentiment.coef_.tolist()[0],index=significant_words)
simple_pos=simple_weights[significant_words][simple_weights[significant_words]>0].index.tolist()

In [36]:
sentiment_weights=pd.Series(sentiment.coef_.tolist()[0],index=words_vec)

In [37]:
pos=sentiment_weights[significant_words][sentiment_weights[significant_words]>0].index.tolist()
print(simple_pos)
print(pos)
sentiment_weights[significant_words]

['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 'well', 'able', 'car']
['love', 'great', 'old', 'loves', 'well', 'able', 'car', 'less', 'even', 'waste', 'disappointed', 'work', 'product', 'money', 'would', 'return']


love            0.266807
great           0.065254
easy           -0.005453
old             0.008179
little         -0.314613
perfect        -0.685414
loves           0.010400
well            0.000001
able            0.212782
car             0.055097
broke          -0.718930
less            0.041758
even            0.079252
waste           0.005772
disappointed    0.002837
work            0.000030
product         0.023301
money           0.000578
would           0.219827
return          0.198969
dtype: float64

In [38]:
test_data.loc[:,'simple']=simple_sentiment.predict(test_matrix_word_subset)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [39]:
test_data.loc[:,'simple_accurate']=(test_data['sentiment']==test_data['simple'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [40]:
np.sum(test_data.loc[:,'simple_accurate'])/len(test_data)

0.8693604511639069

In [41]:
train_data.loc[:,'pred']=sentiment.predict(train_matrix)
np.sum(train_data['pred']==train_data['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


129212

In [42]:
np.sum(train_data['sentiment']==simple_sentiment.predict(train_matrix_word_subset))

115648

In [43]:
np.sum(test_data['sentiment']==1)/len(test_data)

0.8427825773938085