# Library

In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Loading CSV file from Google Drive

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
path = '/content/drive/My Drive/Machine Learning Data/amazon_baby.csv'
products = pd.read_csv(path)

In [6]:
products.head()

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5


# **DATA CLEANING**
# Step 1: Remove punctuation, stopwords and numbers

In [2]:
import re
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords # Import the stop word list

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Function to clean punctuation, stopword, lowercase letter.

In [7]:
def review_to_words(raw_review):
    # Remove punctuation by substitute the punctuation with ' '
    letters_only = re.sub("[^a-zA-Z]", " ", str(raw_review)) 
    # Lowercase all the word and split the word       
    words = letters_only.lower().split() 
    # Set stopword to english language                
    stops = set(stopwords.words("english"))
    #Selecting word that is not in stop words               
    meaningful_words = [w for w in words if not w in stops]
    return( " ".join( meaningful_words ))

# Step 2 : Clean the uselss comment
### By excluding simple comment that length is lower than 10 words

In [8]:
#Finding length of word to exclude the simple comment
def review_to_words_ote(raw_review):
    # Remove punctuation by substitute the punctuation with ' '
    letters_only = re.sub("[^a-zA-Z]", " ", str(raw_review))
    # Lowercase all the word and split the word 
    words = letters_only.lower().split()  
    # Set stopword to english language                           
    stops = set(stopwords.words("english"))   
    #Selecting word that is not in stop words                
    meaningful_words = [w for w in words if not w in stops] 
    return meaningful_words

In [9]:
# Create new column with name 'length' that contain word length
products['length'] = products['review'].apply(lambda x: len(review_to_words_ote(str(x))))

In [11]:
# Simplify the data with only length mroe than 10 words
products = products[products['length']>=10]

# Step 3: Create Response variable (Flag for Supervised Learning)





In [13]:
# Excluding rating that is 3 (neutral)
products=products[products['rating']!=3]

In [14]:
# Flag the postive and negative sentiment as label
products['sentiment'] = products['rating'].apply(lambda x: 1 if x>=4 else 0)

In [15]:
products['clean_review'] = products['review'].apply(lambda x: review_to_words(x))

# Step 4: Fitting model

In [16]:
X_train , X_test , y_train, y_test = train_test_split(products,products.sentiment,test_size=0.2)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression

In [18]:
v = CountVectorizer()
train_features = v.fit_transform(X_train['clean_review'])

In [19]:
train_features

<126324x53344 sparse matrix of type '<class 'numpy.int64'>'
	with 4267416 stored elements in Compressed Sparse Row format>

In [20]:
t = TfidfTransformer()

In [21]:
train_features_f = t.fit_transform(train_features)

In [22]:
logReg= LogisticRegression()

In [23]:
sentiment_model = logReg.fit(train_features_f,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [25]:
predicted_data = t.transform(v.transform(X_test['clean_review']))

In [26]:
predicted = sentiment_model.predict(predicted_data)

In [27]:
print(np.mean(predicted == y_test))

0.9274586789943638


# Additional : Analyze the best and worst review

In [28]:
print(sentiment_model.predict_proba(t.transform(v.transform(["This product is awful"])))) # Testing 

[[0.97798323 0.02201677]]


## Vulli Sophie the Giraffe Teether - Highest Sales Product

In [29]:
giraffe_review = products[products['name'] == 'Vulli Sophie the Giraffe Teether']

In [30]:
giraffe_review.head()

Unnamed: 0,name,review,rating,length,sentiment,clean_review
34313,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5,11,1,likes chewing parts especially head ears helpe...
34314,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,5,17,1,son loves toy fits great diaper bag also easy ...
34315,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,1,34,0,really large warning box sheesh many anaphylac...
34316,Vulli Sophie the Giraffe Teether,All the moms in my moms\' group got Sophie for...,5,27,1,moms moms group got sophie babies wondered som...
34317,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,5,42,1,little skeptical whether sophie going worth mo...


In [31]:
def predictSentiment(review):
  proba = sentiment_model.predict_proba(t.transform(v.transform([review])))
  return proba[0][1]

In [32]:
print(predictSentiment("This is awful product"))

0.022016774472533893


In [33]:
giraffe_review['predicted_sentiment'] = giraffe_review['clean_review'].apply(lambda x : predictSentiment(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [34]:
giraffe_review.head()

Unnamed: 0,name,review,rating,length,sentiment,clean_review,predicted_sentiment
34313,Vulli Sophie the Giraffe Teether,He likes chewing on all the parts especially t...,5,11,1,likes chewing parts especially head ears helpe...,0.996781
34314,Vulli Sophie the Giraffe Teether,My son loves this toy and fits great in the di...,5,17,1,son loves toy fits great diaper bag also easy ...,0.999826
34315,Vulli Sophie the Giraffe Teether,There really should be a large warning on the ...,1,34,0,really large warning box sheesh many anaphylac...,0.533788
34316,Vulli Sophie the Giraffe Teether,All the moms in my moms\' group got Sophie for...,5,27,1,moms moms group got sophie babies wondered som...,0.968879
34317,Vulli Sophie the Giraffe Teether,I was a little skeptical on whether Sophie was...,5,42,1,little skeptical whether sophie going worth mo...,0.594474


### Sorting the review by probability

In [35]:
sorted_review = giraffe_review.sort_values(by=['predicted_sentiment'],ascending=False)

#### View the best review

In [36]:
sorted_review.head() # Good review

Unnamed: 0,name,review,rating,length,sentiment,clean_review,predicted_sentiment
34497,Vulli Sophie the Giraffe Teether,Absolutely wonderful product! The baby loves i...,5,11,1,absolutely wonderful product baby loves love s...,0.99999
34814,Vulli Sophie the Giraffe Teether,We love Sophie at our house... she is a great ...,5,20,1,love sophie house great teether great toy ente...,0.999986
34848,Vulli Sophie the Giraffe Teether,Love it! Love it! Love it! The best teether I...,5,12,1,love love love best teether purchased baby bou...,0.999985
34529,Vulli Sophie the Giraffe Teether,I hesitated to buy this because it was such a ...,5,32,1,hesitated buy high price simple toy hearing ma...,0.999979
34659,Vulli Sophie the Giraffe Teether,Don\'t really think this needs a review becaus...,5,23,1,really think needs review everyone heard sophi...,0.999969


In [37]:
sorted_review.loc[34497,"review"]

'Absolutely wonderful product! The baby loves it so we love it! Sophie is the best investment we have made so far!'

In [38]:
sorted_review.loc[34814,"review"]

'We love Sophie at our house... she is a great teether and a great toy for entertainment.  Our 18 month old still loves her.  She is the perfect size for gripping and holding and biting... she is natural which is great too... and she is SO cute!'

In [39]:
sorted_review.loc[34848,"review"]

'Love it! Love it! Love it!  The best teether I purchased for my baby (I bought several others before finding this one). :)'

In [40]:
sorted_review.loc[34529,"review"]

'I hesitated to buy this because it was such a high price for a simple toy, but after hearing so many great things, I decided to order for my little guy.  He LOVES it!  It is one of his favorite toys, and was worth every penny!  I will definitely keep it in mind for friends family members who have babies.  It would make a great gift.'

In [41]:
sorted_review.tail() # Worst review

Unnamed: 0,name,review,rating,length,sentiment,clean_review,predicted_sentiment
34705,Vulli Sophie the Giraffe Teether,I just received Sophie. I am pretty sure that ...,4,28,1,received sophie pretty sure used returned prod...,0.018041
34956,Vulli Sophie the Giraffe Teether,I was very excited getting this product but my...,2,13,0,excited getting product son interested least d...,0.010782
34706,Vulli Sophie the Giraffe Teether,Totally overpriced for what it is. Go to Pets...,1,19,0,totally overpriced go petsmart get squeaky toy...,0.008926
34630,Vulli Sophie the Giraffe Teether,It\'s really terrible. When my 4 months daught...,1,14,0,really terrible months daughter started chew f...,0.00672
34732,Vulli Sophie the Giraffe Teether,Received the product and smells like cheap rub...,1,24,0,received product smells like cheap rubber bigg...,0.000663


#### View the worst review

In [42]:
sorted_review.loc[34732,"review"]

'Received the product and smells like cheap rubber. This is biggest waste of money. I even soaked it in hot water with soap for couple of hours (over a couple of days) but still smells. I ended up tossing this as this was waste of money.'

In [43]:
sorted_review.loc[34630,"review"]

"It\\'s really terrible. When my 4 months daughter started to chew it,I just found that the rubber was peeling off.....So scary. I returned it to AMAZON right away."

In [44]:
sorted_review.loc[34706,"review"]

"Totally overpriced for what it is.  Go to Petsmart and get a squeaky toy for $.99.  My 7-month old doesn\\'t even like it.  She wants her $2 rattles instead.  This thing is totally useless.  Don\\'t waste your money!"

In [45]:
sorted_review.loc[34956,"review"]

'I was very excited getting this product but my son was not interested in it in the least.  Very disappointed.  I will not recommend this product to anyone not to mention the price for it is ridiculous.'

The worst reviews are about the smelly rubber, peeling off rubber, overpriced. Therefore this can be improved by manufacturer.