## Sentiment Analysis - Amazon baby data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#Read the main dataset
products = pd.read_csv(f"D:/amazon_baby.csv")

In [3]:
products.columns

Index(['name', 'review', 'rating'], dtype='object')

## Remove the NAN values in review column

In [4]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [5]:
products['review'].isnull().values.any()

False

## Remove punctuation marks

In [6]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [7]:
products.head(5)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


In [8]:
products = products[products['rating'] != 3]

In [9]:
products.head(3)

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...


In [10]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [12]:
#Example of negative review
products['review'][27401]

'I love this product.Simple but does the job great.Very easy to attach.I really have nothing bad to say about it.My baby is now protected from the sun.'

## Split the data into test/train

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(products['review_clean'], 
products['sentiment'], test_size=0.20, random_state = 0)

In [14]:
X_train.head(3)

170560    This is going to be a stocking stuffer for a 1...
82460     I bought this for My 2 yr old I had it persona...
95808     Appears to be a good quality item and does a n...
Name: review_clean, dtype: object

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
# Use this token pattern to keep single-letter words
# First, learn vocabulary from the training data and assign columns to words
# Then convert the training data into a sparse matrix
train_matrix = vectorizer.fit_transform(X_train)

# Second, convert the test data into a sparse matrix, using the same word-column mapping
test_matrix = vectorizer.transform(X_test)

In [20]:
print(train_matrix[1])

  (0, 107026)	2
  (0, 108491)	2
  (0, 15623)	1
  (0, 44431)	1
  (0, 73717)	1
  (0, 10463)	1
  (0, 54052)	2
  (0, 18991)	1
  (0, 69554)	1
  (0, 1549)	1
  (0, 120972)	1
  (0, 49578)	1
  (0, 57251)	3
  (0, 78903)	1
  (0, 47994)	1
  (0, 39940)	1
  (0, 116306)	2
  (0, 103397)	2
  (0, 26451)	1
  (0, 118955)	2
  (0, 40197)	1
  (0, 51660)	1
  (0, 69742)	1
  (0, 105662)	1
  (0, 73964)	1
  (0, 54350)	1
  (0, 114986)	1
  (0, 113524)	1


In [22]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
sentiment_model

LogisticRegression(random_state=0)

In [36]:
sample_test_data = X_test[10:13]
print(sample_test_data)

27972     I love these bibs  We have about 8 of them in ...
123316    My 4 year old gets up earlier than me this mea...
60874     love the bag especially since its over the sho...
Name: review_clean, dtype: object
