## Sentiment Analysis - Amazon baby data

In [1]:
import pandas as pd
#products = pd.read_csv(f"D:/Docs/amazon_baby.csv")
products = pd.read_csv(f"D:/SYED/data/amazon_baby.csv")
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    183213 non-null  object
 1   review  182702 non-null  object
 2   rating  183531 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


## Remove the NAN values in review column

In [2]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review'].isnull().values.any()

False

## Remove punctuation marks

In [3]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
products.head(5)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


### Remove rating of 3

In [4]:
products = products[products['rating'] != 3]
products.head(3)

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...


### More than 3 is +1 else -1

In [5]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
#Example of negative review
products['review'][27401]

'I love this product.Simple but does the job great.Very easy to attach.I really have nothing bad to say about it.My baby is now protected from the sun.'

## Split the data into test/train

In [6]:
train_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/train-idx.json")
train_set = pd.DataFrame(products, index = train_data_indices[0])
test_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/test-idx.json")
test_set = pd.DataFrame(products, index = test_data_indices[0])
print("Train set: " + str(train_set.shape))
print("Test set: " + str(test_set.shape))

Train set: (133416, 5)
Test set: (33336, 5)


### Remove all the NaN before carrying out training

In [7]:
train_set.dropna(subset = ["name"], inplace=True)
train_set.isnull().values.sum()
test_set.dropna(subset = ["name"], inplace=True)
test_set.isnull().values.sum()
print("Train set: " + str(train_set.shape))
print("Test set: " + str(test_set.shape))

Train set: (121011, 5)
Test set: (30209, 5)


## Dictionary of word counts

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_set['review_clean'])
test_matrix = vectorizer.transform(test_set['review_clean'])
print("Train matrix: " + str(train_matrix.shape))
print("Test matrix: " + str(test_matrix.shape))

Train matrix: (121011, 113128)
Test matrix: (30209, 113128)


In [9]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, train_set['sentiment'])
sentiment_model.coef_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([[-2.80346423e-01,  5.03461496e-03,  7.83617580e-03, ...,
        -4.72665536e-04,  4.50663457e-05,  9.66007996e-04]])

In [10]:
zero_elem = (sentiment_model.coef_ > 0).sum()
print(zero_elem)

83399


In [11]:
#predictions = sentiment_model.predict(test_matrix)
final_scores_data = vectorizer.transform(test_set['review_clean'])
final_score = sentiment_model.decision_function(final_scores_data)
print(final_score)

[ 4.00964325  3.83962325  4.1955828  ...  0.06225689 -1.70672874
  3.60665302]


In [12]:
df = pd.DataFrame(final_score, columns = ['Score'])
df.head(3)

Unnamed: 0,Score
0,4.009643
1,3.839623
2,4.195583


In [13]:
test_set.head(5)

Unnamed: 0_level_0,name,review,rating,review_clean,sentiment
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8,"Baby Tracker&reg; - Daily Childcare Journal, S...",A friend of mine pinned this product on Pinter...,5.0,A friend of mine pinned this product on Pinter...,1.0
9,"Baby Tracker&reg; - Daily Childcare Journal, S...",This has been an easy way for my nanny to reco...,4.0,This has been an easy way for my nanny to reco...,1.0
14,Nature's Lullabies First Year Sticker Calendar,"Space for monthly photos, info and a lot of us...",5.0,Space for monthly photos info and a lot of use...,1.0
18,Nature's Lullabies Second Year Sticker Calendar,I completed a calendar for my son's first year...,4.0,I completed a calendar for my sons first year ...,1.0
24,Nature's Lullabies Second Year Sticker Calendar,Wife loves this calender. Comes with a lot of ...,5.0,Wife loves this calender Comes with a lot of s...,1.0


In [14]:
print(df.shape)
print(df.columns)
print(test_set.shape)
print(test_set.columns)

(30209, 1)
Index(['Score'], dtype='object')
(30209, 5)
Index(['name', 'review', 'rating', 'review_clean', 'sentiment'], dtype='object')


In [50]:
df_test = test_set

In [52]:
df_test.reset_index(drop = True, inplace = True)

In [53]:
df_test.head(5)

Unnamed: 0,rating,review_clean,sentiment
0,5.0,A friend of mine pinned this product on Pinter...,1.0
1,4.0,This has been an easy way for my nanny to reco...,1.0
2,5.0,Space for monthly photos info and a lot of use...,1.0
3,4.0,I completed a calendar for my sons first year ...,1.0
4,5.0,Wife loves this calender Comes with a lot of s...,1.0


In [54]:
#df2 = pd.concat([df, test_set])
df2 = pd.concat([df, test_set], axis = 1)
df2.shape

(30209, 4)

In [55]:
df2.head(5)

Unnamed: 0,Score,rating,review_clean,sentiment
0,4.009643,5.0,A friend of mine pinned this product on Pinter...,1.0
1,3.839623,4.0,This has been an easy way for my nanny to reco...,1.0
2,4.195583,5.0,Space for monthly photos info and a lot of use...,1.0
3,6.349742,4.0,I completed a calendar for my sons first year ...,1.0
4,5.604406,5.0,Wife loves this calender Comes with a lot of s...,1.0


In [56]:
df2.sort_values(by='Score', ascending=False)

Unnamed: 0,Score,rating,review_clean,sentiment
20952,71.343315,5.0,Ive posted an UPDATE at the endFirst let me st...,1.0
22392,57.554052,5.0,I did a TON of research before I purchased thi...,1.0
28615,53.675168,5.0,updated 32213 After extensive research trial a...,1.0
18044,44.201488,5.0,I bought this carrier when my daughter was abo...,1.0
9166,41.352544,5.0,The joovy zoom 360 was the perfect solution fo...,1.0
...,...,...,...,...
25931,-24.219109,1.0,This is a dreadful car seat I would never reco...,-1.0
23825,-25.349258,2.0,I purchased this in the black color For some ...,-1.0
21867,-29.352489,1.0,The previous reviewers laud the piece of mind ...,-1.0
1897,-29.670152,1.0,This product should be in the hall of fame sol...,-1.0
