## Sentiment Analysis - Amazon baby data

In [1]:
import pandas as pd

In [2]:
#products = pd.read_csv(f"D:/amazon_baby.csv")
products = pd.read_csv(f"D:/SYED/data/amazon_baby.csv")

In [3]:
products.columns

Index(['name', 'review', 'rating'], dtype='object')

## Remove the NAN values in review column

In [4]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review'].isnull().values.any()

False

## Remove punctuation marks

In [5]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [6]:
products.head(5)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


### Remove rating of 3

In [7]:
products = products[products['rating'] != 3]

In [8]:
products.head(3)

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...


### More than 3 is +1 else -1

In [9]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [10]:
#Example of negative review
products['review'][27401]

'I love this product.Simple but does the job great.Very easy to attach.I really have nothing bad to say about it.My baby is now protected from the sun.'

## Split the data into test/train

In [11]:
 train_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/train-idx.json")
 train_set = pd.DataFrame(products, index = train_data_indices[0])
 test_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/test-idx.json")
 test_set = pd.DataFrame(products, index = test_data_indices[0])

In [13]:
print("Train set: " + str(train_set.shape))
print("Test set: " + str(test_set.shape))

Train set: (133416, 5)
Test set: (33336, 5)


### Remove all the NaN before carrying out training

In [14]:
train_set.dropna(subset = ["name"], inplace=True)
train_set.isnull().values.sum()
test_set.dropna(subset = ["name"], inplace=True)
test_set.isnull().values.sum()

0

In [16]:
train_set.tail(5)

Unnamed: 0_level_0,name,review,rating,review_clean,sentiment
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
166746,"Mud Pie Ornament with Personalization, Snowman",This looked like something one of my five year...,1.0,This looked like something one of my five year...,-1.0
166747,"Stephan Baby Cordy Owl Corduroy Rattle, Pink",I bought this for my niece and it is adorable ...,5.0,I bought this for my niece and it is adorable ...,1.0
166748,Basket Affair - Baby Cakes Washcloth Cupcake G...,This is unbelievable for this price! This gif...,5.0,This is unbelievable for this price This gift...,1.0
166749,Waterproof Bamboo Nursing Pads - Pale Orange (...,These are very nice the only better ones that ...,4.0,These are very nice the only better ones that ...,1.0
166750,Waterproof Bamboo Nursing Pads - BLACK (12 Pie...,Once I got around to opening the package that ...,5.0,Once I got around to opening the package that ...,1.0


In [17]:
print("Train set: " + str(train_set.shape))
print("Test set: " + str(test_set.shape))

Train set: (121011, 5)
Test set: (30209, 5)


## Dictionary of word counts

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_set['review_clean'])
test_matrix = vectorizer.transform(test_set['review_clean'])

In [19]:
print("Train matrix: " + str(train_matrix.shape))
print("Test matrix: " + str(test_matrix.shape))

Train matrix: (121011, 113128)
Test matrix: (30209, 113128)


In [20]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, train_set['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
sentiment_model.coef_

array([[-2.80346021e-01,  5.03460742e-03,  7.83616290e-03, ...,
        -4.72659218e-04,  4.50663123e-05,  9.66004786e-04]])

In [24]:
zero_elem = (sentiment_model.coef_ > 0).sum()
print(zero_elem)

83399


In [25]:
sample_test_data = test_set[10:13]
print(sample_test_data)

                                                  name  \
0                                                        
64                           Our Baby Girl Memory Book   
82   Cloth Diaper Pins Stainless Steel Traditional ...   
102  Newborn Baby Tracker&reg; - Round the Clock Ch...   

                                                review  rating  \
0                                                                
64   Really happy with this purchase. I was looking...     5.0   
82   It has been many years since we needed diaper ...     5.0   
102  Love it love it love it!!  Got my first baby t...     5.0   

                                          review_clean  sentiment  
0                                                                  
64   Really happy with this purchase I was looking ...        1.0  
82   It has been many years since we needed diaper ...        1.0  
102  Love it love it love it  Got my first baby tra...        1.0  


In [34]:
sample_test_data['review_clean']

0
64     Really happy with this purchase I was looking ...
82     It has been many years since we needed diaper ...
102    Love it love it love it  Got my first baby tra...
Name: review_clean, dtype: object

In [35]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[ 6.93152863  6.49328258 10.42594085]


In [49]:
#predictions = sentiment_model.predict(test_matrix)
final_scores_data = vectorizer.transform(test_set['review_clean'])
final_score = sentiment_model.decision_function(final_scores_data)
print(final_score)

[ 4.00960651  3.83962472  4.19556214 ...  0.06223356 -1.70677584
  3.60664066]


In [51]:
df = pd.DataFrame(final_score, columns = ['Score'])
df.head(5)

Unnamed: 0,Score
0,4.009607
1,3.839625
2,4.195562
3,6.349738
4,5.604373


In [52]:
df.sort_values(by='Score', ascending=False)

Unnamed: 0,Score
20952,71.342360
22392,57.554025
28615,53.675176
18044,44.201512
9166,41.352585
...,...
25931,-24.218979
23825,-25.349218
21867,-29.352658
1897,-29.670107


In [59]:
df.head(5)

Unnamed: 0_level_0,Score
index,Unnamed: 1_level_1
0,4.009607
1,3.839625
2,4.195562
3,6.349738
4,5.604373


In [62]:
test_set['review_clean'].head(5)
#df['reivew'] = df(df[test_set])

0
8     A friend of mine pinned this product on Pinter...
9     This has been an easy way for my nanny to reco...
14    Space for monthly photos info and a lot of use...
18    I completed a calendar for my sons first year ...
24    Wife loves this calender Comes with a lot of s...
Name: review_clean, dtype: object

In [None]:
df['reivew'] = df()