## Sentiment Analysis - Amazon baby data

In [1]:
import pandas as pd

In [3]:
products = pd.read_csv(f"D:/Docs/amazon_baby.csv")
#products = pd.read_csv(f"D:/SYED/data/amazon_baby.csv")

In [4]:
products.columns

Index(['name', 'review', 'rating'], dtype='object')

## Remove the NAN values in review column

In [5]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review'].isnull().values.any()

False

## Remove punctuation marks

In [6]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [7]:
products.head(5)

Unnamed: 0,name,review,rating,review_clean
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3,These flannel wipes are OK but in my opinion n...
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,All of my kids have cried nonstop when I tried...


### Remove rating of 3

In [8]:
products = products[products['rating'] != 3]

In [9]:
products.head(3)

Unnamed: 0,name,review,rating,review_clean
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,it came early and was not disappointed i love ...
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,Very soft and comfortable and warmer than it l...
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,This is a product well worth the purchase I h...


### More than 3 is +1 else -1

In [10]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)

In [11]:
#Example of negative review
products['review'][27401]

'I love this product.Simple but does the job great.Very easy to attach.I really have nothing bad to say about it.My baby is now protected from the sun.'

## Split the data into test/train

In [12]:
 train_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/train-idx.json")
 train_set = pd.DataFrame(products, index = train_data_indices[0])
 test_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/test-idx.json")
 test_set = pd.DataFrame(products, index = test_data_indices[0])

In [13]:
print("Train set: " + str(train_set.shape))
print("Test set: " + str(test_set.shape))

Train set: (133416, 5)
Test set: (33336, 5)


### Remove all the NaN before carrying out training

In [14]:
train_set.dropna(subset = ["name"], inplace=True)
train_set.isnull().values.sum()
test_set.dropna(subset = ["name"], inplace=True)
test_set.isnull().values.sum()

0

In [15]:
train_set.tail(5)

Unnamed: 0_level_0,name,review,rating,review_clean,sentiment
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
166746,"Mud Pie Ornament with Personalization, Snowman",This looked like something one of my five year...,1.0,This looked like something one of my five year...,-1.0
166747,"Stephan Baby Cordy Owl Corduroy Rattle, Pink",I bought this for my niece and it is adorable ...,5.0,I bought this for my niece and it is adorable ...,1.0
166748,Basket Affair - Baby Cakes Washcloth Cupcake G...,This is unbelievable for this price! This gif...,5.0,This is unbelievable for this price This gift...,1.0
166749,Waterproof Bamboo Nursing Pads - Pale Orange (...,These are very nice the only better ones that ...,4.0,These are very nice the only better ones that ...,1.0
166750,Waterproof Bamboo Nursing Pads - BLACK (12 Pie...,Once I got around to opening the package that ...,5.0,Once I got around to opening the package that ...,1.0


In [16]:
print("Train set: " + str(train_set.shape))
print("Test set: " + str(test_set.shape))

Train set: (121011, 5)
Test set: (30209, 5)


## Dictionary of word counts

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_set['review_clean'])
test_matrix = vectorizer.transform(test_set['review_clean'])

In [18]:
print("Train matrix: " + str(train_matrix.shape))
print("Test matrix: " + str(test_matrix.shape))

Train matrix: (121011, 113128)
Test matrix: (30209, 113128)


In [19]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, train_set['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
sentiment_model.coef_

array([[-2.80345258e-01,  5.03459305e-03,  7.83613823e-03, ...,
        -4.72647048e-04,  4.50662488e-05,  9.65998619e-04]])

In [21]:
zero_elem = (sentiment_model.coef_ > 0).sum()
print(zero_elem)

83399


In [23]:
sample_test_data = test_set[10:13]
sample_test_data['review_clean']


0
64     Really happy with this purchase I was looking ...
82     It has been many years since we needed diaper ...
102    Love it love it love it  Got my first baby tra...
Name: review_clean, dtype: object

In [24]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores = sentiment_model.decision_function(sample_test_matrix)
print(scores)

[ 6.93153519  6.4932545  10.42578732]


In [25]:
#predictions = sentiment_model.predict(test_matrix)
final_scores_data = vectorizer.transform(test_set['review_clean'])
final_score = sentiment_model.decision_function(final_scores_data)
print(final_score)

[ 4.00953488  3.83962707  4.19552179 ...  0.06218771 -1.70686782
  3.6066164 ]


In [48]:
df = pd.DataFrame(final_score, columns = ['Score'])
df1 = pd.DataFrame(test_set['review_clean'], columns = ['Review_clean'])
df['Score_second'] = df['Score']
df.head(3)

Unnamed: 0,Score,Score_second
0,4.009535,4.009535
1,3.839627,3.839627
2,4.195522,4.195522


In [61]:
df1.shape

(0, 1)

In [55]:
#df_full = pd.concat([df['Score'], test_set['review_clean']],axis = 1)
df_full = df.merge(df1, on=None, how='left', left_index=True, right_index=True)

In [56]:
df_full.shape

(30209, 3)

In [57]:
sorted_df = df_full.sort_values(by=['Score'], ascending=False)

In [59]:
sorted_df.tail(20)

Unnamed: 0,Score,Score_second,Review_clean
1958,-18.945839,-18.945839,
14202,-18.948163,-18.948163,
10390,-19.628384,-19.628384,
28222,-20.193125,-20.193125,
206,-20.370037,-20.370037,
22371,-20.561759,-20.561759,
24374,-20.755757,-20.755757,
2511,-21.666462,-21.666462,
3732,-21.761034,-21.761034,
19776,-21.843305,-21.843305,
