# Sentiment Analysis - Amazon baby data

### Load the data and remove NAN

In [None]:
import pandas as pd
products = pd.read_csv(f"D:/Docs/amazon_baby.csv")
#products = pd.read_csv(f"D:/SYED/data/baby/amazon_baby.csv")
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review'].isnull().values.any()
products.shape

## Remove punctuation marks

In [None]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', 
                           string.punctuation)))
products.head(3)

### Remove rating of 3

In [None]:
products = products[products['rating'] != 3]
products.head(3)

### More than 3 is +1 else -1

In [None]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
products['review'][27401]

## Load the data as test and train
* `train-idx.json` for Training data
* `test-idx.json` for Test data

train_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/train-idx.json")
train_data = pd.DataFrame(products, index = train_data_indices[0])
test_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/test-idx.json")
test_data = pd.DataFrame(products, index = test_data_indices[0])
print("Train set: " + str(train_data.shape))
print("Test set: " + str(test_data.shape))

In [None]:
# To generate the test index list
test_index = pd.read_json('test-idx.json', orient='values')
test_index_list = [x[0] for x in test_index.values]
train_data = products.copy()
train_data = train_data.drop(train_data.index[test_index_list])

# Train index
train_index = pd.read_json('train-idx.json', orient='values')
train_index_list = [x[0] for x in train_index.values]
test_data = products.copy()
test_data = test_data.drop(test_data.index[train_index_list])

### Remove all the NaN before carrying out training

In [None]:
train_data.dropna(subset = ["name"], inplace=True)
train_data.isnull().values.sum()
test_data.dropna(subset = ["name"], inplace=True)
test_data.isnull().values.sum()
print("Train set: " + str(train_data.shape))
print("Test set: " + str(test_data.shape))

## Dictionary of word counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern = r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
print("Train matrix: " + str(train_matrix.shape))
print("Test matrix: " + str(test_matrix.shape))

## Logistic Regression Model

In [9]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, train_data['sentiment'])
zero_elem = (sentiment_model.coef_ >= 0).sum()
less_elem = (sentiment_model.coef_ < 0).sum()
total_val = zero_elem + less_elem
print("Co-efficients with value more than 0: " + str(zero_elem) + " out of " + str(total_val))

Co-efficients with value more than 0: 89423 out of 121539


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
sample_test_data = test_data[10:13]
sample_test_data

Unnamed: 0,name,review,rating,review_clean,sentiment
59,Our Baby Girl Memory Book,Absolutely love it and all of the Scripture in...,5,Absolutely love it and all of the Scripture in...,1
71,Wall Decor Removable Decal Sticker - Colorful ...,Would not purchase again or recommend. The dec...,2,Would not purchase again or recommend The deca...,-1
91,New Style Trailing Cherry Blossom Tree Decal R...,Was so excited to get this product for my baby...,1,Was so excited to get this product for my baby...,-1


In [11]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores_sample = sentiment_model.decision_function(sample_test_matrix)
print(scores_sample)

[  5.16442713  -3.09477187 -10.31064777]


### Score and Probability

In [14]:
def check_score(input_array):
    result = []
    for x in input_array:
        if x >= 0:
            result.append(1)
        else:
            result.append(-1)
    return result
print(check_score(scores_sample))
prob_val = sentiment_model.predict_proba(sample_test_matrix)
print(prob_val[:,1])

[1, -1, -1]
[9.94316154e-01 4.33234261e-02 3.32757702e-05]


## Full data sentiment analysis

In [15]:
#predictions = sentiment_model.predict(test_matrix)
final_scores_data = vectorizer.transform(test_data['review_clean'])
final_score = sentiment_model.decision_function(final_scores_data)
df = pd.DataFrame(final_score, columns = ['Score'])
df_test = test_data
df_test.reset_index(drop = True, inplace = True)
df2 = pd.concat([df, df_test], axis = 1)
df_export = df2.sort_values(by='Score', ascending=False)
df_export.head(3)

Unnamed: 0,Score,name,review,rating,review_clean,sentiment
18081,50.921076,"Infantino Wrap and Tie Baby Carrier, Black Blu...",I bought this carrier when my daughter was abo...,5,I bought this carrier when my daughter was abo...,1
15702,45.972176,Baby Einstein Around The World Discovery Center,I am so HAPPY I brought this item for my 7 mon...,5,I am so HAPPY I brought this item for my 7 mon...,1
30582,44.630419,Graco FastAction Fold Jogger Click Connect Str...,Graco's FastAction Jogging Stroller definitely...,5,Gracos FastAction Jogging Stroller definitely ...,1


## Export data (Top & Bot 20)

In [16]:
df_top = df_export.head(20)
df_top.to_csv('top20.csv')
df_bottom = df_export.tail(20)
df_bottom.to_csv('bottom20.csv')

## Special Classifier

In [17]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']
vectorizer_word_subset = CountVectorizer(vocabulary = significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])
print("Train matrix special: " + str(train_matrix_word_subset.shape))
print("Test matrix special: " + str(test_matrix_word_subset.shape))

Train matrix special: (133174, 20)
Test matrix special: (33282, 20)


## Simple Log-Reg model

In [18]:
simple_model = LogisticRegression(random_state=0).fit(train_matrix_word_subset, train_data['sentiment'])
zero_elem_simple = (simple_model.coef_ > 0).sum()
print(zero_elem_simple)

10


### Co-efficient dictionary for significant words

In [19]:
arr1 = simple_model.coef_
arr2 = sentiment_model.coef_
arr1 = arr1.reshape(-1, 1)
arr2 = arr2.reshape(-1, 1)
list1 = arr1.tolist()
list2 = arr2.tolist()
vectorizer.get_feature_names_out()

# Check the coefficients for 20 words used in model
output_dict = dict(zip(significant_words, list1))
setiment_dict = dict(zip(vectorizer.get_feature_names_out(), list2))
#print(setiment_dict)

## Compare two dictionaries

In [20]:
def compare_ndic(src, dest):
    result = []
    for skey, sval in src.items():
        for nkey, nval in dest.items():
            if skey == nkey:
                result.append([skey, sval, nval])
    return result

new_list = compare_ndic(output_dict, setiment_dict)
df_dict = pd.DataFrame(new_list, columns=['Word', 'Small', 'Sentiment'])
df_dict.head(20)

Unnamed: 0,Word,Small,Sentiment
0,love,[1.3646982275578259],[1.5673927755779458]
1,great,[0.9448579595373243],[1.2492114551129934]
2,easy,[1.1885937648567766],[1.3691891460112817]
3,old,[0.08474509132707345],[0.023880962694272163]
4,little,[0.5188840456189335],[0.5181713454106159]
5,perfect,[1.5090928051705035],[1.9868198542683997]
6,loves,[1.6736521477287714],[1.6974621858567691]
7,well,[0.5038990477325531],[0.4816978993253936]
8,able,[0.1906930998941367],[0.3236821086715096]
9,car,[0.05888698138807688],[0.10270968120582619]


# Compare `train` & `test` accuracy

In [21]:
accuracy_sentiment_model = sentiment_model.score(final_scores_data, test_data['sentiment'].values)
print("Sentiment model accuracy TRAINING data:\n" + str(accuracy_sentiment_model))

accuracy_simple_model = simple_model.score(test_matrix_word_subset, test_data['sentiment'].values)
print("Simple model accuracy TRAINING data:\n" + str(accuracy_simple_model))

Sentiment model accuracy TRAINING data:
0.9324559822126074
Simple model accuracy TRAINING data:
0.8692987200288445


In [22]:
test_data['sentiment'].value_counts()

 1    28048
-1     5234
Name: sentiment, dtype: int64

In [23]:
print("Majority classifier accuracy should be:\n" + str(28048 /(28048 + 5234)))

Majority classifier accuracy should be:
0.8427378162370049
