# Sentiment Analysis - Amazon baby data

## Load the data

In [3]:
import sidetable

In [4]:
import pandas as pd
products = pd.read_csv(f"D:/Docs/amazon_baby.csv")
#products = pd.read_csv(f"D:/SYED/data/amazon_baby.csv")
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183531 entries, 0 to 183530
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    183213 non-null  object
 1   review  182702 non-null  object
 2   rating  183531 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 4.2+ MB


## Function from sidetable

In [5]:
products.stb.freq(["name"])

Unnamed: 0,name,count,percent,cumulative_count,cumulative_percent
0,Vulli Sophie the Giraffe Teether,785,0.428463,785,0.428463
1,"Simple Wishes Hands-Free Breastpump Bra, Pink,...",562,0.306747,1347,0.735210
2,Infant Optics DXR-5 2.4 GHz Digital Video Baby...,561,0.306201,1908,1.041411
3,Baby Einstein Take Along Tunes,547,0.298560,2455,1.339970
4,"Cloud b Twilight Constellation Night Light, Tu...",520,0.283823,2975,1.623793
...,...,...,...,...,...
32412,&quot;B&quot; Is for Babies (And Booties!) 201...,1,0.000546,183209,99.997817
32413,&quot;A&quot; Soccer Ball Alphabet Letter Name...,1,0.000546,183210,99.998363
32414,#88 Turquoise polka dot baby leg warmers for b...,1,0.000546,183211,99.998908
32415,#55 Red &amp; blue stars patriotic leg warmers...,1,0.000546,183212,99.999454


In [7]:
products.stb.missing()

Unnamed: 0,missing,total,percent
name,318,183531,0.173268
review,0,183531,0.0
rating,0,183531,0.0


## Remove the NAN values in review column

In [6]:
products = products.fillna({'review':''})  # fill in N/A's in the review column
products['review'].isnull().values.any()

False

## Remove punctuation marks

In [None]:
import string
products['review_clean'] = products['review'].apply(lambda x: x.translate(str.maketrans('', '', 
                           string.punctuation)))
products.head(3)

### Remove rating of 3

In [None]:
products = products[products['rating'] != 3]
products.head(3)

### More than 3 is +1 else -1

In [None]:
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)
products['review'][27401]

## Load the data as test and train
* `train-idx.json` for Training data
* `test-idx.json` for Test data

In [None]:
train_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/train-idx.json")
train_data = pd.DataFrame(products, index = train_data_indices[0])
test_data_indices = pd.read_json(f"D:/repos/CourseraPlus/1_ML_Combined_Courses/Course/Week-2/test-idx.json")
test_data = pd.DataFrame(products, index = test_data_indices[0])
print("Train set: " + str(train_data.shape))
print("Test set: " + str(test_data.shape))

### Remove all the NaN before carrying out training

In [None]:
train_data.dropna(subset = ["name"], inplace=True)
train_data.isnull().values.sum()
test_data.dropna(subset = ["name"], inplace=True)
test_data.isnull().values.sum()
print("Train set: " + str(train_data.shape))
print("Test set: " + str(test_data.shape))

## Dictionary of word counts

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(token_pattern = r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
print("Train matrix: " + str(train_matrix.shape))
print("Test matrix: " + str(test_matrix.shape))

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression
sentiment_model = LogisticRegression(random_state=0).fit(train_matrix, train_data['sentiment'])
zero_elem = (sentiment_model.coef_ > 0).sum()
less_elem = (sentiment_model.coef_ <= 0).sum()
total_val = zero_elem + less_elem
print("Co-efficients with value more than 0: " + str(zero_elem) + " out of " + str(total_val))

In [None]:
sample_test_data = test_data[45:50]
sample_test_data['review_clean'][235]

In [None]:
sample_test_matrix = vectorizer.transform(sample_test_data['review_clean'])
scores_sample = sentiment_model.decision_function(sample_test_matrix)
print(scores_sample)

### Mapping function based on scores

In [None]:
def check_score(input_array):
    result = []
    for x in input_array:
        if x >= 0:
            result.append(1)
        else:
            result.append(-1)
    return result
check_score(scores_sample)

### Check probabilities

In [None]:
prob_val = sentiment_model.predict_proba(sample_test_matrix)
print(prob_val[:,1])

## Full data sentiment analysis

In [None]:
#predictions = sentiment_model.predict(test_matrix)
final_scores_data = vectorizer.transform(test_data['review_clean'])
final_score = sentiment_model.decision_function(final_scores_data)
df = pd.DataFrame(final_score, columns = ['Score'])
print(df.shape)
print(df.columns)

In [None]:
df_test = test_data
df_test.reset_index(drop = True, inplace = True)
df2 = pd.concat([df, df_test], axis = 1)
df_export = df2.sort_values(by='Score', ascending=False)
df_export.head(5)

## Export data (Top & Bot 20)

In [None]:
df_top = df_export.head(20)
df_top.to_csv('top20.csv')
df_bottom = df_export.tail(20)
df_bottom.to_csv('bottom20.csv')

## Calculate Accuracy - Sentiment Model

In [None]:
print("Scores dataframe shape: " + str(df.shape))
print("Test data sentiment as reference shape: " + str(test_data.shape))
accuracy_sentiment_model = sentiment_model.score(final_scores_data, test_data['sentiment'].values)
print("Sentiment model accuracy:\n" + str(accuracy_sentiment_model))

## Special Classifier

In [None]:
significant_words = ['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves', 
      'well', 'able', 'car', 'broke', 'less', 'even', 'waste', 'disappointed', 
      'work', 'product', 'money', 'would', 'return']

In [None]:
vectorizer_word_subset = CountVectorizer(vocabulary = significant_words) # limit to 20 words
train_matrix_word_subset = vectorizer_word_subset.fit_transform(train_data['review_clean'])
test_matrix_word_subset = vectorizer_word_subset.transform(test_data['review_clean'])
print("Train matrix special: " + str(train_matrix_word_subset.shape))
print("Test matrix special: " + str(test_matrix_word_subset.shape))

## Simple Log-Reg model

In [None]:
simple_model = LogisticRegression(random_state=0).fit(train_matrix_word_subset, train_data['sentiment'])
zero_elem_simple = (simple_model.coef_ > 0).sum()
print(zero_elem_simple)

In [None]:
arr1 = simple_model.coef_
arr2 = sentiment_model.coef_
arr1 = arr1.reshape(-1, 1)
arr2 = arr2.reshape(-1, 1)
list1 = arr1.tolist()
list2 = arr2.tolist()
len(list2)

In [None]:
vectorizer.get_feature_names_out()

### Co-efficient dictionary for significant words

In [None]:
# Check the coefficients for 20 words used in model
output_dict = dict(zip(significant_words, list1))
setiment_dict = dict(zip(vectorizer.get_feature_names_out(), list2))
#print(setiment_dict)

## Compare two dictionaries

In [None]:
def compare_ndic(src, dest):
    result = []
    for skey, sval in src.items():
        for nkey, nval in dest.items():
            if skey == nkey:
                result.append([skey, sval, nval])
    return result

new_list = compare_ndic(output_dict, setiment_dict)
df_dict = pd.DataFrame(new_list, columns=['Word', 'Small', 'Sentiment'])
df_dict.head(20)

# Compare `train` & `test` accuracy

In [None]:
accuracy_sentiment_model = sentiment_model.score(final_scores_data, test_data['sentiment'].values)
print("Sentiment model accuracy TRAINING data:\n" + str(accuracy_sentiment_model))

accuracy_simple_model = simple_model.score(test_matrix_word_subset, test_data['sentiment'].values)
print("Simple model accuracy TRAINING data:\n" + str(accuracy_simple_model))

In [None]:
products['sentiment'].value_counts()

In [None]:
print("Majority classifier accuracy should be:\n" + str(140259/(140259+26493)))