# Amazon Product Review Analysis


## Data

Dataset was extracted from kaggle at https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products

<h6 align="right"><a href = "https://xuelin-amy.github.io/playground/">Back to Home</a></h6>

In [19]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
%matplotlib inline

pd.__version__

ImportError: dlopen(/Users/xuelin/anaconda3/lib/python3.6/site-packages/scipy/spatial/qhull.cpython-36m-darwin.so, 2): Library not loaded: @rpath/libopenblas.dylib
  Referenced from: /Users/xuelin/anaconda3/lib/python3.6/site-packages/scipy/spatial/qhull.cpython-36m-darwin.so
  Reason: image not found

In [None]:
data = pd.read_csv('Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv', 
                    low_memory=False)
data.head()

## Exploratory data anlysis

### statistic summary of reviews

In [None]:
data['dateAdded'] = pd.to_datetime(data.dateAdded)
data['reviews.len'] = data['reviews.text'].map(len)
data['hour'] = data.dateAdded.dt.strftime('%H')
data['ym'] = data.dateAdded.dt.strftime('%Y-%m')
data['dow'] = data.dateAdded.dt.strftime('%a')

In [None]:
data.groupby('ym')\
    .agg(num_product = pd.NamedAgg('name', pd.Series.nunique),
           num_review = pd.NamedAgg('reviews.text', pd.Series.nunique),
           avg_review_len = pd.NamedAgg('reviews.len', lambda i: np.round(np.mean(i),2)),
           avg_rating = pd.NamedAgg('reviews.rating', lambda i: np.round(np.mean(i),2))
          )\
    .reset_index()

In [None]:
res = data.groupby('primaryCategories')\
          .agg(num_product = pd.NamedAgg('name', pd.Series.nunique),
               num_review = pd.NamedAgg('reviews.text', pd.Series.nunique),
               avg_review_len = pd.NamedAgg('reviews.len', lambda i: np.round(np.mean(i),2)),
               avg_rating = pd.NamedAgg('reviews.rating', lambda i: np.round(np.mean(i),2))
              )\
          .reset_index()
res

In [None]:
import altair as alt
alt.Chart(
    res.melt(id_vars='primaryCategories', var_name='variable', value_name='value')
).mark_bar().encode(
    y = 'primaryCategories:N',
    x = 'value:Q', 
    color = 'variable:N',
    column = 'variable:N',
    tooltip = ['primaryCategories','variable','value']
).resolve_scale(
    x='independent'
).properties(
    width = 200
)

In [None]:
# correlation between length and rating
data[['reviews.len','reviews.rating']].boxplot(by = 'reviews.rating')

In [None]:
data[['hour','reviews.len']].boxplot(by = 'hour')

In [None]:
data[['dow','reviews.len']].boxplot(by = 'dow')

In [None]:
# check some random reviews
random_reviews = data.sample(5)

for i in range(len(random_reviews)):
    print('Review #{} ({} stars) by {}'.format(i, 
                                               random_reviews['reviews.rating'].iloc[i],
                                               random_reviews['reviews.username'].iloc[i]))
    print(random_reviews['reviews.title'].iloc[i])
    print(random_reviews['reviews.text'].iloc[i])
    print('\n')

### World cloud by category

In [None]:
from wordcloud import WordCloud
fig, axs = plt.subplots(2,2)
fig.set_size_inches(12,6)
for i, cate in enumerate(data.primaryCategories.unique()):
    text = '\n'.join(data.loc[data.primaryCategories == cate, 'reviews.text'].values)
    wordcloud = WordCloud(background_color='white').generate(text)
    axs[i // 2, i % 2].imshow(wordcloud, interpolation="bilinear")
    axs[i // 2, i % 2].set_title(cate)
    axs[i // 2, i % 2].axis('off')
fig.tight_layout()
plt.show()

## Sentiment Analysis

In [None]:
analyzer = SentimentIntensityAnalyzer()
text = random_reviews['reviews.text'].iloc[0]
print(text)
analyzer.polarity_scores(text)

In [None]:
pos_reviews = data.loc[data['reviews.rating'] == 5, :].sample(3)
neg_reviews = data.loc[data['reviews.rating'] == 1, :].sample(3)
random_reviews = pd.concat([pos_reviews, neg_reviews])
scores = random_reviews['reviews.text'].map(lambda i: analyzer.polarity_scores(i)['compound'])
random_reviews['score'] = scores

In [None]:
for i in range(len(random_reviews)):
    print('Review #{} ({} stars) by {}'.format(i, 
                                               random_reviews['reviews.rating'].iloc[i],
                                               random_reviews['reviews.username'].iloc[i]))
    print(random_reviews['reviews.title'].iloc[i])
    print('{} (sentiment score: {:0.2f})'.format(random_reviews['reviews.text'].iloc[i],
                                               random_reviews['score'].iloc[i]))
    print('\n')

In [None]:
scores = data['reviews.text'].map(lambda i: analyzer.polarity_scores(i)['compound'])
data['score'] = scores

In [None]:
# sentiment score distribution

fig, axs = plt.subplots(ncols=4, sharey=True, sharex=True)
fig.set_size_inches(12, 3)
for idx, cate in enumerate(data.primaryCategories.unique()):
    axs[idx].hist(data.loc[data.primaryCategories == cate, 'score'].values)
    axs[idx].set_title(cate)
fig.tight_layout()
plt.show()

In [None]:
sns.stripplot(x='reviews.rating', y='score', data=data)

In [None]:
# relationship between score and rating

data.plot.scatter('reviews.rating', 'score')

In [None]:
data.groupby(['primaryCategories','brand'])\
    .agg(score_avg = pd.NamedAgg('score','mean'),
         n = pd.NamedAgg('reviews.text','count'))\
    .assign(score_avg = lambda i: i.score_avg.round(2),
            value = lambda i: i.apply(
                lambda row: '{:0.0f} | {:0.2f}'.format(row['n'], row['score_avg']), 
                axis=1))\
    .reset_index()\
    .pivot('primaryCategories','brand','value')

## Topic Modelling

In [None]:
from gensim.test.utils import common_corpus, common_dictionary