In [1]:
import re
import csv
import spacy
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime as dt
pd.options.display.max_columns = None
from reduce_memory_df import optimize_mem_usage

nlp = spacy.load('en_core_web_sm')

In [2]:
df = pd.read_csv('data/amazon_reviews_us_Grocery_v1_00.tsv', sep='\t', quoting=csv.QUOTE_NONE)

In [4]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2402458 entries, 0 to 2402457
Data columns (total 15 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   marketplace        2402458 non-null  object
 1   customer_id        2402458 non-null  int64 
 2   review_id          2402458 non-null  object
 3   product_id         2402458 non-null  object
 4   product_parent     2402458 non-null  int64 
 5   product_title      2402458 non-null  object
 6   product_category   2402458 non-null  object
 7   star_rating        2402458 non-null  int64 
 8   helpful_votes      2402458 non-null  int64 
 9   total_votes        2402458 non-null  int64 
 10  vine               2402458 non-null  object
 11  verified_purchase  2402458 non-null  object
 12  review_headline    2402447 non-null  object
 13  review_body        2402393 non-null  object
 14  review_date        2402458 non-null  object
dtypes: int64(5), object(10)
memory usage: 274.9+ MB


In [5]:
df.nunique()

marketplace                1
customer_id          1363986
review_id            2402458
product_id            305512
product_parent        268150
product_title         273649
product_category           1
star_rating                5
helpful_votes            564
total_votes              603
vine                       2
verified_purchase          2
review_headline      1273804
review_body          2226078
review_date             4404
dtype: int64

### Inspect the "not a number" or NaN values (aka null values)

In [10]:
print(df[df.review_body.isna()].shape)
df[df.review_body.isna()].head()

(65, 15)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
4451,US,14526893,R1H6DCLMHV382W,B005DN3DC6,292356201,Iaso Tea,Grocery,5,0,3,N,N,You Can Do It!,,2015-08-30
8099,US,14534983,R110WTSH3MYZGW,B0080IIKM4,467236352,"Quaker Plain Unsalted Rice Cake, 4.47 oz, 3 pk",Grocery,1,2,3,N,Y,One Star,,2015-08-29
10946,US,18074592,R2QAQ5GE1TM8LV,B00XSAJXVQ,22560023,Matcha Green Tea Powder,Grocery,4,0,0,N,Y,Four Stars,,2015-08-28
27412,US,2226545,R2J0MZ19BSXZK0,B00991O4NA,992865454,Single Source Party Supply - Sock Monkey Edibl...,Grocery,5,2,2,N,Y,Five Stars,,2015-08-22
37969,US,47506113,R1EK1A4EAP06W6,B00991QGOA,272125398,Single Source Party Supply - Willy Wonka Edibl...,Grocery,5,0,0,N,Y,Five Stars,,2015-08-18


In [11]:
print(df[df.review_headline.isna()].shape)
df[df.review_headline.isna()].head()

(11, 15)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
181503,US,1003646,R227ZKLKQEM2X,B00I08J6S6,760957009,Tully's Coffee French Roast K-cup for Keurig B...,Grocery,5,0,0,N,Y,,Although the product once delivered is always ...,2015-06-29
345827,US,50649712,R37IE7ME43YZA5,B002UQSSHO,154376421,N'JOY Coffee Creamer - 8/16oz Canisters,Grocery,4,0,0,N,Y,,product itself is fine. however the packing le...,2015-05-03
489543,US,51045315,RTUNWNWU730DE,B00DWHNTJM,880397874,Tassimo Petit Dej 16 Pods,Grocery,5,0,0,N,Y,,What can I say - it's coffee!,2015-03-16
511846,US,13305115,R3NWQMCPTWCH9J,B000F4DKB2,280207419,Twinings,Grocery,3,0,0,N,Y,,"I didn't realize this was decaf, but, the flav...",2015-03-10
567751,US,35536171,R362OV7ATOGIO7,B008EM0JMU,489382572,Hidden Springs Maple Vermont Maple Syrup,Grocery,4,0,0,N,Y,,good syrup,2015-02-22


### Fill NaN with "none" token

In [12]:
df.fillna('none', inplace=True)

In [3]:
df.to_parquet('data/amazon_reviews_grocery.parquet')

In [2]:
df = pd.read_parquet('data/amazon_reviews_grocery.parquet')

In [3]:
df.drop(columns=['marketplace','product_category'], inplace=True)

In [4]:
optimize_mem_usage(df)

Memory usage of dataframe is 2,053,696,394 bytes
Memory usage after optimization is: 1,327,983,592 bytes
Decreased by -35.34%


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2402458 entries, 0 to 2402457
Data columns (total 13 columns):
 #   Column             Dtype   
---  ------             -----   
 0   customer_id        int32   
 1   review_id          category
 2   product_id         category
 3   product_parent     int32   
 4   product_title      category
 5   star_rating        int8    
 6   helpful_votes      int16   
 7   total_votes        int16   
 8   vine               category
 9   verified_purchase  category
 10  review_headline    category
 11  review_body        category
 12  review_date        category
dtypes: category(8), int16(2), int32(2), int8(1)
memory usage: 311.8 MB


In [6]:
df.nunique()

customer_id          1363986
review_id            2402458
product_id            305512
product_parent        268150
product_title         273649
star_rating                5
helpful_votes            564
total_votes              603
vine                       2
verified_purchase          2
review_headline      1273804
review_body          2226078
review_date             4404
dtype: int64

In [6]:
df.head()

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,42521656,R26MV8D0KG6QI6,B000SAQCWC,159713740,"The Cravings Place Chocolate Chunk Cookie Mix,...",5,0,0,N,Y,Using these for years - love them.,"As a family allergic to wheat, dairy, eggs, nu...",2015-08-31
1,12049833,R1OF8GP57AQ1A0,B00509LVIQ,138680402,"Mauna Loa Macadamias, 11 Ounce Packages",5,0,0,N,Y,Wonderful,"My favorite nut. Creamy, crunchy, salty, and ...",2015-08-31
2,107642,R3VDC1QB6MC4ZZ,B00KHXESLC,252021703,Organic Matcha Green Tea Powder - 100% Pure Ma...,5,0,0,N,N,Five Stars,This green tea tastes so good! My girlfriend l...,2015-08-31
3,6042304,R12FA3DCF8F9ER,B000F8JIIC,752728342,15oz Raspberry Lyons Designer Dessert Syrup Sauce,5,0,0,N,Y,Five Stars,I love Melissa's brand but this is a great sec...,2015-08-31
4,18123821,RTWHVNV6X4CNJ,B004ZWR9RQ,552138758,"Stride Spark Kinetic Fruit Sugar Free Gum, 14-...",5,0,0,N,Y,Five Stars,good,2015-08-31


In [24]:
fig = px.bar(df.star_rating.value_counts().to_frame(), 
title='Star Ratings Distribution')

fig.update_traces(
    marker_line_width=1, 
    marker_line_color='black',
    marker=dict(color='#3f9c35'),
)

fig.update_xaxes(title='Star Rating (1 to 5)',
                 gridcolor='lightgray',
                 showgrid=True, 
                 gridwidth=1)

fig.update_yaxes(title='Number of Ratings',
                 gridcolor='lightgray',
                 showgrid=True, 
                 gridwidth=1)

fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  hoverlabel=dict(
                      bgcolor='ivory',
                      font_size=16,
                      font_family='Rockwell'),
                  font=dict(family='Rockwell', 
                            color='navy',
                            size=16), 
                  title_x=0.5)
fig.show()

In [66]:
_df = df.star_rating.value_counts().to_frame().reset_index().rename(
    columns={'index':'star_rating','star_rating':'count'})

_df['count'] = _df['count']/_df['count'].sum()

fig = px.pie(_df,
             values='count',
             names='star_rating',
             labels='star_rating', 
             title='Star Ratings Distribution'+\
             '<br><sup>hover over to see labels</sup>',
             hole=0.5, color_discrete_sequence=px.colors.qualitative.Dark24_r, 
             width=500, height=500).update_traces(marker_line_color='rgb(0,0,0)',
                                                  textinfo='label+percent',
                                                  hovertemplate='<br>'.join([
                                                      'Star Rating:  %{label}',
                                                      'Count:  %{value:,.2%}'
                                                      ]),
                                                  textfont_size=16,
                                                  marker_line_width=2)
fig.update_layout(title_font_size=20,
                  showlegend=False,
                  hoverlabel=dict(
                      font_size=16,
                      font_family='Rockwell'),
                  font=dict(family='Rockwell', 
                            color='navy',
                            size=12), 
                  title_x=0.5)
fig.show()

In [16]:
amazon = df[df.product_title.apply(lambda x: '[Amazon.com Exclusive]' in x)]

In [17]:
for title in amazon.product_title:
    print(title)

LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]
LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]
LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]
LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]
LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]
LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]
LU Erin Fetherston Designed, Creme Roulee Dark Chocolate European Style Rolled Wafers, 14.1-Ounce Canisters (Pack of 3) [Amazon.com Exclusive]

In [6]:
df.review_date = df.review_date.astype('O').apply(dt.fromisoformat)
df['review_month'] = df.review_date.apply(lambda x: x.month)
df['review_day'] = df.review_date.apply(lambda x: x.day)
df['review_year'] = df.review_date.apply(lambda x: x.year)

In [8]:
f'{df.memory_usage(deep=True).sum():,.0f}'

'1,399,630,128'

In [111]:
_df = df.review_month.value_counts().reset_index().rename(
    columns={'index':'review_month', 'review_month':'count'})

fig = px.bar(
    _df, 
    x='review_month', 
    y='count', 
    title='Quantity of Reviews By Month')

fig.update_xaxes(
    title='Review Month',
    gridcolor='lightgray',
    showgrid=True, 
    gridwidth=1)

fig.update_yaxes(
    title='Number of Reviews',
    gridcolor='lightgray',
    showgrid=True, 
    gridwidth=1)

fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title_font_size=20,
    showlegend=False,
    hoverlabel=dict(
        bgcolor='ivory',
        font_size=16,
        font_family='Rockwell'),
    font=dict(family='Rockwell', 
            color='navy',
            size=14), 
    title_x=0.5)

fig.show()

In [112]:
_df = df.review_day.value_counts().reset_index().rename(
    columns={'index':'review_day', 'review_day':'count'})

fig = px.bar(
    _df, 
    x='review_day', 
    y='count', 
    title='Quantity of Reviews By Day')

fig.update_xaxes(
    title='Review Day',
    gridcolor='lightgray',
    showgrid=True, 
    gridwidth=1)

fig.update_yaxes(
    title='Number of Reviews',
    gridcolor='lightgray',
    showgrid=True, 
    gridwidth=1)

fig.update_layout(
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    title_font_size=20,
    showlegend=False,
    hoverlabel=dict(
        bgcolor='ivory',
        font_size=16,
        font_family='Rockwell'),
    font=dict(family='Rockwell', 
            color='navy',
            size=14), 
    title_x=0.5)

fig.show()

In [129]:
df.review_date.apply(lambda x: x.year).value_counts()

fig = px.line(
    df.review_date.apply(lambda x: x.year).value_counts(), 
    title='Number of Reviews Over Time')

fig.update_traces(mode='markers+lines')

fig.update_xaxes(title='Year',
    gridcolor='lightgray',
                 showgrid=True, 
                 gridwidth=1)

fig.update_yaxes(title='Number of Reviews',
    gridcolor='lightgray',
                 showgrid=True, 
                 gridwidth=1)

fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  title_font_size=20,
                  showlegend=False,
                  hoverlabel=dict(
                      bgcolor='ivory',
                      font_size=16,
                      font_family='Rockwell'),
                  font=dict(family='Rockwell', 
                            color='navy',
                            size=14), 
                  title_x=0.5)

fig.show()

In [26]:
df.sort_values(by='total_votes', ascending=False)

Unnamed: 0,customer_id,review_id,product_id,product_parent,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
2344167,47364506,RXXPVOUH9NLL3,B00032G1S0,753469671,"Tuscan Dairy Whole Vitamin D Milk, Gallon, 128 oz",5,23755,24170,N,N,Make this your only stock and store,"Once upon a mid-day sunny, while I savored Nut...",2008-07-08
923600,29857793,R2DQNJRV27D3P0,B000EVOSE4,990977895,"Haribo Gummi Candy Gold-Bears,",5,11045,11901,N,N,I have seen the face of God.,I didn't feel the need to plan my weekend arou...,2014-10-22
2393030,49234072,RKT07YYORZMZE,B00032G1S0,753469671,"Tuscan Dairy Whole Vitamin D Milk, Gallon, 128 oz",3,9986,10304,N,N,"One Friday, Without the Milk",He always brought home milk on Friday. <br /> ...,2006-10-30
1756395,9286343,R3SC3RFGX29KPJ,B00012182G,554396858,Fresh Whole Rabbit,1,7886,8076,N,N,Not like Game of Thrones,Brad and I were very excited to order a few of...,2013-05-10
249057,809523,R2JGNJ5ZPJT4YC,B000EVOSE4,990977895,"Haribo Gummi Candy Gold-Bears,",1,6201,6900,N,N,"See you in hell, Haribo Sugar-Free Gummi Bears","It was my last class of the semester, and the ...",2015-06-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
920063,41928415,R48P5I6RBY7RK,B007Y59HVM,267956568,San Francisco Bay One Cup,5,0,0,N,Y,hal-price per k-cup!!!,"great product, great price, but use your own c...",2014-10-24
920062,10993703,R1JJ3QLVVHML01,B000FFLTD2,893081147,Dr. McDougall's Right Foods Vegan Pad Thai Noo...,5,0,0,N,N,Five Stars,The Pad Thai soup is delicious and so quick to...,2014-10-24
920059,17522687,R3Q6NR2WWXPKLZ,B00M2OGS08,715217277,Surge Citrus Flavored Soda 16fl oz. 12 cans,5,0,0,N,Y,Five Stars,It was excellent just like I remembered,2014-10-24
920057,43724923,RJFN6IAQ1MNEF,B008MIGHFE,854138558,Health Warrior Chia Bars,4,0,0,N,Y,Good,Tasty bars. Good price,2014-10-24


In [24]:
df.sort_values(by='helpful_votes', ascending=False).review_body.iloc[4]

"It was my last class of the semester, and the final exam was worth 30% of our grade.<br />After a late night study session I felt confident, but I had to decide between sleeping in or cooking breakfast. My eyelids chose sleep.<br />My stomach later regretted this decision, and after several uncomfortable stomach growls, I finally decided to make a quick stop by the campus bookstore and grab a snack before my test. Since the semester was ending and everyone was going home for the summer, a lot of items were on sale, including the snacks and candy that they kept up front. Being in the hungry state that I was in, it felt only logical to pick the largest, yet least expensive candy in order to get more bang for my buck.<br />And there they sat: two bags of Haribo Sugar-Free Gummi Bears, buy one get one free.<br />&#34;What a deal!&#34; I thought naïvely. I would eat one bag before my test, and one bag afterwards.<br />As I walked to class, I gleefully chewed on those abominable little bast

In [127]:
df.sort_values(by='helpful_votes', ascending=False).helpful_votes.iloc[4]

6201

In [128]:
df.sort_values(by='helpful_votes', ascending=False).iloc[4]

customer_id                                                     809523
review_id                                               R2JGNJ5ZPJT4YC
product_id                                                  B000EVOSE4
product_parent                                               990977895
product_title                           Haribo Gummi Candy Gold-Bears,
star_rating                                                          1
helpful_votes                                                     6201
total_votes                                                       6900
vine                                                                 N
verified_purchase                                                    N
review_headline         See you in hell, Haribo Sugar-Free Gummi Bears
review_body          It was my last class of the semester, and the ...
review_date                                        2015-06-05 00:00:00
review_month                                                         6
review

In [155]:
groups = df.groupby('review_year')

fig = px.scatter()

for group in groups:
    fig.add_trace(
        go.Scatter(
            y=group[1]['review_day'].value_counts(),
            name=str(group[1]['review_year'].iloc[0]),
            showlegend=True,
        )
    )

fig.update_traces(
    mode='markers+lines',
    hovertemplate='Number of Reviews: %{y}')

fig.update_xaxes(title='Day',
    gridcolor='lightgray',
                 showgrid=True, 
                 gridwidth=1)

fig.update_yaxes(title='Number of Reviews',
    gridcolor='lightgray',
                 showgrid=True, 
                 gridwidth=1)

fig.update_layout(title='Reviews Over Time',
    paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  title_font_size=20,
                  showlegend=False,
                  hoverlabel=dict(
                      bgcolor='ivory',
                      font_size=16,
                      font_family='Rockwell'),
                  font=dict(family='Rockwell', 
                            color='navy',
                            size=14), 
                  title_x=0.5)

fig.show()

In [18]:
docs = df.review_headline.astype('O').copy() + ' ' + df.review_body.astype('O').copy()

In [19]:
docs

0          Using these for years - love them. As a family...
1          Wonderful My favorite nut.  Creamy, crunchy, s...
2          Five Stars This green tea tastes so good! My g...
3          Five Stars I love Melissa's brand but this is ...
4                                            Five Stars good
                                 ...                        
2402453    Different The Amor Belhom Duo are likely the o...
2402454    This  Horse  Is  Gorgourges! Being  a  Breyer ...
2402455    brilliant, realistic model! This model is beau...
2402456    don't have it yet..but  will soon As a person ...
2402457    Great This Breyer horse is wonderful. She is b...
Length: 2402458, dtype: object

In [17]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [51]:
def tokenize(doc):
    doc = re.sub('[^a-z]', ' ', doc.lower())
    tokens = word_tokenize(doc)
    lem = WordNetLemmatizer().lemmatize
    tokens = [lem(token,'a').strip() for token in tokens if token not in nlp.Defaults.stop_words]
    tokens = [lem(token,'v').strip() for token in tokens if token not in nlp.Defaults.stop_words]
    return tokens

In [24]:
def tokenize(doc):
    doc = re.sub('[^a-z]', ' ', doc.lower())
    doc = nlp(doc)
    tokens = [token.lemma_ for token in doc if token not in nlp.Defaults.stop_words]
    return tokens

In [52]:
tokenize('This is a, and !!! nothing is making me $))() smellier sentences about driving and drivers eating')

['make', 'smelly', 'sentence', 'drive', 'drivers', 'eat']