In [84]:
import json
import gzip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import sys
import os

# circumvent src import errors
sys.path.append(os.path.abspath('..'))

from src.data import data
from src.features import preprocessing
from src.utilities import utilities
from pprint import pprint

##### `Utilities` functions

In [13]:
def summary_statistics(df, cols=['reviewerID', 'reviewText', 'asin']):
    print(f"The dataframe consists of {df.shape[0]} rows and {df.shape[1]} columns")
    
    for col in cols:
        print(f"The number of unique {col}: {df[col].nunique()}")

##### Converting JSON to csv

In [6]:
%%time
# load the reviews data
movie_reviews = data.read_json('../data/raw/Movies_and_TV_5.json')

# summary statistics
summary_statistics(movie_reviews)
movie_reviews.head()

The dataframe consists of 3410019 rows and 12 columns
The number of unique users: 297529
The number of unique reviews: 2645767
The number of unique products: 60175
CPU times: user 1min 30s, sys: 1min 17s, total: 2min 47s
Wall time: 3min 54s


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"11 9, 2012",A2M1CU2IRZG0K9,0005089549,{'Format:': ' VHS Tape'},Terri,So sorry I didn't purchase this years ago when...,Amazing!,1352419200,,
1,5.0,True,"12 30, 2011",AFTUJYISOFHY6,0005089549,{'Format:': ' VHS Tape'},Melissa D. Abercrombie,Believe me when I tell you that you will recei...,Great Gospel VHS of the Cathedrals!,1325203200,,
2,5.0,True,"04 21, 2005",A3JVF9Y53BEOGC,000503860X,{'Format:': ' DVD'},Anthony Thompson,"I have seen X live many times, both in the ear...",A great document of a great band,1114041600,11.0,
3,5.0,True,"04 6, 2005",A12VPEOEZS1KTC,000503860X,{'Format:': ' DVD'},JadeRain,"I was so excited for this! Finally, a live co...",YES!! X LIVE!!,1112745600,5.0,
4,5.0,True,"12 3, 2010",ATLZNVLYKP9AZ,000503860X,{'Format:': ' DVD'},T. Fisher,X is one of the best punk bands ever. I don't ...,X have still got it,1291334400,5.0,


In [14]:
%%time
# load the product metadata
movie_metadata = data.read_json("../data/raw/meta_Movies_and_TV.json")

# summary statistics
summary_statistics(movie_metadata, cols=['asin'])
movie_metadata.head()

The dataframe consists of 203766 rows and 19 columns
The number of unique asin: 181839
CPU times: user 10.5 s, sys: 1.58 s, total: 12 s
Wall time: 14.8 s


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Movies & TV, Movies]",,[],,Understanding Seizures and Epilepsy,[],,,[],"886,503 in Movies & TV (",[],Movies & TV,,,,695009,[],[],
1,"[Movies & TV, Movies]",,[],,Spirit Led&mdash;Moving By Grace In The Holy S...,[],,,[],"342,688 in Movies & TV (",[],Movies & TV,,,,791156,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Movies & TV, Movies]",,[Disc 1: Flour Power (Scones; Shortcakes; Sout...,,My Fair Pastry (Good Eats Vol. 9),[],,Alton Brown,[],"370,026 in Movies & TV (",[],Movies & TV,,,,143529,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
3,"[Movies & TV, Movies]",,[Barefoot Contessa Volume 2: On these three di...,,"Barefoot Contessa (with Ina Garten), Entertain...","[B002I5GNW4, B005WXPVMM, B009UY3W8O, B00N27ID1...",,Ina Garten,[],"342,914 in Movies & TV (","[B002I5GNW4, 0804187045, B009UY3W8O, 060960219...",Movies & TV,,,$74.95,143588,[],[],
4,"[Movies & TV, Movies]",,[Rise and Swine (Good Eats Vol. 7) includes bo...,,Rise and Swine (Good Eats Vol. 7),"[B000P1CKES, B000NR4CRM]",,Alton Brown,[],"351,684 in Movies & TV (",[B0015SVNXY],Movies & TV,,,,143502,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,


In [8]:
movie_reviews.to_csv("../data/raw/Movies_and_TV_5.csv", index=False)

In [15]:
movie_metadata.to_csv("../data/raw/meta_Movies_and_TV.csv", index=False)

##### Criteria Filtering

The dataset should ensure that there are no missing reviews, while also ensuring that reviews are of adequate length in order to build a meaningful profile. Futhermore, we will also need to identify user who may potentially be "*bot*" users who are paid to generate good/bad reviews for a variety reason – such as artifically inflating product ranking, demeaning competitors products etc. Hence, these users are not an actual personality that we will like to generate a profile for recommendations.

Hence, the following criteria must be enforced:
1. No null values should be present in `reviewText`, else row will be removed
2. Minimum length of review should be at least of 100 characters (e.g. "*Great product*", will be removed)
3. Remove potential "bot" users by identify users through – similar/exact same reviews across different products. As we factor in [`reviewerID`, `asin`, `reviewText`], this ensures that we are not removing user who may just be facing network difficulty resulting in multiple duplication of reviews, but instead we are removing users who are intentionally reusing a same review for multiple products. Regardless of whether such users are legimate, having similar reviews over and over does not built a diverse understanding of such a user and hence, recommendations may be not be adequately provided.

In [16]:
def retrieve_nuniques(df, target_col):
    """Retrieve unique summary count of target column in a dataframe.
    
        Args:
            df [pd.DataFrame]:
            target_col [str]:
        Returns:
            unique_counts [pd.DataFrame]: 
    """
    
    # check unique values
    print(f"The number of unique records in {target_col}: {df[target_col].nunique()}\n")
    unique_counts = df[target_col].value_counts().sort_values(ascending=False).head(10)
    print(unique_counts)
    print("\n")
    
    return unique_counts


def filter_min_length(df, target_col, threshold=100):
    """Filter target column based on minimum threshold in length.
    
        Args:
            df [pd.DataFrame]:
            target_col [str]:
        Returns:
            filtered_df [pd.DataFrame]:
    """
    
    df.dropna(axis=0, subset=[target_col], inplace=True)
    filtered_df = df[df[target_col].map(len) > 100]
    
    return filtered_df
    

In [18]:
# Aggregating reviews count -> spotting exact same repeated reviews
uniq_reviews_agg = retrieve_nuniques(movie_reviews, 'reviewText')
filtered_movie_reviews = filter_min_length(movie_reviews, 'reviewText', threshold=100)

uniq_filt_reviews_agg = retrieve_nuniques(filtered_movie_reviews, 'reviewText')

# summary statistics
print(f"\nThe number of unique users: {filtered_movie_reviews['reviewerID'].nunique()}")
print(f"\nThe number of unique reviews: {filtered_movie_reviews['reviewText'].nunique()}")

The number of unique records in reviewText: 2645767

great          12852
good           11348
Great          11163
Great movie     9501
Good            7987
Good movie      7683
ok              7189
great movie     6474
good movie      6401
Excellent       5772
Name: reviewText, dtype: int64


The number of unique records in reviewText: 1837090

its came in good condition with no problem it is an excellent anime and it was amazing to watch and see                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    


The number of unique users: 237370

The number of unique reviews: 1837090


##### Looking at rows of duplicated reviews

In [19]:
def inspect_duplicates(df, target_col, value, sort_by='asin'):
    """Check all duplicates values of a given target column.
    
        Args:
        Returns:
    """
    
    dup_values = df[df[target_col] == value]
    return dup_values.sort_values(by=[sort_by]).head(10)

In [23]:
# checking rows that have the exact same reviews
review = "We love watching this movie at home. It never gets old and I can't wait to watch it over and over again."
dup_review_samp = inspect_duplicates(filtered_movie_reviews, 'reviewText', review)

dup_review_samp.head(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
37030,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,0788814664,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,
33579,5.0,True,"12 20, 2012",AEZ8WH06SEW6K,0788821075,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355961600,,
51022,5.0,True,"12 20, 2012",AEZ8WH06SEW6K,079072961X,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355961600,,
53331,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,0790736527,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,
132128,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,6300274195,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,
133557,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,6300274268,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,
237523,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,6302787068,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,
248457,5.0,True,"12 20, 2012",AEZ8WH06SEW6K,6303012140,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355961600,,
265607,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,6303122647,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,
269895,5.0,True,"12 19, 2012",AEZ8WH06SEW6K,6303194508,{'Format:': ' DVD'},Amazon Customer,We love watching this movie at home. It never ...,Love it,1355875200,,


We identified that there are users like `AEZ8WH06SEW6K` who duplicates the same review over and over again across different products. This present a strong case of a possible paid reviewer who is paid to make reviews across different products. Such users are not an actual personality that we are trying to understand and build a profile for, in order to recommend products. 

In short, such users are not "legitimate" users that products value to Amazon.

In [55]:
def retrieve_potential_bot_users(df, group, agg_col, agg_by='count', threshold=3):
    """Retrieve Reviewer ID based on threshold of repeated same reviews across different products.
    
        Args:
            df [pd.DataFrame]:
            threshold [int]:
        Returns:
            unique_users [list]: 
    """
    df_dup_dropped = df.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText'])
    
    user_reviews_by_prod = (df_dup_dropped.groupby(group)
                            .agg({agg_col: agg_by})
                            .reset_index())
    
    potential_bot_users_by_count = (user_reviews_by_prod[user_reviews_by_prod
                                                         .sort_values('asin', ascending=False)
                                                         ['asin'] >= threshold])
    
    return potential_bot_users_by_count['reviewerID'].unique().tolist()

In [56]:
potential_bot_users = retrieve_potential_bot_users(filtered_movie_reviews, ['reviewerID', 'reviewText'], 'asin')

# summary
print(f"The number of potential bot users is {len(potential_bot_users)}.")



The number of potential bot users is 9698.


In [57]:
# using the potential bot users list, we can filter out all reviews by them
filtered_movie_reviews = filtered_movie_reviews[~filtered_movie_reviews['reviewerID'].isin(potential_bot_users)]

print(f"The dataframe consists of {filtered_movie_reviews.shape[0]} rows and {filtered_movie_reviews.shape[1]} columns")

The dataframe consists of 1736799 rows and 12 columns


In [67]:
# we will still need to remove the duplicated reviews on a single product itself.
filtered_movie_reviews = filtered_movie_reviews.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText'])

# check on the duplicated reviews
print(filtered_movie_reviews['reviewText'].value_counts().sort_values(ascending=False).head(5))    

Best Zombie movie every! Should have won Best Picture in 2013! The Israel Wall scene alone is worth the price of the DVD! Time to show Brad Pitt some respect.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

In [69]:
filtered_reviews_count = filtered_movie_reviews['reviewText'].value_counts().sort_values().to_frame()

# number of reviews > 2 
duplicated_reviews = len(filtered_reviews_count[filtered_reviews_count['reviewText'] >= 2])

# summary statistics
# we want to see how many % of the total reviews is still duplicated 
# this allows us to better determined if we need to effectively increase the threshold
print(f"The total number of reviews: {len(filtered_movie_reviews)}, % of duplicated reviews {((duplicated_reviews / len(movie_reviews)) * 100):.2f}%")

The total number of reviews: 1674463, % of duplicated reviews 1.94%


In [83]:
inspect_duplicates(filtered_movie_reviews, 'reviewText', "I have been a Star Trek fan most of my life. I can honestly say this movie was probably one of the best (If not the best) Treks I've ever seen! I can't wait for the DVD to come out. What's more I can't wait for the sequel! and it looks very likely that their will be a sequel. And with a movie this good, you can't expect anything other than the huge success it's enjoying!\n\nThe things that made it great where: The story was fantastic, people who weren't already fans of Trek could really get into it, it wasn't hard to understand, but it was still very intelligent. The music was superb! it gave it a very grand and exciting feeling! The effects where top of the line, and just plain brilliant! The humor gave it a more personal quality, so it didn't feel as stiff as it could have been. And the acting was top-notch. They really hit the nail on the head by picking the unknown Chris Pine to play James Kirk. Not only does he look like William Shatner did when he first started playing Kirk, but his acting was exceptional. And there was Zachary Quinto, he did the part of Spock so much justice, it's hard to see anyone else play him. And there was the surprise appearance of Leonard Nimoy (the original Spock) playing Older Spock. He really tied the story together. And not to forget, there was also the wonderful performances of Zoe Saldana, Anton Yelchin, John Cho, Karl Urban, and Eric Bana. Also the children that played Kid Kirk (Jimmy Bennett) and Kid Spock (Jacob Kogan) gave masterful performances for the short background story parts they where tasked with.\n\nSo that is why I say this movie was a triumph in every way! Not just as a Star Trek film but as any film! I give this movie a 10 out if 10! If they'd let me I'd give it a 20 out of 10!!!")

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
1488876,5.0,False,"01 16, 2011",A1U0QRPXO0R14K,B001AVCFJM,{'Format:': ' Blu-ray'},stud_manly47,I have been a Star Trek fan most of my life. I...,Amazing,1295136000,,
1488953,5.0,False,"03 21, 2010",A2M85JGCPOR97Y,B001AVCFJM,{'Format:': ' Blu-ray'},Misterbanks,I have been a Star Trek fan most of my life. I...,A Triumph in every possible way!!,1269129600,,
3078483,5.0,False,"01 16, 2011",A1U0QRPXO0R14K,B01DKLQMM0,{'Format:': ' Blu-ray'},stud_manly47,I have been a Star Trek fan most of my life. I...,Amazing,1295136000,,
3078559,5.0,False,"03 21, 2010",A2M85JGCPOR97Y,B01DKLQMM0,{'Format:': ' Blu-ray'},Misterbanks,I have been a Star Trek fan most of my life. I...,A Triumph in every possible way!!,1269129600,,


In [72]:
summary_statistics(filtered_movie_reviews)

The dataframe consists of 1674463 rows and 12 columns
The number of unique reviewerID: 227672
The number of unique reviewText: 1608181
The number of unique asin: 59813


In [73]:
# saving interim reviews to seperate csv
filtered_movie_reviews.to_csv("../data/interim/Movies_and_TV_5.csv", index=False)

In [74]:
# loading the metadata
movie_metadata = pd.read_csv("../data/raw/meta_Movies_and_TV.csv")

# summary statistics
summary_statistics(movie_metadata, cols=['asin'])
movie_metadata.head()

  interactivity=interactivity, compiler=compiler, result=result)


The dataframe consists of 203766 rows and 19 columns
The number of unique asin: 181839


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"['Movies & TV', 'Movies']",,[],,Understanding Seizures and Epilepsy,[],,,[],"886,503 in Movies & TV (",[],Movies & TV,,,,695009,[],[],
1,"['Movies & TV', 'Movies']",,[],,Spirit Led&mdash;Moving By Grace In The Holy S...,[],,,[],"342,688 in Movies & TV (",[],Movies & TV,,,,791156,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,
2,"['Movies & TV', 'Movies']",,['Disc 1: Flour Power (Scones; Shortcakes; Sou...,,My Fair Pastry (Good Eats Vol. 9),[],,Alton Brown,[],"370,026 in Movies & TV (",[],Movies & TV,,,,143529,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,
3,"['Movies & TV', 'Movies']",,['Barefoot Contessa Volume 2: On these three d...,,"Barefoot Contessa (with Ina Garten), Entertain...","['B002I5GNW4', 'B005WXPVMM', 'B009UY3W8O', 'B0...",,Ina Garten,[],"342,914 in Movies & TV (","['B002I5GNW4', '0804187045', 'B009UY3W8O', '06...",Movies & TV,,,$74.95,143588,[],[],
4,"['Movies & TV', 'Movies']",,['Rise and Swine (Good Eats Vol. 7) includes b...,,Rise and Swine (Good Eats Vol. 7),"['B000P1CKES', 'B000NR4CRM']",,Alton Brown,[],"351,684 in Movies & TV (",['B0015SVNXY'],Movies & TV,,,,143502,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,


In [76]:
# remove duplicates
movie_metadata = movie_metadata.drop_duplicates(subset=['asin'])

# visual check through product id count
summary_statistics(movie_metadata, cols=['asin'])
movie_metadata['asin'].value_counts(ascending=False)

The dataframe consists of 181839 rows and 19 columns
The number of unique asin: 181839


B001808CWY    1
B0015YQNTW    1
6302277094    1
B009SJAUBI    1
B01GRU5CQS    1
             ..
B0029BL8GI    1
B004D8P23U    1
B000QTD6S8    1
B0013FZUQK    1
B00X3PSKEC    1
Name: asin, Length: 181839, dtype: int64

In [77]:
# saving interim metadata to seperate csv
movie_metadata.to_csv("../data/interim/meta_Movies_and_TV.csv")

##### Combining `filtered_movie_reviews` and `movie_metadata`

In [79]:
# merging metadata and reviews to get more information
movies_merged = pd.merge(movie_metadata, filtered_movie_reviews, how='inner', on='asin')

# summary statistics
summary_statistics(movies_merged)
movies_merged.head()

The dataframe consists of 1672887 rows and 30 columns
The number of unique reviewerID: 227647
The number of unique reviewText: 1606801
The number of unique asin: 59750


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,...,verified,reviewTime,reviewerID,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,"['Movies & TV', 'Movies']",,['Barefoot Contessa Volume 2: On these three d...,,"Barefoot Contessa (with Ina Garten), Entertain...","['B002I5GNW4', 'B005WXPVMM', 'B009UY3W8O', 'B0...",,Ina Garten,[],"342,914 in Movies & TV (",...,True,"01 18, 2009",AVIY68KEPQ5ZD,{'Format:': ' DVD'},Rebecca Millington,I am very pleased with the dvd only wish i cou...,Barefoot Contesst Vol 2,1232236800,,
1,"['Movies & TV', 'Movies']",,['Barefoot Contessa Volume 2: On these three d...,,"Barefoot Contessa (with Ina Garten), Entertain...","['B002I5GNW4', 'B005WXPVMM', 'B009UY3W8O', 'B0...",,Ina Garten,[],"342,914 in Movies & TV (",...,True,"10 22, 2015",A3MP1M1DWO836V,{'Format:': ' DVD'},Lucky7,I love Ina Garten and I was so happy that they...,I love Ina Garten and I was so happy that they...,1445472000,,
2,"['Movies & TV', 'Movies']",,['Barefoot Contessa Volume 2: On these three d...,,"Barefoot Contessa (with Ina Garten), Entertain...","['B002I5GNW4', 'B005WXPVMM', 'B009UY3W8O', 'B0...",,Ina Garten,[],"342,914 in Movies & TV (",...,True,"10 13, 2014",A2MSEN5APH6E6L,{'Format:': ' DVD'},Angela Mcclelland,I cannot say enough good things about this DVD...,"Great Entertaining, and the food is WOW, Wow, ...",1413158400,3.0,
3,"['Movies & TV', 'Movies']",,['Barefoot Contessa Volume 2: On these three d...,,"Barefoot Contessa (with Ina Garten), Entertain...","['B002I5GNW4', 'B005WXPVMM', 'B009UY3W8O', 'B0...",,Ina Garten,[],"342,914 in Movies & TV (",...,True,"09 27, 2014",A3MP1M1DWO836V,{'Format:': ' DVD'},Lucky7,Great dvd of one of my favorite cooking shows....,Great dvd of one of my favorite cooking shows,1411776000,,
4,"['Movies & TV', 'Movies']",,['Barefoot Contessa Volume 2: On these three d...,,"Barefoot Contessa (with Ina Garten), Entertain...","['B002I5GNW4', 'B005WXPVMM', 'B009UY3W8O', 'B0...",,Ina Garten,[],"342,914 in Movies & TV (",...,True,"09 27, 2014",A3MP1M1DWO836V,{'Format:': ' DVD'},Lucky7,I wish she would release more DVDs. These are...,I wish she would release more DVDs. These are ...,1411776000,,


In [80]:
movies_merged.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'main_cat', 'similar_item',
       'date', 'price', 'asin', 'imageURL', 'imageURLHighRes', 'details',
       'overall', 'verified', 'reviewTime', 'reviewerID', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

In [81]:
# save merged dataframe to interim
movies_merged.to_csv("../data/interim/Movies_and_TV_merged.csv", index=False)