In [40]:
import json
import gzip
import numpy as np
import matplotlib.pyplot as plt
import logging
import pandas as pd
import re
import sys
import os

# circumvent src import errors
sys.path.append(os.path.abspath('..'))

from src.data import data
from src.features import build_features
from src.utilities import utilities
from tqdm import tqdm
from pprint import pprint

tqdm.pandas()

##### `Utilities` functions

In [30]:
def summary_statistics(df, cols=['reviewerID', 'reviewText', 'asin']):
    print(f"The dataframe consists of {df.shape[0]} rows and {df.shape[1]} columns")
    
    for col in cols:
        print(f"The number of unique {col}: {df[col].nunique()}")

##### Converting JSON to csv

In [4]:
%%time
# load the reviews data
prod_reviews = data.read_json('../data/raw/Automotive_5.json')

# summary statistics
summary_statistics(prod_reviews)
prod_reviews.head()

The dataframe consists of 1711519 rows and 12 columns
The number of unique reviewerID: 193651
The number of unique reviewText: 1348480
The number of unique asin: 79437
CPU times: user 22.4 s, sys: 2.3 s, total: 24.7 s
Wall time: 25.4 s


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"05 1, 2015",A8WEXFRWX1ZHH,209688726,{'Color:': ' AC'},Goldengate,"After I wrote the below review, the manufactur...",Works well if you place phone in horizontally ...,1430438400,,
1,1.0,True,"04 19, 2018",ABCA1A8E4DGV1,209688726,{'Color:': ' Blue'},noe,It sucks barely picks up anything definitely n...,sucks,1524096000,,
2,1.0,True,"04 16, 2018",A1NX8HM89FRQ32,209688726,{'Color:': ' Black'},Eduard,"Well to write a short one, it blew 2 fuses of ...",Defective,1523836800,,
3,3.0,True,"04 13, 2018",A1X77G023NY0KY,209688726,{'Color:': ' CA'},Lauren,I have absolutely no memory of buying this but...,Looks cool! Probably works,1523577600,,
4,5.0,True,"04 8, 2018",A3GK37JO2MGW6Q,209688726,{'Color:': ' Black'},danny,it ok it does it job,Five Stars,1523145600,,


In [41]:
import contractions
import re

from gensim.parsing.preprocessing import remove_stopwords
from gensim.utils import simple_preprocess
from textblob import TextBlob


def lemmatize_with_postags(sentence):
    """Lemmatize a given sentence based on given POS tags.
        Ref: https://www.machinelearningplus.com/nlp/lemmatization-examples-python/#comparingnltktextblobspacypatternandstanfordcorenlp

    Args:
        sentence ([type]): [description]

    Returns:
        [type]: [description]
    """
    sent = TextBlob(sentence)
    tag_dict = {"J": "a", "N": "n", "V": "v", "R": "r"}
    words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
    lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]

    return " ".join(lemmatized_list)


def spelling_correction(sentence):
    """[summary]

    Args:
        sentence ([type]): [description]

    Returns:
        [type]: [description]
    """
    sent = TextBlob(sentence)
    return sent.correct()


# TODO: removal of rare words, last 10 based on value counts
def text_preprocess(review):
    """[summary]

    Args:
        review ([type]): [description]

    Returns:
        [type]: [description]
    """
    logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt='%H:%M:%S', level=logging.INFO)
    review = ' '.join(str(review).splitlines())  # remove whitespace characters
    review = re.sub(r'http\S+', "", str(review))  # remove links
    review = contractions.fix(review)  # expand contractions
    review = re.sub(r'[^\w\s]', " ", str(review))  # remove punctuations
    review = re.sub(r"'", "", str(review))  # remove single quotes
    review = remove_stopwords(review)
    review = lemmatize_with_postags(review)  # lemmatize sentence
    review = simple_preprocess(review, deacc=True)

    return review

In [42]:
test_df = prod_reviews.iloc[:100000,:]

test_df['cleanReviewText'] = test_df['reviewText'].progress_apply(lambda x: text_preprocess(x))

test_df.head(50)

100%|██████████| 100000/100000 [02:34<00:00, 645.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['cleanReviewText'] = test_df['reviewText'].progress_apply(lambda x: text_preprocess(x))


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,cleanReviewText
0,4.0,False,"05 1, 2015",A8WEXFRWX1ZHH,209688726,{'Color:': ' AC'},Goldengate,"After I wrote the below review, the manufactur...",Works well if you place phone in horizontally ...,1430438400,,,"[after, write, review, manufacturer, contact, ..."
1,1.0,True,"04 19, 2018",ABCA1A8E4DGV1,209688726,{'Color:': ' Blue'},noe,It sucks barely picks up anything definitely n...,sucks,1524096000,,,"[it, suck, barely, pick, definitely, car, pret..."
2,1.0,True,"04 16, 2018",A1NX8HM89FRQ32,209688726,{'Color:': ' Black'},Eduard,"Well to write a short one, it blew 2 fuses of ...",Defective,1523836800,,,"[well, write, short, blew, fuse, car, apparent..."
3,3.0,True,"04 13, 2018",A1X77G023NY0KY,209688726,{'Color:': ' CA'},Lauren,I have absolutely no memory of buying this but...,Looks cool! Probably works,1523577600,,,"[absolutely, memory, buying, go, review, go, a..."
4,5.0,True,"04 8, 2018",A3GK37JO2MGW6Q,209688726,{'Color:': ' Black'},danny,it ok it does it job,Five Stars,1523145600,,,"[ok, job]"
5,5.0,True,"03 24, 2018",AIY18YON1TWJJ,209688726,{'Color:': ' Black'},Karen H.,Have 3 big dogs. this have been great for my F...,this have been great for my Ford transit connect,1521849600,,,"[have, big, dog, great, ford, transit, connect]"
6,3.0,True,"03 4, 2018",A2MPTQ85HBBNG2,209688726,{'Color:': ' Black'},Giv,"Pros: Good attachments, nice long cord, can re...",Decent car vaccuum.,1520121600,,,"[pros, good, attachment, nice, long, cord, rea..."
7,2.0,True,"03 1, 2018",A1SPIM9Y6HUUSH,209688726,{'Color:': ' Black'},Frank W.Brodeur,I have a 2017 outback and everytime I try to u...,Two Stars,1519862400,,,"[outback, everytime, try, use, blow, fuse]"
8,4.0,True,"02 22, 2018",A1Q6FHU6DA643L,209688726,{'Color:': ' Black'},nutter1,very good suction will see how it lasts,Four Stars,1519257600,,,"[good, suction, last]"
9,5.0,True,"01 29, 2018",A3MA15RJJ59OKG,209688726,{'Color:': ' Black'},Daryl S.,"love it,works great ! wow !!",Five Stars,1517184000,,,"[love, work, great, wow]"


In [None]:
for i in range(100,200):
    actual = test_df.iloc[:i,:]['reviewText']
    processed = test_df.iloc[:i,:]['cleanReviewText']
    print(f"Actual: {actual}\nProcessed: {' '.join(processed)}\n")

In [7]:
%%time
# load the product metadata
prod_metadata = data.read_json("../data/raw/meta_Kindle_Store.json")

# summary statistics
summary_statistics(prod_metadata, cols=['asin'])
prod_metadata.head()

The dataframe consists of 491670 rows and 19 columns
The number of unique asin: 491670
CPU times: user 29.9 s, sys: 19.2 s, total: 49.1 s
Wall time: 1min 9s


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,"[Kindle Store, Kindle eBooks, Science Fiction ...",,[],,,"[B007NLCJBC, B01FARODH8]",,Arthur K. Barnes,[],"1,716,849 Paid in Kindle Store (","[B000FBF81K, B00PBDMER8]","{'File Size:': '295 KB', 'Print Length:': '113...",Buy a Kindle,,,,B000FA5KKA,[],[]
1,"[Kindle Store, Kindle eBooks, Engineering & Tr...",,[],,,"[B00AYWTHZS, B071CTK28D]",,Visit Amazon's Paul A. Craig Page,[],"1,683,973 Paid in Kindle Store (",[B00AYWTHZS],"{'File Size:': '1648 KB', 'Print Length:': '26...",Buy a Kindle,,,,B000FA5M3K,[],[]
2,"[Kindle Store, Kindle eBooks, Biographies & Me...",,[],,,[],,Jean Marie Stine,[],"3,394,136 Paid in Kindle Store (",[],"{'File Size:': '262 KB', 'Print Length:': '103...",Buy a Kindle,,,,B000FA5KJQ,[],[]
3,"[Kindle Store, Kindle eBooks, Science Fiction ...",,[],,,[],,Arthur K. Barnes,[],"1,884,541 Paid in Kindle Store (",[],"{'File Size:': '251 KB', 'Print Length:': '116...",Buy a Kindle,,,,B000FA5NSO,[],[]
4,"[Kindle Store, Kindle eBooks, Business & Money]",,[],,,"[B000SEGKF2, B004774LR0, B018LE1KUK, B0015DRO7...",,Visit Amazon's Ethan M. Rasiel Page,[],"72,075 Paid in Kindle Store (","[B018LE1KUK, B000SEGKF2, B007XWFZSA, B0015DRO7...","{'File Size:': '953 KB', 'Print Length:': '187...",Buy a Kindle,,,,B000FA5KX2,[],[]


In [5]:
prod_reviews.to_csv("../data/raw/Kindle_Store_5.csv", index=False)

In [8]:
prod_metadata.to_csv("../data/raw/meta_Kindle_Store_5.csv", index=False)

##### Criteria Filtering

The dataset should ensure that there are no missing reviews, while also ensuring that reviews are of adequate length in order to build a meaningful profile. Futhermore, we will also need to identify user who may potentially be "*bot*" users who are paid to generate good/bad reviews for a variety reason – such as artifically inflating product ranking, demeaning competitors products etc. Hence, these users are not an actual personality that we will like to generate a profile for recommendations.

Hence, the following criteria must be enforced:
1. No null values should be present in `reviewText`, else row will be removed
2. Minimum length of review should be at least of 100 characters (e.g. "*Great product*", will be removed)
3. Remove potential "bot" users by identify users through – similar/exact same reviews across different products. As we factor in [`reviewerID`, `asin`, `reviewText`], this ensures that we are not removing user who may just be facing network difficulty resulting in multiple duplication of reviews, but instead we are removing users who are intentionally reusing a same review for multiple products. Regardless of whether such users are legimate, having similar reviews over and over does not built a diverse understanding of such a user and hence, recommendations may be not be adequately provided.

In [9]:
def retrieve_nuniques(df, target_col):
    """Retrieve unique summary count of target column in a dataframe.
    
        Args:
            df [pd.DataFrame]:
            target_col [str]:
        Returns:
            unique_counts [pd.DataFrame]: 
    """
    
    # check unique values
    print(f"The number of unique records in {target_col}: {df[target_col].nunique()}\n")
    unique_counts = df[target_col].value_counts().sort_values(ascending=False).head(10)
    print(unique_counts)
    print("\n")
    
    return unique_counts


def filter_min_length(df, target_col, threshold=100):
    """Filter target column based on minimum threshold in length.
    
        Args:
            df [pd.DataFrame]:
            target_col [str]:
        Returns:
            filtered_df [pd.DataFrame]:
    """
    
    df.dropna(axis=0, subset=[target_col], inplace=True)
    filtered_df = df[df[target_col].map(len) > 100]
    
    return filtered_df
    

In [12]:
# Aggregating reviews count -> spotting exact same repeated reviews
uniq_reviews_agg = retrieve_nuniques(prod_reviews, 'reviewText')
filtered_prod_reviews = filter_min_length(prod_reviews, 'reviewText', threshold=100)
uniq_filt_reviews_agg = retrieve_nuniques(filtered_prod_reviews, 'reviewText')

# summary statistics
print(f"\nThe number of unique users: {filtered_prod_reviews['reviewerID'].nunique()}")

The number of unique records in reviewText: 2114479

Good          2339
Good read     1924
Great book    1783
Good book     1728
Loved it      1622
Great         1538
good          1514
Great read    1211
ok            1183
good read     1078
Name: reviewText, dtype: int64


The number of unique records in reviewText: 1907832

I absolutely adored this book. Loved the characters and the build up to their relationship especially, and the delicious dynamics at work between them all.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

##### Looking at rows of duplicated reviews

In [19]:
def inspect_duplicates(df, target_col, value, sort_by='asin'):
    """Check all duplicates values of a given target column.
    
        Args:
        Returns:
    """
    
    dup_values = df[df[target_col] == value]
    return dup_values.sort_values(by=[sort_by])

In [22]:
# checking rows that have the exact same reviews
review = "I absolutely adored this book. Loved the characters and the build up to their relationship especially, and the delicious dynamics at work between them all."
dup_review_samp = inspect_duplicates(filtered_prod_reviews, 'reviewText', review)

dup_review_samp.head(10)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
1627591,5.0,True,"10 3, 2013",A12OXDSRRPOSPF,B0015YEQ8W,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,Must Read,1380758400,,
37947,5.0,True,"09 21, 2014",A12OXDSRRPOSPF,B004WP3EA2,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,must read,1411257600,,
48514,5.0,True,"05 28, 2014",A12OXDSRRPOSPF,B00540C1W6,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,excellent read,1401235200,,
1656097,5.0,True,"09 21, 2013",A12OXDSRRPOSPF,B005TJUINQ,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,must read,1379721600,,
84111,5.0,True,"03 6, 2015",A12OXDSRRPOSPF,B005UGMC78,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,must read,1425600000,,
1672960,5.0,False,"09 19, 2013",A12OXDSRRPOSPF,B0076OC97U,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,Must Read,1379548800,,
134300,5.0,True,"09 7, 2013",A12OXDSRRPOSPF,B007D788ZW,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,Must Read,1378512000,,
1699402,5.0,True,"11 1, 2013",A12OXDSRRPOSPF,B009XFWQCI,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,must read,1383264000,,
1699707,5.0,True,"10 9, 2013",A12OXDSRRPOSPF,B009Y8DBF0,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,must read,1381276800,,
1704144,5.0,True,"11 25, 2014",A12OXDSRRPOSPF,B00ABPAIBA,{'Format:': ' Kindle Edition'},Teresa,I absolutely adored this book. Loved the chara...,Must read,1416873600,,


We identified that there are users like `A12OXDSRRPOSPF` who duplicates the same review over and over again across different products. This present a strong case of a possible paid reviewer who is paid to make reviews across different products. Such users are not an actual personality that we are trying to understand and build a profile for, in order to recommend products. 

In short, such users are not "legitimate" users that products value to Amazon.

In [23]:
def retrieve_potential_bot_users(df, group, agg_col, agg_by='count', threshold=3):
    """Retrieve Reviewer ID based on threshold of repeated same reviews across different products.
    
        Args:
            df [pd.DataFrame]:
            threshold [int]:
        Returns:
            unique_users [list]: 
    """
    df_dup_dropped = df.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText'])
    
    user_reviews_by_prod = (df_dup_dropped.groupby(group)
                            .agg({agg_col: agg_by})
                            .reset_index())
    
    potential_bot_users_by_count = (user_reviews_by_prod[user_reviews_by_prod
                                                         .sort_values('asin', ascending=False)
                                                         ['asin'] >= threshold])
    
    return potential_bot_users_by_count['reviewerID'].unique().tolist()

In [24]:
potential_bot_users = retrieve_potential_bot_users(filtered_prod_reviews, ['reviewerID', 'reviewText'], 'asin')

# summary
print(f"The number of potential bot users is {len(potential_bot_users)}.")



The number of potential bot users is 2461.


In [25]:
# using the potential bot users list, we can filter out all reviews by them
filtered_prod_reviews = filtered_prod_reviews[~filtered_prod_reviews['reviewerID'].isin(potential_bot_users)]

print(f"The dataframe consists of {filtered_prod_reviews.shape[0]} rows and {filtered_prod_reviews.shape[1]} columns")

The dataframe consists of 1859549 rows and 12 columns


In [26]:
# we will still need to remove the duplicated reviews on a single product itself.
filtered_prod_reviews = filtered_prod_reviews.drop_duplicates(subset=['reviewerID', 'asin', 'reviewText'])

# check on the duplicated reviews
print(filtered_prod_reviews['reviewText'].value_counts().sort_values(ascending=False).head(5))    

The Empathy series by Ker Dukey is definitely not for the faint of heart, yet it will bring out all the dark feels, and have you thinking to yourself, What the hell is wrong with me and why do I like it? Shes a master of mind-blowing stories, and always has me dumbfounded at the end.\n\nThis is a continuation of Ryan and Cereuss story, its 3 yrs down the road, and the chaos and blood hasn't slowed down. Ryan is teaching Cereus to embrace her inner darkness, but there have been some hiccups along the way. Cereus has been blacking out; not knowing whats real or just in her crazy mind, and its causing both her and Ryan to get into some rather sticky situationsliterally!\n\nThe journey they take together is absolute darkness, madness of the mind, yet on the same level, it frees them to be who they truly are. No one will ever come close to having the connection they have, and its crazy yet inspiring at the same time. I felt I was left hanging at the end, but thats the way Ker likes it, alwa

In [27]:
filtered_reviews_count = filtered_prod_reviews['reviewText'].value_counts().sort_values().to_frame()

# number of reviews > 2 
duplicated_reviews = len(filtered_reviews_count[filtered_reviews_count['reviewText'] >= 2])

# summary statistics
# we want to see how many % of the total reviews is still duplicated 
# this allows us to better determined if we need to effectively increase the threshold
print(f"The total number of reviews: {len(filtered_prod_reviews)}, % of duplicated reviews {((duplicated_reviews / len(prod_reviews)) * 100):.2f}%")

The total number of reviews: 1857418, % of duplicated reviews 0.13%


In [29]:
review = "The Empathy series by Ker Dukey is definitely not for the faint of heart, yet it will bring out all the dark feels, and have you thinking to yourself, What the hell is wrong with me and why do I like it? Shes a master of mind-blowing stories, and always has me dumbfounded at the end.\n\nThis is a continuation of Ryan and Cereuss story, its 3 yrs down the road, and the chaos and blood hasn't slowed down. Ryan is teaching Cereus to embrace her inner darkness, but there have been some hiccups along the way. Cereus has been blacking out; not knowing whats real or just in her crazy mind, and its causing both her and Ryan to get into some rather sticky situationsliterally!\n\nThe journey they take together is absolute darkness, madness of the mind, yet on the same level, it frees them to be who they truly are. No one will ever come close to having the connection they have, and its crazy yet inspiring at the same time. I felt I was left hanging at the end, but thats the way Ker likes it, always dragging you back for more and its working like a charm!"
inspect_duplicates(filtered_prod_reviews, 'reviewText', review)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
1229541,4.0,False,"08 26, 2015",ABICFW2Y08H2H,B013Z36GN8,{'Format:': ' Kindle Edition'},Booklover,The Empathy series by Ker Dukey is definitely ...,4 Cranky Stars!,1440547200,,
1229549,4.0,False,"08 24, 2015",A1ZZ81BQIXHE7J,B013Z36GN8,{'Format:': ' Kindle Edition'},Kelly Graham,The Empathy series by Ker Dukey is definitely ...,Crazy thrill ride!,1440374400,,
2079772,4.0,False,"11 30, 2015",A1FG2D6NRZD9NF,B013Z36GN8,{'Format:': ' Kindle Edition'},Crankster,The Empathy series by Ker Dukey is definitely ...,A journey into darkness,1448841600,,


In [31]:
summary_statistics(filtered_prod_reviews)

The dataframe consists of 1857418 rows and 12 columns
The number of unique reviewerID: 130074
The number of unique reviewText: 1854435
The number of unique asin: 98763


In [35]:
utilities.reviews_count(filtered_prod_reviews)

Global average ratings: 4.390031753757097
For product reviews:
Minimum reviews for product: 1, Maximum reviews for products: 1752
Average reviews per products: 18.80682036795156
The interquartile range:
0.25     6.0
0.50    10.0
0.75    20.0
Name: reviewText, dtype: float64

For user reviews:
Minimum reviews for users: 1, Maximum reviews for users: 1368
Average reviews per users: 14.279702323292895
The interquartile range:
0.25     5.0
0.50     7.0
0.75    13.0
Name: reviewText, dtype: float64



In [33]:
# saving interim reviews to seperate csv
filtered_prod_reviews.to_csv("../data/interim/Kindle_Store_5.csv", index=False)

In [46]:
# loading the metadata
prod_metadata = pd.read_csv("../data/raw/meta_Kindle_Store_5.csv")

# summary statistics
summary_statistics(prod_metadata, cols=['asin'])
prod_metadata.head()

  interactivity=interactivity, compiler=compiler, result=result)


The dataframe consists of 491670 rows and 19 columns
The number of unique asin: 491670


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,"['Kindle Store', 'Kindle eBooks', 'Science Fic...",,[],,,"['B007NLCJBC', 'B01FARODH8']",,Arthur K. Barnes,[],"1,716,849 Paid in Kindle Store (","['B000FBF81K', 'B00PBDMER8']","{'File Size:': '295 KB', 'Print Length:': '113...",Buy a Kindle,,,,B000FA5KKA,[],[]
1,"['Kindle Store', 'Kindle eBooks', 'Engineering...",,[],,,"['B00AYWTHZS', 'B071CTK28D']",,Visit Amazon's Paul A. Craig Page,[],"1,683,973 Paid in Kindle Store (",['B00AYWTHZS'],"{'File Size:': '1648 KB', 'Print Length:': '26...",Buy a Kindle,,,,B000FA5M3K,[],[]
2,"['Kindle Store', 'Kindle eBooks', 'Biographies...",,[],,,[],,Jean Marie Stine,[],"3,394,136 Paid in Kindle Store (",[],"{'File Size:': '262 KB', 'Print Length:': '103...",Buy a Kindle,,,,B000FA5KJQ,[],[]
3,"['Kindle Store', 'Kindle eBooks', 'Science Fic...",,[],,,[],,Arthur K. Barnes,[],"1,884,541 Paid in Kindle Store (",[],"{'File Size:': '251 KB', 'Print Length:': '116...",Buy a Kindle,,,,B000FA5NSO,[],[]
4,"['Kindle Store', 'Kindle eBooks', 'Business & ...",,[],,,"['B000SEGKF2', 'B004774LR0', 'B018LE1KUK', 'B0...",,Visit Amazon's Ethan M. Rasiel Page,[],"72,075 Paid in Kindle Store (","['B018LE1KUK', 'B000SEGKF2', 'B007XWFZSA', 'B0...","{'File Size:': '953 KB', 'Print Length:': '187...",Buy a Kindle,,,,B000FA5KX2,[],[]


In [47]:
# remove duplicates
prod_metadata = prod_metadata.drop_duplicates(subset=['asin'])

# visual check through product id count
summary_statistics(prod_metadata, cols=['asin'])
prod_metadata['asin'].value_counts(ascending=False)

The dataframe consists of 491670 rows and 19 columns
The number of unique asin: 491670


B00GES27C6    1
B005V32PYA    1
B00OPJ6CI6    1
B00BFF0FX6    1
B013VFQCGQ    1
             ..
B00WLP4S4Q    1
B00SIEXF8O    1
B00KGGLFSO    1
B00KH8GXBA    1
B00GZPXS1W    1
Name: asin, Length: 491670, dtype: int64

In [48]:
# saving interim metadata to seperate csv
prod_metadata.to_csv("../data/interim/meta_Kindle_Store_5.csv")

##### Combining `filtered_movie_reviews` and `movie_metadata`

In [49]:
# merging metadata and reviews to get more information
prod_merged = pd.merge(prod_metadata, filtered_prod_reviews, how='inner', on='asin')

# summary statistics
summary_statistics(prod_merged)
prod_merged.head()

The dataframe consists of 1850030 rows and 30 columns
The number of unique reviewerID: 130042
The number of unique reviewText: 1847064
The number of unique asin: 98322


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,...,verified,reviewTime,reviewerID,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,"['Kindle Store', 'Kindle eBooks', 'Literature ...",,[],,,"['B000FA64QO', 'B00513F934', 'B00513H3I8', 'B0...",,Troy Denning,[],"350,999 Paid in Kindle Store (",...,False,"03 30, 2011",AQZH7YTWQPOBE,{'Format:': ' Kindle Edition'},Arnold,This is a pretty decent short story. Some of t...,"Good story, but just buy Star by Star",1301443200,8.0,
1,"['Kindle Store', 'Kindle eBooks', 'Literature ...",,[],,,"['B000FA64QO', 'B00513F934', 'B00513H3I8', 'B0...",,Troy Denning,[],"350,999 Paid in Kindle Store (",...,True,"04 16, 2009",A38Z3Q6DTDIH9J,{'Format:': ' Kindle Edition'},Jimmy J. Shaw,"Another well written eBook by Troy Denning, bu...",Star Wars: The New Jedi Order: Recovery,1239840000,5.0,
2,"['Kindle Store', 'Kindle eBooks', 'Literature ...",,[],,,"['B000FA64QO', 'B00513F934', 'B00513H3I8', 'B0...",,Troy Denning,[],"350,999 Paid in Kindle Store (",...,True,"07 20, 2012",A22CW0ZHY3NJH8,{'Format:': ' Kindle Edition'},(),"I have a version of ""Star by Star"" that does n...","Not a necessary read, but I liked it",1342742400,,
3,"['Kindle Store', 'Kindle eBooks', 'Literature ...",,[],,,"['B000FA64QO', 'B00513F934', 'B00513H3I8', 'B0...",,Troy Denning,[],"350,999 Paid in Kindle Store (",...,False,"03 15, 2012",A3SZMGJMV0G16C,{'Format:': ' Kindle Edition'},Andrew Pruette,Troy Denning's novella Recovery was originally...,Han and Leia reunited and Barabel Jedi introduced,1331769600,,
4,"['Kindle Store', 'Kindle eBooks', 'Literature ...",,[],,,"['B000FA64QO', 'B00513F934', 'B00513H3I8', 'B0...",,Troy Denning,[],"350,999 Paid in Kindle Store (",...,True,"01 27, 2014",A1ZT7WV0ZUA0OJ,{'Format:': ' Kindle Edition'},Kindle Customer,This one promises to be another good book. I h...,my collection,1390780800,,


In [50]:
prod_merged.columns

Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes',
       'overall', 'verified', 'reviewTime', 'reviewerID', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='object')

In [56]:
# we cant have empty title as we need to infer the title that we are recommending as well
prod_merged.dropna(subset=['title'], axis=0, inplace=True)

In [57]:
summary_statistics(prod_merged)

The dataframe consists of 1846590 rows and 30 columns
The number of unique reviewerID: 130008
The number of unique reviewText: 1843681
The number of unique asin: 98059


In [58]:
# save merged dataframe to interim
prod_merged.to_csv("../data/interim/Kindle_Store_5_merged.csv", index=False)