# Amazon Data

In [1]:
'''
This notebook will explore the retreival and preprocessing of amazon data
for modelling purposes
'''

import os
import json
import gzip
import numpy as np
import pandas as pd
import datetime
import nltk
import string
import re

from nltk.corpus import stopwords
from urllib.request import urlopen

In [None]:
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/sample_meta_Home_and_Kitchen.json
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/sample/sample_Home_and_Kitchen_5.json

In [2]:
# constants
base_path = os.path.expanduser('~') + '/'
current_year = int(datetime.datetime.now().year)

sw = stopwords.words("english")
stop_words = [w.lower() for w in sw]

In [3]:
def load_data(d_path = 'meta_Computers.json.gz'):
    '''
    This function will load the Amazon data, store it into a dataframe and return the df
    '''
    
    data = []
    counter = 0
    with gzip.open(d_path) as f:
        for i,l in enumerate(f):
            data.append(json.loads(l.strip()))
            counter += 1
            
    df = pd.DataFrame.from_dict(data)
    return df

## Import Data

In [17]:
%time vg_df = load_data(base_path + 'Downloads/Video_Games.json.gz')

CPU times: user 46.7 s, sys: 2.18 s, total: 48.9 s
Wall time: 48.1 s


In [18]:
%time vg_meta_df = load_data(base_path + 'Downloads/meta_Video_Games.json.gz')

CPU times: user 4.6 s, sys: 75 ms, total: 4.68 s
Wall time: 4.69 s


## Data Overview
**VG Reviews Data**
- reviewerID - ID of the reviewer, e.g. A2SUAM1J3GNN3B
- asin - ID of the product, e.g. 0000013714
- reviewerName - name of the reviewer
- vote - helpful votes of the review
- style - a disctionary of the product metadata, e.g., "Format" is "Hardcover"
- reviewText - text of the review
- overall - rating of the product
- summary - summary of the review
- unixReviewTime - time of the review (unix time)
- reviewTime - time of the review (raw)
- image - images that users post after they have received the product

**VG Meta**
- asin - ID of the product, e.g. 0000031852
- title - name of the product
- feature - bullet-point format features of the product
- description - description of the product
- price - price in US dollars (at time of crawl)
- image - url of the product image
- also_view - other similar products viewed
- also_buy - other similar products bought
- salesRank - sales rank information
- brand - brand name
- similar_item - list of similar items
- categories - list of categories the product belongs to
- tech1 - the first technical detail table of the product
- tech2 - the second technical detail table of the product
- similar - similar product table

In [21]:
vg = pd.merge(vg_meta_df, vg_df, on = 'asin', how = 'inner', suffixes = ('_meta', ''))

In [22]:
vg.shape

(2754, 31)

In [23]:
# remomve unneccessary columns
drop_cols = ['tech1', 'tech2', 'fit', 'feature']
vg = vg.drop(columns = drop_cols)

## Preprocess Data

In [24]:
d = vg.copy()

In [25]:
# convert verified column from True/False --> 1 / 0
d['verified'] = d['verified'] * 1

In [26]:
def convert_vote(x):
    '''
    This function will convert the vote category from a string to int
    '''
    
    vote = int(''.join(str(x).split(',')))
    return vote 

In [27]:
# if no votes are available then replace them with 0 
d['vote'].fillna(0, inplace = True)

In [28]:
%%time 

# converting votes to integers
d['vote'] = d['vote'].apply(lambda x : convert_vote(x))

CPU times: user 6.09 ms, sys: 258 µs, total: 6.35 ms
Wall time: 6.29 ms


In [29]:
def valid_image(image_list):
    '''
    This function will return list of 1's and 0's where 1's represent if a
    link to an image appeast for that iteration in the input, 0 otherewise
    '''
    
    valid_images = []
    for i in image_list:
        if type(i) == list:
            valid_images.append(1)
        else:
            valid_images.append(0)
    return valid_images

In [30]:
%time d['valid_image'] = valid_image(d.image.values)

CPU times: user 1.94 ms, sys: 76 µs, total: 2.01 ms
Wall time: 1.98 ms


In [31]:
def parse_review_year(row):
    '''
    This function will parse the review year from the
    reviewTime column in the dataframe
    '''
    return int(row.split(' ')[-1])

In [32]:
%time d['review_year'] = d['reviewTime'].apply(lambda x : parse_review_year(x))

CPU times: user 3.36 ms, sys: 61 µs, total: 3.42 ms
Wall time: 3.38 ms


In [33]:
# identify how many years old is the review
d['time_since_review'] = current_year - d['review_year']

In [34]:
def word_count(row, stopwords = stop_words):
    '''
    This function will remove stopwords from the review and count the unqie
    remainning words
    '''
    row = str(row).lower().translate(str.maketrans('', '', string.punctuation)) # removes punctuation
    words = row.split(' ')
    word_count = len(set(words) - set(stopwords))
    return word_count

In [35]:
%time d['review_wc'] = d['reviewText'].apply(lambda x : word_count(x))

CPU times: user 70.9 ms, sys: 1.94 ms, total: 72.8 ms
Wall time: 71.8 ms


In [36]:
%time d['summary_wc'] = d['summary'].apply(lambda x : word_count(x))

CPU times: user 25.1 ms, sys: 2.31 ms, total: 27.4 ms
Wall time: 26.9 ms


In [37]:
%time d['title_wc'] = d['title'].apply(lambda x : word_count(x))

CPU times: user 28.3 ms, sys: 1.33 ms, total: 29.7 ms
Wall time: 28.7 ms


In [38]:
%time d['description_count'] = d['description'].apply(lambda x : [word_count(x[0]) if len(x) > 0 else np.nan][0])

CPU times: user 19.3 ms, sys: 585 µs, total: 19.9 ms
Wall time: 19.4 ms


In [39]:
def similar_item_present(x):
    '''
    This function will identify if the value passed (associated to the similar_item column)
    is not empty.
    If it is empty, it will return 0, else 1
    '''
    if (x != x) | (x == '') | (x is None): #checks for nan, empty str, and nones
        return 0
    return 1

In [40]:
%time d['similar_item_present'] = d['similar_item'].apply(lambda x : similar_item_present(x))

CPU times: user 2.4 ms, sys: 106 µs, total: 2.51 ms
Wall time: 2.44 ms


In [41]:
%time d['recommended_item_counts'] = d['also_view'].apply(lambda x : len(x))

CPU times: user 2.23 ms, sys: 291 µs, total: 2.53 ms
Wall time: 2.32 ms


In [42]:
def parse_main_cat(x):
    '''
    This function will parse the main category values through lowering all letters and removing special charactres,
    if there is no value present then it will return a np.nan
    '''
    x = str(x)
    if (x != x) | (x is None) | (x == ''):
        return np.nan
    else:
        if 'amp' in x:
            x = x.replace('amp', '')
        x = x.translate(str.maketrans('', '', string.punctuation)) # removes punctuation
        x = x.lower()
        
        if 'amazon fashion' in x:
            return 'amazon fashion'
        if 'digital music' in x:
            return 'digial music'
        return x

In [43]:
%time d['main_cat'] = d['main_cat'].apply(lambda x :parse_main_cat(x))

CPU times: user 9.88 ms, sys: 470 µs, total: 10.4 ms
Wall time: 9.95 ms


In [44]:
%time d['count_also_bought'] = d['also_buy'].apply(lambda x : len(x))

CPU times: user 1.95 ms, sys: 79 µs, total: 2.03 ms
Wall time: 1.97 ms


In [45]:
def parse_price(x):
    '''
    This function will parse the price from the input string
    '''
    x = str(x)
    if (x != x) | (x is None) | (x == ''):
        return np.nan
    
    price = re.findall(r'\d+(?:\.\d+)?', x)
    if len(price) > 0:
        return float(price[0])
    else:
        return np.nan
    

In [46]:
%time d['price'] = d['price'].apply(lambda x : parse_price(x))

CPU times: user 19.1 ms, sys: 842 µs, total: 20 ms
Wall time: 19.4 ms


In [47]:
def get_year(x):
    '''
    If there is a date present, it will return the year associated to that date
    '''
    
    if (x != x) | (x is None) | (x == ''):
        return np.nan
    
    year = x.split(',')[-1]
    try:
        year = int(year)
        return year
    except:
        return np.nan

In [48]:
%time d['year'] = d['date'].apply(lambda x : get_year(x))

CPU times: user 2.77 ms, sys: 631 µs, total: 3.4 ms
Wall time: 2.95 ms


In [49]:
%time d['image_present'] = d['image_meta'].apply(lambda x : [1 if len(x) > 0 else 0][0])

CPU times: user 2.3 ms, sys: 99 µs, total: 2.4 ms
Wall time: 2.33 ms


In [50]:
def parse_rank(x):
    '''
    Given a list of strings which represent the rank associated to the product across multiple
    categories, this function will parse the best (minimum) rank associated to the product
    '''
    
    if len(x) == 0:
        return np.nan
    
    ranks = []
    for r_str in x:
        r_str = r_str.replace(',', '')
        r = re.findall(r'\d+', r_str)
        if len(r) > 0:
            ranks.append(r[0])
        else:
            return np.nan
    return max(ranks)

In [51]:
%time d['best_rank'] = d['rank'].apply(lambda x : parse_rank(x))

CPU times: user 6.46 ms, sys: 128 µs, total: 6.59 ms
Wall time: 6.52 ms


## Save Results

In [38]:
d.to_csv(
    base_path + 'Downloads/video_game_data_preprocessed.csv', 
    index = False
)