# Amazon electronic procducts descriptive analysis

- Source: Amazon Reviews data (http://jmcauley.ucsd.edu/data/amazon/)

The repository has several datasets. For this case study, we are using the Electronics
dataset.


In [16]:
import os
import json
import gzip
from tqdm.notebook import tqdm as tqdm

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

RANDOM = 2021

figure_path = '/home/weiss/git/thesis/doc/figures/'

In [17]:
DIR = '/home/weiss/rs_data/amazon-electronic-product-recommendation/'
PATH_IN = os.path.join(DIR, 'Electronics.json.gz')
PATH_META = os.path.join(DIR, 'meta_Electronics.json.gz')

In [18]:
### load the meta data

def load_data(file, limit_data=False):
    data = []
    with gzip.open(file) as f:
        i = 0
        for l in tqdm(f):
            data.append(json.loads(l.strip()))
            if i >= 100000 and limit_data == True:
                break
            i +=1
    return pd.DataFrame.from_dict(data)

In [None]:
ratings = load_data(PATH_IN)
print(ratings.head())
print(ratings.shape)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

In [None]:
meta = load_data(PATH_META, limit_data=True)
print(meta.head())
print(meta.shape)

In [None]:
processed_ratings = ratings[['reviewerID', 'asin','overall']]
processed_ratings.columns = ['user_id', 'item_id', 'rating']
display(processed_ratings.head())

In [None]:
print('# users: ', processed_ratings['user_id'].unique().size)
print('# items: ', processed_ratings['item_id'].unique().size)
print('# ratings: ', processed_ratings['rating'].size)

In [None]:
# power law?
rating_count_by_user = processed_ratings.groupby('user_id').size().sort_values(ascending=False)
print(rating_count_by_user)
print(type(rating_count_by_user))
size = rating_count_by_user.size
print(size)

df = pd.DataFrame(dict(sorted_user_id=range(1,size+1), rating_counts=rating_count_by_user.values))

sns.set(style='whitegrid', font_scale=1.3, rc={'xtick.labelsize':12, 'ytick.labelsize':12})
image = sns.relplot(data=df,
                    x="sorted_user_id",
                    y="rating_counts",
                    kind="line",
                    )

image.set(xlabel="users sorted desc. by # ratings",
          ylabel="# ratings"
          )


image.savefig(figure_path + '/amazon-long-tail-distribution.png', dpi=300, bbox_inches='tight')

In [None]:
# Keep entries where the user has rated more than n items and less than m items

n = 50
m = 1000

counts = processed_ratings['user_id'].value_counts()
mask = (counts >= n) & (counts <= m)
print(mask.value_counts())
processed_ratings_cut = processed_ratings[processed_ratings['user_id'].isin(mask[mask == True].index)]

In [None]:
# power law?
rating_count_by_user = processed_ratings_cut.groupby('user_id').size().sort_values(ascending=False)
print(rating_count_by_user)
print(type(rating_count_by_user))
size = rating_count_by_user.size
print(size)

df = pd.DataFrame(dict(sorted_user_id=range(1,size+1), rating_counts=rating_count_by_user.values))

sns.set(style='whitegrid', font_scale=1.3, rc={'xtick.labelsize':12, 'ytick.labelsize':12})
image = sns.relplot(data=df,
                    x="sorted_user_id",
                    y="rating_counts",
                    kind="line",
                    )

image.set(xlabel="users sorted desc. by # ratings",
          ylabel="# ratings"
          )


image.savefig(figure_path + '/amazon-long-tail-distribution-cut.png', dpi=300, bbox_inches='tight')

In [None]:
processed_ratings_cut['rating'] = processed_ratings_cut['rating'].values.astype(int)

print(processed_ratings_cut['rating'])
print(processed_ratings_cut['rating'].size)

In [None]:
# rating distribution after cleanup
sns.set(style='whitegrid', font_scale=1.3)
g = sns.factorplot("rating", data=processed_ratings_cut, kind='count')
g.set_ylabels("# ratings")
g.savefig(figure_path + '/amazon-rating-distribution.png', dpi=300, bbox_inches='tight')

In [None]:
processed_ratings_cut['rating'].mean()

In [None]:
processed_ratings_cut['rating'].median()

In [None]:
# save processed ratings as pickle
processed_ratings_cut.to_pickle(DIR+'amazon-electronic-ratings.pkl')