In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import sys

# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from path_utils import *

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty


In [2]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'meta_All_Beauty.json.gz')
print(f'data path: {data_path}')
metadata_df = parse_json_gz(data_path)
# print column names and number of columns
print(f"number of columns: {len(metadata_df.columns)}")
print(f"column names: {metadata_df.columns}")
# number of rows
print(f"number of rows: {len(metadata_df)}")
metadata_df.head(3)


data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/meta_All_Beauty.json.gz
number of columns: 19
column names: Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'],
      dtype='object')
number of rows: 32892


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]


In [3]:
## remove rows with unformatted title (i.e. some 'title' may still contain html style content)
df3 = metadata_df.fillna('')
unformatted_df = df3[df3.title.str.contains('getTime')] # unformatted rows
metadata_df = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
print(len(unformatted_df))
print(len(metadata_df))

0
32892


In [5]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'All_Beauty.json.gz')
print(f'data path: {data_path}')
df = parse_json_gz(data_path)
# print column names and number of columns
print(f"number of columns: {len(df.columns)}")
print(f"column names: {df.columns}")
# number of rows
print(f"number of rows: {len(df)}")
df.head(3)


data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/All_Beauty.json.gz
number of columns: 12
column names: Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image'],
      dtype='object')
number of rows: 371345


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,


In [6]:
# inner join df and metadata_df
merged_df = pd.merge(df, metadata_df, on='asin', how='inner')

# remove all html tags for all columns
merged_df = merged_df.replace(to_replace='<[^>]+>', value='', regex=True)
# print column names and number of columns
print(f"number of columns: {len(merged_df.columns)}")
print(f"column names: {merged_df.columns}")
# number of rows
print(f"number of rows: {len(merged_df)}")
# number of unique ratings and their values
print(f"number of unique ratings: {len(merged_df['overall'].unique())} and their values: {merged_df['overall'].unique()}")
# print first 3 rows
merged_df.head(3)

number of columns: 30
column names: Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image', 'category', 'tech1', 'description', 'fit', 'title',
       'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details',
       'main_cat', 'similar_item', 'date', 'price', 'imageURL',
       'imageURLHighRes'],
      dtype='object')
number of rows: 382532
number of unique ratings: 5 and their values: [5. 4. 1. 3. 2.]


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,feature,rank,also_view,details,main_cat,similar_item,date,price,imageURL,imageURLHighRes
0,5.0,True,"03 10, 2016",A5TA1NJOC0PJ5,7414204790,Amazon Customer,Excellent very good,Five Stars,1457568000,,...,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
1,5.0,True,"09 1, 2015",A1TJICB7VLGQKL,7414204790,LH,Great product,Five Stars,1441065600,,...,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
2,5.0,True,"10 3, 2014",A80M2286B7STE,7414204790,dolly,Magical,Five Stars,1412294400,,...,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [7]:
# Group by 'reviewerID' and filter users with more or equal than 5 ratings
user_counts = merged_df.groupby('reviewerID').size()
users_with_more_than_5_ratings = user_counts[user_counts >= 5].index

# Instead of randomly sampling 5 users, we'll use all users that have rated products more than 5 times
selected_users = users_with_more_than_5_ratings.tolist()

# Filter the original dataframe to include only records of these users
merged_df = merged_df[merged_df['reviewerID'].isin(selected_users)]

# Rename the 'overall' column to 'rating': rating of the product
merged_df = merged_df.rename(columns={'overall': 'rating'})

# Get rid of columns that are not needed
merged_df = merged_df.drop(columns=['verified', 'reviewTime', 
       'reviewerName', 'unixReviewTime', 'vote', 
       'style', 'image', 'tech1', 'description', 'fit', 
       'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details',
       'main_cat', 'similar_item', 'date', 'price', 'imageURL',
       'imageURLHighRes'])

# Save merged_df to a CSV in the appropriate directory
merged_df.to_csv('../../data/amazon-beauty/large_merged_data.csv', index=False)
print('Saved merged_df to csv to appropriate directory!')

# Display the number of rows, columns, and column names
print(f"column names: {merged_df.columns}")
print(f"number of columns: {len(merged_df.columns)}")
print(f"number of rows: {len(merged_df)}")

# Print the first 3 rows
merged_df.head(3)


Saved merged_df to csv to appropriate directory!
column names: Index(['rating', 'reviewerID', 'asin', 'reviewText', 'summary', 'category',
       'title'],
      dtype='object')
number of columns: 7
number of rows: 9767


Unnamed: 0,rating,reviewerID,asin,reviewText,summary,category,title
64,1.0,A2RYSCZOPEXOCQ,9790787006,"I use a lot of perfume, I go through a new bot...",This is not going to be my favorite scent.,[],Jenna Jameson Heartbreaker Perfume for women 3...
131,5.0,A141OPVE376YFI,B000050B65,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",[],Norelco 6885XL Deluxe Quadra Action Cord/Cordl...
132,5.0,A141OPVE376YFI,B000050B65,"First, a little background. I've switched bet...","Finally, a razor that lives up to the ads",[],Norelco 6885XL Deluxe Quadra Action Cord/Cordl...


In [8]:
# explore data
# number of unique users
print(f"number of unique users: {len(merged_df['reviewerID'].unique())}")
# number of unique products
print(f"number of unique products: {len(merged_df['asin'].unique())}")
# number of unique ratings and their values
print(f"number of unique ratings: {len(merged_df['rating'].unique())} and their values: {merged_df['rating'].unique()}")

number of unique users: 1608
number of unique products: 1879
number of unique ratings: 5 and their values: [1. 5. 4. 2. 3.]


# References

Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019
pdf: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/