In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import json
import sys

# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import *
from path_utils import *

# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")

current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty


In [2]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'meta_All_Beauty.json.gz')
print(f'data path: {data_path}')
metadata_df = parse_json_gz(data_path)
# print column names and number of columns
print(f"number of columns: {len(metadata_df.columns)}")
print(f"column names: {metadata_df.columns}")
# number of rows
print(f"number of rows: {len(metadata_df)}")
metadata_df.head(NUM_EXAMPLES)


data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/meta_All_Beauty.json.gz
number of columns: 19
column names: Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'],
      dtype='object')
number of rows: 32892


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]
3,[],,[],,Wella Koleston Perfect Hair Colour 44/44 Mediu...,[B0041PBXX8],,,[],"1,870,258 in Beauty & Personal Care (",[],"{'  Item Weight: ': '1.76 ounces', 'Sh...",All Beauty,,,,7367905066,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4,[],,[Lacto Calamine Skin Balance Daily Nourishing ...,,Lacto Calamine Skin Balance Oil control 120 ml...,[],,Pirmal Healthcare,[],"67,701 in Beauty & Personal Care (","[3254895630, B007VL1D9S, B00EH9A0RI, B0773MBG4...","{'Shipping Weight:': '12 ounces (', 'ASIN: ': ...",All Beauty,,,$12.15,7414204790,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


In [3]:
## remove rows with unformatted title (i.e. some 'title' may still contain html style content)
df3 = metadata_df.fillna('')
unformatted_df = df3[df3.title.str.contains('getTime')] # unformatted rows
metadata_df = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
print(len(unformatted_df))
print(len(metadata_df))

0
32892


In [4]:
# Construct the path to data file
data_path = os.path.join(current_dir, 'All_Beauty.json.gz')
print(f'data path: {data_path}')
df = parse_json_gz(data_path)
# print column names and number of columns
print(f"number of columns: {len(df.columns)}")
print(f"column names: {df.columns}")
# number of rows
print(f"number of rows: {len(df)}")
df.head(3)


data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/rec-sys/data/amazon-beauty/All_Beauty.json.gz
number of columns: 12
column names: Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image'],
      dtype='object')
number of rows: 371345


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,


In [5]:
# Clean data
# inner join df and metadata_df
merged_df = pd.merge(df, metadata_df, on='asin', how='inner')

# Group by 'reviewerID' and filter users with more than 5 ratings
user_counts = merged_df.groupby('reviewerID').size()
users_with_5_or_more_ratings = user_counts[user_counts > 5].index

# Convert the index to a series and then randomly sample 5 users from this filtered group
selected_users = pd.Series(users_with_5_or_more_ratings).sample(n=5, random_state=2002).tolist()

# Filter the original dataframe to include only records of these 5 users
merged_df = merged_df[merged_df['reviewerID'].isin(selected_users)]

# number of rows and columns and column names
print(f"column names: {merged_df.columns}")
print(f"number of columns: {len(merged_df.columns)}")
print(f"number of rows: {len(merged_df)}")
# print first 3 rows
merged_df.head(3)

column names: Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image', 'category', 'tech1', 'description', 'fit', 'title',
       'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details',
       'main_cat', 'similar_item', 'date', 'price', 'imageURL',
       'imageURLHighRes'],
      dtype='object')
number of columns: 30
number of rows: 34


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,feature,rank,also_view,details,main_cat,similar_item,date,price,imageURL,imageURLHighRes
4287,5.0,True,"09 17, 2015",ANV9L0JU6BNL,B000052YAN,Dennis,best floss i've used. does not break as easily...,best floss i've used,1442448000,,...,[],"120,123 in Beauty & Personal Care (","[B01I9TJRN4, B003XDVERE, B0722XHMGZ, B012O5008...",{'  Product Dimensions: ': '1 x 1 x 2 ...,All Beauty,,,$5.17,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
4288,5.0,True,"09 17, 2015",ANV9L0JU6BNL,B000052YAN,Dennis,best floss i've used. does not break as easily...,best floss i've used,1442448000,,...,[],"120,123 in Beauty & Personal Care (","[B01I9TJRN4, B003XDVERE, B0722XHMGZ, B012O5008...",{'  Product Dimensions: ': '1 x 1 x 2 ...,All Beauty,,,$5.17,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
10697,2.0,True,"03 27, 2018",A2TU781PWGS09X,B00006L9LC,Amazon Customer,Doesnt smell,Two Stars,1522108800,,...,[],"1,693,702 in Beauty & Personal Care (",[],"{'ASIN: ': 'B00006L9LC', 'UPC:': '795827187965...",All Beauty,,,$23.00,[],[]


In [6]:
# rename overall column to rating: rating of the product
merged_df = merged_df.rename(columns={'overall': 'rating'})
# Handle Missing Values
# 'rating' is an important column
merged_df = merged_df.dropna(subset=['rating'])
merged_df['rating'] = merged_df['rating'].astype(float)

# remove all html tags for all columns
merged_df = merged_df.replace(to_replace='<[^>]+>', value='', regex=True)

# Rank Cleanup
merged_df['rank'] = merged_df['rank'].str.extract(r'(\d+)').astype(float)

# Price Formatting
merged_df['price'] = pd.to_numeric(merged_df['price'].str.replace('$', '', regex=False), errors='coerce')

# Date Formatting
merged_df['reviewTime'] = pd.to_datetime(merged_df['reviewTime'])
merged_df['unixReviewTime'] = pd.to_datetime(merged_df['unixReviewTime'], unit='s')

# Convert Boolean
merged_df['verified'] = merged_df['verified'].astype(bool)

# Parse JSON or Nested Fields
def parse_json_or_list(column):
    try:
        return json.loads(column.replace("'", "\""))
    except:
        return column

json_columns = ['style', 'feature', 'also_buy', 'also_view']
for col in json_columns:
    merged_df[col] = merged_df[col].apply(parse_json_or_list)

# Handle URLs and Image Links from items, and image from users
merged_df = merged_df.drop(columns=['imageURL', 'imageURLHighRes', 'image'])

# # Remove Duplicate Rows
# merged_df = merged_df.drop_duplicates()

# number of rows and columns and column names
print(f"column names: {merged_df.columns}")
print(f"number of columns: {len(merged_df.columns)}")
print(f"number of rows: {len(merged_df)}")
# print first 3 rows
merged_df.head(3)

column names: Index(['rating', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'category', 'tech1', 'description', 'fit', 'title', 'also_buy',
       'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price'],
      dtype='object')
number of columns: 27
number of rows: 34


Unnamed: 0,rating,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price
4287,5.0,True,2015-09-17,ANV9L0JU6BNL,B000052YAN,Dennis,best floss i've used. does not break as easily...,best floss i've used,2015-09-17,,...,,Reach,[],120.0,"[B01I9TJRN4, B003XDVERE, B0722XHMGZ, B012O5008...",{'  Product Dimensions: ': '1 x 1 x 2 ...,All Beauty,,,5.17
4288,5.0,True,2015-09-17,ANV9L0JU6BNL,B000052YAN,Dennis,best floss i've used. does not break as easily...,best floss i've used,2015-09-17,,...,,Reach,[],120.0,"[B01I9TJRN4, B003XDVERE, B0722XHMGZ, B012O5008...",{'  Product Dimensions: ': '1 x 1 x 2 ...,All Beauty,,,5.17
10697,2.0,True,2018-03-27,A2TU781PWGS09X,B00006L9LC,Amazon Customer,Doesnt smell,Two Stars,2018-03-27,,...,,Citre Shine,[],1.0,[],"{'ASIN: ': 'B00006L9LC', 'UPC:': '795827187965...",All Beauty,,,23.0


In [7]:
# save merged_df to csv to appropriate directory
merged_df.to_csv('../../data/amazon-beauty/merged_data.csv', index=False)
print('Saved merged_df to csv to appropriate directory!')


Saved merged_df to csv to appropriate directory!


In [8]:
# explore data
# number of unique users
print(f"number of unique users: {len(merged_df['reviewerID'].unique())}")
# number of unique products
print(f"number of unique products: {len(merged_df['asin'].unique())}")
# number of unique ratings and their values
print(f"number of unique ratings: {len(merged_df['rating'].unique())} and their values: {merged_df['rating'].unique()}")

number of unique users: 5
number of unique products: 23
number of unique ratings: 4 and their values: [5. 2. 3. 4.]


# References

Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019
pdf: https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/