In [6]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import json
import sys
# Add the path to the constants file to the system path
sys.path.append('../../')
from constants import RANDOM_STATE, NUM_SAMPLES

In [7]:
def parse_json_gz(file_path):
    """
    Parse a .json.gz file into a pandas DataFrame.
    
    Parameters:
    - file_path (str): The path to the .json.gz file.
    
    Returns:
    - DataFrame: A pandas DataFrame containing the parsed data.
    """
    data = []
    
    with gzip.open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    
    return pd.DataFrame(data)


In [8]:
# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")

# Construct the path to data file
data_path = os.path.join(current_dir, 'meta_All_Beauty.json.gz')
print(f'data path: {data_path}')
metadata_df = parse_json_gz(data_path)
# print column names and number of columns
print(f"number of columns: {len(metadata_df.columns)}")
print(f"column names: {metadata_df.columns}")
# number of rows
print(f"number of rows: {len(metadata_df)}")
metadata_df.head(3)


current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty
data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/meta_All_Beauty.json.gz
number of columns: 19
column names: Index(['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'tech2',
       'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat',
       'similar_item', 'date', 'price', 'asin', 'imageURL', 'imageURLHighRes'],
      dtype='object')
number of rows: 32892


Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,details,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes
0,[],,[Loud 'N Clear Personal Sound Amplifier allows...,,Loud 'N Clear&trade; Personal Sound Amplifier,[],,idea village,[],"2,938,573 in Beauty & Personal Care (",[],{'ASIN: ': '6546546450'},All Beauty,,,,6546546450,[],[]
1,[],,[No7 Lift & Luminate Triple Action Serum 50ml ...,,No7 Lift &amp; Luminate Triple Action Serum 50...,"[B01E7LCSL6, B008X5RVME]",,,[],"872,854 in Beauty & Personal Care (",[],"{'Shipping Weight:': '0.3 ounces (', 'ASIN: ':...",All Beauty,"class=""a-bordered a-horizontal-stripes a-spa...",,$44.99,7178680776,[],[]
2,[],,[No7 Stay Perfect Foundation now stays perfect...,,No7 Stay Perfect Foundation Cool Vanilla by No7,[],,No7,[],"956,696 in Beauty & Personal Care (","[B01B8BR0O8, B01B8BR0NO, B014MHXXM8]","{'Shipping Weight:': '3.5 ounces (', 'ASIN: ':...",All Beauty,,,$28.76,7250468162,[],[]


In [9]:
## remove rows with unformatted title (i.e. some 'title' may still contain html style content)
df3 = metadata_df.fillna('')
unformatted_df = df3[df3.title.str.contains('getTime')] # unformatted rows
metadata_df = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
print(len(unformatted_df))
print(len(metadata_df))

0
32892


In [10]:
# Get the current directory of the notebook
current_dir = os.path.dirname(os.path.abspath("../../data/amazon-beauty/parse_and_clean_meta_data.ipynb"))
print(f"current directory: {current_dir}")

# Construct the path to data file
data_path = os.path.join(current_dir, 'All_Beauty.json.gz')
print(f'data path: {data_path}')
df = parse_json_gz(data_path)
# print column names and number of columns
print(f"number of columns: {len(df.columns)}")
print(f"column names: {df.columns}")
# number of rows
print(f"number of rows: {len(df)}")
df.head(3)


current directory: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty
data path: /Users/tnathu-ai/VSCode/recommender-system/recommender-system-openAI/code/data/amazon-beauty/All_Beauty.json.gz
number of columns: 12
column names: Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image'],
      dtype='object')
number of rows: 371345


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"02 19, 2015",A1V6B6TNIC10QE,143026860,theodore j bigham,great,One Star,1424304000,,,
1,4.0,True,"12 18, 2014",A2F5GHSXFQ0W6J,143026860,Mary K. Byke,My husband wanted to reading about the Negro ...,... to reading about the Negro Baseball and th...,1418860800,,,
2,4.0,True,"08 10, 2014",A1572GUYS7DGSR,143026860,David G,"This book was very informative, covering all a...",Worth the Read,1407628800,,,


In [11]:
# inner join df and metadata_df
merged_df = pd.merge(df, metadata_df, on='asin', how='inner')
# get sample of merged_df NUM_SAMPLES rows
merged_df = merged_df.sample(NUM_SAMPLES, random_state=RANDOM_STATE)
# save merged_df to csv to appropriate directory
merged_df.to_csv('../../data/amazon-beauty/merged_data.csv', index=False)
# print column names and number of columns
print(f"number of columns: {len(merged_df.columns)}")
print(f"column names: {merged_df.columns}")
# number of rows
print(f"number of rows: {len(merged_df)}")
# print first 3 rows
merged_df.head(3)

number of columns: 30
column names: Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image', 'category', 'tech1', 'description', 'fit', 'title',
       'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details',
       'main_cat', 'similar_item', 'date', 'price', 'imageURL',
       'imageURLHighRes'],
      dtype='object')
number of rows: 100


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,...,feature,rank,also_view,details,main_cat,similar_item,date,price,imageURL,imageURLHighRes
259508,5.0,True,"11 28, 2016",A36TB0OQPPEWNR,B0171BTD8U,Amazon Customer,It works great!!! I love how our tooth brushes...,Five Stars,1480291200,,...,[],"350,740 in Beauty & Personal Care (","[B003M8GMUO, B076BDKX2H, B0006VGZFC, B00LZRJGC...","{'Shipping Weight:': '15.2 ounces', 'ASIN: ': ...",All Beauty,,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
224587,5.0,True,"04 22, 2016",AP7TD79VLW2J7,B00U1WC2JY,Kevin Gilbert,Without a doubt the best loofa/scrubber I have...,Great scrubber!!!!,1461283200,,...,[],"258,475 in Beauty & Personal Care (","[B07KHPLVDN, B016DQUKE0, B078XQM588, B07DQ8PSW...","{'Shipping Weight:': '3.2 ounces', 'ASIN: ': '...",All Beauty,,,,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...
225487,5.0,True,"06 19, 2017",AIIG83YPKDCU1,B00UL4GGGC,Amazon Customer,I originally ordered this product to exercise ...,IT'S CORRECTING MY OVERBITE!!!,1497830400,3.0,...,[],"368,117 in Beauty & Personal Care (","[B06X6GRJHW, B000K7J1PI, B0743H5LF1, B07CZYXJN...","{'  Item Weight: ': '3.2 ounces', 'Shi...",All Beauty,,,$6.80,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...


# References

Justifying recommendations using distantly-labeled reviews and fined-grained aspects
Jianmo Ni, Jiacheng Li, Julian McAuley
Empirical Methods in Natural Language Processing (EMNLP), 2019
pdf