In [1]:
## Import required libraries:
import json
import pandas as pd
import random
import numpy as np
from faker import Faker
from datetime import timedelta, datetime
from bson import ObjectId

In [2]:
file_paths = {
    "review": "generated data/review_data.json",
    "meta": "generated data/meta_data.json",
    "user": "generated data/user_data.json"
}

# Load each JSON file using json.load() and convert directly to pandas DataFrame
review_df = pd.read_json(file_paths["review"], lines=True, encoding='utf-8')
meta_df = pd.read_json(file_paths["meta"], lines=True, encoding='utf-8')
user_df = pd.read_json(file_paths["user"], lines=True, encoding='utf-8')

In [3]:
# To check if the dataset is correctly loaded
print(review_df.shape)
print(meta_df.shape)
print(user_df.shape)
print(review_df.iloc[945])
print(meta_df.iloc[945])
print(user_df.iloc[945])

(6460965, 11)
(1210967, 18)
(658400, 8)
overall                                                           4
vote                                                              1
verified                                                      False
reviewTime                                              08 14, 2000
unixReviewTime                                            966211200
reviewerID                                           A19646YDU8IH1I
reviewerName                                    Robert Ian Farquhar
asin                                                     B00000DMA8
style                                     {'Edition:': ' Standard'}
reviewText        Okay I admit it, the two main reasons I bought...
summary                                                   Good Fun!
Name: 945, dtype: object
asin                                                      B00001XDVT
title                                  Armorines: Project S.W.A.R.M.
feature            [Great Condition, cleaned and 

In [4]:
# Merge review_df and meta_df on 'asin' to obtain price for each review
payment_methods = ['MasterCard', 'Visa', 'PayPal', 'Discover', 'Amex', 'Bitcoin']

merged_df = review_df.merge(meta_df[['asin', 'price']], on='asin', how='left')

# Filter rows where price starts with '$' and convert to float
valid_prices = merged_df['price'].str.startswith('$', na=False)
merged_df = merged_df[valid_prices]
merged_df['price'] = merged_df['price'].str[1:].astype(float)

# Generate random copy count and calculate total price
merged_df['copy'] = [random.randint(1, 10) for _ in range(len(merged_df))]
merged_df['totalPrice'] = merged_df['price'] * merged_df['copy']

# Compute transactionTime as a random time up to 7 days before reviewTime
seconds_in_a_day = 86400
random_seconds = [random.randint(0, 7 * seconds_in_a_day) for _ in range(len(merged_df))]
review_times = pd.to_datetime(merged_df['unixReviewTime'], unit='s')
transaction_times = review_times - pd.to_timedelta(random_seconds, unit='s')
merged_df['transactionTime'] = transaction_times

# Assign random payment method and generate unique transaction ID
merged_df['paymentMethod'] = [random.choice(payment_methods) for _ in range(len(merged_df))]
merged_df['transactionID'] = [str(ObjectId()) for _ in range(len(merged_df))]

# Select relevant columns
transaction_df = merged_df[['transactionID', 'transactionTime', 'asin','reviewerID', 'copy', 'totalPrice', 'paymentMethod']]

In [5]:
import os

output_dir = "G:\\DSA5104 Project\\generated data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
transaction_df.to_json(os.path.join(output_dir, "transaction_data_2.json"), orient='records', lines=True)

Create reduced sample dataset (select 1000 unique asin's)

In [6]:
# Sample 1000 unique asins from meta_df
unique_asins = meta_df['asin'].unique()
sampled_asins = pd.Series(unique_asins).sample(n=1000, random_state=1).tolist()

# Filter review_df to get reviews related to the sampled asins
review_mask = review_df['asin'].isin(sampled_asins)
review_data_sample = review_df[review_mask]

# Get unique reviewerIDs from the filtered review_df
unique_reviewerIDs = review_data_sample['reviewerID'].unique()

# Filter user_df to get users who made the reviews
user_mask = user_df['reviewerID'].isin(unique_reviewerIDs)
user_data_sample = user_df[user_mask]

# Filter transaction_df for sampled asins
transaction_mask = transaction_df['asin'].isin(sampled_asins)
transaction_data_sample = transaction_df[transaction_mask]

# Filter meta_df for sampled asins
meta_mask = meta_df['asin'].isin(sampled_asins)
meta_data_sample = meta_df[meta_mask]

# Save the reduced datasets to files
output_dir = "G:\\DSA5104 Project\\generated data"

meta_data_sample.to_json(os.path.join(output_dir, 'meta_data_sample.json'), orient='records', lines=True)
review_data_sample.to_json(os.path.join(output_dir, 'review_data_sample.json'), orient='records', lines=True)
user_data_sample.to_json(os.path.join(output_dir, 'user_data_sample.json'), orient='records', lines=True)
transaction_data_sample.to_json(os.path.join(output_dir, 'transaction_data_sample.json'), orient='records', lines=True)