This code is for cleaning data and generating User and Transaction data based on the existed datasets.

In [56]:
## Import required libraries:
import json
import pandas as pd
import random
import numpy as np
from faker import Faker
from datetime import timedelta, datetime
from bson import ObjectId

In [9]:
### Read the datasets
## Change the file_path accordingly 
file_path = "cleaning_data/review_data.json"
review_data = []
with open(file_path, 'r', encoding="utf-8") as file:
    review_data = json.load(file)
review_df = pd.DataFrame(review_data)

In [10]:
file_path = "cleaning_data/meta_data.json"
meta_data = []
with open(file_path, 'r', encoding="utf-8") as file:
    meta_data = json.load(file)
meta_df = pd.DataFrame(meta_data)

In [13]:
# To check if the dataset is correctly loaded
print(review_df.shape)
print(meta_df.shape)
print(review_df.iloc[945])
print(meta_df.iloc[945])

(6460965, 11)
(1210967, 18)
overall                                                         4.0
vote                                                              1
verified                                                      False
reviewTime                                              08 14, 2000
unixReviewTime                                            966211200
reviewerID                                           A19646YDU8IH1I
reviewerName                                    Robert Ian Farquhar
asin                                                     B00000DMA8
style                                     {'Edition:': ' Standard'}
reviewText        Okay I admit it, the two main reasons I bought...
summary                                                   Good Fun!
Name: 945, dtype: object
asin                                                      B00001XDVT
title                                  Armorines: Project S.W.A.R.M.
feature            [Great Condition, cleaned and tested, *Car

In [None]:
review_df.shape
meta_df

In [12]:
### Check the NULL values inside datasets
# Checking for missing values
print("\nMissing values in review_df:")
print(review_df.isnull().sum()) # have missing values in "style"

print("\nMissing values in meta_df:")
print(meta_df.isnull().sum())


Missing values in review_df:
overall                 0
vote                    0
verified                0
reviewTime              0
unixReviewTime          0
reviewerID              0
reviewerName            0
asin                    0
style             1521260
reviewText              0
summary                 0
dtype: int64

Missing values in meta_df:
asin               0
title              0
feature            0
description        0
price              0
imageURL           0
imageURLHighRes    0
also_buy           0
also_view          0
rank               0
brand              0
category           0
main_cat           0
tech1              0
tech2              0
similar_item       0
date               0
fit                0
dtype: int64


When checking the dataset, I found some elements in "price" column is bad:

In [16]:
unique_prices_with_nan = meta_df['price'].unique()
print(unique_prices_with_nan) # many price is ''

# Count of elements with an empty string in "price"
empty_string_count = meta_df[meta_df['price'] == ""].shape[0]

pattern = r'^\$\d+(\.\d{1,2})?$'
non_matching_count = meta_df[~meta_df['price'].str.match(pattern, na=False)].shape[0]

print(f"Number of elements with an empty string in 'price': {empty_string_count}")
print(f"Number of elements not in the form '$[some number]' in 'price': {non_matching_count}")

['' '\n\t\t\t\t\t\t\t\t\t\t\t\t<span class="verticalAlign a-size-large"'
 '$0.72' ... '$217.06' '$108.49' '$293.99']
Number of elements with an empty string in 'price': 658678
Number of elements not in the form '$[some number]' in 'price': 770753


Here are the codes for cleaning the "price" element (generate some fake prices for these "bad" elements)

In [18]:
# Define a function to generate a random price in the format $x.xx
def generate_random_price():
    # Generate a random float between 0.01 and 299.99
    random_price = round(random.uniform(0.01, 299.99), 2)
    # Convert to string format with 2 decimal places and prepend with a dollar sign
    return f"${random_price:.2f}"

# Identify the indices of rows that have an empty string or not in the desired format in the 'price' column
pattern = r'^\$\d+(\.\d{1,2})?$'
invalid_price_indices = meta_df[~meta_df['price'].str.match(pattern, na=False)].index

# Replace these values with randomly generated prices
meta_df.loc[invalid_price_indices, 'price'] = meta_df.loc[invalid_price_indices, 'price'].apply(lambda x: generate_random_price())

In [19]:
# Check again
empty_string_count = meta_df[meta_df['price'] == ""].shape[0]

pattern = r'^\$\d+(\.\d{1,2})?$'
non_matching_count = meta_df[~meta_df['price'].str.match(pattern, na=False)].shape[0]

print(f"Number of elements with an empty string in 'price': {empty_string_count}")
print(f"Number of elements not in the form '$[some number]' in 'price': {non_matching_count}")

Number of elements with an empty string in 'price': 0
Number of elements not in the form '$[some number]' in 'price': 0


Here, let's generate some User data:

First, we observe that there are two columns "reviewerName" and "reviewerID". We do some checking on these columns:

In [22]:
unique_reviewerNames = review_df['reviewerName'].unique()
unique_reviewerIDs = review_df['reviewerID'].unique()

# Print the number of unique values for each
print(f"Number of unique reviewerNames: {len(unique_reviewerNames)}")
print(f"Number of unique reviewerIDs: {len(unique_reviewerIDs)}")

Number of unique reviewerNames: 460580
Number of unique reviewerIDs: 641085


In [23]:
most_frequent_reviewerName = review_df['reviewerName'].mode().iloc[0]
most_frequent_reviewerID = review_df['reviewerID'].mode().iloc[0]

print(f"Most frequent reviewerName: {most_frequent_reviewerName}")
print(f"Most frequent reviewerID: {most_frequent_reviewerID}")

Most frequent reviewerName: Amazon Customer
Most frequent reviewerID: AV6QDP8Q0ONK4


In [20]:
# Rows with both "reviewerName" and "reviewerID" present:
both_present_df = review_df[(review_df['reviewerName'] != "") & (review_df['reviewerID'] != "")]

# Rows with only "reviewerName" present:
only_name_df = review_df[(review_df['reviewerName'] != "") & (review_df['reviewerID'] == "")]

# Rows with only "reviewerID" present:
only_id_df = review_df[(review_df['reviewerName'] == "") & (review_df['reviewerID'] != "")]

# Rows where both "reviewerName" and "reviewerID" are missing:
both_missing_df = review_df[(review_df['reviewerName'] == "") & (review_df['reviewerID'] == "")]

In [21]:
# To get the count of rows for each condition
print("Rows with both reviewerName and reviewerID present:", both_present_df.shape[0])
print("Rows with only reviewerName present:", only_name_df.shape[0])
print("Rows with only reviewerID present:", only_id_df.shape[0])
print("Rows where both reviewerName and reviewerID are missing:", both_missing_df.shape[0])

Rows with both reviewerName and reviewerID present: 6460965
Rows with only reviewerName present: 0
Rows with only reviewerID present: 0
Rows where both reviewerName and reviewerID are missing: 0


In [38]:
long_names = len(review_df[review_df['reviewerName'].str.len() > 40]['reviewerName'])

print(f'There are {long_names} reviewer names longer than 40 characters.')

There are 5825 reviewer names longer than 40 characters.


In [40]:
# Define a function to replace long reviewer names with fake names
fake = Faker()

def replace_long_name(name):
    if len(name) > 40:
        return fake.first_name()
    else:
        return name

# Apply the function to the reviewerName column of review_df
review_df['reviewerName'] = review_df['reviewerName'].apply(replace_long_name)

In [42]:
long_ids = len(review_df[review_df['reviewerID'].str.len() > 20]['reviewerName'])

print(f'There are {long_ids} reviewer IDs longer than 20 characters.')

There are 0 reviewer IDs longer than 20 characters.


In [43]:
# Check again
long_names = len(review_df[review_df['reviewerName'].str.len() > 40]['reviewerName'])

print(f'There are {long_names} reviewer names longer than 40 characters.')

There are 0 reviewer names longer than 40 characters.


Then, we use these information to generate User data (Here I use Yelp User data as an example)

**Some explanations for the User Data**:

**reviewerName and reviewerID**: Referencing the review column (for each unique reviewerName and reviewerID pair, generate one user record)

**registerDate**: Timestamp of user's registeration date and time

**reviewCount**: how many review does the user posted (checking the number of records in "review_df")

**totalVotes**: Total votes the user get for his reviews (also checking "vote" in "review_df")

**fans**: Fake fans number of the user (For user with higher total votes, his fans number should be higher)

**phoneNumber**: Fake phone number of the user

**homeAddress**: Fake home address of the user

In [53]:
fake = Faker()

review_df['vote'] = pd.to_numeric(review_df['vote'], errors='coerce')

# Fill NaN values with 0
review_df['vote'].fillna(0, inplace=True)

# Group by reviewerName and reviewerID, and aggregate needed information
user_df = review_df.groupby(['reviewerName', 'reviewerID']).agg(
    registerDate=pd.NamedAgg(column='unixReviewTime', aggfunc='min'),
    reviewCount=pd.NamedAgg(column='asin', aggfunc='size'),
    totalVotes=pd.NamedAgg(column='vote', aggfunc='sum')
).reset_index()

# Convert registerDate from UNIX timestamp to datetime format
user_df['registerDate'] = pd.to_datetime(user_df['registerDate'], unit='s')

# Subtract a random number of days (between 1 and 365) from the earliest review date
user_df['registerDate'] = user_df['registerDate'] - pd.to_timedelta(np.random.randint(1, 365, size=len(user_df)), unit='D')

# Generate fake fans number based on totalVotes
user_df['fans'] = (np.log(user_df['totalVotes'] + 1) * 30).astype(int) + 5

# Generate fake phone number and home address for each user
user_df['phoneNumber'] = user_df['reviewerName'].apply(lambda x: fake.phone_number())
user_df['homeAddress'] = user_df['reviewerName'].apply(lambda x: fake.address().replace('\n', ', '))

user_df = user_df[['reviewerName', 'reviewerID', 'registerDate', 'reviewCount', 'totalVotes', 'fans', 'phoneNumber', 'homeAddress']]

In [54]:
# Check the generated user data
print(user_df.shape)
print(user_df.iloc[65])

(658400, 8)
reviewerName                              SHAMROCK FPV Chucks P. 
reviewerID                                         A1ZXRG90P7WHB2
registerDate                                  2013-08-31 00:00:00
reviewCount                                                     6
totalVotes                                                    6.0
fans                                                           63
phoneNumber                                001-721-658-3048x31780
homeAddress     434 Wilkinson Courts Suite 488, Port Raymond, ...
Name: 65, dtype: object


Then, let's generate the transaction records:

Some explanations for the generated transaction records:

**transactionID**: an unique id for each transaction 

**transactionTime**: Simulated transaction time, a little bit earlier than the review time (0-3 days)

**buyerID**: The ID of the buyer (also the reviewer, referencing the reviewerID)

**copy**: how many copies the buyer bought. randomly chosen from 1-10

**totalPrice**: calculated total price (price * copy)

**paymentMethod**: fake payment method (master card, paypal etc.)

In [57]:
# This version is time-consuming
# Initialize the list of possible payment methods
payment_methods = ['MasterCard', 'Visa', 'PayPal', 'Discover', 'Amex', 'Bitcoin']

# For each review, generate a transaction record
transactions = []

for index, row in review_df.iterrows():
    asin = row['asin']
    reviewerID = row['reviewerID']
    
    # Extract price for the corresponding product from meta_df
    item_price = meta_df[meta_df['asin'] == asin]['price'].iloc[0]
    
    # If the price is missing or not in correct format, skip this row (or you can assign a default price)
    if not item_price.startswith('$'):
        continue
    item_price = float(item_price[1:])  # Convert price from string to float, excluding the dollar sign
    
    # Determine copy and calculate total price
    copy_count = random.randint(1, 10)
    total_price = item_price * copy_count
    
    # Determine the transaction time
    review_time = pd.to_datetime(row['unixReviewTime'], unit='s')
    seconds_in_a_day = 86400
    random_seconds = random.randint(0, 7 * seconds_in_a_day)
    transaction_time = review_time - timedelta(seconds=random_seconds)
    
    # Append the transaction record to the list
    transactions.append({
        'transactionID': str(ObjectId()),
        'transactionTime': transaction_time,
        'buyerID': reviewerID,
        'copy': copy_count,
        'totalPrice': total_price,
        'paymentMethod': random.choice(payment_methods)
    })

# Convert the list of transactions into a DataFrame
transaction_df = pd.DataFrame(transactions)

KeyboardInterrupt: 

In [58]:
# Use this version
# Merge review_df and meta_df on 'asin' to obtain price for each review
payment_methods = ['MasterCard', 'Visa', 'PayPal', 'Discover', 'Amex', 'Bitcoin']

merged_df = review_df.merge(meta_df[['asin', 'price']], on='asin', how='left')

# Filter rows where price starts with '$' and convert to float
valid_prices = merged_df['price'].str.startswith('$', na=False)
merged_df = merged_df[valid_prices]
merged_df['price'] = merged_df['price'].str[1:].astype(float)

# Generate random copy count and calculate total price
merged_df['copy'] = [random.randint(1, 10) for _ in range(len(merged_df))]
merged_df['totalPrice'] = merged_df['price'] * merged_df['copy']

# Compute transactionTime as a random time up to 7 days before reviewTime
seconds_in_a_day = 86400
random_seconds = [random.randint(0, 7 * seconds_in_a_day) for _ in range(len(merged_df))]
review_times = pd.to_datetime(merged_df['unixReviewTime'], unit='s')
transaction_times = review_times - pd.to_timedelta(random_seconds, unit='s')
merged_df['transactionTime'] = transaction_times

# Assign random payment method and generate unique transaction ID
merged_df['paymentMethod'] = [random.choice(payment_methods) for _ in range(len(merged_df))]
merged_df['transactionID'] = [str(ObjectId()) for _ in range(len(merged_df))]

# Select relevant columns
transaction_df = merged_df[['transactionID', 'transactionTime', 'reviewerID', 'copy', 'totalPrice', 'paymentMethod']]

In [59]:
# Check the generated user data
print(transaction_df.shape)
print(transaction_df.iloc[625])

(6453130, 6)
transactionID      65394be6e6e35c3b983f529a
transactionTime         2010-09-08 10:16:57
reviewerID                   A1FDNRZT2G0LGD
copy                                      1
totalPrice                            31.16
paymentMethod                    MasterCard
Name: 625, dtype: object


Finally, we save the new data into four JSON files to folder called "generated data"

In [60]:
import os

# Create a new folder called "generated data"
output_dir = "G:\\DSA5104 Project\\generated data"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the dataframes into JSON files
meta_df.to_json(os.path.join(output_dir, "meta_data.json"), orient='records', lines=True)
review_df.to_json(os.path.join(output_dir, "review_data.json"), orient='records', lines=True)
user_df.to_json(os.path.join(output_dir, "user_data.json"), orient='records', lines=True)
transaction_df.to_json(os.path.join(output_dir, "transaction_data.json"), orient='records', lines=True)
