In [None]:
import json
import re
import numpy as np
import matplotlib.pyplot as plt

In [None]:
file = 'YOUR_METADATA_FILE.jsonl'

products = []

with open(file, 'r') as fp:
    for line in fp:
        products.append(json.loads(line.strip()))

In [None]:
# Subset all of the info to only include relevant fields
relevant_fields = ['title', 'rating_number', 'features', 'description']
products = [{field:product[field] for field in relevant_fields} for product in products]

# Combine features and description and remove unusual characters such as emojis.
for product in products:
    product['combined_description'] = re.sub(r'[^a-zA-Z0-9\s]', '', ' '.join(product['description'] + product['features'])).strip()

In [None]:
# Get the number of ratings for each product
num_ratings = []

for product in products:
    num_ratings.append(product['rating_number'])
    
num_ratings = np.array(num_ratings)

In [None]:
plt.hist(num_ratings, bins=20)
plt.title('Distribution of Product Number of Ratings')
plt.xlabel('Number of Ratings')
plt.ylabel('Frequency')
plt.show()

PLEASE CHOOSE YOUR OWN THRESHOLD (MINIMUM NUMBER OF RATINGS) BASED ON THE HISTOGRAM ABOVE. YOU MAY NEED TO CHANGE THE NUMBER OF BINS OR EXCLUDE VALUES ABOVE A CERTAIN NUMBER TO GET A SENSE FOR A GOOD CUTOFF VALUE.

In [None]:
ratings_threshold = 1000 # PLEASE CHANGE THIS NUMBER
filtered_products = [product for product in products if product['rating_number'] > ratings_threshold]

print(f"There are now {len(filtered_products)} remaining products.")

In [None]:
lengths = []
has_features = 0
has_description = 0

for product in filtered_products:
    lengths.append(len(product['combined_description']))
    if len(product['features']) > 0:
        has_features += 1
    
    if len(product['description']) > 0:
        has_description += 1
        
lengths = np.array(lengths)

In [None]:
print(f"Proportion of products with features: {has_features / len(filtered_products) * 100}")
print(f"Proportion of products with descriptions: {has_description / len(filtered_products) * 100}")

In [None]:
plt.hist(lengths, bins=20)
plt.title('Distribution of Product Description and Features Length')
plt.xlabel('Character Count')
plt.ylabel('Frequency')
plt.show()

PLEASE CHOOSE YOUR OWN THRESHOLD (MINIMUM NUMBER OF CHARACTERS) BASED ON THE HISTOGRAM ABOVE. YOU MAY NEED TO CHANGE THE NUMBER OF BINS OR EXCLUDE VALUES ABOVE A CERTAIN NUMBER TO GET A SENSE FOR A GOOD CUTOFF VALUE.

In [None]:
description_threshold = 1000 # PLEASE CHANGE THIS NUMBER

final_products = [product for product in filtered_products if len(product['combined_description']) > description_threshold]

print(f"There are now {len(final_products)} remaining products.")