In [1]:
import numpy as np
import pandas as pd

import os
import pickle
import re

import gzip
import json
import shutil

import random

In [2]:
data_dir = "./data"
reviews_fn = "Electronics.csv"
metadata_fn = "meta_Electronics.jsonl"
metadata_split = "meta/"
reviews_split = "reviews/"

# Preprocess metadata file

In [3]:
def split_jsonl(input_file, output_folder, num_files):
	if os.path.isdir(output_folder):
		shutil.rmtree(output_folder)

	if not os.path.exists(output_folder):
		os.makedirs(output_folder)

	with open(input_file, 'r', encoding='utf-8') as f:
		file_count = 1
		lines_per_file = sum(1 for line in f) // num_files
		f.seek(0)  # Reset file pointer to the beginning
		lines = []
		for idx, line in enumerate(f):
			lines.append(line)
			# If reached the desired number of lines per file or end of input file
			if len(lines) == lines_per_file or idx == (os.stat(input_file).st_size - 1):
				output_file = os.path.join(
					output_folder, f'{file_count}.jsonl')
				with open(output_file, 'w', encoding='utf-8') as fw:
					fw.writelines(lines)
				lines = []
				file_count += 1

In [4]:
def load_jsonl_to_df(file_path):
	data = []
	with open(file_path, 'r', encoding='utf-8') as file:
		for line in file:
			data.append(json.loads(line.strip()))

	df = pd.DataFrame(data)
	return df

In [5]:
# Load metadata
metadata_path = os.path.join(data_dir, metadata_fn)
metadata_split_path = os.path.join(data_dir, metadata_split)

In [6]:
split_files_num = 500
split_jsonl(metadata_path, metadata_split_path, split_files_num)

In [7]:
all_files = [os.path.join(metadata_split_path, fn) for fn in os.listdir(
	metadata_split_path) if os.path.isfile(os.path.join(metadata_split_path, fn))]

In [8]:
random_files = random.sample(all_files, 100)

In [9]:
metadata_df = pd.DataFrame()
columns_to_drop = ['main_category', 'features',
				   'description', 'images', 'videos', 'details',
				   'bought_together', 'subtitle', 'author']

In [10]:
count = 0
for fp in all_files:
	count += 1
	if os.path.isfile(fp):
		print(f"{count}. {fp}")
		df = load_jsonl_to_df(fp)
		df = df.sample(n=640, random_state=42)
		# df = df.sample(n=500, random_state=42)
		# Check if columns exist before dropping them
		columns_to_drop_existing = [
			col for col in columns_to_drop if col in df.columns]
		if columns_to_drop_existing:
			df.drop(columns=columns_to_drop_existing, inplace=True)

		# Replace non-numeric values in 'price' with NaN
		# df['price'] = pd.to_numeric(df['price'], errors='coerce')
		# # impute prices
		# median_price = df['price'].astype(float).median()
		# df['price'] = df['price'].astype(float).fillna(median_price)
		# df['price'] = (100 * df['price']).astype(int)


		# Process categories
		df['niche'] = df['categories'].apply(
			lambda x: str(x).strip('[]').split(', ')[-1])
		df['niche'] = df['niche'].apply(
			lambda x: str(x).replace('"', '').replace("'", ''))
		df.drop(columns='categories', inplace=True)

		df = df.dropna()
		# Concat splited df
		metadata_df = pd.concat([metadata_df, df], ignore_index=True)

1. ./data/meta/192.jsonl
2. ./data/meta/276.jsonl
3. ./data/meta/75.jsonl
4. ./data/meta/266.jsonl
5. ./data/meta/296.jsonl
6. ./data/meta/311.jsonl
7. ./data/meta/235.jsonl
8. ./data/meta/449.jsonl
9. ./data/meta/97.jsonl
10. ./data/meta/491.jsonl
11. ./data/meta/300.jsonl
12. ./data/meta/419.jsonl
13. ./data/meta/455.jsonl
14. ./data/meta/202.jsonl
15. ./data/meta/383.jsonl
16. ./data/meta/423.jsonl
17. ./data/meta/63.jsonl
18. ./data/meta/190.jsonl
19. ./data/meta/155.jsonl
20. ./data/meta/416.jsonl
21. ./data/meta/17.jsonl
22. ./data/meta/222.jsonl
23. ./data/meta/78.jsonl
24. ./data/meta/489.jsonl
25. ./data/meta/481.jsonl
26. ./data/meta/210.jsonl
27. ./data/meta/65.jsonl
28. ./data/meta/287.jsonl
29. ./data/meta/53.jsonl
30. ./data/meta/188.jsonl
31. ./data/meta/267.jsonl
32. ./data/meta/209.jsonl
33. ./data/meta/32.jsonl
34. ./data/meta/281.jsonl
35. ./data/meta/103.jsonl
36. ./data/meta/178.jsonl
37. ./data/meta/418.jsonl
38. ./data/meta/117.jsonl
39. ./data/meta/366.jsonl
40.

In [15]:
metadata_df['price'] = pd.to_numeric(metadata_df['price'], errors='coerce')
# check if still exist non numberic row
non_numeric_rows = metadata_df[pd.to_numeric(metadata_df['price'], errors='coerce').isnull()]


In [18]:
niches = list(metadata_df['niche'].unique())

In [19]:
niches.remove("")

In [20]:
imute_prices_by_niche = {}
for niche in niches:
	df = metadata_df[metadata_df['niche'] == niche]
	# # impute prices
	mean_price = df['price'].astype(float).mean()

	imute_prices_by_niche[niche] = mean_price
	

In [24]:
for niche in niches:
    niche_median_price = imute_prices_by_niche[niche]
    # Select rows where 'niche' matches the current niche
    niche_rows = metadata_df['niche'] == niche
    # Replace NaN values in 'price' column for this niche with median price
    metadata_df.loc[niche_rows, 'price'].fillna(niche_median_price, inplace=True)

In [25]:
out_metadata_path = "./data/preprocessed_metadata.csv"
# Write the DataFrame to a CSV file
metadata_df.to_csv(out_metadata_path, index=False)

In [26]:
if os.path.isdir(metadata_split_path):
	shutil.rmtree(metadata_split_path)

# Preprocess reviews file

In [1]:
import pandas as pd

In [2]:
reviews_df = pd.read_csv("data/Electronics.csv")

In [29]:
# reviews_df = reviews_df.sample(n=int(len(reviews_df) / 8), random_state=42)

In [None]:
len(reviews_df)

In [30]:
out_reviews_path = "./data/preprocessed_reviews.csv"
# Write the DataFrame to a CSV file
reviews_df.to_csv(out_reviews_path, index=False)