In [1]:
import pandas as pd
import os
import shutil
import json

In [2]:
data_dir = "./data"
reviews_fn = "Electronics.csv"
metadata_fn = "meta_Electronics.jsonl"
metadata_split = "meta/"
reviews_split = "reviews/"

In [3]:
train_df = pd.read_csv("data/preprocessed_metadata.csv")

In [4]:
def split_jsonl(input_file, output_folder, num_files):
	if os.path.isdir(output_folder):
		shutil.rmtree(output_folder)

	if not os.path.exists(output_folder):
		os.makedirs(output_folder)

	with open(input_file, 'r', encoding='utf-8') as f:
		file_count = 1
		lines_per_file = sum(1 for line in f) // num_files
		f.seek(0)  # Reset file pointer to the beginning
		lines = []
		for idx, line in enumerate(f):
			lines.append(line)
			# If reached the desired number of lines per file or end of input file
			if len(lines) == lines_per_file or idx == (os.stat(input_file).st_size - 1):
				output_file = os.path.join(
					output_folder, f'{file_count}.jsonl')
				with open(output_file, 'w', encoding='utf-8') as fw:
					fw.writelines(lines)
				lines = []
				file_count += 1

In [5]:
def load_jsonl_to_df(file_path):
	data = []
	with open(file_path, 'r', encoding='utf-8') as file:
		for line in file:
			data.append(json.loads(line.strip()))

	df = pd.DataFrame(data)
	return df

In [6]:
# Load metadata
metadata_path = os.path.join(data_dir, metadata_fn)
metadata_split_path = os.path.join(data_dir, metadata_split)

In [7]:
split_files_num = 512
split_jsonl(metadata_path, metadata_split_path, split_files_num)

In [8]:
jsonl_paths = []
all_files = [os.path.join(metadata_split_path, fn) for fn in os.listdir(
	metadata_split_path) if os.path.isfile(os.path.join(metadata_split_path, fn))]

In [9]:
metadata_df = pd.DataFrame()
columns_to_drop = ['main_category', 'features', 'videos', 'details',
				   'bought_together', 'subtitle', 'author']

In [10]:
len(train_df['parent_asin'].unique())

104490

In [11]:
asin_list = list(train_df['parent_asin'])

In [12]:
len(asin_list)

104490

In [13]:
data = {
    'parent_asin': [],
    'images': [],
    'description': []
}
for f in all_files:
    # Load each file into a DataFrame
    # Assuming load_jsonl_to_df is a function that loads JSONL to DataFrame
    df = load_jsonl_to_df(f)

    # Check if 'parent_asin' values are in train_df
    mask = df['parent_asin'].isin(asin_list)

    # Filter the DataFrame based on the mask
    matching_rows = df[mask]

    data['parent_asin'].extend(matching_rows['parent_asin'].to_list())
    data['images'].extend(matching_rows['images'].to_list())
    data['description'].extend(matching_rows['description'].to_list())

In [14]:
train_df['images'] = None
train_df['description'] = None

In [15]:
for i in range(0, len(data['parent_asin'])):
    asin = data['parent_asin'][i]
    row_index = train_df.index[train_df['parent_asin'] == asin]
    train_df.at[row_index[0], 'images'] = data['images'][i]
    train_df.at[row_index[0], 'description'] = data['description'][i]

In [16]:
train_df = train_df.dropna()

In [17]:
train_df['images'] = train_df['images'].apply(lambda x: [item['large'] for item in x])

In [18]:
train_df.to_csv("./database_meta.csv")

In [19]:
if os.path.isdir(metadata_split_path):
	shutil.rmtree(metadata_split_path)