In [2]:
import json
import pickle
from collections import defaultdict

# Paths to input and output files
input_file = './data/All_beauty_more_than_3_with_product.jsonl'  # Your JSONL file
pkl_file_path = 'data/dataset.pkl'  # Your retrieved.pkl file

# Output files
output_file_train_val = 'preprocessed_reviews_train_val.json'
output_file_test = 'preprocessed_reviews_test.json'

# Step 1: Load retrieved.pkl data
def load_retrieved_data(pkl_file_path):
    with open(pkl_file_path, 'rb') as f:
        retrieved_data = pickle.load(f)
    return retrieved_data

retrieved_data = load_retrieved_data(pkl_file_path)

# Extract necessary data
train = retrieved_data['train']
val = retrieved_data['val']
test = retrieved_data['test']
umap = retrieved_data['umap']
smap = retrieved_data['smap']

# Step 2: Create sets of ASINs for train+val and test sequences
def get_sequence_item_ids(sequences):
    item_ids = set()
    for seq in sequences.values():
        item_ids.update(seq)
    return item_ids

# Collect item IDs for train+val sequences
sequence_item_ids_train_val = get_sequence_item_ids(train)
sequence_item_ids_train_val.update(get_sequence_item_ids(val))

# Collect item IDs for test sequences
sequence_item_ids_test = get_sequence_item_ids(test)

# Map item IDs to ASINs using smap
id_to_asin = {v: k for k, v in smap.items()}

def map_item_ids_to_asins(item_ids, id_to_asin):
    asins = set()
    for item_id in item_ids:
        asin = id_to_asin.get(item_id)
        if asin:
            asins.add(asin)
        else:
            print(f"Item ID {item_id} not found in smap.")
    return asins

# ASINs for train+val and test sequences
sequence_asins_train_val = map_item_ids_to_asins(sequence_item_ids_train_val, id_to_asin)
sequence_asins_test = map_item_ids_to_asins(sequence_item_ids_test, id_to_asin)

# Step 3: Create sets of user IDs for train+val and test
def get_sequence_user_ids(sequences, reverse_umap):
    user_ids = set()
    for user_index in sequences.keys():
        user_id = reverse_umap.get(user_index)
        if user_id:
            user_ids.add(user_id)
        else:
            print(f"User index {user_index} not found in reverse umap.")
    return user_ids

# Reverse umap to map indices back to user IDs
reverse_umap = {v: k for k, v in umap.items()}

# User IDs for train+val and test sequences
user_ids_train_val = get_sequence_user_ids(train, reverse_umap)
user_ids_train_val.update(get_sequence_user_ids(val, reverse_umap))

user_ids_test = get_sequence_user_ids(test, reverse_umap)

# Step 4: Process the JSONL file with filtering
user_reviews_train_val = defaultdict(list)
user_reviews_test = defaultdict(list)

# Open and read the JSONL file
with open(input_file, 'r', encoding='utf-8') as jsonlfile:
    for line in jsonlfile:
        # Parse each line as a JSON object
        json_data = json.loads(line)
        
        user_id = json_data['user_id']
        parent_asin = json_data.get('parent_asin')
        
        # Check if the review belongs to train+val
        if user_id in user_ids_train_val and parent_asin in sequence_asins_train_val:
            # Extract the relevant fields from the review
            review = {
                'product_name': json_data.get('product_name', 'Unknown'),
                'parent_asin': parent_asin,
                'rating': json_data.get('rating'),
                'title': json_data.get('title'),
                'text': json_data.get('text'),
                'timestamp': json_data.get('timestamp'),
            }
            # Group reviews by user_id
            user_reviews_train_val[user_id].append(review)
        
        # Check if the review belongs to test
        elif user_id in user_ids_test and parent_asin in sequence_asins_test:
            # Extract the relevant fields from the review
            review = {
                'product_name': json_data.get('product_name', 'Unknown'),
                'parent_asin': parent_asin,
                'rating': json_data.get('rating'),
                'title': json_data.get('title'),
                'text': json_data.get('text'),
                'timestamp': json_data.get('timestamp'),
            }
            # Group reviews by user_id
            user_reviews_test[user_id].append(review)
        # Else, ignore the review

# Step 5: Sort reviews by timestamp for each user
for user_id, reviews in user_reviews_train_val.items():
    user_reviews_train_val[user_id] = sorted(reviews, key=lambda x: x['timestamp'])

for user_id, reviews in user_reviews_test.items():
    user_reviews_test[user_id] = sorted(reviews, key=lambda x: x['timestamp'])

# Step 6: Create the final structured outputs
output_data_train_val = [{'user_id': user_id, 'reviews': reviews} for user_id, reviews in user_reviews_train_val.items()]
output_data_test = [{'user_id': user_id, 'reviews': reviews} for user_id, reviews in user_reviews_test.items()]

# Write the processed data to new JSON files
with open(output_file_train_val, 'w', encoding='utf-8') as outfile_train_val:
    json.dump(output_data_train_val, outfile_train_val, indent=4)

with open(output_file_test, 'w', encoding='utf-8') as outfile_test:
    json.dump(output_data_test, outfile_test, indent=4)

print(f"Data has been successfully written to '{output_file_train_val}' and '{output_file_test}'")


Data has been successfully written to 'preprocessed_reviews_train_val.json' and 'preprocessed_reviews_test.json'


In [None]:
import pickle
import json

# Load retrieved.pkl data
def load_retrieved_data(pkl_file_path):
    with open(pkl_file_path, 'rb') as f:
        retrieved_data = pickle.load(f)
    return retrieved_data

pkl_file_path = 'data/dataset.pkl'  # Update with your actual path
retrieved_data = load_retrieved_data(pkl_file_path)

train = retrieved_data['train']
val = retrieved_data['val']
test = retrieved_data['test']
umap = retrieved_data['umap']
smap = retrieved_data['smap']
meta = retrieved_data['meta']

# Load processed JSON files
processed_train_val_file = 'preprocessed_reviews_train_val.json'
processed_test_file = 'preprocessed_reviews_test.json'

with open(processed_train_val_file, 'r', encoding='utf-8') as f:
    processed_train_val_data = json.load(f)

with open(processed_test_file, 'r', encoding='utf-8') as f:
    processed_test_data = json.load(f)

# Prepare mappings
index_to_user_id = {v: k for k, v in umap.items()}
item_id_to_asin = {v: k for k, v in smap.items()}
asin_to_item_id = {k: v for v, k in smap.items()}

# Define functions
def get_user_sequences(sequences, index_to_user_id):
    user_sequences = {}
    for user_index, item_ids in sequences.items():
        user_id = index_to_user_id.get(user_index)
        if user_id:
            user_sequences[user_id] = item_ids
        else:
            print(f"User index {user_index} not found in umap.")
    return user_sequences

def get_reviews_by_user(processed_data):
    reviews_by_user = {}
    for user_data in processed_data:
        user_id = user_data['user_id']
        reviews_by_user[user_id] = user_data['reviews']
    return reviews_by_user

def compare_user_data(user_sequences, reviews_by_user, item_id_to_asin):
    discrepancies = {}
    for user_id, item_ids in user_sequences.items():
        # Get ASINs from item IDs
        asins_from_sequences = set()
        for item_id in item_ids:
            asin = item_id_to_asin.get(item_id)
            if asin:
                asins_from_sequences.add(asin)
            else:
                print(f"Item ID {item_id} not found in smap.")
        
        # Get ASINs from user's reviews
        user_reviews = reviews_by_user.get(user_id, [])
        asins_from_reviews = set(review['parent_asin'] for review in user_reviews)
        
        # Compare the ASIN sets
        missing_in_reviews = asins_from_sequences - asins_from_reviews
        extra_in_reviews = asins_from_reviews - asins_from_sequences
        
        if missing_in_reviews or extra_in_reviews:
            discrepancies[user_id] = {
                'missing_in_reviews': missing_in_reviews,
                'extra_in_reviews': extra_in_reviews
            }
    
    return discrepancies

def report_discrepancies(discrepancies, dataset_name):
    if discrepancies:
        print(f"\nDiscrepancies found in {dataset_name}:")
        for user_id, discrepancy in discrepancies.items():
            print(f"User ID: {user_id}")
            if discrepancy['missing_in_reviews']:
                print(f"  Items in sequences but missing in reviews (ASINs): {discrepancy['missing_in_reviews']}")
            if discrepancy['extra_in_reviews']:
                print(f"  Items in reviews but not in sequences (ASINs): {discrepancy['extra_in_reviews']}")
    else:
        print(f"\nNo discrepancies found in {dataset_name}. All items match.")

# Retrieve user sequences
train_val_sequences = {**train, **val}
user_sequences_train_val = get_user_sequences(train_val_sequences, index_to_user_id)
user_sequences_test = get_user_sequences(test, index_to_user_id)

# Retrieve reviews by user
reviews_by_user_train_val = get_reviews_by_user(processed_train_val_data)
reviews_by_user_test = get_reviews_by_user(processed_test_data)

# Compare data
discrepancies_train_val = compare_user_data(user_sequences_train_val, reviews_by_user_train_val, item_id_to_asin)
discrepancies_test = compare_user_data(user_sequences_test, reviews_by_user_test, item_id_to_asin)

# Report discrepancies
report_discrepancies(discrepancies_train_val, 'Train + Validation Data')
report_discrepancies(discrepancies_test, 'Test Data')

# Verify counts
def get_counts(user_sequences):
    num_users = len(user_sequences)
    num_interactions = sum(len(items) for items in user_sequences.values())
    unique_items = set()
    for items in user_sequences.values():
        unique_items.update(items)
    num_items = len(unique_items)
    return num_users, num_interactions, num_items

num_users_train_val, num_interactions_train_val, num_items_train_val = get_counts(user_sequences_train_val)
num_users_test, num_interactions_test, num_items_test = get_counts(user_sequences_test)

def get_processed_counts(reviews_by_user):
    num_users = len(reviews_by_user)
    num_interactions = sum(len(reviews) for reviews in reviews_by_user.values())
    unique_asins = set()
    for reviews in reviews_by_user.values():
        unique_asins.update(review['parent_asin'] for review in reviews)
    num_items = len(unique_asins)
    return num_users, num_interactions, num_items

num_users_processed_train_val, num_interactions_processed_train_val, num_items_processed_train_val = get_processed_counts(reviews_by_user_train_val)
num_users_processed_test, num_interactions_processed_test, num_items_processed_test = get_processed_counts(reviews_by_user_test)

# Print counts
print("\nCounts in retrieved.pkl (Train + Val):")
print(f"Number of users: {num_users_train_val}")
print(f"Number of interactions: {num_interactions_train_val}")
print(f"Number of unique items: {num_items_train_val}")

print("\nCounts in processed_reviews_train_val.json:")
print(f"Number of users: {num_users_processed_train_val}")
print(f"Number of interactions: {num_interactions_processed_train_val}")
print(f"Number of unique items: {num_items_processed_train_val}")

print("\nCounts in retrieved.pkl (Test):")
print(f"Number of users: {num_users_test}")
print(f"Number of interactions: {num_interactions_test}")
print(f"Number of unique items: {num_items_test}")

print("\nCounts in processed_reviews_test.json:")
print(f"Number of users: {num_users_processed_test}")
print(f"Number of interactions: {num_interactions_processed_test}")
print(f"Number of unique items: {num_items_processed_test}")
