In [2]:
import json
import pandas as pd
import random
from tqdm import tqdm

# Set random seed
random.seed(42)

# File paths
review_file = "/Users/ujjwalbhatta/Downloads/Electronics.jsonl"
meta_file = "/Users/ujjwalbhatta/Downloads/meta_Electronics.jsonl"
output_file = "data/electronics_merged_30k.csv"

def early_sampling():
    print("Sampling 50k reviews first...")
    sampled_reviews = []
    sample_target = 50000
    
    with open(review_file, 'r') as f:
        total_lines = sum(1 for _ in f)  # Count lines first
    
    # Calculate sampling probability
    sample_prob = min(1.0, sample_target / total_lines)
    
    with open(review_file, 'r') as f:
        for line in tqdm(f, desc="Sampling reviews", total=total_lines):
            if random.random() < sample_prob:
                review = json.loads(line.strip())
                if review.get('parent_asin'):  # Only keep reviews with parent_asin
                    sampled_reviews.append(review)
                if len(sampled_reviews) >= sample_target:
                    break
    
    print(f"Sampled {len(sampled_reviews)} reviews")
    
    # Step 2: Get unique parent_asins from sampled reviews
    unique_asins = set(review['parent_asin'] for review in sampled_reviews)
    print(f"Need metadata for {len(unique_asins)} unique products")
    
    # Step 3: Load only needed metadata
    print("Loading relevant metadata...")
    meta_dict = {}
    with open(meta_file, 'r') as f:
        for line in tqdm(f, desc="Reading metadata"):
            item = json.loads(line.strip())
            parent_asin = item.get('parent_asin')
            if parent_asin and parent_asin in unique_asins:
                meta_dict[parent_asin] = item
    
    print(f"Loaded metadata for {len(meta_dict)} products")
    
    # Step 4: Merge and create final sample
    merged_data = []
    for review in tqdm(sampled_reviews, desc="Merging data"):
        parent_asin = review.get('parent_asin')
        if parent_asin in meta_dict:
            meta = meta_dict[parent_asin]
            record = {
                'main_category': meta.get('main_category'),
                'product_title': meta.get('title'),
                'average_rating': meta.get('average_rating'),
                'rating_number': meta.get('rating_number'),
                'price': meta.get('price'),
                'description': meta.get('description'),
                'parent_asin': parent_asin,
                'details': meta.get('details'),
                'rating': review.get('rating'),
                'review_title': review.get('title'),
                'text': review.get('text'),
                'helpful_vote': review.get('helpful_vote')
            }
            merged_data.append(record)
    
    # Final sample of 30k
    final_sample_size = min(30000, len(merged_data))
    final_sample = random.sample(merged_data, final_sample_size)
    
    # Save to CSV
    df = pd.DataFrame(final_sample)
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} records to {output_file}")

# Run the fastest approach
if __name__ == "__main__":
    early_sampling() 


Sampling 50k reviews first...


Sampling reviews: 100%|██████████| 43886944/43886944 [00:14<00:00, 2931429.51it/s]


Sampled 49662 reviews
Need metadata for 33585 unique products
Loading relevant metadata...


Reading metadata: 1610012it [00:21, 75467.69it/s] 


Loaded metadata for 33585 products


Merging data: 100%|██████████| 49662/49662 [00:00<00:00, 358734.11it/s]


Saved 30000 records to electronics_merged_30k.csv
