In [11]:
import json
import pandas as pd
import random
from tqdm import tqdm
from datetime import datetime

# Set random seed
random.seed(42)

# File paths
review_file = "/Users/ujjwalbhatta/Downloads/Electronics.jsonl"
meta_file = "/Users/ujjwalbhatta/Downloads/meta_Electronics.jsonl"
output_file = "electronics_merged_30k.csv"

def simple_merge_with_meta():
    print("Sampling 50k reviews first...")
    sampled_reviews = []
    sample_target = 50000
    
    with open(review_file, 'r') as f:
        total_lines = sum(1 for _ in f)
    
    sample_prob = min(1.0, sample_target / total_lines)
    
    with open(review_file, 'r') as f:
        for line in tqdm(f, desc="Sampling reviews", total=total_lines):
            if random.random() < sample_prob:
                review = json.loads(line.strip())
                if review.get('parent_asin'):
                    sampled_reviews.append(review)
                if len(sampled_reviews) >= sample_target:
                    break
    
    print(f"Sampled {len(sampled_reviews)} reviews")
    
    unique_asins = set(review['parent_asin'] for review in sampled_reviews)
    print(f"Loading metadata for {len(unique_asins)} unique products...")
    
    meta_dict = {}
    with open(meta_file, 'r') as f:
        for line in tqdm(f, desc="Reading metadata"):
            item = json.loads(line.strip())
            parent_asin = item.get('parent_asin')
            if parent_asin and parent_asin in unique_asins:
                meta_dict[parent_asin] = item
    
    print(f"Loaded metadata for {len(meta_dict)} products")
    
    merged_data = []
    for review in tqdm(sampled_reviews, desc="Merging data"):
        parent_asin = review.get('parent_asin')
        if parent_asin in meta_dict:
            meta = meta_dict[parent_asin]
            details = meta.get('details', {})
        
            brand = details.get("Brand") or details.get("Manufacturer") or meta.get("store") or "UnknownBrand"
            sort_timestamp = review.get('sort_timestamp')
            review_date = None
            if sort_timestamp:
                try:
                    review_date = datetime.fromtimestamp(sort_timestamp / 1000).date()
                except:
                    review_date = None
            
            record = {
                'parent_asin': parent_asin,
                'main_category': meta.get('main_category'),
                'sub_category': meta.get('categories')[0] if meta.get('categories') else None,
                'product_title': meta.get('title'),
                'description': meta.get('description'),
                'price': meta.get('price'),
                'average_rating': meta.get('average_rating'),
                'rating_number': meta.get('rating_number'),

                # --- Brand / Store Info ---
                'brand': brand,
                'store': meta.get('store'),
                'details': details,  

                # --- Review Info ---
                'rating': review.get('rating'),
                'review_title': review.get('title'),
                'text': review.get('text'),
                'helpful_vote': review.get('helpful_votes'),
                'verified_purchase': review.get('verified_purchase'),
                'review_date': review_date
            }
            merged_data.append(record)
    
    final_sample_size = min(30000, len(merged_data))
    final_sample = random.sample(merged_data, final_sample_size)
    
    df = pd.DataFrame(final_sample)
    df.to_csv(output_file, index=False)
    print(f"Saved {len(df)} records to {output_file}")

if __name__ == "__main__":
    simple_merge_with_meta()


Sampling 50k reviews first...


Sampling reviews: 100%|██████████| 43886944/43886944 [00:13<00:00, 3171601.51it/s]


Sampled 49662 reviews
Loading metadata for 33585 unique products...


Reading metadata: 1610012it [00:18, 85608.51it/s] 


Loaded metadata for 33585 products


Merging data: 100%|██████████| 49662/49662 [00:00<00:00, 317011.09it/s]


Saved 30000 records to electronics_merged_30k.csv
