# 1. Amazon Book Reviews EDA for Recommendation System

## 1. Load data and show samples

In [1]:
# Import necessary libraries
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os

# Set GCP project and storage bucket variables
PROJECT_ID = 'review-analysis-456008'
BUCKET_NAME = 'review-data-yu'
DATA_FILE = 'gs://review-data-yu/raw-data/amazon_reviews_us_Books_v1_02.tsv'

# Define Beam pipeline options
pipeline_options = PipelineOptions(
    project=PROJECT_ID,
    temp_location=f'gs://{BUCKET_NAME}/temp',
    region='us-central1'
)

# Ensure we use DirectRunner
pipeline_options.view_as(StandardOptions).runner = 'DirectRunner'


# Simple TSV parsing function
def parse_tsv(line):
    # Simply split by tab character
    elements = line.split('\t')
    if len(elements) != 15:
        return None
    
    fields = [
        'marketplace', 'customer_id', 'review_id', 'product_id', 
        'product_parent', 'product_title', 'product_category', 
        'star_rating', 'helpful_votes', 'total_votes', 'vine', 
        'verified_purchase', 'review_headline', 'review_body', 'review_date'
    ]
    
    return dict(zip(fields, elements))

# Execute initial EDA - data loading
def run_data_loading_eda():
    """Perform data loading and basic information EDA"""
    with beam.Pipeline(options=pipeline_options) as p:
        # Read data
        data = (
            p 
            | 'ReadData' >> ReadFromText(DATA_FILE, skip_header_lines=1)
            | 'ParseTSV' >> beam.Map(parse_tsv)
            | 'FilterNone' >> beam.Filter(lambda x: x is not None)
        )
        
        # Calculate dataset size
        count = (
            data
            | 'Count' >> beam.combiners.Count.Globally()
            | 'FormatCount' >> beam.Map(lambda count: f"===== LOADING DATASET =====\nDataset sample size: {count}")
            | 'PrintCount' >> beam.Map(print)
        )
        
        # Get sample data
        samples = (
            data
            | 'Sample' >> beam.combiners.Sample.FixedSizeGlobally(2)
            | 'FormatSamples' >> beam.Map(lambda samples: f"\n===== SAMPLE DATA =====\n{samples}")
            | 'PrintSamples' >> beam.Map(print)
        )


# Execute the EDA directly 
run_data_loading_eda()




===== SAMPLE DATA =====
[{'marketplace': 'US', 'customer_id': '31774188', 'review_id': 'R2S2R5AY1Q686F', 'product_id': '0060988649', 'product_parent': '696052457', 'product_title': 'Lost: A Novel', 'product_category': 'Books', 'star_rating': '1', 'helpful_votes': '9', 'total_votes': '12', 'vine': 'N', 'verified_purchase': 'N', 'review_headline': 'Lost my mind...literally!', 'review_body': "I had never read any Gregory Maguire books before this one, but heard nothing but praise for them. I picked up Lost based on the blurb on the back and the creepy cover art. I have a moderate interest in the Jack the Ripper period and settled into this book preparing to be entertained and engrossed by a work of fiction that wove this time period with fairy tales and came out with a unique product.     All I can say is that I was horribly and woefully misled...the most interesting and enthralling paragraph in the book is that blurb on the backside. And, what's more, that is about as close as you get t

## 2. Rating Distribution Analysis

In [2]:
# Import necessary libraries
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions
import os

# Rating distribution EDA
def run_rating_distribution_eda():
    """Perform rating distribution analysis with limited data"""
    with beam.Pipeline(options=pipeline_options) as p:
        # Read, parse, and limit data to 100,000 records
        data = (
            p 
            | 'ReadData' >> ReadFromText(DATA_FILE, skip_header_lines=1)
            | 'Limit' >> beam.combiners.Sample.FixedSizeGlobally(100000)
            | 'ParseTSV' >> beam.FlatMap(lambda lines: [parse_tsv(line) for line in lines if line])
            | 'FilterNone' >> beam.Filter(lambda x: x is not None)
        )
        
        # Extract star ratings and count each rating
        rating_counts = (
            data
            | 'ExtractRating' >> beam.Map(lambda x: x['star_rating'])
            | 'CountByValue' >> beam.combiners.Count.PerElement()
        )
        
        # Format and display rating distribution
        def format_output(element):
            # element is a list of (rating, count) tuples
            sorted_ratings = sorted(element, key=lambda x: x[0])
            total_count = sum(count for _, count in sorted_ratings)
            
            print("\n===== RATING DISTRIBUTION =====")
            print("star_rating    Count    Percentage")
            print("--------------------------------")
            
            for rating, count in sorted_ratings:
                percentage = (count / total_count) * 100
                print(f"{rating:^11}  {count:^8}  {percentage:^10.3f}%")
            
            return element
        
        # Process the results
        result = (
            rating_counts
            | 'GatherResults' >> beam.combiners.ToList()
            | 'FormatAndPrint' >> beam.Map(format_output)
        )
        

# Execute the rating distribution EDA
run_rating_distribution_eda()




===== RATING DISTRIBUTION =====
star_rating    Count    Percentage
--------------------------------
     1         7591      7.591   %
     2         5257      5.257   %
     3         8050      8.050   %
     4        19047      19.047  %
     5        60055      60.055  %


## 3. User engagement Analysis

In [3]:
# User engagement analysis EDA
def run_user_engagement_eda():
    """Perform user engagement analysis to understand customer review patterns"""
    with beam.Pipeline(options=pipeline_options) as p:
        # Read, parse, and limit data to 100,000 records
        data = (
            p 
            | 'ReadData' >> ReadFromText(DATA_FILE, skip_header_lines=1)
            | 'Limit' >> beam.combiners.Sample.FixedSizeGlobally(100000)
            | 'ParseTSV' >> beam.FlatMap(lambda lines: [parse_tsv(line) for line in lines if line])
            | 'FilterNone' >> beam.Filter(lambda x: x is not None)
        )
        
        # Extract customer IDs and count reviews per customer
        customer_reviews = (
            data
            | 'ExtractCustomerId' >> beam.Map(lambda x: x['customer_id'])
            | 'CountPerCustomer' >> beam.combiners.Count.PerElement()
        )
        
        # Process and format user engagement metrics
        def analyze_user_engagement(customer_counts):
            # customer_counts is a list of (customer_id, review_count) tuples
            total_unique_customers = len(customer_counts)
            total_reviews = sum(count for _, count in customer_counts)  
            avg_reviews_per_customer = total_reviews / total_unique_customers if total_unique_customers > 0 else 0
            max_reviews = max(count for _, count in customer_counts) if customer_counts else 0  
            
            # Count customers with multiple reviews
            customers_with_multiple = sum(1 for _, count in customer_counts if count > 1)  
            multiple_reviews_percentage = (customers_with_multiple / total_unique_customers * 100) if total_unique_customers > 0 else 0
            
            # Print results
            print("\n===== USER ENGAGEMENT ANALYSIS =====")
            print(f"Total Unique Customers: {total_unique_customers}")
            print(f"Average Reviews per Customer: {avg_reviews_per_customer:.2f}")
            print(f"Max Reviews by a Single Customer: {max_reviews}")
            print(f"Customers with Multiple Reviews: {customers_with_multiple} ({multiple_reviews_percentage:.2f}%)")
            
            return customer_counts
        
        # Process the results
        result = (
            customer_reviews
            | 'GatherCustomerData' >> beam.combiners.ToList()
            | 'AnalyzeUserEngagement' >> beam.Map(analyze_user_engagement)
        )
        

# Execute the user engagement EDA
run_user_engagement_eda()




===== USER ENGAGEMENT ANALYSIS =====
Total Unique Customers: 82425
Average Reviews per Customer: 1.21
Max Reviews by a Single Customer: 660
Customers with Multiple Reviews: 7192 (8.73%)


## 4. Product Analysis

In [4]:
# Product analysis EDA
def run_product_analysis_eda():
    """Perform product analysis to understand book review patterns"""
    with beam.Pipeline(options=pipeline_options) as p:
        # Read, parse, and limit data to 100,000 records
        data = (
            p 
            | 'ReadData' >> ReadFromText(DATA_FILE, skip_header_lines=1)
            | 'Limit' >> beam.combiners.Sample.FixedSizeGlobally(100000)
            | 'ParseTSV' >> beam.FlatMap(lambda lines: [parse_tsv(line) for line in lines if line])
            | 'FilterNone' >> beam.Filter(lambda x: x is not None)
        )
        
        # Extract product IDs and count reviews per product
        product_reviews = (
            data
            | 'ExtractProductId' >> beam.Map(lambda x: x['product_id'])
            | 'CountPerProduct' >> beam.combiners.Count.PerElement()
        )
        
        # Process and format product analysis metrics
        def analyze_product_data(product_counts):
            # product_counts is a list of (product_id, review_count) tuples
            total_unique_products = len(product_counts)
            total_reviews = sum(count for _, count in product_counts)
            avg_reviews_per_product = total_reviews / total_unique_products if total_unique_products > 0 else 0
            max_reviews = max(count for _, count in product_counts) if product_counts else 0
            
            # Count products with multiple reviews
            products_with_multiple = sum(1 for _, count in product_counts if count > 1)
            multiple_reviews_percentage = (products_with_multiple / total_unique_products * 100) if total_unique_products > 0 else 0
            
            # Print results
            print("\n===== PRODUCT ANALYSIS =====")
            print(f"Total Unique Products: {total_unique_products}")
            print(f"Average Reviews per Product: {avg_reviews_per_product:.2f}")
            print(f"Max Reviews for a Single Product: {max_reviews}")
            print(f"Products with Multiple Reviews: {products_with_multiple} ({multiple_reviews_percentage:.2f}%)")
            
            return product_counts
        
        # Process the results
        result = (
            product_reviews
            | 'GatherProductData' >> beam.combiners.ToList()
            | 'AnalyzeProductData' >> beam.Map(analyze_product_data)
        )
        

# Execute the product analysis EDA
run_product_analysis_eda()




===== PRODUCT ANALYSIS =====
Total Unique Products: 74348
Average Reviews per Product: 1.35
Max Reviews for a Single Product: 157
Products with Multiple Reviews: 12008 (16.15%)
