In [1]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import ast

In [None]:
#need to first retrieve the metadata, to correct label all the products. 
#so, the metadata was in the form of json file, for better processing, converted it to csv

def flatten_json(json_data):
    """
    Flatten JSON data for CSV conversion
    """
    def process_value(value):
        if isinstance(value, list):
            return '; '.join(str(v) for v in value if v)
        return value

    flattened = {}
    for key, value in json_data.items():
        flattened[key] = process_value(value)
    
    return flattened

import json
import pandas as pd
import csv

def convert_jsonl_to_csv(json_file_path, csv_file_path):
    """
    Convert JSON Lines file to CSV with proper escaping
    """
    processed_data = []
    
    with open(json_file_path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                json_obj = json.loads(line.strip())
                
                flattened_obj = {}
                for key, value in json_obj.items():
                    if isinstance(value, list):
                        flattened_obj[key] = '|'.join(str(v) for v in value if v)
                    else:
                        flattened_obj[key] = value
                
                processed_data.append(flattened_obj)
            except json.JSONDecodeError:
                continue
    
    df = pd.DataFrame(processed_data)
    
    df.to_csv(csv_file_path, 
              index=False, 
              encoding='utf-8',
              escapechar='\\',        
              doublequote=True,       
              quoting=csv.QUOTE_ALL,  
              sep=',')               
    
    print(f"Successfully converted {len(processed_data)} records to CSV")

    

In [None]:
import os
os.makedirs("../data/processed", exist_ok=True)

try:
    df = convert_jsonl_to_csv("../data/meta_Electronics.json", 
                             "../data/processed/final_processed_metadata.csv")
    
    print("\nFirst few rows of the converted data:")
    display(df.head())
    
except Exception as e:
    print(f"An error occurred: {str(e)}")

Successfully converted 786445 records to CSV

First few rows of the converted data:
An error occurred: 'NoneType' object has no attribute 'head'


In [None]:
import pandas as pd
import json

def load_and_merge_data(reviews_csv, metadata_csv, output_csv):
    try:
        reviews_df = pd.read_csv(reviews_csv)
        initial_reviews_count = len(reviews_df)
        
        metadata_df = pd.read_csv(metadata_csv)
        
        duplicate_asins = metadata_df['asin'].duplicated()
        if duplicate_asins.any():
            print(f"Found {duplicate_asins.sum()} duplicate ASINs in metadata. Keeping first occurrence.")
            metadata_df = metadata_df.drop_duplicates(subset='asin', keep='first')
        
        merged_df = reviews_df.merge(
            metadata_df,
            on='asin',
            how='left',
            validate='m:1'
        )
        
        if len(merged_df) != initial_reviews_count:
            print("WARNING: Merged dataset has a different number of rows than original reviews!")
            print(f"Original reviews: {initial_reviews_count}")
            print(f"Merged reviews: {len(merged_df)}")
        
        
        metadata_columns = [col for col in metadata_df.columns if col != 'asin']
        for col in metadata_columns:
            merged_df[col] = merged_df[col].fillna('Unknown')
        
        print(f"\nSaving merged data to {output_csv}")
        merged_df.to_csv(output_csv, index=False, encoding='utf-8')
        
        return merged_df
        
    except FileNotFoundError as e:
        print(f"Error: Could not find one of the input files: {str(e)}")
        return None
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

In [5]:
reviews_path = "../data/processed/final_cleaned_reviews.csv"
metadata_path = "../data/processed/final_processed_metadata.csv"
output_path = "../data/processed/merged_reviews_metadata_final.csv"

load_and_merge_data(reviews_path, metadata_path, output_path)


Loading reviews from ../data/processed/final_cleaned_reviews.csv
Initial reviews count: 4966
Loading metadata from ../data/processed/final_processed_metadata.csv


  metadata_df = pd.read_csv(metadata_csv)


Found 30368 duplicate ASINs in metadata. Keeping first occurrence.
Merging datasets...

Merge Statistics:
Number of unique products in metadata: 756077
Number of unique products in reviews: 2922
Number of reviews with matching product info: 3128
Number of reviews without product info: 1838

Saving merged data to ../data/processed/merged_reviews_metadata_final.csv


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,review_date,...,feature,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details
0,A2YQ9AX4GOTA0S,B00G4771KA,t. kyle baskett,"[0, 0]",Helps out tremendously.,5,GoPro fun.,1406073600,"07 23, 2014",2014-07-23,...,Product Compatible with both the Hero 3 and He...,Unknown,B00NIYNUBG|B072K25157|B01FQLGUR4|B00J9RO4CU|B0...,Camera & Photo,"class=""a-bordered a-horizontal-stripes a-spa...",Unknown,$99.99,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,{'\\n Product Dimensions: \\n ': '4.3 x ...
1,A2R1HUYHXV7H18,B00GMTN96U,Kristi Gilleland,"[0, 0]",I've got several of these bluetooth speakers n...,4,"Punchy bass, small size, big sound",1406073600,"07 23, 2014",2014-07-23,...,Unknown,">#316,045 in Cell Phones & Accessories (See To...",B016XTADG2|B01CQOV3YO|B010OYASRG,All Electronics,Unknown,"November 12, 2013",Unknown,Unknown,Unknown,{}
2,A2ITGW4E6U909T,B00KFAGCUM,Monkeybongo,"[0, 0]","For those on older Macbook and Macbook pros, I...",5,Great upgrade for Older Macbooks,1406073600,"07 23, 2014",2014-07-23,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
3,A3FMTE5CRT8BK9,B005B6O3LU,Anonymous,"[0, 0]","Son used for a bit, then wanted something else...",4,Four Stars,1406073600,"07 23, 2014",2014-07-23,...,stylish crocodile pattern leather exterior wit...,">#20,676 in Computers & Accessories > Tablet A...",B0772NB2PG|B073ZD7HBT|B06XKRRXR5|B07BTS2KWK,Computers,"class=""a-bordered a-horizontal-stripes a-spa...","July 7, 2011",Unknown,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,{}
4,A3CW0ZLUO5X2B1,B00JOS04PK,"35-year Technology Consumer ""8-tracks to 802.11""","[1, 1]",...between an iPod Nano (7th generation) and w...,5,Does what it's supposed to: moves data and pow...,1406073600,"07 23, 2014",2014-07-23,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4961,A3DKTNZ9A1DY0X,B0040IG1A6,Anonymous,"[0, 0]",This cable did not work upon receipt. It was ...,2,buyer beware,1405382400,"07 15, 2014",2014-07-15,...,PTC Brand High Speed USB2.0 Certified to suppo...,">#1,321 in Computers & Accessories > Computer ...",B00GLZYG6M|B018LYWWKM|B005LJKEXS|B01L81WQ2O|B0...,Computers,"class=""a-bordered a-horizontal-stripes a-spa...","June 29, 2017",$2.54,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,{}
4962,A2KZQEKUS3G02I,B00B588HY2,G. GOELZ,"[0, 0]",Great card fast plenty of storage...anyone usi...,5,fast card,1405382400,"07 15, 2014",2014-07-15,...,Up to 40MB/s transfer speed|Compatible with mu...,>#815 in Computers & Accessories > Computer Ac...,B00X13ZZ80|B013CP3JDO|B01LORO7Z6,Computers,"class=""a-bordered a-horizontal-stripes a-spa...","February 6, 2013",Unknown,Unknown,Unknown,{}
4963,A1FAETFXIWAELQ,B00H7Y3I4M,"Michael R. Curry ""Tenacious--M""","[0, 0]",I bought this for my new HTPC build.Pros:Every...,5,Every cable you need and more Silverstone qual...,1405382400,"07 15, 2014",2014-07-15,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
4964,APST59ZNX96IG,B004QQTXXY,Anonymous,"[0, 0]",I was looking for something to cut glare. I u...,5,I use my iPad for music and lighting condition...,1405382400,"07 15, 2014",2014-07-15,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


will do the queries to visualize