Reference: https://www.geeksforgeeks.org/nlp/amazon-product-reviews-sentiment-analysis-in-python/


In [None]:
!pip install datasets
from datasets import load_dataset



In [None]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import torch
import pickle
from datetime import datetime
import json


# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize once for all processes
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
categories = [
    "All_Beauty",
    "Amazon_Fashion",
    "Appliances",
    "Arts_Crafts_and_Sewing",
    "Automotive",
    "Baby_Products",
    "Beauty_and_Personal_Care",
    "Books",
    "CDs_and_Vinyl",
    "Cell_Phones_and_Accessories",
    "Clothing_Shoes_and_Jewelry",
    "Digital_Music",
    "Electronics",
    "Gift_Cards",
    "Grocery_and_Gourmet_Food",
    "Handmade_Products",
    "Health_and_Household",
    "Health_and_Personal_Care",
    "Home_and_Kitchen",
    "Industrial_and_Scientific",
    "Kindle_Store",
    "Magazine_Subscriptions",
    "Movies_and_TV",
    "Musical_Instruments",
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Software",
    "Sports_and_Outdoors",
    "Subscription_Boxes",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
    "Video_Games",
    "Unknown"
]

In [None]:
def clean_text(text):
    """
    Clean and preprocess text data
    - Convert to lowercase
    - Tokenize into words
    - Remove stopwords, punctuation, non-alphabetic tokens
    - Lemmatize words
    """
    text = text.lower().strip()
    tokens = word_tokenize(text)
    cleaned_tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and w not in string.punctuation and w.isalpha()
    ]
    return cleaned_tokens

In [None]:
# load data of all categories; store in the form that includes: category, reviews and sentiment level (1-5)

# we load 5000 reviews from each category, then sample 600 reviews with balanced sentiment distribution
# rating1-2: 200 (sentiment 2), rating3: 200 (sentiment 1), rating4-5: 200 (sentiment 0)
def process_category(category, reservoir_size=5000):
    """ return a dictionary with the structure: {"category":[
      {"category":[name],"tokens":[......], "rating":[1], "sentiment": 2},{...}
      ]}"""
    try:

      # load the data from hugging face
        data_url = f"https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/review_categories/{category}.jsonl"

        dataset = load_dataset("json", data_files=data_url, split="train", streaming=True)

        raw_rows = []
        for i, row in enumerate(dataset):
            if i >= reservoir_size:
                break

            raw_rows.append({
                'text': row.get('text', ''),
                'rating': row.get('rating', row.get('overall', None)),
                'title': row.get('title', ''),
                'images': row.get('images', []),
                'verified_purchase': row.get('verified_purchase', False),
                'asin': row.get('asin', ''),
                'parent_asin': row.get('parent_asin', ''),
                'user_id': row.get('user_id', ''),
                'timestamp': row.get('timestamp', None),
                'helpful_vote': row.get('helpful_vote', 0)
            })

        # set a seed for randomization
        random.seed(42)
        
        # Group raw_rows by sentiment groups: 1-2 (positive=2), 3 (neutral=1), 4-5 (negative=0)
        sentiment_groups = {
            2: [],  # rating 1-2, positive
            1: [],  # rating 3, neutral
            0: []   # rating 4-5, negative
        }
        
        for row in raw_rows:
            rating = row['rating']
            if rating in [1, 2]:
                sentiment_groups[2].append(row)
            elif rating == 3:
                sentiment_groups[1].append(row)
            elif rating in [4, 5]:
                sentiment_groups[0].append(row)
        
        # Sample 200 from each sentiment group
        sampled_rows = []
        sample_per_group = 200
        
        for sentiment_label, group in sentiment_groups.items():
            if len(group) > 0:
                sample = random.sample(group, min(sample_per_group, len(group)))
                sampled_rows.extend(sample)

        # clean the data: tokenize each review
        processed_data = []
        for row in sampled_rows:
            tokens = clean_text(row['text'])
            title = row['title'].lower().strip()
            # Handle None timestamp safely
            try:
                time = datetime.fromtimestamp(row['timestamp'] / 1000) if row['timestamp'] else None
            except (TypeError, ValueError, OSError):
                time = None
            
            # Determine sentiment label: 2=positive, 1=neutral, 0=negative
            rating = row['rating']
            if rating in [1, 2]:
                sentiment = 2  # positive
            elif rating == 3:
                sentiment = 1  # neutral
            elif rating in [4, 5]:
                sentiment = 0  # negative
            else:
                sentiment = None

            if tokens:
                processed_data.append({
                    'category': category,
                    'tokens': tokens,
                    'rating': row['rating'],
                    'sentiment': sentiment,
                    'original_text': row['text'],
                    'title': title,
                    'images': row['images'],
                    'verified_purchase': row['verified_purchase'],
                    'asin': row['asin'],
                    'parent_asin': row['parent_asin'],
                    'user_id': row['user_id'],
                    'datetime': time,
                    'helpful_vote': row['helpful_vote']
                })

        print(f"{category}: {len(processed_data)} reviews processed")
        return category, processed_data

    except Exception as e:
        print(f"{category}: Error - {str(e)}")
        return category, []
#

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# execute
category_collection = {}

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(process_category, cat): cat for cat in categories}
    for future in as_completed(futures):
        cat, data_list = future.result()
        category_collection[cat] = data_list

Appliances: 499 reviews processed
Amazon_Fashion: 500 reviews processed
All_Beauty: 500 reviews processed
Arts_Crafts_and_Sewing: 499 reviews processed
Automotive: 498 reviews processed
Baby_Products: 499 reviews processed
Beauty_and_Personal_Care: 500 reviews processed
Books: 500 reviews processed
CDs_and_Vinyl: 500 reviews processed
Cell_Phones_and_Accessories: 494 reviews processed
Digital_Music: 498 reviews processed
Clothing_Shoes_and_Jewelry: 500 reviews processed
Gift_Cards: 494 reviews processed
Electronics: 494 reviews processed
Grocery_and_Gourmet_Food: 500 reviews processed
Handmade_Products: 499 reviews processed
Health_and_Personal_Care: 500 reviews processed
Health_and_Household: 500 reviews processed
Home_and_Kitchen: 500 reviews processed
Industrial_and_Scientific: 497 reviews processed
Kindle_Store: 500 reviews processed
Magazine_Subscriptions: 499 reviews processed
Movies_and_TV: 500 reviews processed
Musical_Instruments: 500 reviews processed
Office_Products: 499 rev

In [None]:
final_list = []
for cat, items in category_collection.items():
    for item in items:
        final_list.append({
            'category': item['category'],
            'rating': item['rating'],
            'sentiment': item['sentiment'],
            'tokens': item['tokens'],
            'text_cleaned': ' '.join(item['tokens']),
            'original_text': item['original_text'],
            'token_count': len(item['tokens']),
            'title': item['title'],
            'images': item['images'],
            'verified_purchase': item['verified_purchase'],
            'asin': item['asin'],
            'parent_asin': item['parent_asin'],
            'user_id': item['user_id'],
            'datetime': item['datetime'],
            'helpful_vote': item['helpful_vote']
        })

df = pd.DataFrame(final_list)

In [None]:
df.head()

Unnamed: 0,category,rating,sentiment,tokens,text_cleaned,original_text,token_count,title,images,verified_purchase,asin,parent_asin,user_id,datetime,helpful_vote
0,Appliances,5.0,positive,"[exelent, save, coffee, water, waste, easy, cl...",exelent save coffee water waste easy clean act...,"Exelent, save on coffee, water, no waste, e...",14,savings,[],True,B00LGEKOMS,B07RNJY499,AEOVCZC77QZJQPBIAIKCFV7AS7PA,2017-10-16 22:46:40.529,0
1,Appliances,5.0,positive,"[ordered, wrong, part, quality, part, seemed, ...",ordered wrong part quality part seemed good,Ordered wrong part but the quality of this par...,7,check part numbers,[],True,B094YWPF68,B094YWPF68,AEU2V36H3G45EFVLASUPD56B7ATQ,2021-08-30 19:50:53.564,0
2,Appliances,5.0,positive,[described],described,as described,1,five stars,[],True,B00LQL043A,B00LQL043A,AE7FJMYY4AKWBDASLTMMQ5WASB7A,2016-09-28 15:08:18.000,0
3,Appliances,5.0,positive,"[used, replace, broken, door, bin, back, ice, ...",used replace broken door bin back ice dispense...,Used to replace a broken door bin on the back ...,18,fits,[],True,B00C29G3N0,B00C29G3N0,AGM4WN3EOAA3RUAUXGH2S2AUL6WA,2019-01-06 21:01:13.614,3
4,Appliances,5.0,positive,"[husband, us, broke, cleaning, replace, well, ...",husband us broke cleaning replace well say one...,My husband uses these. I broke it when cleanin...,35,easy peasy to use and clean.,[],True,B01DP1IWKU,B092LLM7H3,AEBXJRP4COCKP22LPKUDVCQ7JKVQ,2019-08-09 02:13:50.975,0


In [None]:
df.to_parquet('amazon_user_reviews_3cat.parquet', index=False)