Reference: https://www.geeksforgeeks.org/nlp/amazon-product-reviews-sentiment-analysis-in-python/


In [31]:
!pip install datasets
from datasets import load_dataset



In [32]:
from datasets import load_dataset
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import nltk
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import torch
import pickle
from datetime import datetime


# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize once for all processes
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [33]:
categories = [
    "All_Beauty",
    "Amazon_Fashion",
    "Appliances",
    "Arts_Crafts_and_Sewing",
    "Automotive",
    "Baby_Products",
    "Beauty_and_Personal_Care",
    "Books",
    "CDs_and_Vinyl",
    "Cell_Phones_and_Accessories",
    "Clothing_Shoes_and_Jewelry",
    "Digital_Music",
    "Electronics",
    "Gift_Cards",
    "Grocery_and_Gourmet_Food",
    "Handmade_Products",
    "Health_and_Household",
    "Health_and_Personal_Care",
    "Home_and_Kitchen",
    "Industrial_and_Scientific",
    "Kindle_Store",
    "Magazine_Subscriptions",
    "Movies_and_TV",
    "Musical_Instruments",
    "Office_Products",
    "Patio_Lawn_and_Garden",
    "Pet_Supplies",
    "Software",
    "Sports_and_Outdoors",
    "Subscription_Boxes",
    "Tools_and_Home_Improvement",
    "Toys_and_Games",
    "Video_Games",
    "Unknown"
]

In [34]:
def clean_text(text):
    """
    Clean and preprocess text data
    - Convert to lowercase
    - Tokenize into words
    - Remove stopwords, punctuation, non-alphabetic tokens
    - Lemmatize words
    """
    text = text.lower()
    tokens = word_tokenize(text)
    cleaned_tokens = [
        lemmatizer.lemmatize(w)
        for w in tokens
        if w not in stop_words and w not in string.punctuation and w.isalpha()
    ]
    return cleaned_tokens

In [35]:
# load data of all categories; store in the form that includes: category, reviews and sentiment level (1-5)

# we load 5000 reviews from each category, then randomly sample 500 reviews for each category
def process_category(category, sample_size=500, reservoir_size=5000):
    """ return a dictionary with the structue: {"category":[
      {"category":[name],"tokens":[......], "rating":[1]},{...}
      ]}"""
    try:

      # load the data from hugging face
        data_url = f"https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/review_categories/{category}.jsonl"

        dataset = load_dataset("json", data_files=data_url, split="train", streaming=True)

        raw_rows = []
        for i, row in enumerate(dataset):
            if i >= reservoir_size:
                break

            raw_rows.append({
                'text': row.get('text', ''),
                'rating': row.get('rating', row.get('overall', None)),
                'title': row.get('title', ''),
                'images': row.get('images', []),
                'verified_purchase': row.get('verified_purchase', False),
                'asin': row.get('asin', ''),
                'parent_asin': row.get('parent_asin', ''),
                'user_id': row.get('user_id', ''),
                'timestamp': row.get('timestamp', None),
                'helpful_vote': row.get('helpful_vote', 0)
            })

        # set a seed for randomization
        random.seed(42)
        sampled_rows = random.sample(raw_rows, min(sample_size, len(raw_rows)))

        # clean the data: tokenize each review
        processed_data = []
        for row in sampled_rows:
            tokens = clean_text(row['text'])
            title = row['title'].lower().strip()
            time = datetime.fromtimestamp(row['timestamp']/ 1000)
            if tokens:
                processed_data.append({
                    'category': category,
                    'tokens': tokens,
                    'rating': row['rating'],
                    'original_text': row['text'],
                    'title': title,
                    'images': row['images'],
                    'verified_purchase': row['verified_purchase'],
                    'asin': row['asin'],
                    'parent_asin': row['parent_asin'],
                    'user_id': row['user_id'],
                    'datetime': time,
                    'helpful_vote': row['helpful_vote']
                })

        print(f"{category}: {len(processed_data)} reviews processed")
        return category, processed_data

    except Exception as e:
        print(f"{category}: Error - {str(e)}")
        return category, []
#

In [36]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [37]:
# execute
category_collection = {}

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = {executor.submit(process_category, cat): cat for cat in categories}
    for future in as_completed(futures):
        cat, data_list = future.result()
        category_collection[cat] = data_list

All_Beauty: 500 reviews processed
Amazon_Fashion: 500 reviews processed
Appliances: 499 reviews processed
Arts_Crafts_and_Sewing: 499 reviews processed
Automotive: 498 reviews processed
Baby_Products: 499 reviews processed
Beauty_and_Personal_Care: 500 reviews processed
CDs_and_Vinyl: 500 reviews processed
Books: 500 reviews processed
Cell_Phones_and_Accessories: 494 reviews processed
Clothing_Shoes_and_Jewelry: 500 reviews processed
Digital_Music: 498 reviews processed
Electronics: 494 reviews processed
Gift_Cards: 494 reviews processed
Grocery_and_Gourmet_Food: 500 reviews processed
Health_and_Household: 500 reviews processed
Handmade_Products: 499 reviews processed
Health_and_Personal_Care: 500 reviews processed
Home_and_Kitchen: 500 reviews processed
Industrial_and_Scientific: 497 reviews processed
Magazine_Subscriptions: 499 reviews processed
Kindle_Store: 500 reviews processed
Movies_and_TV: 500 reviews processed
Musical_Instruments: 500 reviews processed
Office_Products: 499 rev

In [39]:
final_list = []
for cat, items in category_collection.items():
    for item in items:
        final_list.append({
            'category': item['category'],
            'rating': item['rating'],
            'tokens': item['tokens'],
            'text_cleaned': ' '.join(item['tokens']),
            'original_text': item['original_text'],
            'token_count': len(item['tokens']),
            'title': item['title'],
            'images': item['images'],
            'verified_purchase': item['verified_purchase'],
            'asin': item['asin'],
            'parent_asin': item['parent_asin'],
            'user_id': item['user_id'],
            'datetime': item['datetime'],
            'helpful_vote': item['helpful_vote']
        })

df = pd.DataFrame(final_list)

In [None]:
print(df[['category', 'rating', 'token_count', 'text_cleaned']].head())

In [40]:
df.to_parquet('amazon_user_reviews.parquet', index=False)