In [2]:
# First cell - Essential imports
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Visualization settings
sns.set_theme()
%matplotlib inline

In [3]:
# Load the data
def load_data():
    """Load the explored dataset from previous notebook"""
    try:
        df = pd.read_json('../data/raw/reviews_Electronics_5.json.gz', lines=True)
        print(f"Loaded {len(df)} reviews")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

df = load_data()

Loaded 1689188 reviews


In [4]:
# Preprocessing class
class ReviewPreprocessor:
    """Class to handle all preprocessing steps for Amazon reviews"""
    
    def __init__(self):
        # Common English stopwords - we can define these ourselves
        self.stop_words = {
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
            "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
            'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
            'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
            'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 
            'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
            'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 
            'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
            'with', 'about', 'against', 'between', 'into', 'through', 'during', 
            'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
            'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 
            'then', 'once'
        }
    
    def clean_text(self, text):
        """Clean review text"""
        if not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def remove_stopwords(self, text):
        """Remove stopwords without using NLTK"""
        # Simple word splitting (no need for complex tokenization)
        words = text.split()
        # Remove stopwords
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)
    
    def process_dates(self, df):
        """Process and format dates"""
        df['review_date'] = pd.to_datetime(df['unixReviewTime'], unit='s')
        df['formatted_date'] = df['review_date'].dt.strftime('%B %d, %Y')
        return df
    
    def create_features(self, df):
        """Create additional features"""
        df['review_length'] = df['reviewText'].str.len()
        df['word_count'] = df['reviewText'].str.split().str.len()
        return df



In [5]:
# Cell 1 - Define preprocessing function
def preprocess_data(df, batch_size=10000):
    """Main preprocessing pipeline"""
    df = df.copy()
    preprocessor = ReviewPreprocessor()
    print(f"Starting preprocessing pipeline for {len(df)} reviews...")
    
    # Remove duplicates
    initial_size = len(df)
    df = df.drop_duplicates(subset=['reviewText'])
    print(f"Removed {initial_size - len(df)} duplicate reviews")
    
    # Process in batches
    total_batches = len(df) // batch_size + 1
    print("Cleaning text and removing stopwords...")
    for i in range(total_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        print(f"Processing batch {i+1}/{total_batches}")
        
        # Clean and process text
        df.loc[start_idx:end_idx, 'cleaned_text'] = \
            df.loc[start_idx:end_idx, 'reviewText'].apply(preprocessor.clean_text)
        df.loc[start_idx:end_idx, 'processed_text'] = \
            df.loc[start_idx:end_idx, 'cleaned_text'].apply(preprocessor.remove_stopwords)
    
    # Process dates and create features
    df = preprocessor.process_dates(df)
    df = preprocessor.create_features(df)
    
    print("Preprocessing complete!")
    return df

In [6]:
# Cell 2 - Define verification function
def verify_preprocessing(df):
    """Verify the preprocessing results"""
    print("Preprocessing Verification:")
    print("-" * 50)
    
    print("\n1. Data Shape:")
    print(f"Number of reviews: {len(df)}")
    
    print("\n2. Text Cleaning Check:")
    print("Sample original text:")
    print(df['reviewText'].iloc[0][:200])
    print("\nSample cleaned text:")
    print(df['processed_text'].iloc[0][:200])
    
    print("\n3. Feature Statistics:")
    print(df[['review_length', 'word_count']].describe())

In [7]:
# Cell 3 - Define save function
def save_processed_data(df, filename='../data/processed/processed_reviews.csv'):
    """Save the processed dataset"""
    df.to_csv(filename, index=False)
    print(f"Saved processed data to {filename}")

In [8]:
# Cell 4 - Process sample data
# Process a small sample first
sample_df = df.head(1000)  # Start with 1000 reviews
print("Testing preprocessing on sample...")
processed_sample = preprocess_data(sample_df)

Testing preprocessing on sample...
Starting preprocessing pipeline for 1000 reviews...
Removed 1 duplicate reviews
Cleaning text and removing stopwords...
Processing batch 1/1
Preprocessing complete!


In [None]:
# Cell 5 - Verify sample processing
verify_preprocessing(processed_sample)

Preprocessing Verification:
--------------------------------------------------

1. Data Shape:
Number of reviews: 999

2. Text Cleaning Check:
Sample original text:
We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started fr

Sample cleaned text:
got gps husband otr road trucker very impressed shipping time arrived few days earlier than expected within week use however started freezing could just glitch unit worked great when worked will work 

3. Feature Statistics:
       review_length   word_count
count     999.000000   999.000000
mean      789.534535   146.160160
std      1188.697647   215.970236
min         0.000000     0.000000
25%       165.500000    32.000000
50%       345.000000    67.000000
75%       871.000000   165.000000
max     11622.000000  2079.000000


: 

In [None]:
# Cell 6 - Process full dataset (only run if sample looks good) -- TODO: Check if need to do for whole dataset or only 5000 reviews?
print("Processing full dataset...")
processed_df = preprocess_data(df)

Processing full dataset...
Starting preprocessing pipeline for 1689188 reviews...
Removed 2019 duplicate reviews
Cleaning text and removing stopwords...
Processing batch 1/169
Processing batch 2/169
Processing batch 3/169
Processing batch 4/169
Processing batch 5/169
Processing batch 6/169
Processing batch 7/169
Processing batch 8/169
Processing batch 9/169
Processing batch 10/169
Processing batch 11/169
Processing batch 12/169
Processing batch 13/169
Processing batch 14/169
Processing batch 15/169
Processing batch 16/169
Processing batch 17/169
Processing batch 18/169
Processing batch 19/169
Processing batch 20/169
Processing batch 21/169
Processing batch 22/169
Processing batch 23/169
Processing batch 24/169
Processing batch 25/169
Processing batch 26/169
Processing batch 27/169
Processing batch 28/169
Processing batch 29/169
Processing batch 30/169
Processing batch 31/169
Processing batch 32/169
Processing batch 33/169
Processing batch 34/169
Processing batch 35/169
Processing batch

In [None]:
# Cell 7 - Verify full dataset
verify_preprocessing(processed_df)

In [None]:
# Cell 8 - Save processed data
save_processed_data(processed_df)