In [2]:
# First cell - Essential imports
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Visualization settings
sns.set_theme()
%matplotlib inline

In [3]:
# Load the data
def load_data():
    """Load the explored dataset from previous notebook"""
    try:
        df = pd.read_json('../data/raw/reviews_Electronics_5.json.gz', lines=True)
        print(f"Loaded {len(df)} reviews")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

df = load_data()

Loaded 1689188 reviews


In [None]:
# Preprocessing class
class ReviewPreprocessor:
    """Class to handle all preprocessing steps for Amazon reviews"""
    
    def __init__(self):
        # Common English stopwords - we can define these ourselves
        self.stop_words = {
            'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
            "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 
            'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 
            'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
            'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 
            'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 
            'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 
            'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 
            'with', 'about', 'against', 'between', 'into', 'through', 'during', 
            'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
            'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 
            'then', 'once'
        }
    
    def clean_text(self, text):
        """Clean review text"""
        if not isinstan=ce(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove HTML tags
        text = re.sub(r'<[^>]+>', '', text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www.\S+', '', text)
        
        # Remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def remove_stopwords(self, text):
        """Remove stopwords without using NLTK"""
        # Simple word splitting (no need for complex tokenization)
        words = text.split()
        # Remove stopwords
        filtered_words = [word for word in words if word not in self.stop_words]
        return ' '.join(filtered_words)
    
    def process_dates(self, df):
        """Process and format dates"""
        df['review_date'] = pd.to_datetime(df['unixReviewTime'], unit='s')
        df['formatted_date'] = df['review_date'].dt.strftime('%B %d, %Y')
        return df
    
    def create_features(self, df):
        """Create additional features"""
        df['review_length'] = df['reviewText'].str.len()
        df['word_count'] = df['reviewText'].str.split().str.len()
        return df



In [16]:
# Modified preprocessing function without batching (since we only have 5000 reviews)
def preprocess_data(df):
    """Main preprocessing pipeline"""
    # Create a copy and reset index
    df = df.copy()
    df = df.reset_index(drop=True)  # This fixes the indexing issue
    
    preprocessor = ReviewPreprocessor()
    print(f"Starting preprocessing pipeline for {len(df)} reviews...")
    
    # Remove duplicates
    initial_size = len(df)
    df = df.drop_duplicates(subset=['reviewText'])
    print(f"Removed {initial_size - len(df)} duplicate reviews")
    
    print("Cleaning text and removing stopwords...")
    # Process all reviews at once (no batching needed for 5000 reviews)
    df['cleaned_text'] = df['reviewText'].apply(preprocessor.clean_text)
    df['processed_text'] = df['cleaned_text'].apply(preprocessor.remove_stopwords)
    
    # Process dates and create features
    df = preprocessor.process_dates(df)
    df = preprocessor.create_features(df)
    
    print("Preprocessing complete!")
    return df

In [None]:
# Verification function
def verify_preprocessing(df):
    """Verify the preprocessing results"""
    print("Preprocessing Verification:")
    print("-" * 50)
    
    print("\n1. Data Shape:")
    print(f"Number of reviews: {len(df)}")
    
    print("\n2. Text Cleaning Check:")
    print("Sample original text:")
    print(df['reviewText'].iloc[0][:200])
    print("\nSample cleaned text:")
    print(df['processed_text'].iloc[0][:200])
    
    print("\n3. Feature Statistics:")
    print(df[['review_length', 'word_count']].describe())

In [None]:
# Save function to save data in csv
def save_processed_data(df, filename='../data/processed/processed_reviews.csv'):
    """Save the processed dataset"""
    df.to_csv(filename, index=False)
    print(f"Saved processed data to {filename}")

In [17]:
# Get the most recent 5000 reviews up until July 2014
df['review_date'] = pd.to_datetime(df['unixReviewTime'], unit='s')
cutoff_date = '2014-07-31'

# Filter and sort
recent_reviews = df[df['review_date'] <= cutoff_date].sort_values('review_date', ascending=False).head(5000)

print("Selected Reviews Information:")
print("-" * 50)
print(f"Date range: {recent_reviews['review_date'].min()} to {recent_reviews['review_date'].max()}")
print(f"Number of reviews: {len(recent_reviews)}")

# Now process these reviews
processed_sample = preprocess_data(recent_reviews)

Selected Reviews Information:
--------------------------------------------------
Date range: 2014-07-15 00:00:00 to 2014-07-23 00:00:00
Number of reviews: 5000
Starting preprocessing pipeline for 5000 reviews...
Removed 32 duplicate reviews
Cleaning text and removing stopwords...
Preprocessing complete!


In [18]:
# Cell 5 - Verify sample processing
verify_preprocessing(processed_sample)

Preprocessing Verification:
--------------------------------------------------

1. Data Shape:
Number of reviews: 4968

2. Text Cleaning Check:
Sample original text:
Helps out tremendously.

Sample cleaned text:
helps tremendously

3. Feature Statistics:
       review_length   word_count
count    4968.000000  4968.000000
mean      514.063607    93.649557
std       844.144106   150.748031
min         0.000000     0.000000
25%        85.000000    16.000000
50%       226.000000    43.000000
75%       564.250000   105.000000
max     15312.000000  2734.000000


In [19]:
# Save processed data
save_processed_data(processed_sample)

Saved processed data to ../data/processed/processed_reviews.csv


In [22]:
def preprocessing_completion_check(df):
    """Comprehensive check to verify preprocessing completion"""
    print("PREPROCESSING COMPLETION CHECKLIST")
    print("=" * 50)
    
    # 1. Data Volume and Date Range
    print("\n1. DATA REQUIREMENTS:")
    print("-" * 30)
    print(f"✓ Number of reviews: {len(df)} (~5000 required)")
    print(f"✓ Date range: {df['review_date'].min()} to {df['review_date'].max()}")
    print("  (Should be up to July 2014)")
    
    # 2. Data Quality
    print("\n2. DATA QUALITY:")
    print("-" * 30)
    # Check for missing values
    missing = df.isnull().sum()
    print("✓ Missing values:")
    print(missing[missing > 0] if any(missing > 0) else "None")
    
    # Check for empty reviews
    empty_reviews = len(df[df['processed_text'].str.len() == 0])
    print(f"✓ Empty reviews: {empty_reviews}")
    
    # 3. Text Processing
    print("\n3. TEXT PROCESSING:")
    print("-" * 30)
    # Sample a review to check cleaning
    sample = df.iloc[0]
    print("✓ Text cleaning verification:")
    print(f"  Original: {sample['reviewText'][:100]}...")
    print(f"  Cleaned:  {sample['cleaned_text'][:100]}...")
    print(f"  Final:    {sample['processed_text'][:100]}...")
    
    # 4. Required Features
    print("\n4. REQUIRED FEATURES:")
    print("-" * 30)
    required_columns = [
        'reviewText',        # Original text
        'cleaned_text',      # Cleaned text
        'processed_text',    # Final processed text
        'review_date',       # Datetime
        'formatted_date',    # Formatted date
        'review_length',     # Length features
        'word_count',
        'overall'           # Rating
    ]
    
    missing_cols = [col for col in required_columns if col not in df.columns]
    print("✓ Required columns:", "All present" if not missing_cols else f"Missing: {missing_cols}")
    
    # 5. Statistics
    print("\n5. BASIC STATISTICS:")
    print("-" * 30)
    print(f"✓ Average word count: {df['word_count'].mean():.1f}")
    print(f"✓ Rating distribution:\n{df['overall'].value_counts().sort_index()}")

# Run the completion check
preprocessing_completion_check(processed_sample)

PREPROCESSING COMPLETION CHECKLIST

1. DATA REQUIREMENTS:
------------------------------
✓ Number of reviews: 4968 (~5000 required)
✓ Date range: 2014-07-15 00:00:00 to 2014-07-23 00:00:00
  (Should be up to July 2014)

2. DATA QUALITY:
------------------------------
✓ Missing values:
reviewerName    1020
dtype: int64
✓ Empty reviews: 2

3. TEXT PROCESSING:
------------------------------
✓ Text cleaning verification:
  Original: Helps out tremendously....
  Cleaned:  helps out tremendously...
  Final:    helps tremendously...

4. REQUIRED FEATURES:
------------------------------
✓ Required columns: All present

5. BASIC STATISTICS:
------------------------------
✓ Average word count: 93.6
✓ Rating distribution:
overall
1     271
2     232
3     423
4     985
5    3057
Name: count, dtype: int64


In [28]:
# Clean missing values and empty reviews
def final_cleaning(df):
    """Final cleaning steps for the dataset"""
    print("FINAL CLEANING STEPS")
    print("-" * 50)
    
    # 1. Handle Missing Reviewer Names
    initial_missing = df['reviewerName'].isnull().sum()
    df['reviewerName'] = df['reviewerName'].fillna('Anonymous')
    print(f"✓ Filled {initial_missing} missing reviewer names with 'Anonymous'")
    
    # 2. Handle Empty Reviews
    initial_empty = df[df['processed_text'].str.len() == 0]
    print("\nEmpty Reviews Found:")
    print("-" * 30)
    for idx, review in initial_empty.iterrows():
        print(f"Review {idx}:")
        print(f"Original text: {review['reviewText']}")
        print(f"Cleaned text: {review['cleaned_text']}")
        print(f"Rating: {review['overall']}")
        print("-" * 30)
    
    # Remove empty reviews
    df = df[df['processed_text'].str.len() > 0]
    print(f"\n✓ Removed {len(initial_empty)} empty reviews")
    
    # Final verification
    print("\nFINAL VERIFICATION:")
    print("-" * 30)
    print(f"Final number of reviews: {len(df)}")
    print("\nMissing values check:")
    print(df.isnull().sum()[df.isnull().sum() > 0])
    print("\nEmpty reviews check:", len(df[df['processed_text'].str.len() == 0]))
    
    return df

# Apply final cleaning
processed_df = final_cleaning(processed_sample)

# Save the final cleaned dataset
processed_df.to_csv('../data/processed/final_cleaned_reviews.csv', index=False)
print("\nSaved final cleaned dataset")

# Run the completion check again
print("\nFINAL STATUS CHECK:")
preprocessing_completion_check(processed_df)

FINAL CLEANING STEPS
--------------------------------------------------
✓ Filled 0 missing reviewer names with 'Anonymous'

Empty Reviews Found:
------------------------------
Review 2017:
Original text: 
Cleaned text: 
Rating: 4
------------------------------
Review 3935:
Original text: I have about 8 of them.
Cleaned text: i have about of them
Rating: 5
------------------------------

✓ Removed 2 empty reviews

FINAL VERIFICATION:
------------------------------
Final number of reviews: 4966

Missing values check:
Series([], dtype: int64)

Empty reviews check: 0

Saved final cleaned dataset

FINAL STATUS CHECK:
PREPROCESSING COMPLETION CHECKLIST

1. DATA REQUIREMENTS:
------------------------------
✓ Number of reviews: 4966 (~5000 required)
✓ Date range: 2014-07-15 00:00:00 to 2014-07-23 00:00:00
  (Should be up to July 2014)

2. DATA QUALITY:
------------------------------
✓ Missing values:
None
✓ Empty reviews: 0

3. TEXT PROCESSING:
------------------------------
✓ Text cleaning ve