# Yelp Sentiment Analysis - Data Preprocessing Utilities

This notebook contains utility functions and classes for preprocessing Yelp restaurant and hotel reviews for sentiment analysis using DistilBERT.

## Project Overview
- **Dataset**: Yelp restaurant and hotel reviews
- **Task**: Sentiment analysis (positive, negative, neutral)
- **Model**: DistilBERT for text classification
- **Preprocessing**: Text cleaning, label encoding, data splitting

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from typing import Tuple, List
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 2. YelpDataProcessor Class Definition

In [None]:
class YelpDataProcessor:
    """Data processor for Yelp reviews dataset."""
    
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.label_map = None
        
    def load_data(self, file_path: str) -> pd.DataFrame:
        """Load Yelp dataset from CSV file.
        
        Args:
            file_path: Path to the CSV file
            
        Returns:
            DataFrame with loaded data
        """
        logger.info(f"Loading data from {file_path}")
        df = pd.read_csv(file_path)
        logger.info(f"Loaded dataset with shape: {df.shape}")
        return df
    
    def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Clean the dataset by removing missing values and filtering labels.
        
        Args:
            df: Input DataFrame
            
        Returns:
            Cleaned DataFrame
        """
        logger.info("Cleaning data...")
        
        # Drop rows with missing cleaned_text or sentiment label
        df_clean = df.dropna(subset=["cleaned_text", "star_sentiment"])
        
        # Keep only reviews labeled as positive, negative, or neutral
        df_clean = df_clean[df_clean["star_sentiment"].isin(["positive", "negative", "neutral"])]
        
        # Reset index
        df_clean.reset_index(drop=True, inplace=True)
        
        logger.info(f"Cleaned dataset shape: {df_clean.shape}")
        logger.info(f"Label distribution:\n{df_clean['star_sentiment'].value_counts()}")
        
        return df_clean
    
    def encode_labels(self, df: pd.DataFrame) -> pd.DataFrame:
        """Encode sentiment labels to numerical values.
        
        Args:
            df: DataFrame with sentiment labels
            
        Returns:
            DataFrame with encoded labels
        """
        logger.info("Encoding labels...")
        
        df = df.copy()
        df["label"] = self.label_encoder.fit_transform(df["star_sentiment"])
        
        # Create label mapping
        self.label_map = dict(zip(
            self.label_encoder.classes_, 
            self.label_encoder.transform(self.label_encoder.classes_)
        ))
        
        logger.info(f"Label encoding: {self.label_map}")
        return df
    
    def split_data(self, df: pd.DataFrame, test_size: float = 0.2, val_size: float = 0.1, 
                   random_state: int = 42) -> Tuple[pd.Series, pd.Series, pd.Series, 
                                                   np.ndarray, np.ndarray, np.ndarray]:
        """Split data into train, validation, and test sets.
        
        Args:
            df: DataFrame with text and labels
            test_size: Proportion of test set
            val_size: Proportion of validation set
            random_state: Random seed
            
        Returns:
            Tuple of (train_texts, val_texts, test_texts, train_labels, val_labels, test_labels)
        """
        logger.info("Splitting data into train/val/test sets...")
        
        # Split into train (80%) and temp (20%)
        train_texts, temp_texts, train_labels, temp_labels = train_test_split(
            df["cleaned_text"], df["label"], 
            test_size=test_size, stratify=df["label"], random_state=random_state
        )
        
        # Split temp into val (10%) and test (10%)
        val_texts, test_texts, val_labels, test_labels = train_test_split(
            temp_texts, temp_labels, 
            test_size=0.5, stratify=temp_labels, random_state=random_state
        )
        
        logger.info(f"Train size: {len(train_texts)}")
        logger.info(f"Validation size: {len(val_texts)}")
        logger.info(f"Test size: {len(test_texts)}")
        
        return train_texts, val_texts, test_texts, train_labels.values, val_labels.values, test_labels.values
    
    def get_label_names(self) -> List[str]:
        """Get list of label names.
        
        Returns:
            List of label names
        """
        if self.label_encoder.classes_ is not None:
            return self.label_encoder.classes_.tolist()
        return ["negative", "neutral", "positive"]
    
    def visualize_label_distribution(self, df: pd.DataFrame, title: str = "Label Distribution"):
        """Visualize the distribution of sentiment labels.
        
        Args:
            df: DataFrame with sentiment labels
            title: Title for the plot
        """
        plt.figure(figsize=(10, 6))
        
        # Count plot
        plt.subplot(1, 2, 1)
        df['star_sentiment'].value_counts().plot(kind='bar', color=['#ff7f7f', '#7fbf7f', '#7f7fff'])
        plt.title(f'{title} - Counts')
        plt.xlabel('Sentiment')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        
        # Pie chart
        plt.subplot(1, 2, 2)
        df['star_sentiment'].value_counts().plot(kind='pie', autopct='%1.1f%%', 
                                                colors=['#ff7f7f', '#7fbf7f', '#7f7fff'])
        plt.title(f'{title} - Proportions')
        plt.ylabel('')
        
        plt.tight_layout()
        plt.show()
    
    def save_processed_data(self, train_texts, val_texts, test_texts, 
                           train_labels, val_labels, test_labels, output_dir: str):
        """Save processed data to pickle files.
        
        Args:
            train_texts, val_texts, test_texts: Text data splits
            train_labels, val_labels, test_labels: Label data splits
            output_dir: Directory to save the files
        """
        os.makedirs(output_dir, exist_ok=True)
        
        # Save text data
        with open(os.path.join(output_dir, 'train_texts.pkl'), 'wb') as f:
            pickle.dump(train_texts, f)
        with open(os.path.join(output_dir, 'val_texts.pkl'), 'wb') as f:
            pickle.dump(val_texts, f)
        with open(os.path.join(output_dir, 'test_texts.pkl'), 'wb') as f:
            pickle.dump(test_texts, f)
            
        # Save labels
        with open(os.path.join(output_dir, 'train_labels.pkl'), 'wb') as f:
            pickle.dump(train_labels, f)
        with open(os.path.join(output_dir, 'val_labels.pkl'), 'wb') as f:
            pickle.dump(val_labels, f)
        with open(os.path.join(output_dir, 'test_labels.pkl'), 'wb') as f:
            pickle.dump(test_labels, f)
            
        # Save label encoder
        with open(os.path.join(output_dir, 'label_encoder.pkl'), 'wb') as f:
            pickle.dump(self.label_encoder, f)
            
        logger.info(f"Processed data saved to {output_dir}")

## 3. Helper Functions

In [None]:
def process_yelp_data(file_path: str) -> Tuple[pd.Series, pd.Series, pd.Series, 
                                               np.ndarray, np.ndarray, np.ndarray, 
                                               YelpDataProcessor]:
    """Complete data processing pipeline.
    
    Args:
        file_path: Path to the Yelp dataset CSV
        
    Returns:
        Processed train/val/test data and the processor instance
    """
    processor = YelpDataProcessor()
    
    # Load and process data
    df = processor.load_data(file_path)
    df_clean = processor.clean_data(df)
    df_encoded = processor.encode_labels(df_clean)
    
    # Split data
    train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = processor.split_data(df_encoded)
    
    return train_texts, val_texts, test_texts, train_labels, val_labels, test_labels, processor

def load_processed_data(data_dir: str):
    """Load previously processed data from pickle files.
    
    Args:
        data_dir: Directory containing the processed data files
        
    Returns:
        Loaded data splits and label encoder
    """
    # Load text data
    with open(os.path.join(data_dir, 'train_texts.pkl'), 'rb') as f:
        train_texts = pickle.load(f)
    with open(os.path.join(data_dir, 'val_texts.pkl'), 'rb') as f:
        val_texts = pickle.load(f)
    with open(os.path.join(data_dir, 'test_texts.pkl'), 'rb') as f:
        test_texts = pickle.load(f)
        
    # Load labels
    with open(os.path.join(data_dir, 'train_labels.pkl'), 'rb') as f:
        train_labels = pickle.load(f)
    with open(os.path.join(data_dir, 'val_labels.pkl'), 'rb') as f:
        val_labels = pickle.load(f)
    with open(os.path.join(data_dir, 'test_labels.pkl'), 'rb') as f:
        test_labels = pickle.load(f)
        
    # Load label encoder
    with open(os.path.join(data_dir, 'label_encoder.pkl'), 'rb') as f:
        label_encoder = pickle.load(f)
        
    return train_texts, val_texts, test_texts, train_labels, val_labels, test_labels, label_encoder

def display_data_info(train_texts, val_texts, test_texts, train_labels, val_labels, test_labels):
    """Display information about the data splits.
    
    Args:
        Data splits to analyze
    """
    print("=== Data Split Information ===")
    print(f"Training set size: {len(train_texts)}")
    print(f"Validation set size: {len(val_texts)}")
    print(f"Test set size: {len(test_texts)}")
    print(f"Total samples: {len(train_texts) + len(val_texts) + len(test_texts)}")
    
    print("\n=== Label Distribution ===")
    print("Training set:")
    unique, counts = np.unique(train_labels, return_counts=True)
    for label, count in zip(unique, counts):
        print(f"  Label {label}: {count} ({count/len(train_labels)*100:.1f}%)")
    
    print("Validation set:")
    unique, counts = np.unique(val_labels, return_counts=True)
    for label, count in zip(unique, counts):
        print(f"  Label {label}: {count} ({count/len(val_labels)*100:.1f}%)")
    
    print("Test set:")
    unique, counts = np.unique(test_labels, return_counts=True)
    for label, count in zip(unique, counts):
        print(f"  Label {label}: {count} ({count/len(test_labels)*100:.1f}%)")

# Example usage function
def run_preprocessing_example():
    """Example function showing how to use the preprocessing utilities."""
    print("=== Yelp Data Preprocessing Example ===")
    print("1. Load the data using: processor.load_data('path/to/data.csv')")
    print("2. Clean the data using: processor.clean_data(df)")
    print("3. Encode labels using: processor.encode_labels(df)")
    print("4. Split data using: processor.split_data(df)")
    print("5. Visualize using: processor.visualize_label_distribution(df)")
    print("6. Save processed data using: processor.save_processed_data(...)")
    print("\nOr use the complete pipeline: process_yelp_data('path/to/data.csv')")

## 4. Example Usage

This section shows how to use the preprocessing utilities. Uncomment and run the cells below to test the functions.

In [None]:
# Show example usage
run_preprocessing_example()

# Example: Process the Yelp data (uncomment to run)
# data_path = "../data/yelp_restaurants_hotels_ver2.csv"
# train_texts, val_texts, test_texts, train_labels, val_labels, test_labels, processor = process_yelp_data(data_path)

# Display information about the processed data
# display_data_info(train_texts, val_texts, test_texts, train_labels, val_labels, test_labels)

# Save processed data
# processor.save_processed_data(train_texts, val_texts, test_texts, 
#                              train_labels, val_labels, test_labels, "../outputs/processed_data")