# Text Classification with Multiple Word Embeddings



**Dataset**: Amazon Fine Food Reviews (1-5 star ratings)

---



# 1. Setup & Installation

In [1]:
# Install required packages (run once)
!pip install pandas numpy matplotlib seaborn wordcloud nltk scikit-learn tensorflow gensim beautifulsoup4 -q

print("✅ Packages installed successfully!")

✅ Packages installed successfully!


ERROR: Exception:
Traceback (most recent call last):
  File "c:\Users\LENOVO\Cloned repos\text_classification\text_classification_with_multiple_embeddings\.venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 438, in _error_catcher
    yield
  File "c:\Users\LENOVO\Cloned repos\text_classification\text_classification_with_multiple_embeddings\.venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 561, in read
    data = self._fp_read(amt) if not fp_closed else b""
           ~~~~~~~~~~~~~^^^^^
  File "c:\Users\LENOVO\Cloned repos\text_classification\text_classification_with_multiple_embeddings\.venv\Lib\site-packages\pip\_vendor\urllib3\response.py", line 527, in _fp_read
    return self._fp.read(amt) if amt is not None else self._fp.read()
           ~~~~~~~~~~~~~^^^^^
  File "c:\Users\LENOVO\Cloned repos\text_classification\text_classification_with_multiple_embeddings\.venv\Lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 100, in read
    data: bytes =

In [10]:
import nltk
import ssl

# Handle SSL certificate issues (if any)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Download required NLTK data
print("Downloading NLTK data...")
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('omw-1.4', quiet=True)
print("NLTK data downloaded successfully!")

ModuleNotFoundError: No module named 'nltk'

In [7]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# NLP libraries
import re
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    classification_report, confusion_matrix, accuracy_score,
    precision_recall_fscore_support
)

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Embeddings
from gensim.models import Word2Vec, FastText
from gensim.models import KeyedVectors

# Utilities
from collections import Counter
from typing import List, Tuple, Dict
import json
import os
from datetime import datetime

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("✅ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

ModuleNotFoundError: No module named 'pandas'

# 2. Configuration Settings

In [None]:
# Configuration parameters
CONFIG = {
    # Data settings
    'DATA_PATH': '../Reviews.csv',
    'SAMPLE_SIZE': 50000,  # Use None for full dataset, or integer for testing
    'TEST_SIZE': 0.2,
    'VAL_SIZE': 0.1,
    'RANDOM_STATE': 42,
    
    # Text preprocessing
    'MAX_SEQUENCE_LENGTH': 200,
    'MIN_WORD_FREQ': 2,
    'MAX_VOCAB_SIZE': 50000,
    
    # Model hyperparameters
    'NUM_CLASSES': 5,
    'BATCH_SIZE': 32,
    'EPOCHS': 30,  # Reduced for faster training in notebook
    'LEARNING_RATE': 0.001,
    
    # GRU architecture
    'GRU_UNITS': 128,
    'DROPOUT_RATE': 0.5,
    'RECURRENT_DROPOUT': 0.2,
    'USE_BIDIRECTIONAL': True,
    'NUM_GRU_LAYERS': 2,
    
    # Embedding dimensions
    'EMBEDDING_DIM': 100,
    'TFIDF_MAX_FEATURES': 5000,
    
    # Training
    'EARLY_STOPPING_PATIENCE': 5,
    'REDUCE_LR_PATIENCE': 3,
    'USE_CLASS_WEIGHTS': True,
    
    # Class names
    'CLASS_NAMES': ['1 Star', '2 Stars', '3 Stars', '4 Stars', '5 Stars']
}

print("✅ Configuration loaded:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# 3. Helper Functions

We'll define all helper functions here that will be used throughout the notebook.

In [None]:
# Text Preprocessing Class
class TextPreprocessor:
    """Comprehensive text preprocessing pipeline."""
    
    def __init__(self, remove_stopwords=True, lemmatize=True):
        self.remove_stopwords = remove_stopwords
        self.lemmatize = lemmatize
        self.stop_words = set(stopwords.words('english')) if remove_stopwords else set()
        self.lemmatizer = WordNetLemmatizer() if lemmatize else None
    
    def remove_html_tags(self, text):
        """Remove HTML tags using BeautifulSoup."""
        if pd.isna(text):
            return ""
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()
    
    def clean_text(self, text):
        """Clean and normalize text."""
        # Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        # Remove special characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    
    def preprocess_text(self, text, return_string=False):
        """Complete preprocessing pipeline."""
        # Remove HTML
        text = self.remove_html_tags(text)
        # Clean
        text = self.clean_text(text)
        # Lowercase
        text = text.lower()
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        if self.remove_stopwords:
            tokens = [t for t in tokens if t not in self.stop_words]
        # Lemmatize
        if self.lemmatize and self.lemmatizer:
            tokens = [self.lemmatizer.lemmatize(t) for t in tokens]
        # Filter short tokens
        tokens = [t for t in tokens if len(t) >= 2]
        
        return ' '.join(tokens) if return_string else tokens

print("✅ TextPreprocessor class defined")