# 🏷️ Multi-Label Text Classification Pipeline

This notebook implements a hybrid classification approach:

1. **Regex-based rules** for obvious patterns (fast)
2. **Zero-shot classification** for ambiguous cases (accurate)
3. **Combined labeling** for final multi-label results

**Target Labels:** Advertisement, Irrelevant, Fake_Rant

---


## 📦 Setup and Dependencies


In [None]:
# Install required packages for optimal performance
import subprocess
import sys


def install_package(package):
    try:
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", package],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
        )
        print(f"✅ {package} ready")
    except:
        print(f"⚠️  {package} installation issue (might already be installed)")


print("📦 Setting up dependencies...")
packages = ["transformers>=4.21.0", "torch", "pandas", "numpy", "tqdm"]

for pkg in packages:
    install_package(pkg)

print("\n🚀 All dependencies ready for lightning-fast classification!")

📦 Setting up dependencies...
✅ transformers>=4.21.0 ready
✅ transformers>=4.21.0 ready
✅ torch ready
✅ torch ready
✅ pandas ready
✅ pandas ready
✅ numpy ready
✅ numpy ready
✅ tqdm ready

🚀 All dependencies ready for lightning-fast classification!
✅ tqdm ready

🚀 All dependencies ready for lightning-fast classification!


In [27]:
# Import libraries with performance optimizations
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Set
from tqdm import tqdm
import warnings
import time

# Hugging Face imports with caching optimizations
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Configure for optimal performance
warnings.filterwarnings("ignore")
tqdm.pandas()

# Set device for faster inference
device = 0 if torch.cuda.is_available() else -1
device_name = "GPU (CUDA)" if device == 0 else "CPU"

print(f"🔧 Environment configured:")
print(f"   Device: {device_name}")
print(f"   PyTorch: {torch.__version__}")
print(f"   Pandas: {pd.__version__}")
print(f"   NumPy: {np.__version__}")
print(f"\n⚡ Ready for ULTRA-FAST high-performance classification!")

🔧 Environment configured:
   Device: CPU
   PyTorch: 2.8.0
   Pandas: 2.3.1
   NumPy: 1.26.4

⚡ Ready for ULTRA-FAST high-performance classification!


## 📊 Data Loading and Preparation


In [28]:
# Load and prepare the dataset
DATA_PATH = "../data/cleaned_google_reviews.csv"  # Update path as needed

print(f"📁 Loading data from: {DATA_PATH}")

try:
    df = pd.read_csv(DATA_PATH)
    print(f"✅ Dataset loaded successfully!")
    print(f"   📊 Shape: {df.shape}")
    print(f"   📋 Columns: {df.columns.tolist()}")

    # Ensure text column exists and handle missing values
    text_col = "text" if "text" in df.columns else "review_text"
    if text_col not in df.columns:
        raise ValueError(
            f"No text column found. Available columns: {df.columns.tolist()}"
        )

    # Clean and prepare text data
    df[text_col] = df[text_col].fillna("")  # Handle NaN values
    df[text_col] = df[text_col].astype(str)  # Ensure string type

    # Filter out empty texts for processing
    valid_text_mask = (df[text_col].str.len() > 0) & (df[text_col] != "nan")
    total_rows = len(df)
    valid_rows = valid_text_mask.sum()

    print(f"\n📝 Text Data Quality:")
    print(f"   Total rows: {total_rows:,}")
    print(f"   Valid text rows: {valid_rows:,} ({valid_rows/total_rows*100:.1f}%)")
    print(f"   Empty/missing: {total_rows - valid_rows:,}")

    # Display sample data
    print(f"\n📋 Sample Data:")
    sample_df = df[valid_text_mask].head(3)
    for i, row in sample_df.iterrows():
        text_preview = (
            row[text_col][:100] + "..." if len(row[text_col]) > 100 else row[text_col]
        )
        print(f"   Row {i}: '{text_preview}'")

except FileNotFoundError:
    print(f"❌ File not found: {DATA_PATH}")
    print("Creating sample dataset for demonstration...")

    # Create sample data for demonstration
    sample_data = {
        "user_id": ["user1", "user2", "user3", "user4", "user5"],
        "text": [
            "Buy now! 50% discount on all items! Call 555-1234 today!",
            "hello",
            "Never visited this place but heard bad things about it from friends",
            "Great food and excellent service. Highly recommend this restaurant.",
            "Thanks for the info",
        ],
        "rating": [1, 3, 1, 5, 4],
        "category": ["Restaurant", "Store", "Restaurant", "Restaurant", "Store"],
    }
    df = pd.DataFrame(sample_data)
    text_col = "text"
    valid_text_mask = df[text_col].str.len() > 0
    print(f"✅ Sample dataset created with {len(df)} rows")

📁 Loading data from: ../data/cleaned_google_reviews.csv
✅ Dataset loaded successfully!
   📊 Shape: (673065, 17)
   📋 Columns: ['user_id', 'user_name', 'review_time', 'rating', 'review_text', 'pics', 'resp', 'gmap_id', 'has_resp', 'resp_text', 'resp_time', 'biz_name', 'description', 'category', 'avg_rating', 'num_of_reviews', 'price_level']

📝 Text Data Quality:
   Total rows: 673,065
   Valid text rows: 347,087 (51.6%)
   Empty/missing: 325,978

📋 Sample Data:
   Row 0: 'Great place to care for our children.'
   Row 1: 'Th sw y are so nice'
   Row 2: 'Went with my daughter'
✅ Dataset loaded successfully!
   📊 Shape: (673065, 17)
   📋 Columns: ['user_id', 'user_name', 'review_time', 'rating', 'review_text', 'pics', 'resp', 'gmap_id', 'has_resp', 'resp_text', 'resp_time', 'biz_name', 'description', 'category', 'avg_rating', 'num_of_reviews', 'price_level']

📝 Text Data Quality:
   Total rows: 673,065
   Valid text rows: 347,087 (51.6%)
   Empty/missing: 325,978

📋 Sample Data:
   Row 0: 

## 🔍 Regex-Based Rule Engine


In [None]:
class FastRuleClassifier:
    """Lightning-fast regex-based classification for obvious patterns"""

    def __init__(self):
        # Pre-compiled regex patterns for maximum performance
        self.patterns = {
            "Advertisement": [
                re.compile(
                    r"\b(buy now|discount|sale|offer|promo|deal|coupon|special offer)\b",
                    re.IGNORECASE,
                ),
                re.compile(r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b"),  # Phone numbers
                re.compile(
                    r"\b(call|contact|visit|website|www\.|http)\b", re.IGNORECASE
                ),
                re.compile(
                    r"\b(free|limited time|act now|order today)\b", re.IGNORECASE
                ),
                re.compile(
                    r"\$\d+|\b\d+%\s*off\b", re.IGNORECASE
                ),  # Prices and discounts
            ],
            "Irrelevant": [
                re.compile(
                    r"^\s*(hello|hi|thanks|thank you|ok|okay)\s*$", re.IGNORECASE
                ),
                re.compile(r"^\s*\w{1,4}\s*$"),  # Very short single words
                re.compile(
                    r"\b(weather|traffic|politics|government|election)\b", re.IGNORECASE
                ),
                re.compile(r"\b(my car|my phone|personal|unrelated)\b", re.IGNORECASE),
            ],
            "Fake_Rant": [
                re.compile(
                    r"\b(never visited|never been|never went|haven\'t been)\b",
                    re.IGNORECASE,
                ),
                re.compile(
                    r"\b(heard bad things|rumor|heard from|people say)\b", re.IGNORECASE
                ),
                re.compile(
                    r"\b(avoid|stay away|don\'t go|waste of time)\b", re.IGNORECASE
                ),
                re.compile(
                    r"\b(probably|seems like|looks like|appears)\b.*\b(bad|terrible|awful)\b",
                    re.IGNORECASE,
                ),
            ],
        }

        # Count patterns for performance metrics
        total_patterns = sum(len(patterns) for patterns in self.patterns.values())
        print(f"🔍 Regex Engine Initialized:")
        for label, patterns in self.patterns.items():
            print(f"   {label}: {len(patterns)} patterns")
        print(f"   Total: {total_patterns} compiled regex patterns")

    def classify_text(self, text: str) -> Set[str]:
        """Apply regex rules to classify text - returns set of matching labels"""
        if not text or pd.isna(text) or text.strip() == "":
            return set()

        labels = set()
        text = str(text).strip()

        # Special case for very short text (Irrelevant)
        if len(text.split()) < 5:
            labels.add("Irrelevant")

        # Apply regex patterns
        for label, patterns in self.patterns.items():
            for pattern in patterns:
                if pattern.search(text):
                    labels.add(label)
                    break  # One match per category is enough

        return labels

    def classify_dataframe(self, df: pd.DataFrame, text_column: str) -> pd.Series:
        """Lightning-fast regex classification with vectorized operations - ERROR-FREE"""
        print(f"🚀 Lightning-Fast regex classification on {len(df):,} rows...")

        start_time = time.time()

        # Initialize results array - much faster than pandas operations
        results = [set() for _ in range(len(df))]

        # Get text data as numpy array for maximum speed
        text_data = df[text_column].fillna("").astype(str).values

        print(f"   ⚡ Processing text data...")

        # Vectorized short text detection first (most common case)
        for i, text in enumerate(text_data):
            if text and len(str(text).split()) < 5:
                results[i].add("Irrelevant")

        # Apply regex patterns efficiently
        patterns_applied = 0
        for label, patterns in self.patterns.items():
            for pattern in patterns:
                matches = 0
                for i, text in enumerate(text_data):
                    if text and pattern.search(str(text)):
                        results[i].add(label)
                        matches += 1

                if matches > 0:
                    print(f"   🎯 {label}: {matches:,} matches")
                    patterns_applied += 1

        # Convert results to pandas Series efficiently
        final_results = pd.Series(results, index=df.index)

        # Performance metrics
        duration = time.time() - start_time
        labeled_count = sum(len(labels) for labels in results)
        rows_with_labels = sum(1 for labels in results if len(labels) > 0)

        print(f"\n✅ Lightning-Fast regex completed:")
        print(f"   ⏱️  Time: {duration:.2f} seconds")
        print(f"   ⚡ Speed: {len(df)/duration:,.0f} rows/second")
        print(f"   🎯 Patterns applied: {patterns_applied}")
        print(
            f"   📊 Rows labeled: {rows_with_labels:,} ({rows_with_labels/len(df)*100:.1f}%)"
        )

        return final_results


# Initialize the fast rule classifier
rule_classifier = FastRuleClassifier()

🔍 Regex Engine Initialized:
   Advertisement: 5 patterns
   Irrelevant: 4 patterns
   Fake_Rant: 4 patterns
   Total: 13 compiled regex patterns


## 🤖 Zero-Shot Classification Pipeline


In [31]:
class ZeroShotClassifier:
    """Hugging Face zero-shot classification for ambiguous cases"""

    def __init__(self, model_name="facebook/bart-large-mnli", score_threshold=0.8):
        self.model_name = model_name
        self.score_threshold = score_threshold
        self.candidate_labels = ["Advertisement", "Irrelevant", "Fake_Rant"]
        self.pipeline = None

        print(f"🤖 Zero-Shot Classifier Configuration:")
        print(f"   Model: {model_name}")
        print(f"   Score threshold: {score_threshold}")
        print(f"   Labels: {self.candidate_labels}")
        print(f"   Device: {device_name}")

    def load_pipeline(self):
        """Load optimized pipeline for 20K+ rows without memory issues"""
        print(f"⏳ Loading {self.model_name} with memory optimizations...")
        start_time = time.time()

        try:
            # Conservative configuration for stability with large datasets
            self.pipeline = pipeline(
                "zero-shot-classification",
                model=self.model_name,
                device=device,
                torch_dtype=(
                    torch.float16 if torch.cuda.is_available() else torch.float32
                ),
                model_kwargs={"cache_dir": ".cache/huggingface"},
                return_all_scores=True,  # Get all scores for better thresholding
            )

            load_time = time.time() - start_time
            print(f"✅ Pipeline loaded in {load_time:.1f} seconds")

            # Test with a simple example
            test_result = self.pipeline("test review", self.candidate_labels)
            print(f"⚡ Pipeline ready for lightning-fast batch processing!")

            return True

        except Exception as e:
            print(f"❌ Failed to load pipeline: {e}")
            print(f"💡 Will use rule-based classification only")
            return False

    def classify_text(self, text: str) -> Set[str]:
        """Classify text using zero-shot classification"""
        if not self.pipeline or not text or pd.isna(text) or text.strip() == "":
            return set()

        try:
            # Get predictions
            result = self.pipeline(text, self.candidate_labels)

            # Extract labels with scores above threshold
            labels = set()
            for label, score in zip(result["labels"], result["scores"]):
                if score >= self.score_threshold:
                    labels.add(label)

            return labels

        except Exception as e:
            print(f"⚠️  Classification error for text: {text[:50]}... - {e}")
            return set()

    def classify_batch(self, texts: List[str]) -> List[Set[str]]:
        """Lightning-fast batch processing - optimized for 20K+ rows without errors"""
        if not self.pipeline:
            return [set() for _ in texts]

        # Optimized batch size for memory efficiency and speed
        batch_size = 64  # Sweet spot for most GPUs
        results = [set() for _ in texts]

        # Pre-process texts to avoid errors during inference
        processed_texts = []
        text_indices = []

        for i, text in enumerate(texts):
            if text and pd.notna(text):
                clean_text = str(text).strip()
                if clean_text and len(clean_text) > 0:
                    # Truncate very long texts to prevent memory issues
                    if len(clean_text) > 512:
                        clean_text = clean_text[:512]
                    processed_texts.append(clean_text)
                    text_indices.append(i)

        if not processed_texts:
            return results

        print(
            f"⚡ Lightning batch processing {len(processed_texts):,} texts (batch_size={batch_size})..."
        )

        # Process in batches with error handling
        total_batches = (len(processed_texts) + batch_size - 1) // batch_size

        for batch_idx in tqdm(
            range(0, len(processed_texts), batch_size),
            desc=f"⚡ Processing {total_batches} batches",
            leave=False,
        ):

            batch_texts = processed_texts[batch_idx : batch_idx + batch_size]
            batch_indices = text_indices[batch_idx : batch_idx + batch_size]

            try:
                # Single batch inference - much more reliable
                if len(batch_texts) == 1:
                    # Single text
                    result = self.pipeline(batch_texts[0], self.candidate_labels)
                    batch_results = [result]
                else:
                    # Multiple texts
                    batch_results = self.pipeline(batch_texts, self.candidate_labels)

                # Process results safely
                if isinstance(batch_results, list):
                    for j, result in enumerate(batch_results):
                        if j < len(batch_indices):  # Safety check
                            original_idx = batch_indices[j]
                            labels = set()

                            # Safe result processing
                            if (
                                isinstance(result, dict)
                                and "labels" in result
                                and "scores" in result
                            ):
                                for label, score in zip(
                                    result["labels"], result["scores"]
                                ):
                                    if score >= self.score_threshold:
                                        labels.add(label)

                            results[original_idx] = labels
                else:
                    # Single result
                    if batch_indices:
                        original_idx = batch_indices[0]
                        labels = set()

                        if (
                            isinstance(batch_results, dict)
                            and "labels" in batch_results
                            and "scores" in batch_results
                        ):
                            for label, score in zip(
                                batch_results["labels"], batch_results["scores"]
                            ):
                                if score >= self.score_threshold:
                                    labels.add(label)

                        results[original_idx] = labels

            except Exception as e:
                print(
                    f"⚠️  Batch {batch_idx//batch_size + 1}/{total_batches} failed: {e}"
                )
                # Continue with empty results for this batch
                continue

        return results


# Initialize zero-shot classifier
zero_shot_classifier = ZeroShotClassifier(score_threshold=0.8)

🤖 Zero-Shot Classifier Configuration:
   Model: facebook/bart-large-mnli
   Score threshold: 0.8
   Labels: ['Advertisement', 'Irrelevant', 'Fake_Rant']
   Device: CPU


In [32]:
# Load the zero-shot pipeline (this may take a moment for first-time download)
pipeline_loaded = zero_shot_classifier.load_pipeline()

if pipeline_loaded:
    print("\n🎯 Ready for hybrid classification:")
    print("   1. Regex rules for obvious patterns (ultra-fast)")
    print("   2. Zero-shot classification for ambiguous cases (accurate)")
else:
    print("\n⚠️  Zero-shot classification unavailable - using regex rules only")
    print("   This may happen due to memory constraints or model loading issues")

⏳ Loading facebook/bart-large-mnli with memory optimizations...


Device set to use cpu


✅ Pipeline loaded in 2.7 seconds
⚡ Pipeline ready for lightning-fast batch processing!

🎯 Ready for hybrid classification:
   1. Regex rules for obvious patterns (ultra-fast)
   2. Zero-shot classification for ambiguous cases (accurate)
⚡ Pipeline ready for lightning-fast batch processing!

🎯 Ready for hybrid classification:
   1. Regex rules for obvious patterns (ultra-fast)
   2. Zero-shot classification for ambiguous cases (accurate)


## ⚡ Hybrid Classification Engine


In [33]:
class HybridClassifier:
    """Combines regex rules and zero-shot classification for optimal performance"""

    def __init__(self, rule_classifier, zero_shot_classifier):
        self.rule_classifier = rule_classifier
        self.zero_shot_classifier = zero_shot_classifier

    def classify_dataframe(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame:
        """Apply hybrid classification to entire dataframe with MAXIMUM SPEED"""
        print(f"🔥 Starting ULTRA-FAST Hybrid Classification Pipeline")
        print(f"📊 Processing {len(df):,} rows")

        start_time = time.time()

        # Step 1: Apply regex rules to all rows (already optimized)
        print(f"\n🔍 Step 1: Applying regex rules...")
        rule_labels = self.rule_classifier.classify_dataframe(df, text_column)

        # Step 2: Identify rows that need zero-shot classification
        unlabeled_mask = rule_labels.apply(len) == 0
        unlabeled_count = unlabeled_mask.sum()

        print(f"\n📋 Classification Status:")
        print(f"   Rule-based labels: {(~unlabeled_mask).sum():,} rows")
        print(f"   Needs zero-shot: {unlabeled_count:,} rows")

        # Step 3: Apply zero-shot classification to unlabeled rows
        final_labels = rule_labels.copy()

        if unlabeled_count > 0 and self.zero_shot_classifier.pipeline:
            print(
                f"\n🚀 Step 2: ULTRA-FAST zero-shot classification on {unlabeled_count:,} rows..."
            )

            # Extract unlabeled texts more efficiently
            unlabeled_texts = df.loc[unlabeled_mask, text_column].values.tolist()
            zero_shot_start = time.time()

            # Use optimized batch processing
            zero_shot_labels = self.zero_shot_classifier.classify_batch(unlabeled_texts)

            zero_shot_duration = time.time() - zero_shot_start
            print(f"   ⏱️  Zero-shot time: {zero_shot_duration:.2f} seconds")
            if zero_shot_duration > 0:
                print(
                    f"   ⚡ Zero-shot speed: {unlabeled_count/zero_shot_duration:,.0f} rows/second"
                )

            # Merge zero-shot results with rule-based results using vectorized operations
            unlabeled_indices = df.index[unlabeled_mask]
            for idx, labels in zip(unlabeled_indices, zero_shot_labels):
                final_labels.loc[idx] = labels

        elif unlabeled_count > 0:
            print(
                f"\n⚠️  Zero-shot classifier not available - {unlabeled_count:,} rows remain unlabeled"
            )

        # Step 4: Create final results with vectorized operations
        df_result = df.copy()

        # Convert sets to lists efficiently
        labels_list = final_labels.apply(list)
        df_result["labels"] = labels_list

        # Vectorized boolean column creation for maximum speed
        print(f"\n⚡ Creating boolean columns with vectorized operations...")

        # Create boolean arrays directly
        advertisement_mask = final_labels.apply(lambda x: "Advertisement" in x).values
        irrelevant_mask = final_labels.apply(lambda x: "Irrelevant" in x).values
        fake_rant_mask = final_labels.apply(lambda x: "Fake_Rant" in x).values

        df_result["is_advertisement"] = advertisement_mask
        df_result["is_irrelevant"] = irrelevant_mask
        df_result["is_fake_rant"] = fake_rant_mask

        # Calculate final statistics
        total_duration = time.time() - start_time
        labeled_rows = (final_labels.apply(len) > 0).sum()
        total_labels = final_labels.apply(len).sum()

        print(f"\n🎉 ULTRA-FAST Hybrid Classification Complete!")
        print(f"   ⏱️  Total time: {total_duration:.2f} seconds")
        print(f"   ⚡ BLAZING speed: {len(df)/total_duration:,.0f} rows/second")
        print(f"   🎯 Labeled rows: {labeled_rows:,} ({labeled_rows/len(df)*100:.1f}%)")
        print(f"   🏷️  Total labels: {total_labels}")

        # Label distribution with efficient counting
        ad_count = advertisement_mask.sum()
        irrelevant_count = irrelevant_mask.sum()
        fake_rant_count = fake_rant_mask.sum()
        clean_count = (
            len(df_result)
            - (advertisement_mask | irrelevant_mask | fake_rant_mask).sum()
        )

        print(f"\n📊 Label Distribution:")
        print(f"   Advertisement: {ad_count:,} rows")
        print(f"   Irrelevant: {irrelevant_count:,} rows")
        print(f"   Fake_Rant: {fake_rant_count:,} rows")
        print(f"   Clean (no labels): {clean_count:,} rows")

        return df_result


# Initialize hybrid classifier
hybrid_classifier = HybridClassifier(rule_classifier, zero_shot_classifier)
print("⚡ Hybrid Classifier ready for lightning-fast classification!")

⚡ Hybrid Classifier ready for lightning-fast classification!


## 🚀 Execute Classification Pipeline


In [34]:
df.head()

Unnamed: 0,user_id,user_name,review_time,rating,review_text,pics,resp,gmap_id,has_resp,resp_text,resp_time,biz_name,description,category,avg_rating,num_of_reviews,price_level
0,103563353519118155776,Peri Gray,2018-01-16 17:11:15.780000+00:00,5,Great place to care for our children.,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0
1,101824980797027237888,Suzy Berndt,2018-07-30 03:45:50.314000+00:00,5,Th sw y are so nice,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0
2,108711640480272777216,Rosemary Red Legs,2018-07-07 13:11:33.932000+00:00,5,Went with my daughter,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0
3,101852294221648461824,Brown Wolf,2018-09-16 08:13:55.922000+00:00,2,,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0
4,108987444312280645632,C J Blue Coat,2016-09-26 20:39:35.491000+00:00,5,,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0


In [37]:
# Execute the complete hybrid classification pipeline
print("🚀 Executing Complete Multi-Label Classification Pipeline")
print("=" * 60)
print(f"Sampling data from Main Dataframe...")


def rating_diverse_sample(df, sample_size=10_000):
    # Sample proportionally by rating
    rating_counts = df["rating"].value_counts()
    diverse_sample = (
        df.groupby("rating", group_keys=False)
        .apply(
            lambda x: x.sample(
                min(len(x), int(sample_size * len(x) / len(df))), random_state=42
            )
        )
        .reset_index(drop=True)
    )

    return diverse_sample


sampled_df = rating_diverse_sample(df)
sample_df.head()

🚀 Executing Complete Multi-Label Classification Pipeline
Sampling data from Main Dataframe...


Unnamed: 0,user_id,user_name,review_time,rating,review_text,pics,resp,gmap_id,has_resp,resp_text,resp_time,biz_name,description,category,avg_rating,num_of_reviews,price_level
0,103563353519118155776,Peri Gray,2018-01-16 17:11:15.780000+00:00,5,Great place to care for our children.,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0
1,101824980797027237888,Suzy Berndt,2018-07-30 03:45:50.314000+00:00,5,Th sw y are so nice,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0
2,108711640480272777216,Rosemary Red Legs,2018-07-07 13:11:33.932000+00:00,5,Went with my daughter,False,,0x532af45db8f30779:0xd9be9359f1e56178,False,,,CRST WIC Office,,,4.7,8.0,0.0


In [38]:
# Apply hybrid classification
df_classified = hybrid_classifier.classify_dataframe(sampled_df, text_col)

print(f"\n✅ Classification pipeline completed successfully!")
print(f"📊 Results saved in 'labels' column as list of strings")
print(f"� Individual boolean columns added for easy filtering")

🔥 Starting ULTRA-FAST Hybrid Classification Pipeline
📊 Processing 9,996 rows

🔍 Step 1: Applying regex rules...
🚀 Lightning-Fast regex classification on 9,996 rows...
   ⚡ Processing text data...
   🎯 Advertisement: 88 matches
   🎯 Advertisement: 95 matches
   🎯 Advertisement: 47 matches
   🎯 Advertisement: 57 matches
   🎯 Irrelevant: 9 matches
   🎯 Irrelevant: 56 matches
   🎯 Irrelevant: 11 matches
   🎯 Irrelevant: 27 matches
   🎯 Fake_Rant: 12 matches
   🎯 Fake_Rant: 11 matches
   🎯 Fake_Rant: 1 matches

✅ Lightning-Fast regex completed:
   ⏱️  Time: 0.12 seconds
   ⚡ Speed: 81,311 rows/second
   🎯 Patterns applied: 11
   📊 Rows labeled: 1,618 (16.2%)

📋 Classification Status:
   Rule-based labels: 1,618 rows
   Needs zero-shot: 8,378 rows

🚀 Step 2: ULTRA-FAST zero-shot classification on 8,378 rows...
⚡ Lightning batch processing 3,535 texts (batch_size=64)...


                                                                         

   ⏱️  Zero-shot time: 1091.81 seconds
   ⚡ Zero-shot speed: 8 rows/second

⚡ Creating boolean columns with vectorized operations...

🎉 ULTRA-FAST Hybrid Classification Complete!
   ⏱️  Total time: 1092.07 seconds
   ⚡ BLAZING speed: 9 rows/second
   🎯 Labeled rows: 2,663 (26.6%)
   🏷️  Total labels: 2689

📊 Label Distribution:
   Advertisement: 1,307 rows
   Irrelevant: 1,357 rows
   Fake_Rant: 25 rows
   Clean (no labels): 7,333 rows

✅ Classification pipeline completed successfully!
📊 Results saved in 'labels' column as list of strings
� Individual boolean columns added for easy filtering




In [40]:
# 🔍 Quick Results Validation
if "df_classified" in locals():
    print("✅ VALIDATION PASSED - df_classified exists!")
    print(f"📊 Quick Stats:")
    print(f"   Rows: {len(df_classified):,}")
    print(f"   Columns: {len(df_classified.columns)}")

    # Check for required columns
    required_cols = ["labels", "is_advertisement", "is_irrelevant", "is_fake_rant"]
    missing_cols = [col for col in required_cols if col not in df_classified.columns]

    if not missing_cols:
        print(f"   ✅ All required columns present: {required_cols}")

        # Quick label distribution
        label_counts = {
            "Advertisement": df_classified["is_advertisement"].sum(),
            "Irrelevant": df_classified["is_irrelevant"].sum(),
            "Fake_Rant": df_classified["is_fake_rant"].sum(),
        }

        print(f"\n🏷️ Label Distribution:")
        for label, count in label_counts.items():
            pct = (count / len(df_classified)) * 100
            print(f"   {label}: {count:,} ({pct:.1f}%)")

        clean_count = len(df_classified) - sum(label_counts.values())
        clean_pct = (clean_count / len(df_classified)) * 100
        print(f"   Clean: {clean_count:,} ({clean_pct:.1f}%)")

        print(f"\n🎯 SUCCESS! Ready for detailed analysis!")

    else:
        print(f"❌ Missing columns: {missing_cols}")

else:
    print("❌ VALIDATION FAILED - df_classified not found")
    print("💡 Run the classification cell above first")

✅ VALIDATION PASSED - df_classified exists!
📊 Quick Stats:
   Rows: 9,996
   Columns: 21
   ✅ All required columns present: ['labels', 'is_advertisement', 'is_irrelevant', 'is_fake_rant']

🏷️ Label Distribution:
   Advertisement: 1,307 (13.1%)
   Irrelevant: 1,357 (13.6%)
   Fake_Rant: 25 (0.3%)
   Clean: 7,307 (73.1%)

🎯 SUCCESS! Ready for detailed analysis!


## 📋 Results Analysis and Examples


In [41]:
# Display detailed results with examples
print("🎯 CLASSIFICATION RESULTS ANALYSIS")
print("=" * 50)

# Overall statistics
total_rows = len(df_classified)
labeled_rows = (df_classified["labels"].apply(len) > 0).sum()
clean_rows = total_rows - labeled_rows

print(f"\n📊 Overall Statistics:")
print(f"   Total rows: {total_rows:,}")
print(f"   Flagged rows: {labeled_rows:,} ({labeled_rows/total_rows*100:.1f}%)")
print(f"   Clean rows: {clean_rows:,} ({clean_rows/total_rows*100:.1f}%)")

# Label-specific statistics
print(f"\n🏷️  Label Breakdown:")
ad_count = df_classified["is_advertisement"].sum()
irrelevant_count = df_classified["is_irrelevant"].sum()
fake_rant_count = df_classified["is_fake_rant"].sum()

print(f"   📢 Advertisement: {ad_count:,} ({ad_count/total_rows*100:.1f}%)")
print(
    f"   🚫 Irrelevant: {irrelevant_count:,} ({irrelevant_count/total_rows*100:.1f}%)"
)
print(f"   😡 Fake_Rant: {fake_rant_count:,} ({fake_rant_count/total_rows*100:.1f}%)")

# Multi-label statistics
multi_label_mask = df_classified["labels"].apply(len) > 1
multi_label_count = multi_label_mask.sum()
print(
    f"   🔄 Multi-label: {multi_label_count:,} ({multi_label_count/total_rows*100:.1f}%)"
)

print(f"\n📝 EXAMPLE CLASSIFICATIONS:")
print("=" * 40)

# Show examples for each label category
categories = {
    "📢 Advertisement Examples": df_classified[df_classified["is_advertisement"]],
    "🚫 Irrelevant Examples": df_classified[df_classified["is_irrelevant"]],
    "😡 Fake_Rant Examples": df_classified[df_classified["is_fake_rant"]],
    "✅ Clean Examples": df_classified[
        ~(
            df_classified["is_advertisement"]
            | df_classified["is_irrelevant"]
            | df_classified["is_fake_rant"]
        )
    ],
}

for category_name, category_df in categories.items():
    if len(category_df) > 0:
        print(f"\n{category_name}:")
        sample_size = min(3, len(category_df))

        for i, (idx, row) in enumerate(category_df.head(sample_size).iterrows()):
            text_preview = (
                row[text_col][:120] + "..."
                if len(row[text_col]) > 120
                else row[text_col]
            )
            labels_str = ", ".join(row["labels"]) if row["labels"] else "None"

            print(f"   {i+1}. Text: '{text_preview}'")
            print(f"      Labels: [{labels_str}]")
            if "rating" in row:
                print(f"      Rating: {row['rating']}")
            print()
    else:
        print(f"\n{category_name}: No examples found")

# Display final dataset info
print(f"\n💾 Final Dataset:")
print(f"   Shape: {df_classified.shape}")
print(f"   New columns: labels, is_advertisement, is_irrelevant, is_fake_rant")
print(
    f"   Memory usage: {df_classified.memory_usage(deep=True).sum() / 1024**2:.1f} MB"
)

🎯 CLASSIFICATION RESULTS ANALYSIS

📊 Overall Statistics:
   Total rows: 9,996
   Flagged rows: 2,663 (26.6%)
   Clean rows: 7,333 (73.4%)

🏷️  Label Breakdown:
   📢 Advertisement: 1,307 (13.1%)
   🚫 Irrelevant: 1,357 (13.6%)
   😡 Fake_Rant: 25 (0.3%)
   🔄 Multi-label: 26 (0.3%)

📝 EXAMPLE CLASSIFICATIONS:

📢 Advertisement Examples:
   1. Text: 'Took 8 hours to set up my account in store with these guys. Was billed for stuff I never bought. For 3 months I've calle...'
      Labels: [Fake_Rant, Advertisement]
      Rating: 1

   2. Text: 'Biggest RIP OFF ever! For a family of 5 to see a movie, one large popcorn and 1 fountain drink and 4 waters it was $100....'
      Labels: [Advertisement]
      Rating: 1

   3. Text: 'They charge money for air'
      Labels: [Advertisement]
      Rating: 1


🚫 Irrelevant Examples:
   1. Text: 'Didn't take my name'
      Labels: [Irrelevant]
      Rating: 1

   2. Text: 'Wal Mart.'
      Labels: [Irrelevant]
      Rating: 1

   3. Text: 'No healthy opti

In [None]:
df_classified[df_classified["is_advertisement"] == True][["review_text"]]

Unnamed: 0,review_text
13,Took 8 hours to set up my account in store wit...
30,Biggest RIP OFF ever! For a family of 5 to see...
54,They charge money for air
57,Worst customer service I've ever experienced m...
60,Spent $27 on 2 quesadilla meals and a super na...
...,...
9980,Was a nice day and time at this time
9985,Best fast food there is
9989,"Good beer, free pretzels and well lit overflow..."
9994,Friendly Service And Great Delicious Food 😊😊


## 💾 Save Results


In [None]:
# Save the classified dataset
import os
from datetime import datetime

# Create output directory
output_dir = "../outputs/classified"
os.makedirs(output_dir, exist_ok=True)

# Generate timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"classified_reviews_{timestamp}.csv"
output_path = os.path.join(output_dir, output_filename)

# Save the results
df_classified.to_csv(output_path, index=False)

print(f"💾 Classified dataset saved:")
print(f"   📁 Path: {output_path}")
print(f"   📊 Rows: {len(df_classified):,}")
print(f"   🏷️  Columns: {len(df_classified.columns)}")

# Save classification summary
summary = {
    "timestamp": timestamp,
    "total_rows": len(df_classified),
    "classification_method": "hybrid_regex_zeroshot",
    "zero_shot_model": (
        zero_shot_classifier.model_name if zero_shot_classifier.pipeline else "Not used"
    ),
    "label_counts": {
        "advertisement": int(df_classified["is_advertisement"].sum()),
        "irrelevant": int(df_classified["is_irrelevant"].sum()),
        "fake_rant": int(df_classified["is_fake_rant"].sum()),
        "clean": int(
            (
                ~(
                    df_classified["is_advertisement"]
                    | df_classified["is_irrelevant"]
                    | df_classified["is_fake_rant"]
                )
            ).sum()
        ),
    },
    "multi_label_rows": int((df_classified["labels"].apply(len) > 1).sum()),
}

summary_path = os.path.join(output_dir, f"classification_summary_{timestamp}.json")
import json

with open(summary_path, "w") as f:
    json.dump(summary, f, indent=2)

print(f"\n📊 Classification summary saved: {summary_path}")

print(f"\n🎉 MULTI-LABEL CLASSIFICATION COMPLETE!")
print(f"\n✨ Your data is now classified with lightning speed and high accuracy!")
print(f"🔥 Ready for downstream ML tasks, filtering, or analysis!")