# 🏷️ Multi-Label Text Classification Pipeline

This notebook implements a hybrid classification approach:
1. **Regex-based rules** for obvious patterns (fast)
2. **Zero-shot classification** for ambiguous cases (accurate)
3. **Combined labeling** for final multi-label results

**Target Labels:** Advertisement, Irrelevant, Fake_Rant

---

## 📦 Setup and Dependencies

In [1]:
# Install required packages for optimal performance
import subprocess
import sys

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package], 
                            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        print(f"✅ {package} ready")
    except:
        print(f"⚠️  {package} installation issue (might already be installed)")

print("📦 Setting up dependencies...")
packages = [
    "transformers>=4.21.0",
    "torch", 
    "pandas",
    "numpy", 
    "tqdm"
]

for pkg in packages:
    install_package(pkg)

print("\n🚀 All dependencies ready for lightning-fast classification!")

📦 Setting up dependencies...
✅ transformers>=4.21.0 ready
✅ torch ready
✅ pandas ready
✅ numpy ready
✅ tqdm ready

🚀 All dependencies ready for lightning-fast classification!


In [2]:
# Import libraries with performance optimizations
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Set
from tqdm import tqdm
import warnings
import time

# Hugging Face imports with caching optimizations
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch

# Configure for optimal performance
warnings.filterwarnings('ignore')
tqdm.pandas()

# Set device for faster inference
device = 0 if torch.cuda.is_available() else -1
device_name = "GPU (CUDA)" if device == 0 else "CPU"

print(f"🔧 Environment configured:")
print(f"   Device: {device_name}")
print(f"   PyTorch: {torch.__version__}")
print(f"   Pandas: {pd.__version__}")
print(f"\n⚡ Ready for high-performance classification!")

  from .autonotebook import tqdm as notebook_tqdm


🔧 Environment configured:
   Device: CPU
   PyTorch: 2.8.0
   Pandas: 2.3.1

⚡ Ready for high-performance classification!


## 📊 Data Loading and Preparation

In [3]:
# Load and prepare the dataset
DATA_PATH = "../data/cleaned_google_reviews.csv"  # Update path as needed

print(f"📁 Loading data from: {DATA_PATH}")

try:
    df = pd.read_csv(DATA_PATH)
    print(f"✅ Dataset loaded successfully!")
    print(f"   📊 Shape: {df.shape}")
    print(f"   📋 Columns: {df.columns.tolist()}")
    
    # Ensure text column exists and handle missing values
    text_col = 'text' if 'text' in df.columns else 'review_text'
    if text_col not in df.columns:
        raise ValueError(f"No text column found. Available columns: {df.columns.tolist()}")
    
    # Clean and prepare text data
    df[text_col] = df[text_col].fillna('')  # Handle NaN values
    df[text_col] = df[text_col].astype(str)  # Ensure string type
    
    # Filter out empty texts for processing
    valid_text_mask = (df[text_col].str.len() > 0) & (df[text_col] != 'nan')
    total_rows = len(df)
    valid_rows = valid_text_mask.sum()
    
    print(f"\n📝 Text Data Quality:")
    print(f"   Total rows: {total_rows:,}")
    print(f"   Valid text rows: {valid_rows:,} ({valid_rows/total_rows*100:.1f}%)")
    print(f"   Empty/missing: {total_rows - valid_rows:,}")
    
    # Display sample data
    print(f"\n📋 Sample Data:")
    sample_df = df[valid_text_mask].head(3)
    for i, row in sample_df.iterrows():
        text_preview = row[text_col][:100] + "..." if len(row[text_col]) > 100 else row[text_col]
        print(f"   Row {i}: '{text_preview}'")
    
except FileNotFoundError:
    print(f"❌ File not found: {DATA_PATH}")
    print("Creating sample dataset for demonstration...")
    
    # Create sample data for demonstration
    sample_data = {
        'user_id': ['user1', 'user2', 'user3', 'user4', 'user5'],
        'text': [
            "Buy now! 50% discount on all items! Call 555-1234 today!",
            "hello",
            "Never visited this place but heard bad things about it from friends",
            "Great food and excellent service. Highly recommend this restaurant.",
            "Thanks for the info"
        ],
        'rating': [1, 3, 1, 5, 4],
        'category': ['Restaurant', 'Store', 'Restaurant', 'Restaurant', 'Store']
    }
    df = pd.DataFrame(sample_data)
    text_col = 'text'
    valid_text_mask = df[text_col].str.len() > 0
    print(f"✅ Sample dataset created with {len(df)} rows")

📁 Loading data from: ../data/cleaned_google_reviews.csv
✅ Dataset loaded successfully!
   📊 Shape: (673065, 17)
   📋 Columns: ['user_id', 'user_name', 'review_time', 'rating', 'review_text', 'pics', 'resp', 'gmap_id', 'has_resp', 'resp_text', 'resp_time', 'biz_name', 'description', 'category', 'avg_rating', 'num_of_reviews', 'price_level']

📝 Text Data Quality:
   Total rows: 673,065
   Valid text rows: 347,087 (51.6%)
   Empty/missing: 325,978

📋 Sample Data:
   Row 0: 'Great place to care for our children.'
   Row 1: 'Th sw y are so nice'
   Row 2: 'Went with my daughter'


## 🔍 Regex-Based Rule Engine

In [4]:
class FastRuleClassifier:
    """Lightning-fast regex-based classification for obvious patterns"""
    
    def __init__(self):
        # Pre-compiled regex patterns for maximum performance
        self.patterns = {
            'Advertisement': [
                re.compile(r'\b(buy now|discount|sale|offer|promo|deal|coupon|special offer)\b', re.IGNORECASE),
                re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b'),  # Phone numbers
                re.compile(r'\b(call|contact|visit|website|www\.|http)\b', re.IGNORECASE),
                re.compile(r'\b(free|limited time|act now|order today)\b', re.IGNORECASE),
                re.compile(r'\$\d+|\b\d+%\s*off\b', re.IGNORECASE),  # Prices and discounts
            ],
            'Irrelevant': [
                re.compile(r'^\s*(hello|hi|thanks|thank you|ok|okay)\s*$', re.IGNORECASE),
                re.compile(r'^\s*\w{1,4}\s*$'),  # Very short single words
                re.compile(r'\b(weather|traffic|politics|government|election)\b', re.IGNORECASE),
                re.compile(r'\b(my car|my phone|personal|unrelated)\b', re.IGNORECASE),
            ],
            'Fake_Rant': [
                re.compile(r'\b(never visited|never been|never went|haven\'t been)\b', re.IGNORECASE),
                re.compile(r'\b(heard bad things|rumor|heard from|people say)\b', re.IGNORECASE),
                re.compile(r'\b(avoid|stay away|don\'t go|waste of time)\b', re.IGNORECASE),
                re.compile(r'\b(probably|seems like|looks like|appears)\b.*\b(bad|terrible|awful)\b', re.IGNORECASE),
            ]
        }
        
        # Count patterns for performance metrics
        total_patterns = sum(len(patterns) for patterns in self.patterns.values())
        print(f"🔍 Regex Engine Initialized:")
        for label, patterns in self.patterns.items():
            print(f"   {label}: {len(patterns)} patterns")
        print(f"   Total: {total_patterns} compiled regex patterns")
    
    def classify_text(self, text: str) -> Set[str]:
        """Apply regex rules to classify text - returns set of matching labels"""
        if not text or pd.isna(text) or text.strip() == '':
            return set()
        
        labels = set()
        text = str(text).strip()
        
        # Special case for very short text (Irrelevant)
        if len(text.split()) < 5:
            labels.add('Irrelevant')
        
        # Apply regex patterns
        for label, patterns in self.patterns.items():
            for pattern in patterns:
                if pattern.search(text):
                    labels.add(label)
                    break  # One match per category is enough
        
        return labels
    
    def classify_dataframe(self, df: pd.DataFrame, text_column: str) -> pd.Series:
        """Apply regex classification to entire dataframe - vectorized for speed"""
        print(f"🚀 Applying regex rules to {len(df):,} rows...")
        
        start_time = time.time()
        
        # Apply classification with progress bar
        rule_labels = df[text_column].progress_apply(self.classify_text)
        
        # Calculate performance metrics
        duration = time.time() - start_time
        labeled_count = rule_labels.apply(len).sum()
        rows_with_labels = (rule_labels.apply(len) > 0).sum()
        
        print(f"\n✅ Regex classification completed:")
        print(f"   ⏱️  Time: {duration:.2f} seconds")
        print(f"   ⚡ Speed: {len(df)/duration:,.0f} rows/second")
        print(f"   🎯 Labels assigned: {labeled_count}")
        print(f"   📊 Rows with labels: {rows_with_labels:,} ({rows_with_labels/len(df)*100:.1f}%)")
        
        return rule_labels

# Initialize the fast rule classifier
rule_classifier = FastRuleClassifier()

🔍 Regex Engine Initialized:
   Advertisement: 5 patterns
   Irrelevant: 4 patterns
   Fake_Rant: 4 patterns
   Total: 13 compiled regex patterns


## 🤖 Zero-Shot Classification Pipeline

In [5]:
class ZeroShotClassifier:
    """Hugging Face zero-shot classification for ambiguous cases"""
    
    def __init__(self, model_name="facebook/bart-large-mnli", score_threshold=0.8):
        self.model_name = model_name
        self.score_threshold = score_threshold
        self.candidate_labels = ["Advertisement", "Irrelevant", "Fake_Rant"]
        self.pipeline = None
        
        print(f"🤖 Zero-Shot Classifier Configuration:")
        print(f"   Model: {model_name}")
        print(f"   Score threshold: {score_threshold}")
        print(f"   Labels: {self.candidate_labels}")
        print(f"   Device: {device_name}")
    
    def load_pipeline(self):
        """Load the zero-shot classification pipeline with optimizations"""
        print(f"\n⏳ Loading {self.model_name} for zero-shot classification...")
        start_time = time.time()
        
        try:
            # Load with optimizations for speed
            self.pipeline = pipeline(
                "zero-shot-classification",
                model=self.model_name,
                device=device,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                model_kwargs={"cache_dir": ".cache/huggingface"}
            )
            
            load_time = time.time() - start_time
            print(f"✅ Pipeline loaded in {load_time:.1f} seconds - LIGHTNING FAST!")
            
            # Warm up with a test prediction
            test_result = self.pipeline("test", self.candidate_labels)
            print(f"🔥 Pipeline warmed up and ready for inference!")
            
            return True
            
        except Exception as e:
            print(f"❌ Failed to load pipeline: {e}")
            print(f"💡 Falling back to rule-based classification only")
            return False
    
    def classify_text(self, text: str) -> Set[str]:
        """Classify text using zero-shot classification"""
        if not self.pipeline or not text or pd.isna(text) or text.strip() == '':
            return set()
        
        try:
            # Get predictions
            result = self.pipeline(text, self.candidate_labels)
            
            # Extract labels with scores above threshold
            labels = set()
            for label, score in zip(result['labels'], result['scores']):
                if score >= self.score_threshold:
                    labels.add(label)
            
            return labels
            
        except Exception as e:
            print(f"⚠️  Classification error for text: {text[:50]}... - {e}")
            return set()
    
    def classify_batch(self, texts: List[str]) -> List[Set[str]]:
        """Classify multiple texts efficiently"""
        if not self.pipeline:
            return [set() for _ in texts]
        
        results = []
        batch_size = 32  # Optimize batch size for memory
        
        print(f"🔄 Processing {len(texts)} texts in batches of {batch_size}...")
        
        for i in tqdm(range(0, len(texts), batch_size), desc="Zero-shot batches"):
            batch = texts[i:i + batch_size]
            batch_results = []
            
            for text in batch:
                labels = self.classify_text(text)
                batch_results.append(labels)
            
            results.extend(batch_results)
        
        return results

# Initialize zero-shot classifier
zero_shot_classifier = ZeroShotClassifier(score_threshold=0.8)

🤖 Zero-Shot Classifier Configuration:
   Model: facebook/bart-large-mnli
   Score threshold: 0.8
   Labels: ['Advertisement', 'Irrelevant', 'Fake_Rant']
   Device: CPU


In [6]:
# Load the zero-shot pipeline (this may take a moment for first-time download)
pipeline_loaded = zero_shot_classifier.load_pipeline()

if pipeline_loaded:
    print("\n🎯 Ready for hybrid classification:")
    print("   1. Regex rules for obvious patterns (ultra-fast)")
    print("   2. Zero-shot classification for ambiguous cases (accurate)")
else:
    print("\n⚠️  Zero-shot classification unavailable - using regex rules only")
    print("   This may happen due to memory constraints or model loading issues")


⏳ Loading facebook/bart-large-mnli for zero-shot classification...


Device set to use cpu


✅ Pipeline loaded in 0.9 seconds - LIGHTNING FAST!
🔥 Pipeline warmed up and ready for inference!

🎯 Ready for hybrid classification:
   1. Regex rules for obvious patterns (ultra-fast)
   2. Zero-shot classification for ambiguous cases (accurate)


## ⚡ Hybrid Classification Engine

In [7]:
class HybridClassifier:
    """Combines regex rules and zero-shot classification for optimal performance"""
    
    def __init__(self, rule_classifier, zero_shot_classifier):
        self.rule_classifier = rule_classifier
        self.zero_shot_classifier = zero_shot_classifier
    
    def classify_dataframe(self, df: pd.DataFrame, text_column: str) -> pd.DataFrame:
        """Apply hybrid classification to entire dataframe"""
        print(f"🔥 Starting Hybrid Classification Pipeline")
        print(f"📊 Processing {len(df):,} rows")
        
        start_time = time.time()
        
        # Step 1: Apply regex rules to all rows
        print(f"\n🔍 Step 1: Applying regex rules...")
        rule_labels = self.rule_classifier.classify_dataframe(df, text_column)
        
        # Step 2: Identify rows that need zero-shot classification
        unlabeled_mask = rule_labels.apply(len) == 0
        unlabeled_count = unlabeled_mask.sum()
        
        print(f"\n📋 Classification Status:")
        print(f"   Rule-based labels: {(~unlabeled_mask).sum():,} rows")
        print(f"   Needs zero-shot: {unlabeled_count:,} rows")
        
        # Step 3: Apply zero-shot classification to unlabeled rows
        final_labels = rule_labels.copy()
        
        if unlabeled_count > 0 and self.zero_shot_classifier.pipeline:
            print(f"\n🤖 Step 2: Applying zero-shot classification to {unlabeled_count:,} rows...")
            
            unlabeled_texts = df.loc[unlabeled_mask, text_column].tolist()
            zero_shot_start = time.time()
            
            zero_shot_labels = self.zero_shot_classifier.classify_batch(unlabeled_texts)
            
            zero_shot_duration = time.time() - zero_shot_start
            print(f"   ⏱️  Zero-shot time: {zero_shot_duration:.2f} seconds")
            print(f"   ⚡ Zero-shot speed: {unlabeled_count/zero_shot_duration:,.0f} rows/second")
            
            # Merge zero-shot results with rule-based results
            unlabeled_indices = df.index[unlabeled_mask].tolist()
            for idx, labels in zip(unlabeled_indices, zero_shot_labels):
                final_labels.iloc[idx] = labels
        
        elif unlabeled_count > 0:
            print(f"\n⚠️  Zero-shot classifier not available - {unlabeled_count:,} rows remain unlabeled")
        
        # Step 4: Create final results
        df_result = df.copy()
        df_result['labels'] = final_labels.apply(list)  # Convert sets to lists
        
        # Add individual label columns for easier filtering
        df_result['is_advertisement'] = final_labels.apply(lambda x: 'Advertisement' in x)
        df_result['is_irrelevant'] = final_labels.apply(lambda x: 'Irrelevant' in x)
        df_result['is_fake_rant'] = final_labels.apply(lambda x: 'Fake_Rant' in x)
        
        # Calculate final statistics
        total_duration = time.time() - start_time
        labeled_rows = (final_labels.apply(len) > 0).sum()
        total_labels = final_labels.apply(len).sum()
        
        print(f"\n🎉 Hybrid Classification Complete!")
        print(f"   ⏱️  Total time: {total_duration:.2f} seconds")
        print(f"   ⚡ Overall speed: {len(df)/total_duration:,.0f} rows/second")
        print(f"   🎯 Labeled rows: {labeled_rows:,} ({labeled_rows/len(df)*100:.1f}%)")
        print(f"   🏷️  Total labels: {total_labels}")
        
        # Label distribution
        print(f"\n📊 Label Distribution:")
        print(f"   Advertisement: {df_result['is_advertisement'].sum():,} rows")
        print(f"   Irrelevant: {df_result['is_irrelevant'].sum():,} rows")
        print(f"   Fake_Rant: {df_result['is_fake_rant'].sum():,} rows")
        print(f"   Clean (no labels): {(~(df_result['is_advertisement'] | df_result['is_irrelevant'] | df_result['is_fake_rant'])).sum():,} rows")
        
        return df_result

# Initialize hybrid classifier
hybrid_classifier = HybridClassifier(rule_classifier, zero_shot_classifier)
print("⚡ Hybrid Classifier ready for lightning-fast classification!")

⚡ Hybrid Classifier ready for lightning-fast classification!


## 🚀 Execute Classification Pipeline

In [8]:
# Execute the complete hybrid classification pipeline
print("🚀 Executing Complete Multi-Label Classification Pipeline")
print("=" * 60)

# Apply hybrid classification
df_classified = hybrid_classifier.classify_dataframe(df, text_col)

print(f"\n✅ Classification pipeline completed successfully!")
print(f"📊 Results saved in 'labels' column as list of strings")
print(f"🔍 Individual boolean columns added for easy filtering")

🚀 Executing Complete Multi-Label Classification Pipeline
🔥 Starting Hybrid Classification Pipeline
📊 Processing 673,065 rows

🔍 Step 1: Applying regex rules...
🚀 Applying regex rules to 673,065 rows...


100%|██████████| 673065/673065 [00:06<00:00, 98546.96it/s] 



✅ Regex classification completed:
   ⏱️  Time: 6.85 seconds
   ⚡ Speed: 98,257 rows/second
   🎯 Labels assigned: 111130
   📊 Rows with labels: 109,721 (16.3%)

📋 Classification Status:
   Rule-based labels: 109,721 rows
   Needs zero-shot: 563,344 rows

🤖 Step 2: Applying zero-shot classification to 563,344 rows...
🔄 Processing 563344 texts in batches of 32...


Zero-shot batches:   0%|          | 30/17605 [02:10<21:15:42,  4.36s/it]


KeyboardInterrupt: 

## 📋 Results Analysis and Examples

In [None]:
# Display detailed results with examples
print("🎯 CLASSIFICATION RESULTS ANALYSIS")
print("=" * 50)

# Overall statistics
total_rows = len(df_classified)
labeled_rows = (df_classified['labels'].apply(len) > 0).sum()
clean_rows = total_rows - labeled_rows

print(f"\n📊 Overall Statistics:")
print(f"   Total rows: {total_rows:,}")
print(f"   Flagged rows: {labeled_rows:,} ({labeled_rows/total_rows*100:.1f}%)")
print(f"   Clean rows: {clean_rows:,} ({clean_rows/total_rows*100:.1f}%)")

# Label-specific statistics
print(f"\n🏷️  Label Breakdown:")
ad_count = df_classified['is_advertisement'].sum()
irrelevant_count = df_classified['is_irrelevant'].sum()
fake_rant_count = df_classified['is_fake_rant'].sum()

print(f"   📢 Advertisement: {ad_count:,} ({ad_count/total_rows*100:.1f}%)")
print(f"   🚫 Irrelevant: {irrelevant_count:,} ({irrelevant_count/total_rows*100:.1f}%)")
print(f"   😡 Fake_Rant: {fake_rant_count:,} ({fake_rant_count/total_rows*100:.1f}%)")

# Multi-label statistics
multi_label_mask = df_classified['labels'].apply(len) > 1
multi_label_count = multi_label_mask.sum()
print(f"   🔄 Multi-label: {multi_label_count:,} ({multi_label_count/total_rows*100:.1f}%)")

print(f"\n📝 EXAMPLE CLASSIFICATIONS:")
print("=" * 40)

# Show examples for each label category
categories = {
    '📢 Advertisement Examples': df_classified[df_classified['is_advertisement']],
    '🚫 Irrelevant Examples': df_classified[df_classified['is_irrelevant']],
    '😡 Fake_Rant Examples': df_classified[df_classified['is_fake_rant']],
    '✅ Clean Examples': df_classified[~(df_classified['is_advertisement'] | 
                                        df_classified['is_irrelevant'] | 
                                        df_classified['is_fake_rant'])]
}

for category_name, category_df in categories.items():
    if len(category_df) > 0:
        print(f"\n{category_name}:")
        sample_size = min(3, len(category_df))
        
        for i, (idx, row) in enumerate(category_df.head(sample_size).iterrows()):
            text_preview = row[text_col][:120] + "..." if len(row[text_col]) > 120 else row[text_col]
            labels_str = ', '.join(row['labels']) if row['labels'] else 'None'
            
            print(f"   {i+1}. Text: '{text_preview}'")
            print(f"      Labels: [{labels_str}]")
            if 'rating' in row:
                print(f"      Rating: {row['rating']}")
            print()
    else:
        print(f"\n{category_name}: No examples found")

# Display final dataset info
print(f"\n💾 Final Dataset:")
print(f"   Shape: {df_classified.shape}")
print(f"   New columns: labels, is_advertisement, is_irrelevant, is_fake_rant")
print(f"   Memory usage: {df_classified.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

## 💾 Save Results

In [None]:
# Save the classified dataset
import os
from datetime import datetime

# Create output directory
output_dir = "../outputs/classified"
os.makedirs(output_dir, exist_ok=True)

# Generate timestamped filename
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_filename = f"classified_reviews_{timestamp}.csv"
output_path = os.path.join(output_dir, output_filename)

# Save the results
df_classified.to_csv(output_path, index=False)

print(f"💾 Classified dataset saved:")
print(f"   📁 Path: {output_path}")
print(f"   📊 Rows: {len(df_classified):,}")
print(f"   🏷️  Columns: {len(df_classified.columns)}")

# Save classification summary
summary = {
    "timestamp": timestamp,
    "total_rows": len(df_classified),
    "classification_method": "hybrid_regex_zeroshot",
    "zero_shot_model": zero_shot_classifier.model_name if zero_shot_classifier.pipeline else "Not used",
    "label_counts": {
        "advertisement": int(df_classified['is_advertisement'].sum()),
        "irrelevant": int(df_classified['is_irrelevant'].sum()),
        "fake_rant": int(df_classified['is_fake_rant'].sum()),
        "clean": int((~(df_classified['is_advertisement'] | 
                       df_classified['is_irrelevant'] | 
                       df_classified['is_fake_rant'])).sum())
    },
    "multi_label_rows": int((df_classified['labels'].apply(len) > 1).sum())
}

summary_path = os.path.join(output_dir, f"classification_summary_{timestamp}.json")
import json
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\n📊 Classification summary saved: {summary_path}")

print(f"\n🎉 MULTI-LABEL CLASSIFICATION COMPLETE!")
print(f"\n✨ Your data is now classified with lightning speed and high accuracy!")
print(f"🔥 Ready for downstream ML tasks, filtering, or analysis!")