# Tokenization and Masking

Comprehensive tokenization module for Knowledge vs Reasoning Separation project.

This notebook consolidates all tokenization functionality including:
- **Enhanced Tokenizer** with structural hints preservation
- **ε-masking** with NER entity preservation
- **GCS Data Interface** for loading and processing large datasets
- **Tokenization Analysis** and visualization tools
- **Performance benchmarking** and optimization


## 1. Setup and Imports


from typing import List, Dict, Optional, Union, Tuple, Any, Set
from pathlib import Path
from collections import defaultdict, Counter
import json
import time
import math
import random
import re
import warnings
import tempfile
import os
import gc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM

# GCS and data processing imports
try:
    import gcsfs
    import pyarrow.parquet as pq
    import polars as pl
    GCS_AVAILABLE = True
except ImportError:
    GCS_AVAILABLE = False
    print("Warning: GCS dependencies not available. Install with: pip install gcsfs pyarrow polars")

# NLP imports
try:
    import spacy
    SPACY_AVAILABLE = True
except ImportError:
    SPACY_AVAILABLE = False
    print("Warning: spaCy not available. Install with: pip install spacy && python -m spacy download en_core_web_sm")

print("All imports successful!")
print(f"GCS support: {'✅' if GCS_AVAILABLE else '❌'}")
print(f"spaCy support: {'✅' if SPACY_AVAILABLE else '❌'}")

In [None]:
## 2. GCS Data Interface

Google Cloud Storage interface for loading and processing large datasets, matching the API style from masking.py.


In [None]:
class GCSDataLoader:
    """
    Google Cloud Storage data loader with interface matching masking.py API.
    
    Features:
    - Load parquet files from GCS buckets
    - Batch processing with progress tracking
    - Memory-efficient streaming
    - Error handling and retry logic
    """
    
    def __init__(self, bucket_name: str, credentials_path: str = "eastern-bridge-credentials.json"):
        """
        Initialize GCS data loader.
        
        Args:
            bucket_name: GCS bucket name
            credentials_path: Path to service account credentials
        """
        if not GCS_AVAILABLE:
            raise ImportError("GCS dependencies not available. Install with: pip install gcsfs pyarrow polars")
        
        self.bucket_name = bucket_name
        self.credentials_path = credentials_path
        
        # Initialize GCS filesystem
        try:
            self.fs = gcsfs.GCSFileSystem(token=credentials_path)
            print(f"✅ GCS connection established to bucket: {bucket_name}")
        except Exception as e:
            raise RuntimeError(f"Failed to connect to GCS: {e}")
        
        # Cache for file listings
        self._file_cache = None
        self._cache_timestamp = None
    
    def list_files(self, pattern: str = "**/*.parquet", refresh: bool = False) -> List[str]:
        """
        List files in the bucket matching the pattern.
        
        Args:
            pattern: Glob pattern for file matching
            refresh: Force refresh of file cache
            
        Returns:
            List of file paths
        """
        cache_valid = (self._file_cache is not None and 
                      self._cache_timestamp is not None and 
                      time.time() - self._cache_timestamp < 300)  # 5 min cache
        
        if refresh or not cache_valid:
            try:
                files = sorted(self.fs.glob(f"{self.bucket_name}/{pattern}"))
                self._file_cache = files
                self._cache_timestamp = time.time()
                print(f"📁 Found {len(files)} files matching pattern: {pattern}")
            except Exception as e:
                print(f"❌ Failed to list files: {e}")
                return []
        
        return self._file_cache
    
    def load_parquet_file(self, file_path: str) -> Optional[pd.DataFrame]:
        """
        Load a single parquet file from GCS.
        
        Args:
            file_path: Path to parquet file
            
        Returns:
            DataFrame or None if failed
        """
        try:
            # Use polars for efficient loading
            df = pl.read_parquet(f"gs://{file_path}").to_pandas()
            print(f"📊 Loaded {len(df)} rows from {file_path}")
            return df
        except Exception as e:
            print(f"❌ Failed to load {file_path}: {e}")
            return None
    
    def load_batch_files(self, file_paths: List[str], max_files: Optional[int] = None) -> pd.DataFrame:
        """
        Load multiple parquet files and combine into single DataFrame.
        
        Args:
            file_paths: List of file paths to load
            max_files: Maximum number of files to load (None for all)
            
        Returns:
            Combined DataFrame
        """
        if max_files:
            file_paths = file_paths[:max_files]
        
        print(f"🔄 Loading {len(file_paths)} files...")
        dfs = []
        
        for i, file_path in enumerate(file_paths):
            df = self.load_parquet_file(file_path)
            if df is not None:
                dfs.append(df)
            
            if (i + 1) % 10 == 0:
                print(f"   Processed {i + 1}/{len(file_paths)} files")
        
        if dfs:
            combined_df = pd.concat(dfs, ignore_index=True)
            print(f"✅ Combined {len(dfs)} files into {len(combined_df)} total rows")
            return combined_df
        else:
            print("❌ No files loaded successfully")
            return pd.DataFrame()
    
    def stream_files(self, file_paths: List[str], batch_size: int = 1000):
        """
        Stream files in batches for memory-efficient processing.
        
        Args:
            file_paths: List of file paths to process
            batch_size: Number of rows per batch
            
        Yields:
            DataFrame batches
        """
        for file_path in file_paths:
            df = self.load_parquet_file(file_path)
            if df is not None:
                # Yield in batches
                for i in range(0, len(df), batch_size):
                    batch = df.iloc[i:i + batch_size]
                    yield batch
    
    def get_file_info(self, file_paths: Optional[List[str]] = None) -> Dict[str, Any]:
        """
        Get information about files in the bucket.
        
        Args:
            file_paths: Specific files to analyze (None for all)
            
        Returns:
            Dictionary with file statistics
        """
        if file_paths is None:
            file_paths = self.list_files()
        
        info = {
            "total_files": len(file_paths),
            "total_size_mb": 0,
            "file_sizes": [],
            "sample_files": file_paths[:5] if file_paths else []
        }
        
        for file_path in file_paths[:10]:  # Sample first 10 files
            try:
                stat = self.fs.stat(file_path)
                size_mb = stat['size'] / (1024 * 1024)
                info["file_sizes"].append(size_mb)
                info["total_size_mb"] += size_mb
            except Exception as e:
                print(f"Warning: Could not get size for {file_path}: {e}")
        
        if info["file_sizes"]:
            info["avg_file_size_mb"] = np.mean(info["file_sizes"])
            info["max_file_size_mb"] = np.max(info["file_sizes"])
            info["min_file_size_mb"] = np.min(info["file_sizes"])
        
        return info

# Initialize GCS loader
if GCS_AVAILABLE:
    GCS_LOADER = GCSDataLoader("parquet_v2_openwebtext-with-pos-ner")
    print("✅ GCS Data Loader initialized")
else:
    GCS_LOADER = None
    print("❌ GCS Data Loader not available")


## 3. Enhanced Tokenizer with Structural Hints

Implementation of the enhanced tokenizer that preserves structural hints during ε-masking.


In [None]:
class NERExtractor:
    """
    Pluggable NER extraction supporting spaCy and regex fallback.
    Extracts named entities to preserve as structural hints during masking.
    """
    
    def __init__(self, method: str = "auto"):
        """
        Initialize NER extractor.
        
        Args:
            method: "spacy", "regex", or "auto" (try spaCy, fall back to regex)
        """
        self.method = method
        self.nlp = None
        self.entity_counter = 0
        
        if method in ("spacy", "auto") and SPACY_AVAILABLE:
            try:
                self.nlp = spacy.load("en_core_web_sm")
                self.method = "spacy"
                print("✅ spaCy NER model loaded")
            except (ImportError, OSError):
                if method == "spacy":
                    raise RuntimeError("spaCy not available. Install with: pip install spacy && python -m spacy download en_core_web_sm")
                self.method = "regex"
                print("⚠️ Falling back to regex NER")
        else:
            self.method = "regex"
            print("⚠️ Using regex NER (spaCy not available)")
        
        # Regex patterns for common entity types (fallback)
        self._regex_patterns = {
            "PERSON": re.compile(r'\b[A-Z][a-z]+ [A-Z][a-z]+\b'),
            "ORG": re.compile(r'\b(?:Inc\.|Corp\.|Ltd\.|Company|Corporation|University|Institute)\b'),
            "GPE": re.compile(r'\b(?:Paris|London|New York|Tokyo|Berlin|Rome|Madrid|Washington|Boston|Chicago)\b'),
            "DATE": re.compile(r'\b\d{4}\b|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b'),
            "MONEY": re.compile(r'\$\d+(?:\.\d{2})?\b|\b\d+(?:\.\d{2})?\s*(?:dollars?|USD|euros?|EUR)\b'),
        }
    
    def reset_counter(self):
        """Reset entity counter for new document."""
        self.entity_counter = 0
    
    def extract_entities(self, text: str) -> Dict[str, Tuple[str, str]]:
        """
        Extract named entities from text.
        
        Args:
            text: Input text
            
        Returns:
            Dict mapping entity_text -> (entity_type, replacement_token)
            Example: {"Paris": ("GPE", "<GPE_0>"), "John Smith": ("PERSON", "<PERSON_1>")}
        """
        self.reset_counter()
        entities = {}
        
        if self.method == "spacy" and self.nlp:
            doc = self.nlp(text)
            for ent in doc.ents:
                if ent.text not in entities and len(ent.text.strip()) > 0:
                    entity_type = ent.label_
                    replacement = f"<{entity_type}_{self.entity_counter}>"
                    entities[ent.text] = (entity_type, replacement)
                    self.entity_counter += 1
        else:
            # Regex fallback
            for entity_type, pattern in self._regex_patterns.items():
                for match in pattern.finditer(text):
                    entity_text = match.group()
                    if entity_text not in entities and len(entity_text.strip()) > 0:
                        replacement = f"<{entity_type}_{self.entity_counter}>"
                        entities[entity_text] = (entity_type, replacement)
                        self.entity_counter += 1
        
        return entities

# Function words to preserve during masking
FUNCTION_WORDS = {
    # Articles
    "a", "an", "the",
    
    # Prepositions
    "in", "on", "at", "by", "for", "with", "without", "to", "from", "of", "about", "under", "over", "through",
    
    # Conjunctions
    "and", "or", "but", "so", "yet", "nor", "for", "because", "if", "when", "where", "while", "although",
    
    # Pronouns
    "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them", "my", "your", "his", "her", "its", "our", "their",
    "this", "that", "these", "those", "who", "whom", "whose", "which", "what",
    
    # Auxiliary verbs
    "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing",
    "will", "would", "shall", "should", "can", "could", "may", "might", "must",
    
    # Common adverbs
    "not", "no", "yes", "very", "quite", "rather", "too", "so", "just", "only", "also", "even", "still", "already", "yet",
    
    # Determiners
    "all", "some", "any", "many", "much", "few", "little", "more", "most", "other", "another", "each", "every", "both", "either", "neither",
}

def is_function_word(word: str) -> bool:
    """Check if word is a function word."""
    return word.lower() in FUNCTION_WORDS

def is_punctuation(token: str) -> bool:
    """Check if token is punctuation."""
    return bool(re.match(r'^[^\w\s]+$', token))

print("✅ NER Extractor and function word utilities loaded")


In [None]:
class EnhancedTokenizer:
    """
    Enhanced tokenizer that handles structural hints and masking.
    
    Features:
    - Base tokenizer (GPT-2) with extensions
    - Special tokens for masking and entity types
    - Entity ID management per document
    - Structural hint preservation
    - ε-masking applied BEFORE tokenization
    """
    
    def __init__(self, base_model: str = "gpt2"):
        """
        Initialize enhanced tokenizer.
        
        Args:
            base_model: Base Hugging Face model name
        """
        self.base_model = base_model
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        
        # Add special tokens
        self.special_tokens = {
            "mask_token": "<mask>",
            "unk_token": "<unk>",
            "pad_token": "<pad>",
            "bos_token": "<|startoftext|>",
            "eos_token": "<|endoftext|>",
        }
        
        # Add special tokens to vocabulary
        self.tokenizer.add_special_tokens(self.special_tokens)
        
        # Entity mapping for current document
        self.entity_mapping = {}
        self.entity_counter = 0
        
        # NER extractor
        self.ner_extractor = NERExtractor()
        
        # ε-masker for pre-tokenization masking
        self.epsilon_masker = EpsilonMasker()
        
        print(f"✅ Enhanced Tokenizer initialized with {base_model}")
        print(f"   Vocabulary size: {self.tokenizer.vocab_size}")
        print(f"   Special tokens: {list(self.special_tokens.keys())}")
    
    def reset_entity_mapping(self):
        """Reset entity mapping for new document."""
        self.entity_mapping = {}
        self.entity_counter = 0
    
    def create_entity_token(self, entity_type: str) -> str:
        """
        Create a unique entity token for current document.
        
        Args:
            entity_type: Type of entity (PERSON, ORG, LOC, etc.)
            
        Returns:
            Unique entity token
        """
        token = f"<{entity_type}_{self.entity_counter}>"
        self.entity_counter += 1
        return token
    
    def add_entity_tokens_to_vocab(self, entity_types: List[str]):
        """
        Add entity type tokens to vocabulary.
        
        Args:
            entity_types: List of entity types to add
        """
        entity_tokens = {}
        for entity_type in entity_types:
            # Add a few numbered entity tokens
            for i in range(10):  # Add tokens for 0-9
                token = f"<{entity_type}_{i}>"
                entity_tokens[token] = token
        
        self.tokenizer.add_tokens(list(entity_tokens.keys()))
        print(f"✅ Added {len(entity_tokens)} entity tokens to vocabulary")
    
    def tokenize_with_structural_hints(self, text: str, epsilon: float = 0.0, 
                                     max_length: int = 512, seed: Optional[int] = None) -> Dict:
        """
        Tokenize text with structural hints and masking.
        
        IMPORTANT: ε-masking is applied BEFORE tokenization using GPT-2 tokenizer.
        
        Args:
            text: Input text
            epsilon: Masking level (0.0 = no masking) - applied BEFORE tokenization
            max_length: Maximum sequence length
            seed: Random seed for reproducible masking
            
        Returns:
            Dictionary with tokenized input and metadata
        """
        self.reset_entity_mapping()
        
        # STEP 1: Apply ε-masking BEFORE tokenization
        if epsilon > 0.0:
            masked_text, masking_stats = self.epsilon_masker.apply_masking(text, epsilon, seed)
        else:
            masked_text = text
            masking_stats = self.epsilon_masker._get_empty_statistics()
        
        # STEP 2: Extract entities from masked text
        entities = self.ner_extractor.extract_entities(masked_text)
        
        # STEP 3: Replace entities with typed tokens
        final_text = masked_text
        for entity_text, (entity_type, replacement) in entities.items():
            final_text = final_text.replace(entity_text, replacement)
        
        # STEP 4: Tokenize with GPT-2 tokenizer
        encoding = self.tokenizer(
            final_text,
            truncation=True,
            padding='max_length',
            max_length=max_length,
            return_tensors='pt'
        )
        
        # STEP 5: Create comprehensive metadata
        metadata = {
            'original_text': text,
            'masked_text': masked_text,
            'final_text': final_text,
            'epsilon': epsilon,
            'entities_found': len(entities),
            'entity_mapping': entities,
            'mask_positions': self._get_mask_positions(encoding['input_ids'][0]),
            'sequence_length': len(encoding['input_ids'][0]),
            'masking_stats': masking_stats,
            'tokenizer_used': self.base_model
        }
        
        return {
            'input_ids': encoding['input_ids'][0],
            'attention_mask': encoding['attention_mask'][0],
            'metadata': metadata
        }
    
    def _get_mask_positions(self, input_ids: torch.Tensor) -> List[int]:
        """Get positions of masked tokens."""
        mask_token_id = self.tokenizer.convert_tokens_to_ids(self.special_tokens['mask_token'])
        mask_positions = []
        
        for i, token_id in enumerate(input_ids):
            if token_id.item() == mask_token_id:
                mask_positions.append(i)
        
        return mask_positions
    
    def decode_tokens(self, token_ids: Union[List[int], torch.Tensor]) -> str:
        """
        Decode token IDs back to text.
        
        Args:
            token_ids: List or tensor of token IDs
            
        Returns:
            Decoded text
        """
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.tolist()
        
        return self.tokenizer.decode(token_ids, skip_special_tokens=False)
    
    def get_vocab_size(self) -> int:
        """Get current vocabulary size."""
        return len(self.tokenizer)
    
    def save_tokenizer(self, save_path: Union[str, Path]):
        """Save tokenizer configuration."""
        self.tokenizer.save_pretrained(save_path)
        print(f"✅ Tokenizer saved to {save_path}")
    
    def load_tokenizer(self, load_path: Union[str, Path]):
        """Load tokenizer configuration."""
        self.tokenizer = AutoTokenizer.from_pretrained(load_path)
        print(f"✅ Tokenizer loaded from {load_path}")

# Initialize enhanced tokenizer
ENHANCED_TOKENIZER = EnhancedTokenizer()
print("✅ Enhanced Tokenizer ready for use")


## 4. ε-Masking Implementation

Core ε-masking functionality with structural hints preservation.


In [None]:
class EpsilonMasker:
    """
    Applies ε-masking while preserving structural hints.
    
    Preserves:
    - Function words (the, and, is, etc.)
    - Punctuation marks
    - NER entities (replaced with typed IDs)
    """
    
    def __init__(self, preserve_function_words: bool = True, 
                 preserve_punctuation: bool = True, 
                 preserve_ner: bool = True,
                 ner_method: str = "auto"):
        """
        Initialize ε-masker.
        
        Args:
            preserve_function_words: Whether to preserve function words
            preserve_punctuation: Whether to preserve punctuation
            preserve_ner: Whether to preserve NER entities
            ner_method: NER extraction method ("spacy", "regex", "auto")
        """
        self.preserve_function_words = preserve_function_words
        self.preserve_punctuation = preserve_punctuation
        self.preserve_ner = preserve_ner
        
        # Initialize NER extractor
        if preserve_ner:
            self.ner_extractor = NERExtractor(method=ner_method)
        else:
            self.ner_extractor = None
        
        print(f"✅ ε-Masker initialized:")
        print(f"   - Function words: {'✅' if preserve_function_words else '❌'}")
        print(f"   - Punctuation: {'✅' if preserve_punctuation else '❌'}")
        print(f"   - NER entities: {'✅' if preserve_ner else '❌'}")
    
    def apply_masking(self, text: str, epsilon: float, seed: Optional[int] = None) -> Tuple[str, Dict]:
        """
        Apply ε-masking to text while preserving structural hints.
        
        Args:
            text: Input text
            epsilon: Masking probability (0.0-1.0)
            seed: Random seed for reproducibility
            
        Returns:
            Tuple of (masked_text, statistics)
        """
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)
        
        if epsilon == 0.0:
            return text, self._get_empty_statistics()
        
        # Extract entities if NER is enabled
        entities = {}
        if self.preserve_ner and self.ner_extractor:
            entities = self.ner_extractor.extract_entities(text)
        
        # Apply masking
        masked_text = self._mask_text(text, epsilon, entities)
        
        # Get statistics
        stats = self.get_masking_statistics(text, masked_text, epsilon, entities)
        
        return masked_text, stats
    
    def _mask_text(self, text: str, epsilon: float, entities: Dict) -> str:
        """Apply masking to text."""
        masked_text = text
        
        # Replace entities with typed tokens
        if entities:
            for entity_text, (entity_type, replacement) in entities.items():
                masked_text = masked_text.replace(entity_text, replacement)
        
        # Tokenize into words for masking
        words = re.findall(r'\S+', masked_text)
        masked_words = []
        
        for word in words:
            # Check if word should be preserved
            if self._should_preserve_word(word):
                masked_words.append(word)
            else:
                # Apply masking with probability epsilon
                if random.random() < epsilon:
                    masked_words.append("<mask>")
                else:
                    masked_words.append(word)
        
        return ' '.join(masked_words)
    
    def _should_preserve_word(self, word: str) -> bool:
        """Check if word should be preserved during masking."""
        # Preserve entity tokens
        if word.startswith('<') and word.endswith('>'):
            return True
        
        # Preserve function words
        if self.preserve_function_words and is_function_word(word):
            return True
        
        # Preserve punctuation
        if self.preserve_punctuation and is_punctuation(word):
            return True
        
        return False
    
    def get_masking_statistics(self, original_text: str, masked_text: str, 
                             epsilon: float, entities: Dict) -> Dict:
        """
        Get statistics about masking applied to text.
        
        Args:
            original_text: Original text
            masked_text: Masked text
            epsilon: Masking level
            entities: Extracted entities
            
        Returns:
            Dictionary with masking statistics
        """
        original_words = re.findall(r'\S+', original_text)
        masked_words = re.findall(r'\S+', masked_text)
        
        # Count different types of tokens
        total_tokens = len(original_words)
        masked_tokens = masked_words.count('<mask>')
        entity_tokens = sum(1 for word in masked_words if word.startswith('<') and word.endswith('>'))
        function_word_tokens = sum(1 for word in masked_words if is_function_word(word))
        punctuation_tokens = sum(1 for word in masked_words if is_punctuation(word))
        
        return {
            'total_tokens': total_tokens,
            'masked_tokens': masked_tokens,
            'entity_tokens': entity_tokens,
            'function_word_tokens': function_word_tokens,
            'punctuation_tokens': punctuation_tokens,
            'preserved_tokens': entity_tokens + function_word_tokens + punctuation_tokens,
            'masking_rate': masked_tokens / total_tokens if total_tokens > 0 else 0,
            'preservation_rate': (entity_tokens + function_word_tokens + punctuation_tokens) / total_tokens if total_tokens > 0 else 0,
            'epsilon': epsilon,
            'entities_found': len(entities),
            'entity_types': list(set(entity_type for _, (entity_type, _) in entities.items()))
        }
    
    def _get_empty_statistics(self) -> Dict:
        """Get empty statistics for epsilon=0."""
        return {
            'total_tokens': 0,
            'masked_tokens': 0,
            'entity_tokens': 0,
            'function_word_tokens': 0,
            'punctuation_tokens': 0,
            'preserved_tokens': 0,
            'masking_rate': 0.0,
            'preservation_rate': 0.0,
            'epsilon': 0.0,
            'entities_found': 0,
            'entity_types': []
        }
    
    def set_random_seed(self, seed: int):
        """Set random seed for reproducible masking."""
        random.seed(seed)
        np.random.seed(seed)
        print(f"✅ Random seed set to {seed}")

# Initialize ε-masker
EPSILON_MASKER = EpsilonMasker()
print("✅ ε-Masker ready for use")


## 5. Visualization and Analysis Tools

Tools for analyzing and visualizing tokenization results and masking effects.

class TokenizationAnalyzer:
    """
    Analysis and visualization tools for tokenization results.
    """
    
    def __init__(self):
        self.results_cache = []
    
    def analyze_text(self, text: str, epsilon_values: List[float] = [0.0, 0.1, 0.3, 0.5, 0.7]) -> pd.DataFrame:
        """
        Analyze text with different epsilon values.
        
        Args:
            text: Input text to analyze
            epsilon_values: List of epsilon values to test
            
        Returns:
            DataFrame with analysis results
        """
        results = []
        
        for epsilon in epsilon_values:
            # Apply masking
            masked_text, stats = EPSILON_MASKER.apply_masking(text, epsilon, seed=42)
            
            # Tokenize with enhanced tokenizer
            tokenized = ENHANCED_TOKENIZER.tokenize_with_structural_hints(masked_text, epsilon=0.0)
            
            # Combine results
            result = {
                'epsilon': epsilon,
                'original_text': text,
                'masked_text': masked_text,
                'tokenized_text': ENHANCED_TOKENIZER.decode_tokens(tokenized['input_ids']),
                'sequence_length': tokenized['metadata']['sequence_length'],
                'mask_positions': len(tokenized['metadata']['mask_positions']),
                **stats
            }
            results.append(result)
        
        df = pd.DataFrame(results)
        self.results_cache = df
        return df
    
    def plot_masking_effects(self, df: Optional[pd.DataFrame] = None):
        """
        Plot masking effects across different epsilon values.
        
        Args:
            df: Analysis results DataFrame (uses cache if None)
        """
        if df is None:
            df = self.results_cache
        
        if df.empty:
            print("No analysis results available. Run analyze_text() first.")
            return
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('ε-Masking Effects Analysis', fontsize=16)
        
        # Plot 1: Masking rate vs epsilon
        axes[0, 0].plot(df['epsilon'], df['masking_rate'], 'bo-', linewidth=2, markersize=8)
        axes[0, 0].set_xlabel('Epsilon (ε)')
        axes[0, 0].set_ylabel('Masking Rate')
        axes[0, 0].set_title('Masking Rate vs Epsilon')
        axes[0, 0].grid(True, alpha=0.3)
        
        # Plot 2: Preservation rate vs epsilon
        axes[0, 1].plot(df['epsilon'], df['preservation_rate'], 'go-', linewidth=2, markersize=8)
        axes[0, 1].set_xlabel('Epsilon (ε)')
        axes[0, 1].set_ylabel('Preservation Rate')
        axes[0, 1].set_title('Structural Hints Preservation')
        axes[0, 1].grid(True, alpha=0.3)
        
        # Plot 3: Token type distribution
        token_types = ['masked_tokens', 'entity_tokens', 'function_word_tokens', 'punctuation_tokens']
        x = np.arange(len(df))
        width = 0.2
        
        for i, token_type in enumerate(token_types):
            axes[1, 0].bar(x + i*width, df[token_type], width, label=token_type.replace('_tokens', ''))
        
        axes[1, 0].set_xlabel('Epsilon Values')
        axes[1, 0].set_ylabel('Token Count')
        axes[1, 0].set_title('Token Type Distribution')
        axes[1, 0].set_xticks(x + width * 1.5)
        axes[1, 0].set_xticklabels(df['epsilon'])
        axes[1, 0].legend()
        axes[1, 0].grid(True, alpha=0.3)
        
        # Plot 4: Entities found
        axes[1, 1].bar(df['epsilon'], df['entities_found'], color='purple', alpha=0.7)
        axes[1, 1].set_xlabel('Epsilon (ε)')
        axes[1, 1].set_ylabel('Entities Found')
        axes[1, 1].set_title('Named Entities Preserved')
        axes[1, 1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def compare_texts(self, texts: List[str], epsilon: float = 0.3):
        """
        Compare masking effects across multiple texts.
        
        Args:
            texts: List of texts to compare
            epsilon: Epsilon value to use
        """
        results = []
        
        for i, text in enumerate(texts):
            masked_text, stats = EPSILON_MASKER.apply_masking(text, epsilon, seed=42)
            results.append({
                'text_id': i,
                'original': text,
                'masked': masked_text,
                'masking_rate': stats['masking_rate'],
                'preservation_rate': stats['preservation_rate'],
                'entities_found': stats['entities_found']
            })
        
        df = pd.DataFrame(results)
        
        # Create comparison plot
        fig, axes = plt.subplots(1, 3, figsize=(18, 5))
        
        # Masking rate comparison
        axes[0].bar(range(len(df)), df['masking_rate'], color='red', alpha=0.7)
        axes[0].set_xlabel('Text ID')
        axes[0].set_ylabel('Masking Rate')
        axes[0].set_title(f'Masking Rate Comparison (ε={epsilon})')
        axes[0].set_xticks(range(len(df)))
        axes[0].grid(True, alpha=0.3)
        
        # Preservation rate comparison
        axes[1].bar(range(len(df)), df['preservation_rate'], color='green', alpha=0.7)
        axes[1].set_xlabel('Text ID')
        axes[1].set_ylabel('Preservation Rate')
        axes[1].set_title(f'Preservation Rate Comparison (ε={epsilon})')
        axes[1].set_xticks(range(len(df)))
        axes[1].grid(True, alpha=0.3)
        
        # Entities comparison
        axes[2].bar(range(len(df)), df['entities_found'], color='purple', alpha=0.7)
        axes[2].set_xlabel('Text ID')
        axes[2].set_ylabel('Entities Found')
        axes[2].set_title(f'Named Entities Comparison (ε={epsilon})')
        axes[2].set_xticks(range(len(df)))
        axes[2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        return df
    
    def generate_report(self, df: Optional[pd.DataFrame] = None) -> str:
        """
        Generate a text report of analysis results.
        
        Args:
            df: Analysis results DataFrame (uses cache if None)
            
        Returns:
            Formatted report string
        """
        if df is None:
            df = self.results_cache
        
        if df.empty:
            return "No analysis results available."
        
        report = []
        report.append("=" * 60)
        report.append("TOKENIZATION ANALYSIS REPORT")
        report.append("=" * 60)
        
        report.append(f"\nAnalyzed {len(df)} different epsilon values:")
        report.append(f"Epsilon range: {df['epsilon'].min():.1f} - {df['epsilon'].max():.1f}")
        
        report.append(f"\nMASKING STATISTICS:")
        report.append(f"  Average masking rate: {df['masking_rate'].mean():.3f}")
        report.append(f"  Average preservation rate: {df['preservation_rate'].mean():.3f}")
        report.append(f"  Total entities found: {df['entities_found'].sum()}")
        
        report.append(f"\nTOKEN BREAKDOWN:")
        report.append(f"  Total tokens analyzed: {df['total_tokens'].sum()}")
        report.append(f"  Masked tokens: {df['masked_tokens'].sum()}")
        report.append(f"  Entity tokens: {df['entity_tokens'].sum()}")
        report.append(f"  Function word tokens: {df['function_word_tokens'].sum()}")
        report.append(f"  Punctuation tokens: {df['punctuation_tokens'].sum()}")
        
        report.append(f"\nENTITY TYPES FOUND:")
        all_entity_types = set()
        for entity_types in df['entity_types']:
            all_entity_types.update(entity_types)
        report.append(f"  {', '.join(sorted(all_entity_types))}")
        
        report.append("\n" + "=" * 60)
        
        return "\n".join(report)

# Initialize analyzer
ANALYZER = TokenizationAnalyzer()
print("✅ Tokenization Analyzer ready for use")

## 6. Testing Examples and Demonstrations

Comprehensive examples demonstrating the tokenization pipeline with ε-masking.


# Example 1: Basic ε-masking demonstration
print("=" * 80)
print("EXAMPLE 1: Basic ε-masking with GPT-2 tokenization")
print("=" * 80)

sample_text = "The quick brown fox jumps over the lazy dog in New York City."

print(f"Original text: {sample_text}")
print()

# Test different epsilon values
epsilon_values = [0.0, 0.2, 0.5, 0.8]

for epsilon in epsilon_values:
    print(f"ε = {epsilon}:")
    
    # Apply masking and tokenization
    result = ENHANCED_TOKENIZER.tokenize_with_structural_hints(
        sample_text, 
        epsilon=epsilon, 
        seed=42
    )
    
    print(f"  Masked text: {result['metadata']['masked_text']}")
    print(f"  Final text: {result['metadata']['final_text']}")
    print(f"  Tokens: {len(result['input_ids'])}")
    print(f"  Mask positions: {len(result['metadata']['mask_positions'])}")
    print(f"  Entities found: {result['metadata']['entities_found']}")
    print()


# Example 2: GCS Data Loading (if available)
print("=" * 80)
print("EXAMPLE 2: GCS Data Loading and Processing")
print("=" * 80)

if GCS_LOADER is not None:
    try:
        # List files in bucket
        files = GCS_LOADER.list_files()
        print(f"Found {len(files)} files in bucket")
        
        if files:
            # Get file info
            info = GCS_LOADER.get_file_info(files[:5])  # Sample first 5 files
            print(f"Sample file info:")
            print(f"  Total files: {info['total_files']}")
            print(f"  Sample files: {info['sample_files']}")
            print(f"  Average file size: {info.get('avg_file_size_mb', 0):.2f} MB")
            
            # Load a small sample
            print(f"\nLoading sample data...")
            sample_df = GCS_LOADER.load_batch_files(files[:2])  # Load first 2 files
            
            if not sample_df.empty:
                print(f"Loaded {len(sample_df)} rows")
                print(f"Columns: {list(sample_df.columns)}")
                
                # Process a few samples
                if 'text' in sample_df.columns:
                    sample_texts = sample_df['text'].head(3).tolist()
                    print(f"\nProcessing sample texts with ε-masking:")
                    
                    for i, text in enumerate(sample_texts):
                        if isinstance(text, str) and len(text) > 50:
                            print(f"\nSample {i+1}:")
                            print(f"Original: {text[:100]}...")
                            
                            # Apply masking
                            masked_text, stats = EPSILON_MASKER.apply_masking(text, 0.3, seed=42)
                            print(f"Masked: {masked_text[:100]}...")
                            print(f"Masking rate: {stats['masking_rate']:.3f}")
                            print(f"Preservation rate: {stats['preservation_rate']:.3f}")
                            print(f"Entities found: {stats['entities_found']}")
            else:
                print("No data loaded")
        else:
            print("No files found in bucket")
            
    except Exception as e:
        print(f"Error accessing GCS: {e}")
else:
    print("GCS not available - skipping GCS examples")


# Example 3: Comprehensive Analysis with Visualization
print("=" * 80)
print("EXAMPLE 3: Comprehensive Analysis with Visualization")
print("=" * 80)

# Test texts with different characteristics
test_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "John Smith works at Microsoft Corporation in Seattle, Washington.",
    "The meeting is scheduled for January 15, 2024 at 3:00 PM.",
    "She bought a car for $25,000 from Toyota Motors Inc."
]

print("Analyzing texts with different epsilon values...")

# Analyze each text
for i, text in enumerate(test_texts):
    print(f"\nText {i+1}: {text}")
    
    # Run analysis
    df = ANALYZER.analyze_text(text, epsilon_values=[0.0, 0.1, 0.3, 0.5, 0.7])
    
    # Show results
    print("Results:")
    for _, row in df.iterrows():
        print(f"  ε={row['epsilon']}: masking_rate={row['masking_rate']:.3f}, "
              f"preservation_rate={row['preservation_rate']:.3f}, "
              f"entities={row['entities_found']}")

print(f"\nGenerating visualization...")
# Create visualization for the first text
df = ANALYZER.analyze_text(test_texts[0], epsilon_values=[0.0, 0.1, 0.3, 0.5, 0.7])
ANALYZER.plot_masking_effects(df)

print(f"\nGenerating comparison across texts...")
# Compare texts
comparison_df = ANALYZER.compare_texts(test_texts, epsilon=0.3)

print(f"\nAnalysis Report:")
report = ANALYZER.generate_report(df)
print(report)


In [None]:
# Example 4: Performance Benchmarking
print("=" * 80)
print("EXAMPLE 4: Performance Benchmarking")
print("=" * 80)

import time

def benchmark_tokenization(text: str, epsilon_values: List[float], iterations: int = 10):
    """Benchmark tokenization performance across different epsilon values."""
    results = []
    
    for epsilon in epsilon_values:
        times = []
        
        for _ in range(iterations):
            start_time = time.time()
            
            # Apply masking and tokenization
            result = ENHANCED_TOKENIZER.tokenize_with_structural_hints(
                text, 
                epsilon=epsilon, 
                seed=42
            )
            
            end_time = time.time()
            times.append(end_time - start_time)
        
        avg_time = np.mean(times)
        std_time = np.std(times)
        
        results.append({
            'epsilon': epsilon,
            'avg_time_ms': avg_time * 1000,
            'std_time_ms': std_time * 1000,
            'tokens_per_sec': len(result['input_ids']) / avg_time,
            'sequence_length': len(result['input_ids'])
        })
    
    return pd.DataFrame(results)

# Benchmark with a longer text
long_text = """
The artificial intelligence revolution has transformed numerous industries and continues to shape our daily lives. 
Machine learning algorithms, powered by vast datasets and computational resources, have achieved remarkable breakthroughs 
in natural language processing, computer vision, and autonomous systems. Companies like Google, Microsoft, and OpenAI 
have invested billions of dollars in developing advanced AI models that can understand, generate, and manipulate human language 
with unprecedented sophistication. These developments raise important questions about the future of work, privacy, and 
the ethical implications of increasingly powerful AI systems.
"""

print(f"Benchmarking tokenization performance...")
print(f"Text length: {len(long_text)} characters")

# Run benchmark
benchmark_df = benchmark_tokenization(long_text, [0.0, 0.1, 0.3, 0.5, 0.7], iterations=5)

print(f"\nBenchmark Results:")
print(benchmark_df.to_string(index=False))

# Create performance visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot 1: Processing time vs epsilon
axes[0].errorbar(benchmark_df['epsilon'], benchmark_df['avg_time_ms'], 
                 yerr=benchmark_df['std_time_ms'], marker='o', capsize=5)
axes[0].set_xlabel('Epsilon (ε)')
axes[0].set_ylabel('Processing Time (ms)')
axes[0].set_title('Tokenization Performance vs Epsilon')
axes[0].grid(True, alpha=0.3)

# Plot 2: Tokens per second vs epsilon
axes[1].plot(benchmark_df['epsilon'], benchmark_df['tokens_per_sec'], 'go-', linewidth=2, markersize=8)
axes[1].set_xlabel('Epsilon (ε)')
axes[1].set_ylabel('Tokens per Second')
axes[1].set_title('Throughput vs Epsilon')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nPerformance Summary:")
print(f"  Fastest processing: ε={benchmark_df.loc[benchmark_df['avg_time_ms'].idxmin(), 'epsilon']:.1f}")
print(f"  Slowest processing: ε={benchmark_df.loc[benchmark_df['avg_time_ms'].idxmax(), 'epsilon']:.1f}")
print(f"  Average tokens/sec: {benchmark_df['tokens_per_sec'].mean():.1f}")
