# SciTeX String Utilities (str) Module

This notebook demonstrates the powerful string manipulation utilities provided by the SciTeX `str` module. These utilities are essential for scientific computing tasks involving:

- **Text Formatting**: Color coding, formatting for terminal output
- **Scientific Notation**: LaTeX rendering, mathematical formatting
- **Plot Text**: Axis labels, titles, and scientific text formatting
- **Path Operations**: Clean path strings, remove special characters
- **Search and Replace**: Pattern matching, text manipulation
- **API Security**: Mask sensitive information in logs

The str module provides specialized string operations tailored for scientific computing and publication-ready output.

## Installation and Setup

In [None]:
import scitex as stx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import re

# Configure SciTeX for this notebook
stx.repro.fix_seeds(42)
print("SciTeX String Utilities (str) Module Demonstration")
print(f"SciTeX version: {stx.__version__}")

# Create working directory
work_dir = Path('./temp_str_demo')
work_dir.mkdir(exist_ok=True)

## 1. Text Coloring and Terminal Output

Format text with colors for better visibility in terminal output and logs.

In [None]:
# Demonstrate text coloring
print("=== Text Coloring and Formatting ===")

# Basic color formatting
print("\n1. Basic color formatting:")
print(stx.str.color_text("SUCCESS: Experiment completed", "green"))
print(stx.str.color_text("WARNING: Memory usage high", "yellow"))
print(stx.str.color_text("ERROR: File not found", "red"))
print(stx.str.color_text("INFO: Processing data...", "blue"))

# Shorthand color text function
print("\n2. Using shorthand ct() function:")
print(stx.str.ct("✓ Test passed", "green"))
print(stx.str.ct("⚠ Check results", "yellow"))
print(stx.str.ct("✗ Test failed", "red"))

# Color coding for scientific results
print("\n3. Scientific result formatting:")
p_value = 0.003
if p_value < 0.001:
    result = stx.str.color_text(f"p < 0.001 ***", "green")
elif p_value < 0.01:
    result = stx.str.color_text(f"p = {p_value:.3f} **", "green")
elif p_value < 0.05:
    result = stx.str.color_text(f"p = {p_value:.3f} *", "yellow")
else:
    result = stx.str.color_text(f"p = {p_value:.3f} (n.s.)", "gray")
print(f"Statistical significance: {result}")

# Remove ANSI codes for file output
colored_text = stx.str.color_text("This is colored text", "blue")
clean_text = stx.str.remove_ansi(colored_text)
print(f"\n4. ANSI removal:")
print(f"   With color codes: {repr(colored_text[:20])}...")
print(f"   Clean text: {repr(clean_text)}")

## 2. Scientific and LaTeX Text Formatting

Format mathematical expressions and scientific notation for publications.

In [None]:
# Demonstrate LaTeX formatting
print("=== LaTeX and Scientific Text Formatting ===")

# Convert to LaTeX style
print("\n1. LaTeX style conversion:")
variable_names = ['alpha', 'beta', 'gamma', 'theta', 'lambda']
for var in variable_names:
    latex_var = stx.str.to_latex_style(var)
    print(f"   {var} → {latex_var}")

# Add hats for vectors/estimators
print("\n2. Adding hats for vectors/estimators:")
estimators = ['x', 'theta', 'beta']
for est in estimators:
    hat_est = stx.str.add_hat_in_latex_style(est)
    print(f"   {est} → {hat_est}")

# Safe LaTeX conversion with fallback
print("\n3. Safe LaTeX rendering with fallback:")
expressions = [
    r"\alpha + \beta",
    r"\sum_{i=1}^{n} x_i",
    r"\frac{\partial f}{\partial x}",
    r"\int_{0}^{\infty} e^{-x} dx"
]

for expr in expressions:
    safe_expr = stx.str.safe_latex_render(expr)
    print(f"   {expr} → {safe_expr}")

# Scientific text formatting
print("\n4. Scientific text formatting:")
numbers = [1234567, 0.00012345, 1.23e-15, 9.87e23]
for num in numbers:
    sci_text = stx.str.scientific_text(f"{num:.3e}")
    print(f"   {num} → {sci_text}")

# Check LaTeX capability
print("\n5. LaTeX system check:")
latex_status = stx.str.get_latex_status()
print(f"   LaTeX available: {latex_status['available']}")
print(f"   Fallback mode: {latex_status['fallback_mode']}")

## 3. Plot Text Formatting

Format axis labels, titles, and scientific notation for publication-quality plots.

In [None]:
# Demonstrate plot text formatting
print("=== Plot Text Formatting ===")

# Format axis labels with units
print("\n1. Axis label formatting:")
axis_examples = [
    ('time', 's', 'Time'),
    ('frequency', 'Hz', 'Frequency'),
    ('power', 'mW', 'Power'),
    ('voltage', 'µV', 'Voltage'),
    ('temperature', '°C', 'Temperature')
]

for var, unit, label in axis_examples:
    formatted_label = stx.str.format_axis_label(label, unit)
    print(f"   {label} with unit {unit} → {formatted_label}")

# Format plot titles
print("\n2. Plot title formatting:")
titles = [
    "neural_network_accuracy_vs_epochs",
    "power_spectral_density_analysis",
    "correlation_matrix_features"
]

for title in titles:
    formatted = stx.str.format_title(title)
    print(f"   {title}")
    print(f"   → {formatted}")

# Smart tick formatting for large numbers
print("\n3. Smart tick formatting:")
large_numbers = [1000, 1000000, 1500000, 2.5e9, 3.7e12]
formatter = stx.str.smart_tick_formatter()

for num in large_numbers:
    formatted = formatter(num)
    print(f"   {num} → {formatted}")

# Create example plot with formatted text
print("\n4. Example plot with formatted text:")
fig, ax = plt.subplots(figsize=(8, 6))

# Generate sample data
x = np.linspace(0, 10, 100)
y = 1e6 * np.exp(-x/3) * np.sin(2*np.pi*x)

ax.plot(x, y)
ax.set_xlabel(stx.str.format_axis_label('Time', 's'))
ax.set_ylabel(stx.str.format_axis_label('Amplitude', 'µV'))
ax.set_title(stx.str.format_title('exponential_decay_oscillation'))

# Use smart tick formatter for y-axis
ax.yaxis.set_major_formatter(plt.FuncFormatter(stx.str.smart_tick_formatter()))

plt.tight_layout()
plt.savefig(work_dir / 'formatted_plot.png', dpi=150)
plt.show()

print("   Plot saved with formatted labels and smart tick formatting")

## 4. Path String Operations

Clean and manipulate path strings for cross-platform compatibility.

In [None]:
# Demonstrate path string operations
print("=== Path String Operations ===")

# Clean path strings
print("\n1. Path cleaning:")
messy_paths = [
    "./data//raw/../processed/file.csv",
    "C:\\Users\\Name\\Documents\\..\\Projects\\data.txt",
    "/home/user//project/./scripts/../results/output.png",
    "data\\experiments\\\\trial_01\\\\results.json"
]

for path in messy_paths:
    cleaned = stx.str.clean_path(path)
    print(f"   Original: {path}")
    print(f"   Cleaned:  {cleaned}")
    print()

# Extract components from paths
print("2. Path component extraction using string operations:")
file_paths = [
    "experiments/2024_03_15/results/accuracy_plot.png",
    "data/processed/subject_001_session_02.csv",
    "models/neural_network_v2.3.pkl"
]

for path in file_paths:
    # Extract meaningful parts using string operations
    parts = path.split('/')
    filename = parts[-1]
    name_parts = filename.split('.')
    base_name = name_parts[0]
    extension = name_parts[-1] if len(name_parts) > 1 else ''
    
    print(f"   Path: {path}")
    print(f"   Base name: {base_name}")
    print(f"   Extension: {extension}")
    print()

## 5. Search and Pattern Matching

Search for patterns in text and code using grep-like functionality.

In [None]:
# Demonstrate search and pattern matching
print("=== Search and Pattern Matching ===")

# Sample code to search
sample_code = '''import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

def process_data(data, normalize=True):
    """Process experimental data with optional normalization."""
    if normalize:
        data = (data - data.mean()) / data.std()
    return data

def train_model(X_train, y_train, model_type='svm'):
    """Train machine learning model."""
    if model_type == 'svm':
        from sklearn.svm import SVC
        model = SVC(kernel='rbf')
    elif model_type == 'rf':
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100)
    
    model.fit(X_train, y_train)
    return model
'''

# Grep-like search
print("\n1. Grep-like pattern search:")
patterns = ['def', 'import', 'model', 'data']
for pattern in patterns:
    matches = stx.str.grep(sample_code, pattern)
    print(f"   Pattern '{pattern}': {len(matches)} matches")
    for i, match in enumerate(matches[:2]):  # Show first 2 matches
        print(f"     Line {match['line_number']}: {match['line'].strip()}")

# Advanced search with regex
print("\n2. Advanced pattern search:")
# Search for function definitions
func_pattern = r'def\s+(\w+)\s*\('
func_matches = stx.str.search(sample_code, func_pattern)
print(f"   Function definitions found: {len(func_matches)}")
for match in func_matches:
    print(f"     {match.group(1)}()")

# Search for imports
import_pattern = r'from\s+([\w.]+)\s+import\s+([\w, ]+)'
import_matches = stx.str.search(sample_code, import_pattern)
print(f"\n   Import statements found: {len(import_matches)}")
for match in import_matches:
    print(f"     from {match.group(1)} import {match.group(2)}")

## 6. Text Parsing and Processing

Parse structured text data and perform advanced text processing.

In [None]:
# Demonstrate text parsing
print("=== Text Parsing and Processing ===")

# Parse structured data
print("\n1. Parsing structured text:")
log_entry = "2024-03-15 14:23:45 [INFO] Experiment started - subject_id: 42, condition: A"

# Parse timestamp, level, and message
parsed = stx.str.parse(log_entry, 
                      pattern=r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) \[(\w+)\] (.+)')
if parsed:
    timestamp, level, message = parsed.groups()
    print(f"   Timestamp: {timestamp}")
    print(f"   Level: {level}")
    print(f"   Message: {message}")

# Parse key-value pairs
config_string = "epochs=100, batch_size=32, learning_rate=0.001, dropout=0.2"
kv_pattern = r'(\w+)=([\d.]+)'
kv_matches = stx.str.search(config_string, kv_pattern, all_matches=True)
print("\n2. Parsing configuration string:")
for match in kv_matches:
    key, value = match.groups()
    print(f"   {key}: {value}")

# String replacement with patterns
print("\n3. Pattern-based string replacement:")
text = "The temperature was 25C and the pressure was 1013hPa"
replacements = {
    r'(\d+)C': r'\1°C',
    r'(\d+)hPa': r'\1 hPa',
}

modified_text = text
for pattern, replacement in replacements.items():
    modified_text = stx.str.replace(modified_text, pattern, replacement)

print(f"   Original: {text}")
print(f"   Modified: {modified_text}")

# Squeeze multiple spaces
print("\n4. Space normalization:")
messy_text = "This   text    has     too    many      spaces"
clean_text = stx.str.squeeze_spaces(messy_text)
print(f"   Original: '{messy_text}'")
print(f"   Cleaned:  '{clean_text}'")

## 7. Security and API Masking

Mask sensitive information like API keys in logs and output.

In [None]:
# Demonstrate API masking
print("=== Security and API Masking ===")

# Mask API keys in logs
print("\n1. API key masking:")
log_messages = [
    "Connecting to API with key: sk-1234567890abcdef1234567890abcdef",
    "Authorization: Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxMjM0NTY3ODkwIiwibmFtZSI6IkpvaG4gRG9lIiwiaWF0IjoxNTE2MjM5MDIyfQ",
    "Database password: p@ssw0rd123!@#",
    "AWS Access Key: AKIAIOSFODNN7EXAMPLE"
]

for log in log_messages:
    masked = stx.str.mask_api(log)
    print(f"   Original: {log[:50]}...")
    print(f"   Masked:   {masked[:50]}...")
    print()

# Custom masking patterns
print("2. Custom sensitive data masking:")
sensitive_data = {
    "email": "john.doe@example.com",
    "phone": "+1-555-123-4567",
    "ssn": "123-45-6789",
    "credit_card": "4532-1234-5678-9012"
}

# Create masked versions
for data_type, value in sensitive_data.items():
    if data_type == "email":
        # Keep domain, mask local part
        parts = value.split('@')
        masked = f"{parts[0][:2]}****@{parts[1]}"
    elif data_type == "phone":
        masked = f"{value[:5]}***-****"
    elif data_type == "ssn":
        masked = f"***-**-{value[-4:]}"
    elif data_type == "credit_card":
        masked = f"****-****-****-{value[-4:]}"
    
    print(f"   {data_type}: {value} → {masked}")

# Safe logging function
print("\n3. Safe logging example:")
def safe_log(message, sensitive_patterns=None):
    """Log message with automatic sensitive data masking."""
    # Apply API masking
    safe_message = stx.str.mask_api(message)
    
    # Apply custom patterns if provided
    if sensitive_patterns:
        for pattern in sensitive_patterns:
            safe_message = re.sub(pattern, '***MASKED***', safe_message)
    
    return safe_message

test_log = "User john.doe@example.com logged in with API key sk-abc123def456"
safe_version = safe_log(test_log, [r'\b[\w.]+@[\w.]+\b'])
print(f"   Original: {test_log}")
print(f"   Safe:     {safe_version}")

## 8. Utility Functions

Additional string utility functions for scientific computing.

In [None]:
# Demonstrate utility functions
print("=== String Utility Functions ===")

# Readable byte sizes
print("\n1. Human-readable byte sizes:")
file_sizes = [1024, 1048576, 1073741824, 1099511627776, 1234567890]
for size in file_sizes:
    readable = stx.str.readable_bytes(size)
    print(f"   {size:>15} bytes → {readable}")

# Debug printing
print("\n2. Debug printing with context:")
def process_data(data):
    stx.str.print_debug("Starting data processing", data.shape)
    
    # Process data
    result = data.mean(axis=0)
    
    stx.str.print_debug("Processing complete", result.shape)
    return result

test_data = np.random.randn(100, 10)
result = process_data(test_data)

# Decapitalize strings
print("\n3. String case manipulation:")
titles = [
    "NEURAL NETWORK ARCHITECTURE",
    "Machine Learning Results",
    "DATA PREPROCESSING PIPELINE"
]

for title in titles:
    decap = stx.str.decapitalize(title)
    print(f"   {title}")
    print(f"   → {decap}")

# Block printing for emphasis
print("\n4. Block printing for emphasis:")
stx.str.printc("EXPERIMENT RESULTS", color="green", style="block")
print("Accuracy: 95.3%")
print("F1 Score: 0.947")
print("AUC-ROC: 0.983")
stx.str.printc("END OF RESULTS", color="green", style="block")

## 9. Integration Example: Scientific Report Generation

Demonstrate how string utilities work together for report generation.

In [None]:
# Complete scientific report generation example
print("=== Scientific Report Generation Example ===")

class ScientificReportGenerator:
    def __init__(self, experiment_name):
        self.experiment_name = experiment_name
        self.sections = []
        
    def add_header(self):
        """Add formatted header to report."""
        title = stx.str.format_title(self.experiment_name)
        header = f"""
{'=' * 70}
{stx.str.color_text(title.upper(), 'blue')}
{'=' * 70}
"""
        self.sections.append(header)
        
    def add_results(self, results_dict):
        """Add formatted results section."""
        section = ["\n## RESULTS\n"]
        
        for metric, value in results_dict.items():
            # Format metric name
            metric_label = stx.str.format_title(metric)
            
            # Format value based on type
            if isinstance(value, float):
                if value < 0.01:
                    value_str = stx.str.scientific_text(f"{value:.3e}")
                else:
                    value_str = f"{value:.4f}"
            else:
                value_str = str(value)
            
            # Color code based on performance
            if metric.lower() in ['accuracy', 'precision', 'recall', 'f1_score']:
                if isinstance(value, (int, float)) and value > 0.9:
                    value_str = stx.str.color_text(value_str, 'green')
                elif isinstance(value, (int, float)) and value < 0.7:
                    value_str = stx.str.color_text(value_str, 'red')
            
            section.append(f"  {metric_label}: {value_str}")
        
        self.sections.append('\n'.join(section))
        
    def add_statistical_tests(self, test_results):
        """Add statistical test results with proper formatting."""
        section = ["\n## STATISTICAL ANALYSIS\n"]
        
        for test_name, test_data in test_results.items():
            formatted_name = stx.str.format_title(test_name)
            section.append(f"\n### {formatted_name}")
            
            # Format p-value with significance stars
            p_value = test_data['p_value']
            if p_value < 0.001:
                p_str = stx.str.color_text("p < 0.001 ***", 'green')
            elif p_value < 0.01:
                p_str = stx.str.color_text(f"p = {p_value:.3f} **", 'green')
            elif p_value < 0.05:
                p_str = stx.str.color_text(f"p = {p_value:.3f} *", 'yellow')
            else:
                p_str = f"p = {p_value:.3f} (n.s.)"
            
            section.append(f"  Statistical significance: {p_str}")
            section.append(f"  Test statistic: {test_data['statistic']:.4f}")
            section.append(f"  Effect size: {test_data['effect_size']:.3f}")
        
        self.sections.append('\n'.join(section))
        
    def add_file_info(self, file_paths):
        """Add file information section."""
        section = ["\n## OUTPUT FILES\n"]
        
        for file_path in file_paths:
            # Clean path for display
            clean_path = stx.str.clean_path(file_path)
            
            # Get file size if exists
            if Path(file_path).exists():
                size = Path(file_path).stat().st_size
                size_str = stx.str.readable_bytes(size)
                section.append(f"  • {clean_path} ({size_str})")
            else:
                section.append(f"  • {clean_path} (pending)")
        
        self.sections.append('\n'.join(section))
        
    def generate(self):
        """Generate final report."""
        # Remove ANSI codes for file output
        clean_sections = [stx.str.remove_ansi(section) for section in self.sections]
        return '\n'.join(clean_sections)
        
    def display(self):
        """Display report with colors."""
        return '\n'.join(self.sections)

# Create example report
report = ScientificReportGenerator("neural_network_classification_analysis")
report.add_header()

# Add results
results = {
    'accuracy': 0.9534,
    'precision': 0.9412,
    'recall': 0.9667,
    'f1_score': 0.9538,
    'auc_roc': 0.9823,
    'training_time': 145.67,
    'n_parameters': 1234567
}
report.add_results(results)

# Add statistical tests
stats_results = {
    'model_comparison_t_test': {
        'p_value': 0.0023,
        'statistic': 3.456,
        'effect_size': 0.82
    },
    'permutation_test': {
        'p_value': 0.0001,
        'statistic': 4.789,
        'effect_size': 1.23
    }
}
report.add_statistical_tests(stats_results)

# Add file information
output_files = [
    str(work_dir / 'model_weights.pkl'),
    str(work_dir / 'training_history.csv'),
    str(work_dir / 'confusion_matrix.png'),
    str(work_dir / 'roc_curve.pdf')
]
report.add_file_info(output_files)

# Display colored report
print(report.display())

# Save clean version
report_path = work_dir / 'experiment_report.txt'
with open(report_path, 'w') as f:
    f.write(report.generate())
print(f"\nReport saved to: {report_path}")

## 10. Summary and Best Practices

The SciTeX str module provides comprehensive string utilities for scientific computing:

In [None]:
# Summary of key string utilities
summary = {
    'Text Formatting': [
        'stx.str.color_text() - Terminal color formatting',
        'stx.str.remove_ansi() - Clean ANSI codes',
        'stx.str.printc() - Block printing with emphasis',
        'stx.str.print_debug() - Debug output with context'
    ],
    'Scientific Notation': [
        'stx.str.to_latex_style() - Convert to LaTeX',
        'stx.str.add_hat_in_latex_style() - Add vector notation',
        'stx.str.scientific_text() - Format scientific notation',
        'stx.str.safe_latex_render() - Safe LaTeX with fallback'
    ],
    'Plot Formatting': [
        'stx.str.format_axis_label() - Axis labels with units',
        'stx.str.format_title() - Clean plot titles',
        'stx.str.smart_tick_formatter() - Intelligent tick labels',
        'stx.str.factor_out_digits() - Factor large numbers'
    ],
    'Text Processing': [
        'stx.str.grep() - Pattern matching in text',
        'stx.str.search() - Advanced regex search',
        'stx.str.parse() - Structured text parsing',
        'stx.str.replace() - Pattern-based replacement'
    ],
    'Path Operations': [
        'stx.str.clean_path() - Normalize path strings',
        'stx.str.squeeze_spaces() - Remove extra spaces',
        'Cross-platform path handling',
        'Safe path string manipulation'
    ],
    'Security': [
        'stx.str.mask_api() - Hide sensitive data',
        'API key detection and masking',
        'Safe logging practices',
        'Data privacy protection'
    ]
}

print("SciTeX String Utilities (str) Module - Summary")
print("=" * 60)

for category, utilities in summary.items():
    print(f"\n{category}:")
    for utility in utilities:
        print(f"  • {utility}")

print(f"\n{'='*60}")
print("Best Practices:")
print("  • Use color_text() for important terminal output")
print("  • Apply mask_api() before logging sensitive data")
print("  • Format plot text for publication-ready figures")
print("  • Use LaTeX formatting with fallback for compatibility")
print("  • Clean paths for cross-platform compatibility")
print("  • Parse structured text with appropriate patterns")
print("  • Generate reports with proper formatting and colors")

print(f"\nDemo completed successfully! 🎉")

In [None]:
# Clean up
import shutil

if work_dir.exists():
    shutil.rmtree(work_dir)
    print(f"Cleaned up: {work_dir}")

print("\nNotebook cleanup completed.")