# SciTeX I/O Operations

This notebook demonstrates advanced I/O features in SciTeX, including:
- Multiple format support
- Compression options
- Caching mechanisms
- Batch operations
- HDF5 hierarchical storage

In [None]:
import scitex
import numpy as np
import pandas as pd
from pathlib import Path
import time

# Create output directory
output_dir = Path("io_demo")
output_dir.mkdir(exist_ok=True)
print(f"Working directory: {output_dir}")

## 1. Format Auto-Detection

SciTeX automatically detects the appropriate format based on file extension.

In [None]:
# Create sample data
sample_data = {
    'array': np.random.randn(100, 50),
    'dataframe': pd.DataFrame({
        'x': np.random.randn(1000),
        'y': np.random.randn(1000),
        'category': np.random.choice(['A', 'B', 'C'], 1000)
    }),
    'metadata': {
        'experiment': 'io_demo',
        'timestamp': time.time(),
        'parameters': {'alpha': 0.5, 'beta': 1.0}
    }
}

# Test different formats
formats = {
    'pkl': sample_data,  # Pickle - best for complex Python objects
    'json': sample_data['metadata'],  # JSON - human readable, simple data
    'npy': sample_data['array'],  # NumPy - efficient for arrays
    'csv': sample_data['dataframe'],  # CSV - tabular data
    'h5': sample_data  # HDF5 - hierarchical data
}

print("Saving in different formats:")
for ext, data in formats.items():
    filepath = output_dir / f"sample.{ext}"
    scitex.io.save(data, filepath)
    size_kb = filepath.stat().st_size / 1024
    print(f"  {ext:4} - {size_kb:6.1f} KB")

## 2. Compression Support

SciTeX supports automatic compression for space-efficient storage.

In [None]:
# Create large dataset for compression demo
large_data = np.random.randn(10000, 100)
print(f"Data size in memory: {large_data.nbytes / (1024**2):.1f} MB")

# Save with different compression levels
compression_results = {}

# Uncompressed
uncompressed_path = output_dir / "large_data.npy"
scitex.io.save(large_data, uncompressed_path)
uncompressed_size = uncompressed_path.stat().st_size / (1024**2)
compression_results['none'] = uncompressed_size

# Compressed pickle
compressed_path = output_dir / "large_data_compressed.pkl.gz"
scitex.io.save(large_data, compressed_path)
compressed_size = compressed_path.stat().st_size / (1024**2)
compression_results['gzip'] = compressed_size

# Display results
print("\nCompression results:")
for method, size in compression_results.items():
    ratio = uncompressed_size / size if size > 0 else 0
    print(f"  {method:12} - {size:6.2f} MB (ratio: {ratio:.1f}x)")

## 3. Caching for Performance

SciTeX provides caching to speed up repeated operations.

In [None]:
# Decorator-based caching for expensive computations
@scitex.decorators.cache_disk
def expensive_computation(n_samples=1000, n_features=50):
    """Simulate an expensive computation."""
    print(f"Computing with n_samples={n_samples}, n_features={n_features}...")
    time.sleep(1)  # Simulate computation time
    
    data = np.random.randn(n_samples, n_features)
    result = {
        'mean': data.mean(axis=0),
        'std': data.std(axis=0),
        'correlation': np.corrcoef(data.T)
    }
    return result

# First call - will compute
print("First call (computing):")
start = time.time()
result1 = expensive_computation(500, 20)
time1 = time.time() - start
print(f"  Time: {time1:.3f} seconds")

# Second call - will load from cache
print("\nSecond call (from cache):")
start = time.time()
result2 = expensive_computation(500, 20)
time2 = time.time() - start
print(f"  Time: {time2:.3f} seconds")
print(f"  Speedup: {time1/time2:.1f}x")

## 4. Batch Operations

Process multiple files efficiently.

In [None]:
# Create batch of files
batch_dir = output_dir / "batch"
batch_dir.mkdir(exist_ok=True)

print("Creating batch files...")
for i in range(5):
    data = {
        'id': i,
        'values': np.random.randn(100),
        'timestamp': time.time() + i
    }
    scitex.io.save(data, batch_dir / f"data_{i:03d}.pkl")

# Load all files in batch
print("\nLoading batch files...")
batch_data = []
for file in sorted(batch_dir.glob("data_*.pkl")):
    data = scitex.io.load(file)
    batch_data.append(data)
    print(f"  Loaded {file.name} - ID: {data['id']}")

# Combine results
all_values = np.vstack([d['values'] for d in batch_data])
print(f"\nCombined data shape: {all_values.shape}")

## 5. HDF5 for Hierarchical Data

HDF5 is perfect for storing complex, hierarchical scientific data.

In [None]:
# Create hierarchical experiment data
experiment_data = {
    'metadata': {
        'name': 'Multi-condition experiment',
        'date': '2025-01-25',
        'researcher': 'SciTeX User'
    },
    'conditions': {
        'control': {
            'raw_data': np.random.randn(1000, 50),
            'processed': np.random.randn(1000, 10),
            'parameters': {'temperature': 20, 'pressure': 1.0}
        },
        'treatment_A': {
            'raw_data': np.random.randn(1000, 50),
            'processed': np.random.randn(1000, 10),
            'parameters': {'temperature': 25, 'pressure': 1.2}
        },
        'treatment_B': {
            'raw_data': np.random.randn(1000, 50),
            'processed': np.random.randn(1000, 10),
            'parameters': {'temperature': 30, 'pressure': 1.5}
        }
    },
    'analysis': {
        'summary_stats': pd.DataFrame({
            'condition': ['control', 'treatment_A', 'treatment_B'],
            'mean_response': [0.1, 0.5, 0.8],
            'std_response': [0.05, 0.08, 0.12]
        })
    }
}

# Save as HDF5
h5_path = output_dir / "experiment.h5"
scitex.io.save(experiment_data, h5_path)
print(f"Saved hierarchical data to {h5_path}")
print(f"File size: {h5_path.stat().st_size / (1024**2):.2f} MB")

# Load and explore
loaded_exp = scitex.io.load(h5_path)
print("\nLoaded structure:")
for key in loaded_exp.keys():
    print(f"  {key}/")
    if isinstance(loaded_exp[key], dict):
        for subkey in loaded_exp[key].keys():
            print(f"    {subkey}")

## 6. Best Practices Summary

In [None]:
# Format selection guide
format_guide = pd.DataFrame({
    'Format': ['pickle', 'JSON', 'CSV', 'NumPy', 'HDF5'],
    'Extension': ['.pkl', '.json', '.csv', '.npy', '.h5'],
    'Best For': [
        'Complex Python objects',
        'Configuration, metadata',
        'Tabular data, sharing',
        'Numeric arrays',
        'Large hierarchical data'
    ],
    'Human Readable': ['No', 'Yes', 'Yes', 'No', 'No'],
    'Compression': ['Yes', 'Text', 'Text', 'No', 'Yes']
})

print("SciTeX I/O Format Guide:")
print(format_guide.to_string(index=False))

# Performance tips
print("\n\nPerformance Tips:")
print("1. Use caching (@cache_disk) for expensive computations")
print("2. Use HDF5 for large hierarchical datasets")
print("3. Enable compression for large files (add .gz extension)")
print("4. Use batch operations for processing multiple files")
print("5. Choose the right format for your data type")

## 7. Cleanup

In [None]:
# Clean up demo files
import shutil

if output_dir.exists():
    shutil.rmtree(output_dir)
    print(f"✓ Cleaned up {output_dir}")

print("\n🎉 I/O operations demo complete!")