# SciTeX IO Module - Advanced Features

This notebook demonstrates advanced features of the `scitex.io` module for unified file operations.

In [None]:
import scitex as stx
import numpy as np
import pandas as pd
from pathlib import Path
import json
import yaml

## 1. Format Auto-Detection

SciTeX automatically detects file formats based on extensions.

In [None]:
# Create test data
test_data = {
    'numbers': [1, 2, 3, 4, 5],
    'letters': ['a', 'b', 'c', 'd', 'e'],
    'values': [10.1, 20.2, 30.3, 40.4, 50.5]
}
df = pd.DataFrame(test_data)

# Save in different formats
formats = {
    'csv': './io_demo/data.csv',
    'json': './io_demo/data.json',
    'yaml': './io_demo/data.yaml',
    'pickle': './io_demo/data.pkl',
    'parquet': './io_demo/data.parquet',
    'excel': './io_demo/data.xlsx'
}

for fmt, path in formats.items():
    try:
        stx.io.save(df, path)
        print(f"✓ Saved as {fmt}: {path}")
    except Exception as e:
        print(f"✗ Failed to save {fmt}: {e}")

In [None]:
# Load and verify
for fmt, path in formats.items():
    if Path(path).exists():
        try:
            loaded = stx.io.load(path)
            print(f"\n{fmt.upper()}:")
            if isinstance(loaded, pd.DataFrame):
                print(f"  Shape: {loaded.shape}")
                print(f"  Columns: {list(loaded.columns)}")
            else:
                print(f"  Type: {type(loaded)}")
                print(f"  Content: {str(loaded)[:50]}...")
        except Exception as e:
            print(f"  Failed to load: {e}")

## 2. Numpy Array Operations

In [None]:
# Different array types
arrays = {
    '1D array': np.array([1, 2, 3, 4, 5]),
    '2D array': np.random.randn(10, 5),
    '3D array': np.random.randn(4, 5, 6),
    'Complex array': np.array([1+2j, 3+4j, 5+6j]),
    'Structured array': np.array([(1, 'a', 2.5), (2, 'b', 3.5)], 
                                dtype=[('x', 'i4'), ('y', 'U1'), ('z', 'f4')])
}

# Save arrays
for name, arr in arrays.items():
    filename = f"./io_demo/{name.replace(' ', '_')}.npy"
    stx.io.save(arr, filename)
    print(f"Saved {name}: shape={arr.shape if hasattr(arr, 'shape') else 'N/A'}, dtype={arr.dtype}")

In [None]:
# Save multiple arrays in one file
stx.io.save({
    'array1': arrays['1D array'],
    'array2': arrays['2D array'],
    'metadata': {'created': '2024-01-01', 'version': 1.0}
}, './io_demo/multiple_arrays.npz')

# Load and inspect
loaded_arrays = stx.io.load('./io_demo/multiple_arrays.npz')
print("Loaded arrays:")
for key in loaded_arrays.files:
    arr = loaded_arrays[key]
    print(f"  {key}: shape={arr.shape if hasattr(arr, 'shape') else 'scalar'}, dtype={arr.dtype}")

## 3. Compressed Files

In [None]:
# Create large dataset
large_data = np.random.randn(1000, 1000)
large_df = pd.DataFrame(large_data)

# Save with different compression methods
compression_tests = [
    ('./io_demo/large_uncompressed.npy', None),
    ('./io_demo/large_compressed.npy.gz', 'gzip'),
    ('./io_demo/large_compressed.csv.gz', 'gzip'),
    ('./io_demo/large_compressed.pkl.bz2', 'bz2'),
]

for path, compression in compression_tests:
    if path.endswith('.npy') or path.endswith('.npy.gz'):
        data_to_save = large_data
    else:
        data_to_save = large_df
    
    stx.io.save(data_to_save, path)
    size_mb = Path(path).stat().st_size / 1024 / 1024
    print(f"{path}: {size_mb:.2f} MB")

In [None]:
# Compare compression ratios
import matplotlib.pyplot as plt

sizes = []
labels = []

for path, _ in compression_tests:
    if Path(path).exists():
        size_mb = Path(path).stat().st_size / 1024 / 1024
        sizes.append(size_mb)
        labels.append(Path(path).name)

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(range(len(sizes)), sizes)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_ylabel('File Size (MB)')
ax.set_title('Compression Comparison')

# Add value labels on bars
for bar, size in zip(bars, sizes):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
            f'{size:.1f}', ha='center', va='bottom')

plt.tight_layout()
stx.io.save(fig, './io_demo/compression_comparison.png')
plt.show()

## 4. Working with HDF5 Files

In [None]:
# Create hierarchical data structure
hdf5_data = {
    'experiment1': {
        'data': np.random.randn(100, 50),
        'metadata': {
            'date': '2024-01-01',
            'parameters': {'alpha': 0.1, 'beta': 0.2}
        }
    },
    'experiment2': {
        'data': np.random.randn(200, 50),
        'metadata': {
            'date': '2024-01-02',
            'parameters': {'alpha': 0.2, 'beta': 0.3}
        }
    },
    'summary': pd.DataFrame({
        'exp_id': [1, 2],
        'mean_value': [0.05, -0.02],
        'std_value': [0.98, 1.01]
    })
}

# Save to HDF5
stx.io.save(hdf5_data, './io_demo/experiments.h5')
print("Saved hierarchical data to HDF5")

# Load and explore
loaded_h5 = stx.io.load('./io_demo/experiments.h5')
print("\nHDF5 structure:")
for key in loaded_h5:
    print(f"  /{key}")
    if isinstance(loaded_h5[key], dict):
        for subkey in loaded_h5[key]:
            print(f"    /{key}/{subkey}")

## 5. Configuration Files

In [None]:
# Complex configuration
config = {
    'project': {
        'name': 'SciTeX Demo',
        'version': '1.0.0',
        'authors': ['Alice', 'Bob'],
        'created': '2024-01-01'
    },
    'experiment': {
        'parameters': {
            'learning_rate': 0.001,
            'batch_size': 32,
            'epochs': 100,
            'optimizer': {
                'type': 'adam',
                'betas': [0.9, 0.999]
            }
        },
        'data': {
            'train_size': 0.8,
            'validation_size': 0.1,
            'test_size': 0.1,
            'random_seed': 42
        }
    },
    'paths': {
        'data': './data/',
        'models': './models/',
        'results': './results/',
        'figures': './figures/'
    }
}

# Save as YAML (human-readable)
stx.io.save(config, './io_demo/config.yaml')

# Save as JSON (programmatic)
stx.io.save(config, './io_demo/config.json')

# Display YAML content
print("YAML format:")
with open('./io_demo/config.yaml', 'r') as f:
    print(f.read())

In [None]:
# Load and access nested configuration
loaded_config = stx.io.load('./io_demo/config.yaml')

print(f"Project: {loaded_config['project']['name']} v{loaded_config['project']['version']}")
print(f"Learning rate: {loaded_config['experiment']['parameters']['learning_rate']}")
print(f"Optimizer: {loaded_config['experiment']['parameters']['optimizer']['type']}")
print(f"\nPaths:")
for key, path in loaded_config['paths'].items():
    print(f"  {key}: {path}")

## 6. Symlinks and Output Organization

In [None]:
# Generate some results
results = {
    'accuracy': 0.95,
    'loss': 0.05,
    'confusion_matrix': [[95, 5], [3, 97]]
}

# Save with organized structure and symlink
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
result_path = f'./io_demo/results/experiment_{timestamp}/metrics.json'

stx.io.save(results, result_path, symlink_from_cwd=True)

# Check if symlink was created
symlink_path = Path('metrics.json')
if symlink_path.exists() and symlink_path.is_symlink():
    print(f"✓ Symlink created: {symlink_path} -> {symlink_path.resolve()}")
else:
    print("✗ No symlink found")

# List all results
results_dir = Path('./io_demo/results')
if results_dir.exists():
    print("\nAll results:")
    for exp_dir in sorted(results_dir.iterdir()):
        print(f"  {exp_dir.name}/")
        for file in exp_dir.iterdir():
            print(f"    {file.name}")

## 7. Custom Save/Load Handlers

In [None]:
# Create custom class
class ExperimentResult:
    def __init__(self, name, data, metadata):
        self.name = name
        self.data = data
        self.metadata = metadata
        self.timestamp = pd.Timestamp.now()
    
    def to_dict(self):
        return {
            'name': self.name,
            'data': self.data.tolist() if isinstance(self.data, np.ndarray) else self.data,
            'metadata': self.metadata,
            'timestamp': self.timestamp.isoformat()
        }
    
    @classmethod
    def from_dict(cls, d):
        obj = cls(d['name'], np.array(d['data']), d['metadata'])
        obj.timestamp = pd.Timestamp(d['timestamp'])
        return obj

# Create instance
exp_result = ExperimentResult(
    name='Test Experiment',
    data=np.random.randn(10, 5),
    metadata={'version': 1, 'author': 'SciTeX User'}
)

# Save as JSON (using to_dict)
stx.io.save(exp_result.to_dict(), './io_demo/experiment_result.json')

# Load and reconstruct
loaded_dict = stx.io.load('./io_demo/experiment_result.json')
reconstructed = ExperimentResult.from_dict(loaded_dict)

print(f"Original: {exp_result.name}, shape: {exp_result.data.shape}")
print(f"Loaded: {reconstructed.name}, shape: {reconstructed.data.shape}")
print(f"Timestamp preserved: {exp_result.timestamp == reconstructed.timestamp}")

## 8. Batch Operations

In [None]:
# Generate multiple datasets
datasets = {}
for i in range(5):
    datasets[f'dataset_{i}'] = pd.DataFrame({
        'x': np.random.randn(100),
        'y': np.random.randn(100),
        'group': np.random.choice(['A', 'B', 'C'], 100)
    })

# Save all datasets
for name, df in datasets.items():
    stx.io.save(df, f'./io_demo/batch/{name}.csv')

# Load all CSVs from directory
batch_dir = Path('./io_demo/batch')
loaded_datasets = {}

for csv_file in batch_dir.glob('*.csv'):
    name = csv_file.stem
    loaded_datasets[name] = stx.io.load(csv_file)

print(f"Loaded {len(loaded_datasets)} datasets:")
for name, df in loaded_datasets.items():
    print(f"  {name}: shape={df.shape}, columns={list(df.columns)}")

## 9. Error Handling and Validation

In [None]:
# Test various edge cases
test_cases = [
    ('Empty DataFrame', pd.DataFrame()),
    ('Single value', 42),
    ('List of mixed types', [1, 'two', 3.0, None]),
    ('Nested structure', {'a': {'b': {'c': [1, 2, 3]}}}) 
]

for name, data in test_cases:
    print(f"\nTesting: {name}")
    
    # Try different formats
    for ext in ['.json', '.yaml', '.pkl']:
        path = f'./io_demo/edge_cases/{name.replace(" ", "_")}{ext}'
        try:
            stx.io.save(data, path)
            loaded = stx.io.load(path)
            
            # Verify
            if isinstance(data, pd.DataFrame):
                success = data.equals(loaded)
            elif isinstance(data, (list, dict)):
                success = data == loaded
            else:
                success = data == loaded
                
            print(f"  {ext}: {'✓' if success else '✗'}")
        except Exception as e:
            print(f"  {ext}: ✗ ({type(e).__name__})")

## 10. Performance Comparison

In [None]:
import time

# Create test data
large_df = pd.DataFrame(np.random.randn(10000, 100))

# Test different formats
formats_to_test = {
    'CSV': '.csv',
    'Pickle': '.pkl',
    'Parquet': '.parquet',
    'HDF5': '.h5'
}

results = []

for name, ext in formats_to_test.items():
    path = f'./io_demo/perf_test{ext}'
    
    # Time save
    start = time.time()
    try:
        stx.io.save(large_df, path)
        save_time = time.time() - start
        
        # Time load
        start = time.time()
        loaded = stx.io.load(path)
        load_time = time.time() - start
        
        # File size
        size_mb = Path(path).stat().st_size / 1024 / 1024
        
        results.append({
            'Format': name,
            'Save Time (s)': round(save_time, 3),
            'Load Time (s)': round(load_time, 3),
            'File Size (MB)': round(size_mb, 2)
        })
    except Exception as e:
        results.append({
            'Format': name,
            'Save Time (s)': 'Error',
            'Load Time (s)': 'Error',
            'File Size (MB)': 'Error'
        })

# Display results
perf_df = pd.DataFrame(results)
print("\nPerformance Comparison:")
print(perf_df.to_string(index=False))

# Visualize
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))

# Only plot numeric values
valid_results = [r for r in results if r['Save Time (s)'] != 'Error']
formats = [r['Format'] for r in valid_results]

ax1.bar(formats, [r['Save Time (s)'] for r in valid_results])
ax1.set_title('Save Time')
ax1.set_ylabel('Time (seconds)')

ax2.bar(formats, [r['Load Time (s)'] for r in valid_results])
ax2.set_title('Load Time')
ax2.set_ylabel('Time (seconds)')

ax3.bar(formats, [r['File Size (MB)'] for r in valid_results])
ax3.set_title('File Size')
ax3.set_ylabel('Size (MB)')

plt.tight_layout()
stx.io.save(fig, './io_demo/format_performance_comparison.png')
plt.show()

## Summary

The `scitex.io` module provides:

1. **Unified Interface**: Single `save()` and `load()` function for all formats
2. **Format Auto-detection**: Based on file extensions
3. **Compression Support**: Automatic handling of .gz, .bz2, etc.
4. **Hierarchical Data**: Support for HDF5 and nested structures
5. **Configuration Files**: YAML and JSON for settings
6. **Symlink Creation**: For easy access to latest results
7. **Batch Operations**: Process multiple files efficiently
8. **Robust Error Handling**: Graceful handling of edge cases

### Best Practices:

- Use **relative paths** starting with `./`
- **Organize outputs** by type (data/, results/, figures/)
- Choose **appropriate formats**:
  - CSV for data sharing
  - Pickle for Python objects
  - Parquet for large DataFrames
  - HDF5 for hierarchical data
  - YAML for human-readable configs
- Use **compression** for large files
- Create **symlinks** for important outputs

In [None]:
# Cleanup
import shutil

# Remove demo directory
if Path('./io_demo').exists():
    # shutil.rmtree('./io_demo')  # Uncomment to clean up
    print("Demo files kept in ./io_demo/")

# Remove symlink
if Path('metrics.json').is_symlink():
    # Path('metrics.json').unlink()  # Uncomment to remove
    print("Symlink kept: metrics.json")