# SciTeX Input/Output Operations

This notebook demonstrates the I/O capabilities provided by the `scitex.io` module, which offers unified file operations with automatic format detection, caching, and scientific data handling features.

## 1. Setup and Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import json
import scitex as stx
from pathlib import Path
import tempfile
import shutil

# Set up reproducible environment
stx.repro.fix_seeds(42)

# Create temporary directory for examples
temp_dir = tempfile.mkdtemp(prefix='scitex_io_demo_')
print(f"Working directory: {temp_dir}")
print(f"SciTeX version: {stx.__version__}")

## 2. Universal Save and Load

In [None]:
# SciTeX provides universal save/load functions that handle multiple formats
# The format is automatically detected from the file extension

# Create sample data in different formats
# NumPy array
numpy_data = np.random.randn(100, 50)
print(f"NumPy array shape: {numpy_data.shape}")

# Pandas DataFrame
df_data = pd.DataFrame({
    'subject_id': range(100),
    'measurement_1': np.random.randn(100),
    'measurement_2': np.random.randn(100),
    'category': np.random.choice(['A', 'B', 'C'], 100)
})
print(f"\nDataFrame shape: {df_data.shape}")
print(df_data.head())

# Dictionary with mixed types
dict_data = {
    'experiment_name': 'Neural Recording Session 1',
    'parameters': {
        'sampling_rate': 1000,
        'n_channels': 64,
        'duration': 300
    },
    'results': {
        'accuracy': 0.95,
        'f1_score': 0.93
    },
    'data_array': numpy_data[:10, :10].tolist()
}
print(f"\nDictionary keys: {list(dict_data.keys())}")

# PyTorch tensor
torch_data = torch.randn(32, 128, 1000)  # batch x features x time
print(f"\nTorch tensor shape: {torch_data.shape}")

In [None]:
# Save different data types
# NumPy - automatically saved as .npy
npy_path = Path(temp_dir) / 'array_data.npy'
stx.io.save(numpy_data, npy_path)
print(f"Saved NumPy array to: {npy_path}")

# Pandas DataFrame - can save as CSV, Pickle, or HDF5
csv_path = Path(temp_dir) / 'dataframe.csv'
pkl_path = Path(temp_dir) / 'dataframe.pkl'
stx.io.save(df_data, csv_path)
stx.io.save(df_data, pkl_path)
print(f"Saved DataFrame to: {csv_path} and {pkl_path}")

# Dictionary - save as JSON or Pickle
json_path = Path(temp_dir) / 'experiment_config.json'
stx.io.save(dict_data, json_path)
print(f"Saved dictionary to: {json_path}")

# PyTorch tensor
pt_path = Path(temp_dir) / 'model_output.pt'
stx.io.save(torch_data, pt_path)
print(f"Saved PyTorch tensor to: {pt_path}")

# Create a symlink from current working directory (optional)
# This helps with organization and quick access
stx.io.save(numpy_data, npy_path, symlink_from_cwd=True)

In [None]:
# Load data back - format automatically detected
loaded_numpy = stx.io.load(npy_path)
loaded_csv = stx.io.load(csv_path)
loaded_pkl = stx.io.load(pkl_path)
loaded_json = stx.io.load(json_path)
loaded_torch = stx.io.load(pt_path)

# Verify data integrity
print("Data integrity checks:")
print(f"NumPy arrays match: {np.allclose(numpy_data, loaded_numpy)}")
print(f"CSV DataFrame shape matches: {df_data.shape == loaded_csv.shape}")
print(f"Pickle DataFrame equals: {df_data.equals(loaded_pkl)}")
print(f"JSON keys match: {set(dict_data.keys()) == set(loaded_json.keys())}")
print(f"Torch tensors match: {torch.allclose(torch_data, loaded_torch)}")

## 3. HDF5 Exploration and Hierarchical Data

In [None]:
# Create complex hierarchical data structure
experiment_data = {
    'metadata': {
        'date': '2024-01-15',
        'experimenter': 'Dr. Smith',
        'protocol': 'Neural Recording v2.1'
    },
    'subjects': {
        'subject_001': {
            'neural_data': np.random.randn(64, 10000),  # channels x time
            'behavior': np.random.randn(1000, 3),       # time x xyz
            'timestamps': np.linspace(0, 10, 10000),
            'events': pd.DataFrame({
                'time': np.sort(np.random.uniform(0, 10, 50)),
                'event_type': np.random.choice(['stimulus', 'response', 'reward'], 50)
            })
        },
        'subject_002': {
            'neural_data': np.random.randn(64, 10000),
            'behavior': np.random.randn(1000, 3),
            'timestamps': np.linspace(0, 10, 10000),
            'events': pd.DataFrame({
                'time': np.sort(np.random.uniform(0, 10, 45)),
                'event_type': np.random.choice(['stimulus', 'response', 'reward'], 45)
            })
        }
    },
    'analysis': {
        'spike_rates': np.random.poisson(10, (2, 64)),  # subjects x channels
        'correlations': np.random.rand(64, 64),
        'summary_stats': {
            'mean_rate': 10.5,
            'std_rate': 2.3
        }
    }
}

# Save as HDF5 (hierarchical data format)
h5_path = Path(temp_dir) / 'experiment_data.h5'
stx.io.save(experiment_data, h5_path)
print(f"Saved hierarchical data to: {h5_path}")

In [None]:
# Explore HDF5 file structure
print("HDF5 File Structure:")
print("=" * 50)
stx.io.explore_h5(h5_path)

# Check if specific keys exist
key_to_check = '/subjects/subject_001/neural_data'
exists = stx.io.has_h5_key(h5_path, key_to_check)
print(f"\nKey '{key_to_check}' exists: {exists}")

# Load specific parts of the HDF5 file
# Load only subject_001 data
subject_001_data = stx.io.load(h5_path, key='/subjects/subject_001')
print(f"\nLoaded subject_001 keys: {list(subject_001_data.keys())}")

# Load only analysis results
analysis_data = stx.io.load(h5_path, key='/analysis')
print(f"Loaded analysis keys: {list(analysis_data.keys())}")

## 4. Configuration Management

In [None]:
# Create configuration files in different formats
config_dir = Path(temp_dir) / 'configs'
config_dir.mkdir(exist_ok=True)

# YAML-style config (saved as JSON for this example)
main_config = {
    'experiment': {
        'name': 'Neural Decoding Study',
        'version': '1.0.0',
        'random_seed': 42
    },
    'model': {
        'architecture': 'ResNet1D',
        'n_layers': 4,
        'hidden_size': 256,
        'dropout': 0.5
    },
    'training': {
        'batch_size': 32,
        'learning_rate': 0.001,
        'n_epochs': 100,
        'optimizer': 'Adam'
    }
}

# Data preprocessing config
preprocess_config = {
    'sampling_rate': 1000,
    'filter': {
        'type': 'bandpass',
        'low_freq': 1,
        'high_freq': 100,
        'order': 4
    },
    'normalization': 'z-score',
    'window_size': 1000,
    'overlap': 0.5
}

# Save configs
stx.io.save(main_config, config_dir / 'main_config.json')
stx.io.save(preprocess_config, config_dir / 'preprocessing.json')

print("Saved configuration files:")
for f in config_dir.glob('*.json'):
    print(f"  - {f.name}")

In [None]:
# Load all configs from directory
configs = stx.io.load_configs(config_dir)

print("Loaded configurations:")
for name, config in configs.items():
    print(f"\n{name}:")
    print(json.dumps(config, indent=2)[:200] + '...')

# Access specific config values
print(f"\nModel architecture: {configs['main_config']['model']['architecture']}")
print(f"Learning rate: {configs['main_config']['training']['learning_rate']}")
print(f"Filter type: {configs['preprocessing']['filter']['type']}")

## 5. File Pattern Matching with Glob

In [None]:
# Create multiple files with patterns
data_dir = Path(temp_dir) / 'data'
data_dir.mkdir(exist_ok=True)

# Create files for multiple subjects and sessions
for subject in ['S001', 'S002', 'S003']:
    for session in range(1, 4):
        for data_type in ['neural', 'behavior', 'events']:
            filename = f"{subject}_session{session}_{data_type}.npy"
            data = np.random.randn(100, 10)
            stx.io.save(data, data_dir / filename)

# Also create some analysis files
for subject in ['S001', 'S002', 'S003']:
    analysis_data = {'accuracy': np.random.rand(), 'f1_score': np.random.rand()}
    stx.io.save(analysis_data, data_dir / f"{subject}_analysis_results.json")

print(f"Created {len(list(data_dir.glob('*')))} files in {data_dir}")

In [None]:
# Use glob to find files with patterns
# Find all neural data files
neural_files = stx.io.glob(data_dir, '*neural.npy')
print(f"Found {len(neural_files)} neural data files:")
for f in sorted(neural_files)[:5]:
    print(f"  - {Path(f).name}")

# Find all files for subject S001
s001_files = stx.io.glob(data_dir, 'S001_*.npy')
print(f"\nFound {len(s001_files)} files for subject S001")

# Find all session 2 files
session2_files = stx.io.glob(data_dir, '*_session2_*.npy')
print(f"\nFound {len(session2_files)} files for session 2")

# Use parse_glob to extract pattern information
pattern = '{subject}_session{session:d}_{dtype}.npy'
parsed_results = []

for filepath in neural_files[:3]:
    parsed = stx.io.parse_glob(filepath, pattern)
    if parsed:
        parsed_results.append(parsed)
        print(f"\nParsed {Path(filepath).name}:")
        print(f"  Subject: {parsed['subject']}")
        print(f"  Session: {parsed['session']}")
        print(f"  Data type: {parsed['dtype']}")

## 6. Caching for Expensive Operations

In [None]:
# Define an expensive computation
def expensive_analysis(data, n_components=10):
    """Simulate expensive analysis like PCA or spectral decomposition."""
    import time
    print("Running expensive analysis...")
    time.sleep(2)  # Simulate computation time
    
    # Fake PCA-like results
    components = np.random.randn(n_components, data.shape[1])
    explained_variance = np.random.rand(n_components)
    explained_variance = explained_variance / explained_variance.sum()
    
    return {
        'components': components,
        'explained_variance': explained_variance,
        'total_variance': np.var(data)
    }

# Create cache directory
cache_dir = Path(temp_dir) / '.cache'
cache_dir.mkdir(exist_ok=True)

# First call - will compute and cache
test_data = np.random.randn(1000, 50)
cache_key = 'pca_analysis_v1'

import time
start_time = time.time()
result1 = stx.io.cache(
    expensive_analysis,
    test_data,
    n_components=5,
    cache_key=cache_key,
    cache_dir=cache_dir
)
first_call_time = time.time() - start_time
print(f"First call took: {first_call_time:.2f} seconds")

# Second call - will load from cache
start_time = time.time()
result2 = stx.io.cache(
    expensive_analysis,
    test_data,
    n_components=5,
    cache_key=cache_key,
    cache_dir=cache_dir
)
second_call_time = time.time() - start_time
print(f"Second call (cached) took: {second_call_time:.2f} seconds")
print(f"Speedup: {first_call_time / second_call_time:.1f}x")

# Verify results are identical
print(f"\nResults match: {np.allclose(result1['components'], result2['components'])}")

## 7. Specialized Save Functions

In [None]:
# Save multiple DataFrames as separate CSV files
results_dir = Path(temp_dir) / 'results'
results_dir.mkdir(exist_ok=True)

# Create list of DataFrames (e.g., results from multiple experiments)
dfs = []
for i in range(5):
    df = pd.DataFrame({
        'experiment_id': i,
        'accuracy': np.random.rand(10),
        'precision': np.random.rand(10),
        'recall': np.random.rand(10),
        'f1_score': np.random.rand(10)
    })
    dfs.append(df)

# Save all DataFrames
if stx.io.save_listed_dfs_as_csv:
    stx.io.save_listed_dfs_as_csv(
        dfs,
        names=[f'experiment_{i}' for i in range(5)],
        save_dir=results_dir
    )
    print(f"Saved {len(dfs)} DataFrames to {results_dir}")

# Save scalar metrics over time
epochs = list(range(100))
train_loss = [1.0 / (i + 1) + 0.1 * np.random.randn() for i in epochs]
val_loss = [1.2 / (i + 1) + 0.15 * np.random.randn() for i in epochs]
accuracy = [min(0.99, i / 100 + 0.1 * np.random.rand()) for i in epochs]

if stx.io.save_listed_scalars_as_csv:
    stx.io.save_listed_scalars_as_csv(
        [train_loss, val_loss, accuracy],
        names=['train_loss', 'val_loss', 'accuracy'],
        index=epochs,
        index_name='epoch',
        save_path=results_dir / 'training_metrics.csv'
    )
    print("Saved training metrics")

In [None]:
# Generate and save images
fig, axes = plt.subplots(2, 2, figsize=(10, 10))

# Plot 1: Signal visualization
t = np.linspace(0, 2, 1000)
signal = np.sin(2 * np.pi * 5 * t) + 0.5 * np.sin(2 * np.pi * 20 * t)
axes[0, 0].plot(t, signal)
axes[0, 0].set_title('Multi-frequency Signal')
axes[0, 0].set_xlabel('Time (s)')
axes[0, 0].set_ylabel('Amplitude')

# Plot 2: Correlation matrix
corr_matrix = np.random.rand(10, 10)
corr_matrix = (corr_matrix + corr_matrix.T) / 2  # Make symmetric
np.fill_diagonal(corr_matrix, 1)
im = axes[0, 1].imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
axes[0, 1].set_title('Correlation Matrix')
plt.colorbar(im, ax=axes[0, 1])

# Plot 3: Training curves
if 'epochs' in locals():
    axes[1, 0].plot(epochs, train_loss, label='Train Loss')
    axes[1, 0].plot(epochs, val_loss, label='Val Loss')
    axes[1, 0].set_xlabel('Epoch')
    axes[1, 0].set_ylabel('Loss')
    axes[1, 0].set_title('Training Progress')
    axes[1, 0].legend()
    axes[1, 0].set_yscale('log')

# Plot 4: Bar chart
categories = ['Method A', 'Method B', 'Method C', 'Method D']
values = np.random.rand(4) * 0.3 + 0.7
axes[1, 1].bar(categories, values)
axes[1, 1].set_ylabel('Accuracy')
axes[1, 1].set_title('Method Comparison')
axes[1, 1].set_ylim([0, 1])

plt.tight_layout()

# Save figure
fig_path = results_dir / 'analysis_results.png'
if stx.io.save_image:
    stx.io.save_image(fig, fig_path, dpi=300)
else:
    fig.savefig(fig_path, dpi=300)
    
plt.close()
print(f"Saved figure to {fig_path}")

## 8. Hot Reload for Development

In [None]:
# Create a simple module file that can be modified
module_path = Path(temp_dir) / 'my_analysis.py'
module_content = '''
def analyze_data(data):
    """Simple analysis function."""
    return {
        'mean': data.mean(),
        'std': data.std(),
        'version': 1
    }
'''

with open(module_path, 'w') as f:
    f.write(module_content)

# Import the module
import sys
sys.path.insert(0, str(temp_dir))
import my_analysis

# Use the function
data = np.random.randn(100)
result1 = my_analysis.analyze_data(data)
print(f"First version result: {result1}")

# Modify the module (simulate development)
updated_content = '''
import numpy as np

def analyze_data(data):
    """Enhanced analysis function."""
    return {
        'mean': data.mean(),
        'std': data.std(),
        'median': np.median(data),
        'q25': np.percentile(data, 25),
        'q75': np.percentile(data, 75),
        'version': 2
    }
'''

with open(module_path, 'w') as f:
    f.write(updated_content)

# Reload the module
stx.io.reload(my_analysis)

# Use the updated function
result2 = my_analysis.analyze_data(data)
print(f"\nUpdated version result: {result2}")
print(f"\nNew keys added: {set(result2.keys()) - set(result1.keys())}")

## 9. Batch Operations and Organization

In [None]:
# Organize experiment outputs with automatic directory creation
experiment_name = "neural_decoding_2024"
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
output_base = Path(temp_dir) / 'experiments' / experiment_name / timestamp

# Save will automatically create directories
# Save different types of outputs
outputs = {
    'raw_data': np.random.randn(1000, 64),
    'processed_data': np.random.randn(1000, 32),
    'features': np.random.randn(1000, 128),
    'predictions': np.random.randint(0, 3, 1000),
    'metadata': {
        'n_subjects': 10,
        'n_trials': 100,
        'conditions': ['A', 'B', 'C']
    }
}

# Save all outputs in organized structure
for name, data in outputs.items():
    if isinstance(data, np.ndarray):
        path = output_base / 'arrays' / f'{name}.npy'
    else:
        path = output_base / 'configs' / f'{name}.json'
    
    stx.io.save(data, path)
    print(f"Saved {name} to {path.relative_to(temp_dir)}")

# Create summary report
summary = {
    'experiment': experiment_name,
    'timestamp': timestamp,
    'outputs': list(outputs.keys()),
    'data_shapes': {k: v.shape if isinstance(v, np.ndarray) else 'metadata' 
                   for k, v in outputs.items()}
}

summary_path = output_base / 'summary.json'
stx.io.save(summary, summary_path)
print(f"\nSaved experiment summary to {summary_path.relative_to(temp_dir)}")

## 10. Integration Example: Complete Experiment Pipeline

In [None]:
def run_experiment_pipeline(config_path, data_path, output_dir):
    """Complete experiment pipeline with SciTeX I/O."""
    
    # Load configuration
    config = stx.io.load(config_path)
    print(f"Loaded config: {config['experiment']['name']}")
    
    # Load data
    data = stx.io.load(data_path)
    print(f"Loaded data shape: {data.shape}")
    
    # Create output directory structure
    output_dir = Path(output_dir)
    dirs = {
        'processed': output_dir / 'processed_data',
        'features': output_dir / 'features',
        'models': output_dir / 'models',
        'results': output_dir / 'results',
        'figures': output_dir / 'figures'
    }
    
    # Process data (cached)
    @stx.gen.cache
    def process_data(data, config):
        # Simulate preprocessing
        processed = data - data.mean(axis=0)
        processed = processed / (data.std(axis=0) + 1e-8)
        return processed
    
    processed_data = process_data(data, config)
    stx.io.save(processed_data, dirs['processed'] / 'normalized_data.npy')
    
    # Extract features
    features = {
        'mean_features': processed_data.mean(axis=1),
        'std_features': processed_data.std(axis=1),
        'pca_features': np.random.randn(data.shape[0], 10)  # Simulated PCA
    }
    
    for name, feat in features.items():
        stx.io.save(feat, dirs['features'] / f'{name}.npy')
    
    # Simulate model training
    model_results = {
        'accuracy': 0.92,
        'precision': 0.91,
        'recall': 0.93,
        'f1_score': 0.92,
        'confusion_matrix': np.random.randint(0, 50, (3, 3))
    }
    
    stx.io.save(model_results, dirs['results'] / 'model_performance.json')
    
    # Generate and save report
    report = {
        'experiment': config['experiment'],
        'data_info': {
            'n_samples': data.shape[0],
            'n_features': data.shape[1]
        },
        'results': model_results,
        'output_structure': {k: str(v) for k, v in dirs.items()}
    }
    
    stx.io.save(report, output_dir / 'experiment_report.json')
    
    # Create visualization
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    im = ax.imshow(model_results['confusion_matrix'], cmap='Blues')
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('True')
    plt.colorbar(im, ax=ax)
    
    # Add text annotations
    for i in range(3):
        for j in range(3):
            ax.text(j, i, str(model_results['confusion_matrix'][i, j]),
                   ha='center', va='center', color='white' if model_results['confusion_matrix'][i, j] > 25 else 'black')
    
    fig_path = dirs['figures'] / 'confusion_matrix.png'
    fig.savefig(fig_path, dpi=150, bbox_inches='tight')
    plt.close()
    
    print(f"\nExperiment complete! Results saved to {output_dir}")
    return report

# Run the pipeline
pipeline_output = Path(temp_dir) / 'pipeline_output'
report = run_experiment_pipeline(
    config_path=config_dir / 'main_config.json',
    data_path=npy_path,
    output_dir=pipeline_output
)

print("\nFinal report:")
print(json.dumps(report, indent=2))

## Cleanup

In [None]:
# Clean up temporary directory
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary directory: {temp_dir}")

## Summary

The `scitex.io` module provides comprehensive I/O functionality for scientific computing:

1. **Universal Save/Load**: Automatic format detection for NumPy, Pandas, PyTorch, JSON, HDF5, etc.
2. **HDF5 Support**: Hierarchical data storage with selective loading and exploration tools
3. **Configuration Management**: Easy loading of multiple config files from directories
4. **Pattern Matching**: Powerful glob functionality with pattern parsing
5. **Caching**: Speed up expensive computations with automatic caching
6. **Batch Operations**: Save multiple DataFrames, scalars, and images efficiently
7. **Hot Reload**: Reload modules during development without restarting
8. **Automatic Organization**: Creates directory structures automatically

These features enable:
- Reproducible data management
- Efficient experiment organization
- Fast prototyping with caching
- Clean separation of data, configs, and results