# SciTeX Reproduce Module Tutorial

This notebook demonstrates the reproducibility utilities in SciTeX, essential for ensuring consistent results in scientific computing and machine learning experiments.

## 1. Setup and Imports

In [None]:
import scitex as stx
import numpy as np
import random
import os
import time
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt

# For demonstrating deep learning reproducibility
try:
    import torch
    import torch.nn as nn
    TORCH_AVAILABLE = True
except ImportError:
    TORCH_AVAILABLE = False
    print("PyTorch not available. Some examples will be skipped.")

try:
    import tensorflow as tf
    TF_AVAILABLE = True
except ImportError:
    TF_AVAILABLE = False
    print("TensorFlow not available. Some examples will be skipped.")

## 2. Random Seed Management with fix_seeds

### 2.1 Basic Seed Fixing

In [None]:
# Demonstrate randomness without seed fixing
print("=== Without seed fixing ===")
print("Run 1:")
print(f"  Random: {random.random():.6f}")
print(f"  NumPy: {np.random.random():.6f}")
if TORCH_AVAILABLE:
    print(f"  PyTorch: {torch.rand(1).item():.6f}")

print("\nRun 2 (different values):")
print(f"  Random: {random.random():.6f}")
print(f"  NumPy: {np.random.random():.6f}")
if TORCH_AVAILABLE:
    print(f"  PyTorch: {torch.rand(1).item():.6f}")

# Fix seeds
print("\n=== Fixing seeds ===")
stx.repro.fix_seeds(seed=42, verbose=True)

# Now values are reproducible
print("\n=== After seed fixing ===")
print("Run 1:")
r1_random = random.random()
r1_numpy = np.random.random()
r1_torch = torch.rand(1).item() if TORCH_AVAILABLE else None
print(f"  Random: {r1_random:.6f}")
print(f"  NumPy: {r1_numpy:.6f}")
if TORCH_AVAILABLE:
    print(f"  PyTorch: {r1_torch:.6f}")

# Reset seeds and generate again
stx.repro.fix_seeds(seed=42, verbose=False)
print("\nRun 2 (after re-fixing seeds - same values):")
r2_random = random.random()
r2_numpy = np.random.random()
r2_torch = torch.rand(1).item() if TORCH_AVAILABLE else None
print(f"  Random: {r2_random:.6f}")
print(f"  NumPy: {r2_numpy:.6f}")
if TORCH_AVAILABLE:
    print(f"  PyTorch: {r2_torch:.6f}")

# Verify reproducibility
print("\nReproducibility check:")
print(f"  Random values match: {r1_random == r2_random}")
print(f"  NumPy values match: {r1_numpy == r2_numpy}")
if TORCH_AVAILABLE:
    print(f"  PyTorch values match: {r1_torch == r2_torch}")

### 2.2 Selective Library Seed Fixing

In [None]:
# Fix seeds for specific libraries only
print("=== Selective seed fixing ===")

# Fix only NumPy
stx.repro.fix_seeds(seed=123, np=np, verbose=True)

# Generate some values
print("\nValues after fixing only NumPy:")
print(f"  NumPy (fixed): {np.random.random():.6f}")
print(f"  Random (not fixed): {random.random():.6f}")
print(f"  Random (not fixed): {random.random():.6f}")

# Fix all available libraries
print("\n=== Fixing all available libraries ===")
libs = {
    'os': os,
    'random': random,
    'np': np
}

if TORCH_AVAILABLE:
    libs['torch'] = torch
if TF_AVAILABLE:
    libs['tf'] = tf

stx.repro.fix_seeds(seed=999, verbose=True, **libs)

### 2.3 Reproducible Machine Learning

In [None]:
if TORCH_AVAILABLE:
    print("=== Reproducible Neural Network Training ===")
    
    # Define a simple model
    class SimpleNet(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Linear(10, 1)
            
        def forward(self, x):
            return self.fc(x)
    
    # Function to train model
    def train_model(seed=None):
        if seed is not None:
            stx.repro.fix_seeds(seed=seed, torch=torch, np=np)
        
        # Create model and data
        model = SimpleNet()
        X = torch.randn(100, 10)
        y = torch.randn(100, 1)
        
        # Train for a few steps
        optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
        losses = []
        
        for _ in range(10):
            optimizer.zero_grad()
            output = model(X)
            loss = nn.MSELoss()(output, y)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        
        return losses, model.fc.weight.data.clone()
    
    # Train without seed fixing
    print("Without seed fixing:")
    losses1, weights1 = train_model()
    losses2, weights2 = train_model()
    print(f"  Final losses match: {losses1[-1] == losses2[-1]}")
    print(f"  Final loss 1: {losses1[-1]:.6f}")
    print(f"  Final loss 2: {losses2[-1]:.6f}")
    
    # Train with seed fixing
    print("\nWith seed fixing:")
    losses3, weights3 = train_model(seed=42)
    losses4, weights4 = train_model(seed=42)
    print(f"  Final losses match: {losses3[-1] == losses4[-1]}")
    print(f"  Final loss 1: {losses3[-1]:.6f}")
    print(f"  Final loss 2: {losses4[-1]:.6f}")
    print(f"  Weights match: {torch.allclose(weights3, weights4)}")
    
    # Plot training curves
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    plt.plot(losses1, label='Run 1 (no seed)', alpha=0.7)
    plt.plot(losses2, label='Run 2 (no seed)', alpha=0.7)
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Without Seed Fixing')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(losses3, label='Run 1 (seed=42)', alpha=0.7)
    plt.plot(losses4, label='Run 2 (seed=42)', alpha=0.7, linestyle='--')
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('With Seed Fixing (lines overlap)')
    plt.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("PyTorch not available. Skipping ML reproducibility example.")

## 3. Unique ID Generation

### 3.1 Basic ID Generation

In [None]:
# Generate unique IDs
print("=== Unique ID Generation ===")

# Generate multiple IDs
ids = [stx.repro.gen_id() for _ in range(5)]
print("Generated IDs:")
for i, id_str in enumerate(ids):
    print(f"  {i+1}: {id_str}")

# All IDs are unique
print(f"\nAll IDs unique: {len(ids) == len(set(ids))}")

# Parse ID components
example_id = ids[0]
timestamp_part = example_id.split('_')[0]
random_part = example_id.split('_')[1]
print(f"\nID components:")
print(f"  Full ID: {example_id}")
print(f"  Timestamp: {timestamp_part}")
print(f"  Random suffix: {random_part}")

### 3.2 Custom ID Formats

In [None]:
# Custom timestamp formats
print("=== Custom ID Formats ===")

# Different timestamp formats
formats = [
    ("%Y%m%d", "Date only"),
    ("%Y%m%d_%H%M%S", "Date and time"),
    ("%Y-%m-%d", "ISO date"),
    ("%Y%m%d%H%M%S", "Compact")
]

for fmt, desc in formats:
    id_str = stx.repro.gen_id(timestr=fmt)
    print(f"{desc:15} -> {id_str}")

# Different random suffix lengths
print("\n=== Different suffix lengths ===")
for n in [4, 8, 12, 16]:
    id_str = stx.repro.gen_id(N=n)
    print(f"N={n:2d}: {id_str}")

### 3.3 Practical Use Cases for IDs

In [None]:
# Experiment tracking
class ExperimentTracker:
    def __init__(self):
        self.experiments = []
    
    def new_experiment(self, name, params):
        exp_id = stx.repro.gen_id()
        experiment = {
            'id': exp_id,
            'name': name,
            'params': params,
            'created': datetime.now(),
            'results': None
        }
        self.experiments.append(experiment)
        return exp_id
    
    def update_results(self, exp_id, results):
        for exp in self.experiments:
            if exp['id'] == exp_id:
                exp['results'] = results
                break
    
    def get_summary(self):
        return pd.DataFrame([
            {
                'ID': exp['id'],
                'Name': exp['name'],
                'LR': exp['params'].get('learning_rate', 'N/A'),
                'Batch': exp['params'].get('batch_size', 'N/A'),
                'Result': exp['results']
            }
            for exp in self.experiments
        ])

# Use the tracker
tracker = ExperimentTracker()

# Run multiple experiments
experiments = [
    ('baseline', {'learning_rate': 0.01, 'batch_size': 32}),
    ('high_lr', {'learning_rate': 0.1, 'batch_size': 32}),
    ('large_batch', {'learning_rate': 0.01, 'batch_size': 128}),
]

print("Running experiments...")
for name, params in experiments:
    # Create experiment
    exp_id = tracker.new_experiment(name, params)
    print(f"\nExperiment: {name}")
    print(f"  ID: {exp_id}")
    
    # Simulate running experiment
    time.sleep(0.1)  # Simulate work
    result = np.random.random() * 0.1 + 0.9  # Simulate accuracy
    
    # Update results
    tracker.update_results(exp_id, f"{result:.3f}")

# Show summary
print("\nExperiment Summary:")
summary = tracker.get_summary()
print(summary.to_string(index=False))

## 4. Timestamp Generation

### 4.1 Basic Timestamps

In [None]:
# Generate timestamps
print("=== Timestamp Generation ===")

# Basic timestamp
ts = stx.repro.gen_timestamp()
print(f"Current timestamp: {ts}")

# Also available as alias
ts2 = stx.repro.timestamp()
print(f"Using alias: {ts2}")

# Generate multiple timestamps with small delays
print("\nTimestamps with delays:")
timestamps = []
for i in range(3):
    ts = stx.repro.timestamp()
    timestamps.append(ts)
    print(f"  {i+1}: {ts}")
    time.sleep(1)  # Wait 1 second

# Parse timestamp
example_ts = timestamps[0]
year = example_ts[:4]
month_day = example_ts[5:9]
hour_min = example_ts[10:]
print(f"\nParsed timestamp '{example_ts}':")
print(f"  Year: {year}")
print(f"  Month-Day: {month_day}")
print(f"  Hour-Min: {hour_min}")

### 4.2 File Versioning with Timestamps

In [None]:
# File versioning system
class FileVersionManager:
    def __init__(self, base_dir="./versioned_files"):
        self.base_dir = base_dir
        os.makedirs(base_dir, exist_ok=True)
    
    def save_version(self, filename, content):
        # Generate versioned filename
        base_name = os.path.splitext(filename)[0]
        ext = os.path.splitext(filename)[1]
        timestamp = stx.repro.timestamp()
        versioned_name = f"{base_name}_{timestamp}{ext}"
        
        # Save file
        filepath = os.path.join(self.base_dir, versioned_name)
        with open(filepath, 'w') as f:
            f.write(content)
        
        return versioned_name
    
    def list_versions(self, filename):
        base_name = os.path.splitext(filename)[0]
        versions = []
        
        for file in os.listdir(self.base_dir):
            if file.startswith(base_name + "_"):
                versions.append(file)
        
        return sorted(versions)

# Example usage
vm = FileVersionManager()

# Save multiple versions
print("Saving file versions...")
for i in range(3):
    content = f"Model configuration version {i+1}\nlearning_rate: {0.01 * (i+1)}\n"
    versioned = vm.save_version("model_config.txt", content)
    print(f"  Saved: {versioned}")
    time.sleep(1)  # Wait to get different timestamps

# List versions
print("\nAll versions:")
versions = vm.list_versions("model_config.txt")
for v in versions:
    print(f"  {v}")

# Clean up
import shutil
shutil.rmtree("./versioned_files")

## 5. Complete Reproducibility Workflow

In [None]:
# Complete reproducible experiment framework
class ReproducibleExperiment:
    def __init__(self, name, seed=42):
        self.name = name
        self.seed = seed
        self.exp_id = stx.repro.gen_id()
        self.timestamp = stx.repro.timestamp()
        self.results = {}
        self.metadata = {
            'name': name,
            'id': self.exp_id,
            'timestamp': self.timestamp,
            'seed': seed
        }
    
    def __enter__(self):
        # Fix seeds on entry
        print(f"\nStarting experiment: {self.name}")
        print(f"ID: {self.exp_id}")
        print(f"Timestamp: {self.timestamp}")
        print(f"Seed: {self.seed}")
        
        # Fix all available seeds
        libs = {'np': np, 'random': random}
        if TORCH_AVAILABLE:
            libs['torch'] = torch
        
        stx.repro.fix_seeds(seed=self.seed, verbose=False, **libs)
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        print(f"\nExperiment {self.name} completed")
        if self.results:
            print("Results:")
            for key, value in self.results.items():
                print(f"  {key}: {value}")
    
    def log_result(self, key, value):
        self.results[key] = value
    
    def save_config(self, params):
        config = {
            **self.metadata,
            'parameters': params,
            'results': self.results
        }
        filename = f"exp_{self.exp_id}_config.json"
        print(f"\nConfig saved to: {filename}")
        return config

# Example: Reproducible data analysis
with ReproducibleExperiment("Data Analysis", seed=123) as exp:
    # Generate data
    data = np.random.randn(1000)
    
    # Compute statistics
    exp.log_result('mean', float(np.mean(data)))
    exp.log_result('std', float(np.std(data)))
    exp.log_result('min', float(np.min(data)))
    exp.log_result('max', float(np.max(data)))
    
    # Save configuration
    config = exp.save_config({
        'n_samples': 1000,
        'distribution': 'normal'
    })

# Verify reproducibility
print("\n" + "="*50)
print("Verifying reproducibility...")

with ReproducibleExperiment("Data Analysis (Repeat)", seed=123) as exp2:
    # Generate data with same seed
    data2 = np.random.randn(1000)
    
    # Compute statistics
    exp2.log_result('mean', float(np.mean(data2)))
    exp2.log_result('std', float(np.std(data2)))
    
    # Check if results match
    print("\nReproducibility check:")
    print(f"  Means match: {exp.results['mean'] == exp2.results['mean']}")
    print(f"  Stds match: {exp.results['std'] == exp2.results['std']}")

## 6. Advanced Patterns

### 6.1 Distributed Experiment Tracking

In [None]:
# Simulate distributed experiment tracking
class DistributedExperimentManager:
    def __init__(self):
        self.experiments = {}
        self.workers = {}
    
    def create_experiment(self, name, n_workers=4):
        # Master experiment ID
        master_id = stx.repro.gen_id(N=12)
        
        # Create worker IDs
        worker_ids = []
        for i in range(n_workers):
            worker_id = f"{master_id}_W{i:02d}"
            worker_ids.append(worker_id)
        
        self.experiments[master_id] = {
            'name': name,
            'master_id': master_id,
            'timestamp': stx.repro.timestamp(),
            'n_workers': n_workers,
            'worker_ids': worker_ids,
            'status': 'created'
        }
        
        return master_id, worker_ids
    
    def worker_checkin(self, worker_id, status, metrics=None):
        self.workers[worker_id] = {
            'last_checkin': datetime.now(),
            'status': status,
            'metrics': metrics or {}
        }
    
    def get_experiment_status(self, master_id):
        exp = self.experiments.get(master_id, {})
        if not exp:
            return None
        
        # Check worker statuses
        worker_statuses = []
        for wid in exp['worker_ids']:
            worker_info = self.workers.get(wid, {'status': 'not started'})
            worker_statuses.append({
                'id': wid,
                'status': worker_info['status']
            })
        
        return {
            'experiment': exp['name'],
            'master_id': master_id,
            'workers': worker_statuses
        }

# Example usage
manager = DistributedExperimentManager()

# Create distributed experiment
master_id, worker_ids = manager.create_experiment("Distributed Training", n_workers=3)
print(f"Created distributed experiment:")
print(f"  Master ID: {master_id}")
print(f"  Worker IDs:")
for wid in worker_ids:
    print(f"    {wid}")

# Simulate workers checking in
print("\nSimulating worker activity...")
statuses = ['initializing', 'training', 'completed']
for i, wid in enumerate(worker_ids):
    status = statuses[i % len(statuses)]
    metrics = {'loss': 0.5 - i*0.1, 'accuracy': 0.8 + i*0.05}
    manager.worker_checkin(wid, status, metrics)
    print(f"  Worker {wid[-3:]} -> {status}")

# Get experiment status
status = manager.get_experiment_status(master_id)
print("\nExperiment Status:")
print(f"  Experiment: {status['experiment']}")
print(f"  Master ID: {status['master_id']}")
print(f"  Workers:")
for w in status['workers']:
    print(f"    {w['id'][-3:]}: {w['status']}")

### 6.2 Reproducibility Metadata

In [None]:
import platform
import sys

def capture_environment():
    """Capture complete environment for reproducibility."""
    env_info = {
        'timestamp': stx.repro.timestamp(),
        'experiment_id': stx.repro.gen_id(),
        'platform': {
            'system': platform.system(),
            'release': platform.release(),
            'version': platform.version(),
            'machine': platform.machine(),
            'processor': platform.processor(),
            'python_version': sys.version,
        },
        'packages': {
            'numpy': np.__version__,
            'pandas': pd.__version__,
            'scitex': stx.__version__ if hasattr(stx, '__version__') else 'unknown',
        },
        'seeds': {
            'random_state': random.getstate()[1][0],  # First value from state
            'numpy_state': np.random.get_state()[1][0],  # First value from state
        }
    }
    
    if TORCH_AVAILABLE:
        env_info['packages']['torch'] = torch.__version__
        env_info['cuda'] = {
            'available': torch.cuda.is_available(),
            'version': torch.version.cuda if torch.cuda.is_available() else None
        }
    
    return env_info

# Capture current environment
env = capture_environment()

print("=== Reproducibility Metadata ===")
print(f"\nExperiment ID: {env['experiment_id']}")
print(f"Timestamp: {env['timestamp']}")

print("\nPlatform:")
for key, value in env['platform'].items():
    if key != 'python_version':  # Skip long version string
        print(f"  {key}: {value}")
print(f"  Python: {sys.version.split()[0]}")

print("\nPackage versions:")
for pkg, version in env['packages'].items():
    print(f"  {pkg}: {version}")

if TORCH_AVAILABLE and 'cuda' in env:
    print(f"\nCUDA available: {env['cuda']['available']}")
    if env['cuda']['version']:
        print(f"CUDA version: {env['cuda']['version']}")

## 7. Best Practices Summary

### Key Takeaways

1. **Seed Management**: Always fix seeds at the start of experiments
2. **Unique Identification**: Use IDs for tracking experiments and versions
3. **Timestamp Everything**: Add timestamps for temporal tracking
4. **Environment Capture**: Record complete environment information
5. **Structured Workflows**: Build reproducibility into your experiment design

### Best Practices

1. **Start Every Experiment**:
   ```python
   stx.repro.fix_seeds(seed=42)
   exp_id = stx.repro.gen_id()
   ```

2. **Version Control Integration**:
   ```python
   # Include in filenames
   model_file = f"model_{stx.repro.timestamp()}.pkl"
   ```

3. **Experiment Tracking**:
   ```python
   # Unique ID for each run
   run_id = stx.repro.gen_id(N=12)
   ```

4. **Distributed Systems**:
   ```python
   # Hierarchical IDs
   master_id = stx.repro.gen_id()
   worker_id = f"{master_id}_W01"
   ```

In [None]:
print("\nReproducibility module tutorial completed!")
print("\nNext steps:")
print("1. Always start experiments with fix_seeds()")
print("2. Use gen_id() for experiment tracking")
print("3. Add timestamps to all output files")
print("4. Build reproducibility into your workflows")