# SciTeX Path Module Tutorial

This notebook demonstrates the path utilities in SciTeX for file system operations, versioning, and path management.

## 1. Setup and Imports

In [None]:
import scitex as stx
import os
import shutil
from pathlib import Path
import tempfile
import time

# Create a temporary directory for examples
temp_dir = tempfile.mkdtemp(prefix="scitex_path_demo_")
print(f"Working directory: {temp_dir}")

# Change to temp directory
original_dir = os.getcwd()
os.chdir(temp_dir)

## 2. Basic Path Operations

### 2.1 Getting Current Path Information

In [None]:
# Get the path of the current file
current_path = stx.path.this_path()
print(f"Current file path: {current_path}")

# Note: In Jupyter notebooks, this_path() returns the notebook's path
# In regular Python scripts, it returns the script's path

# Get various path components
print("\nPath components:")
print(f"  Directory: {os.path.dirname(current_path)}")
print(f"  Filename: {os.path.basename(current_path)}")
print(f"  Extension: {os.path.splitext(current_path)[1]}")

### 2.2 Path Splitting

In [None]:
# Split paths into components
test_paths = [
    "/home/user/data/experiment.csv",
    "./results/model_v001.pkl",
    "../figures/plot.png",
    "data/raw/sample.txt.gz"
]

print("Path splitting examples:")
for path in test_paths:
    dir_path, filename, ext = stx.path.split(path)
    print(f"\nPath: {path}")
    print(f"  Directory: {dir_path}")
    print(f"  Filename: {filename}")
    print(f"  Extension: {ext}")

### 2.3 Path Cleaning

In [None]:
# Clean messy paths
messy_paths = [
    "./data/../results//output.txt",
    "figures/./plots///graph.png",
    "my data/file with spaces.csv",
    "~/documents/../downloads/./file.pdf"
]

print("Path cleaning examples:")
for messy in messy_paths:
    clean = stx.path.clean(messy)
    print(f"\nOriginal: {messy}")
    print(f"Cleaned:  {clean}")

## 3. Safe Path Generation

### 3.1 Creating Safe Paths

In [None]:
# Get safe path for current script/notebook
spath = stx.path.get_spath()
print(f"Safe path: {spath}")

# The safe path is unique to the current file
# It's useful for creating output directories

# Create the safe path directory
stx.path.mk_spath()
print(f"\nCreated directory: {os.path.exists(spath)}")

# Use safe path for outputs
output_file = os.path.join(spath, "results.txt")
with open(output_file, 'w') as f:
    f.write("Results from notebook\n")
print(f"\nCreated file: {output_file}")

## 4. File and Directory Search

### 4.1 Setting Up Test Directory Structure

In [None]:
# Create test directory structure
test_structure = {
    "data": [
        "raw/sample1.csv",
        "raw/sample2.csv",
        "processed/cleaned_data.pkl",
        "processed/features.npy"
    ],
    "models": [
        "v1/model_001.pkl",
        "v1/model_002.pkl",
        "v2/model_001.pkl",
        "best/final_model.pkl"
    ],
    "figures": [
        "plots/accuracy.png",
        "plots/loss.png",
        "plots/confusion_matrix.pdf",
        "report.tex"
    ],
    "logs": [
        "experiment_001.log",
        "experiment_002.log",
        "debug.log"
    ]
}

# Create the structure
for base_dir, files in test_structure.items():
    for file_path in files:
        full_path = os.path.join(base_dir, file_path)
        os.makedirs(os.path.dirname(full_path), exist_ok=True)
        # Create dummy file
        with open(full_path, 'w') as f:
            f.write(f"Dummy content for {file_path}\n")

print("Created test directory structure:")
for root, dirs, files in os.walk('.'):
    level = root.replace('.', '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files:
        print(f"{subindent}{file}")

### 4.2 Finding Files

In [None]:
# Find files by pattern
print("=== Finding Files ===")

# Find all CSV files
csv_files = stx.path.find_file("*.csv")
print("\nCSV files found:")
for f in csv_files:
    print(f"  {f}")

# Find all model files
model_files = stx.path.find_file("model*.pkl")
print("\nModel files found:")
for f in model_files:
    print(f"  {f}")

# Find files with specific pattern in name
log_files = stx.path.find_file("*experiment*.log")
print("\nExperiment log files:")
for f in log_files:
    print(f"  {f}")

# Find files in specific subdirectory
processed_files = stx.path.find_file("*", root="data/processed")
print("\nFiles in data/processed:")
for f in processed_files:
    print(f"  {f}")

### 4.3 Finding Directories

In [None]:
# Find directories
print("=== Finding Directories ===")

# Find all directories
all_dirs = stx.path.find_dir("*")
print("\nAll directories:")
for d in sorted(all_dirs):
    print(f"  {d}")

# Find version directories
version_dirs = stx.path.find_dir("v*")
print("\nVersion directories:")
for d in version_dirs:
    print(f"  {d}")

# Find specific named directories
plot_dirs = stx.path.find_dir("plots")
print("\nPlot directories:")
for d in plot_dirs:
    print(f"  {d}")

## 5. Version Control for Files

### 5.1 Automatic Version Incrementing

In [None]:
# Create versioned files
print("=== Version Control ===")

# Create initial file
base_name = "results/experiment"
os.makedirs("results", exist_ok=True)

# Generate versioned filenames
versions = []
for i in range(5):
    if i == 0:
        # First version
        filename = f"{base_name}_v001.txt"
    else:
        # Increment from previous
        filename = stx.path.increment_version(versions[-1])
    
    versions.append(filename)
    
    # Create the file
    with open(filename, 'w') as f:
        f.write(f"Experiment version {i+1}\n")
        f.write(f"Timestamp: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
    
    print(f"Created: {filename}")
    time.sleep(0.1)  # Small delay

# Find latest version
latest = stx.path.find_latest("results/experiment_v*.txt")
print(f"\nLatest version: {latest}")

### 5.2 Custom Version Formats

In [None]:
# Different version formats
version_examples = [
    "model_v1.pkl",
    "data_version_001.csv",
    "checkpoint_iter_1000.pt",
    "backup_2024_01_15_v1.zip"
]

print("Custom version increments:")
for example in version_examples:
    next_version = stx.path.increment_version(example)
    print(f"\n{example}")
    print(f"  -> {next_version}")

# Create sequence with custom prefix
print("\nCustom version sequence:")
custom_base = "backup/daily_backup"
os.makedirs("backup", exist_ok=True)

for i in range(3):
    if i == 0:
        filename = f"{custom_base}_rev001.tar.gz"
    else:
        filename = stx.path.increment_version(filename, version_prefix="rev")
    
    Path(filename).touch()
    print(f"  {filename}")

## 6. Working with Git Repositories

In [None]:
# Find git root (if in a git repository)
try:
    git_root = stx.path.find_git_root()
    print(f"Git root found: {git_root}")
    
    # Show relative path from git root
    current = os.getcwd()
    rel_path = os.path.relpath(current, git_root)
    print(f"Current directory relative to git root: {rel_path}")
except Exception as e:
    print(f"Not in a git repository or git root not found: {e}")
    
    # Simulate git repository for demo
    print("\nCreating demo git structure...")
    demo_git = "demo_project"
    os.makedirs(os.path.join(demo_git, ".git"), exist_ok=True)
    os.makedirs(os.path.join(demo_git, "src", "module"), exist_ok=True)
    
    # Change to subdirectory
    original_cwd = os.getcwd()
    os.chdir(os.path.join(demo_git, "src", "module"))
    
    # Now find git root
    try:
        git_root = stx.path.find_git_root()
        print(f"Demo git root: {git_root}")
        print(f"Current location: {os.getcwd()}")
    except Exception as e:
        print(f"Error: {e}")
    finally:
        os.chdir(original_cwd)

## 7. File Size Utilities

In [None]:
# Get file sizes
print("=== File Sizes ===")

# Create files with different sizes
size_test_dir = "size_test"
os.makedirs(size_test_dir, exist_ok=True)

test_files = {
    "small.txt": b"Small file content",
    "medium.dat": b"x" * 1024 * 10,  # 10 KB
    "large.bin": b"X" * 1024 * 1024,  # 1 MB
}

for filename, content in test_files.items():
    path = os.path.join(size_test_dir, filename)
    with open(path, 'wb') as f:
        f.write(content)
    
    # Get size in different formats
    size_bytes = stx.path.getsize(path)
    size_kb = stx.path.getsize(path, unit='KB')
    size_mb = stx.path.getsize(path, unit='MB')
    
    print(f"\n{filename}:")
    print(f"  Bytes: {size_bytes:,}")
    print(f"  KB: {size_kb:.2f}")
    print(f"  MB: {size_mb:.4f}")

# Get directory size
dir_size = stx.path.getsize(size_test_dir)
dir_size_kb = stx.path.getsize(size_test_dir, unit='KB')
print(f"\nTotal directory size:")
print(f"  Bytes: {dir_size:,}")
print(f"  KB: {dir_size_kb:.2f}")

## 8. Practical Examples

### 8.1 Experiment Output Management

In [None]:
class ExperimentManager:
    """Manage experiment outputs with versioning."""
    
    def __init__(self, base_dir="experiments"):
        self.base_dir = base_dir
        os.makedirs(base_dir, exist_ok=True)
    
    def new_experiment(self, name):
        """Create new experiment directory with version."""
        # Find existing experiments
        pattern = os.path.join(self.base_dir, f"{name}_v*.exp")
        latest = stx.path.find_latest(pattern)
        
        if latest:
            # Increment version
            new_dir = stx.path.increment_version(latest)
        else:
            # First version
            new_dir = os.path.join(self.base_dir, f"{name}_v001.exp")
        
        # Create directory structure
        os.makedirs(new_dir)
        for subdir in ['data', 'models', 'figures', 'logs']:
            os.makedirs(os.path.join(new_dir, subdir))
        
        return new_dir
    
    def save_results(self, exp_dir, results):
        """Save experiment results."""
        import json
        
        results_file = os.path.join(exp_dir, "results.json")
        with open(results_file, 'w') as f:
            json.dump(results, f, indent=2)
    
    def list_experiments(self):
        """List all experiments."""
        exp_dirs = stx.path.find_dir("*.exp", root=self.base_dir)
        return sorted(exp_dirs)

# Use the manager
manager = ExperimentManager()

# Run multiple experiments
for i in range(3):
    exp_dir = manager.new_experiment("neural_network")
    print(f"\nCreated experiment: {exp_dir}")
    
    # Simulate saving results
    results = {
        "accuracy": 0.85 + i * 0.03,
        "loss": 0.3 - i * 0.05,
        "parameters": {"lr": 0.001, "batch_size": 32}
    }
    manager.save_results(exp_dir, results)

# List all experiments
print("\nAll experiments:")
for exp in manager.list_experiments():
    print(f"  {exp}")

### 8.2 Data Pipeline with Auto-versioning

In [None]:
class DataPipeline:
    """Data processing pipeline with automatic versioning."""
    
    def __init__(self, project_name):
        self.project_name = project_name
        self.base_dir = stx.path.clean(f"./projects/{project_name}")
        self._setup_directories()
    
    def _setup_directories(self):
        """Setup project directory structure."""
        dirs = ['raw', 'processed', 'features', 'models', 'reports']
        for d in dirs:
            os.makedirs(os.path.join(self.base_dir, d), exist_ok=True)
    
    def save_data(self, data, stage, description=""):
        """Save data with automatic versioning."""
        import pandas as pd
        import numpy as np
        
        # Determine file extension based on data type
        if isinstance(data, pd.DataFrame):
            ext = ".csv"
            save_func = lambda path: data.to_csv(path, index=False)
        elif isinstance(data, np.ndarray):
            ext = ".npy"
            save_func = lambda path: np.save(path, data)
        else:
            ext = ".pkl"
            import pickle
            save_func = lambda path: pickle.dump(data, open(path, 'wb'))
        
        # Generate versioned filename
        base_name = os.path.join(self.base_dir, stage, f"{stage}_data")
        pattern = f"{base_name}_v*{ext}"
        
        latest = stx.path.find_latest(pattern)
        if latest:
            new_path = stx.path.increment_version(latest)
        else:
            new_path = f"{base_name}_v001{ext}"
        
        # Save data
        save_func(new_path)
        
        # Save metadata
        meta_path = new_path.replace(ext, "_meta.json")
        import json
        metadata = {
            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
            "description": description,
            "shape": getattr(data, 'shape', len(data)) if hasattr(data, '__len__') else None,
            "type": type(data).__name__
        }
        with open(meta_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        return new_path
    
    def get_latest(self, stage):
        """Get latest version of data for a stage."""
        pattern = os.path.join(self.base_dir, stage, f"{stage}_data_v*.*")
        files = stx.path.find_file(os.path.basename(pattern), 
                                  root=os.path.dirname(pattern))
        
        # Filter out metadata files
        data_files = [f for f in files if not f.endswith('_meta.json')]
        
        if data_files:
            return sorted(data_files)[-1]
        return None

# Use the pipeline
import pandas as pd
import numpy as np

pipeline = DataPipeline("ml_project")

# Simulate data processing stages
print("Processing data through pipeline...\n")

# Stage 1: Raw data
raw_data = pd.DataFrame({
    'feature1': np.random.randn(100),
    'feature2': np.random.randn(100),
    'target': np.random.randint(0, 2, 100)
})
raw_path = pipeline.save_data(raw_data, 'raw', "Initial dataset")
print(f"Saved raw data: {raw_path}")

# Stage 2: Processed data (simulate multiple versions)
for i in range(3):
    processed_data = raw_data.copy()
    processed_data['feature3'] = processed_data['feature1'] * 2 + i
    proc_path = pipeline.save_data(processed_data, 'processed', 
                                  f"Added feature3 (version {i+1})")
    print(f"Saved processed data: {proc_path}")
    time.sleep(0.1)

# Stage 3: Features
features = processed_data[['feature1', 'feature2', 'feature3']].values
feat_path = pipeline.save_data(features, 'features', "Extracted feature matrix")
print(f"Saved features: {feat_path}")

# Get latest versions
print("\nLatest versions:")
for stage in ['raw', 'processed', 'features']:
    latest = pipeline.get_latest(stage)
    if latest:
        print(f"  {stage}: {os.path.basename(latest)}")

## 9. Advanced Path Utilities

In [None]:
# Get data path from package
try:
    # This would work with installed packages
    data_path = stx.path.get_data_path_from_a_package('scitex')
    print(f"SciTeX data path: {data_path}")
except Exception as e:
    print(f"Could not get package data path: {e}")

# Complex path operations
print("\nComplex path examples:")

# Clean and create nested path
messy_path = "./output//results/../final///report.pdf"
clean_path = stx.path.clean(messy_path)
print(f"Cleaned path: {clean_path}")

# Create directory if needed
output_dir = os.path.dirname(clean_path)
os.makedirs(output_dir, exist_ok=True)

# Find all Python files (excluding certain directories)
print("\nFinding Python files...")
py_files = stx.path.find_file("*.py", root=".")
# Note: find_file automatically excludes /lib/, /env/, /build/ directories
print(f"Found {len(py_files)} Python files")

# Batch rename simulation
print("\nBatch versioning example:")
batch_dir = "batch_test"
os.makedirs(batch_dir, exist_ok=True)

# Create initial files
for i in range(3):
    old_name = os.path.join(batch_dir, f"data_{i}.txt")
    Path(old_name).touch()
    
    # Convert to versioned name
    new_name = os.path.join(batch_dir, f"data_{i}_v001.txt")
    os.rename(old_name, new_name)
    print(f"  Renamed: {os.path.basename(old_name)} -> {os.path.basename(new_name)}")

## 10. Cleanup

In [None]:
# Return to original directory and cleanup
os.chdir(original_dir)
shutil.rmtree(temp_dir)
print(f"Cleaned up temporary directory: {temp_dir}")

## Summary and Best Practices

### Key Takeaways

1. **Path Management**: Clean and normalize paths for cross-platform compatibility
2. **File Versioning**: Automatic version incrementing for outputs
3. **Smart Search**: Find files and directories with pattern matching
4. **Safe Paths**: Generate unique output directories for scripts
5. **Git Integration**: Find repository roots programmatically

### Best Practices

1. **Always Clean Paths**:
   ```python
   path = stx.path.clean("./data/../results//output.txt")
   ```

2. **Use Versioning for Outputs**:
   ```python
   latest = stx.path.find_latest("model_v*.pkl")
   next_version = stx.path.increment_version(latest)
   ```

3. **Organize with Safe Paths**:
   ```python
   output_dir = stx.path.mk_spath()
   # Creates unique directory for current script
   ```

4. **Smart File Search**:
   ```python
   # Automatically excludes build/env directories
   files = stx.path.find_file("*.py")
   ```

In [None]:
print("\nPath module tutorial completed!")
print("\nNext steps:")
print("1. Use path.clean() for all file paths")
print("2. Implement versioning for experiment outputs")
print("3. Organize outputs with safe paths")
print("4. Use find_file/find_dir for dynamic file discovery")