# SeaweedFS Integration Examples

This notebook demonstrates how to work with SeaweedFS storage for datasets and models.

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
import json

## Storage Locations

JupyterHub provides several storage locations:
- `/home/jovyan/notebooks` - Persistent notebook storage (SeaweedFS)
- `/home/jovyan/datasets` - Shared datasets (SeaweedFS)
- `/home/jovyan/models` - Trained models (SeaweedFS)
- `/home/jovyan/scratch` - Fast local temporary storage

In [None]:
# Check available storage locations
storage_paths = {
    'notebooks': '/home/jovyan/notebooks',
    'datasets': '/home/jovyan/datasets',
    'models': '/home/jovyan/models',
    'scratch': '/home/jovyan/scratch'
}

for name, path in storage_paths.items():
    if os.path.exists(path):
        # Get disk usage
        stat = os.statvfs(path)
        free_gb = (stat.f_bavail * stat.f_frsize) / (1024**3)
        total_gb = (stat.f_blocks * stat.f_frsize) / (1024**3)
        print(f"{name:10} - {path}")
        print(f"           Free: {free_gb:.2f} GB / Total: {total_gb:.2f} GB\n")
    else:
        print(f"{name:10} - {path} (not mounted)\n")

## Working with Datasets

In [None]:
# Create a sample dataset and save to persistent storage
np.random.seed(42)
data = pd.DataFrame({
    'feature1': np.random.randn(1000),
    'feature2': np.random.randn(1000),
    'target': np.random.randint(0, 2, 1000)
})

# Save to datasets directory (persistent across nodes)
dataset_path = Path('/home/jovyan/datasets/sample_data.csv')
data.to_csv(dataset_path, index=False)
print(f"Dataset saved to: {dataset_path}")
print(f"Dataset shape: {data.shape}")
print(f"File size: {dataset_path.stat().st_size / 1024:.2f} KB")

In [None]:
# Load dataset from persistent storage
loaded_data = pd.read_csv(dataset_path)
print(f"Dataset loaded successfully")
print(loaded_data.head())

## Working with Models

In [None]:
# Train a simple model and save it
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import joblib

# Split data
X = loaded_data[['feature1', 'feature2']]
y = loaded_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
score = model.score(X_test, y_test)
print(f"Model accuracy: {score:.4f}")

# Save model to persistent storage
model_path = Path('/home/jovyan/models/sample_model.joblib')
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")

In [None]:
# Load model from persistent storage
loaded_model = joblib.load(model_path)
print(f"Model loaded successfully")

# Make predictions
predictions = loaded_model.predict(X_test[:5])
print(f"Sample predictions: {predictions}")

## Using Scratch Space for Temporary Work

In [None]:
# Use scratch space for temporary large files
scratch_file = Path('/home/jovyan/scratch/temp_large_data.npy')

# Create a large temporary array
large_array = np.random.randn(10000, 1000)
np.save(scratch_file, large_array)
print(f"Temporary file saved to scratch: {scratch_file}")
print(f"File size: {scratch_file.stat().st_size / (1024**2):.2f} MB")

# Note: Files in /scratch are NOT persistent and will be lost when the pod restarts

## Creating Project Structure

In [None]:
# Create a project structure in notebooks directory
project_name = "my_ml_project"
project_base = Path(f'/home/jovyan/notebooks/projects/{project_name}')

# Create directories
dirs_to_create = [
    project_base / 'data' / 'raw',
    project_base / 'data' / 'processed',
    project_base / 'notebooks',
    project_base / 'models',
    project_base / 'src',
    project_base / 'results'
]

for dir_path in dirs_to_create:
    dir_path.mkdir(parents=True, exist_ok=True)
    print(f"Created: {dir_path}")

# Create a README
readme_content = f"""# {project_name}

## Project Structure
- `data/` - Dataset storage
- `notebooks/` - Jupyter notebooks
- `models/` - Trained models
- `src/` - Source code
- `results/` - Results and outputs

Created on: {pd.Timestamp.now()}
"""

(project_base / 'README.md').write_text(readme_content)
print(f"\nProject structure created at: {project_base}")

## Persistence Verification

Files saved to SeaweedFS-backed directories will persist even if:
- The notebook server restarts
- You switch to a different node
- The pod is rescheduled

This allows true multi-node flexibility for GPU workloads!