# Data Generation Demo

This notebook demonstrates the new modular data generation structure for the geospatial neural adapter.

## Overview

The package now has a clean, modular structure:
- `data.generators`: Synthetic data generation functions
- `data.preprocessing`: Data preprocessing utilities
- `metrics`: Evaluation metrics

This replaces the old monolithic `utils.py` approach.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Import from the new modular structure
from geospatial_neural_adapter.data.generators import (
    generate_combined_synthetic_data,
    generate_time_synthetic_data,
)
from geospatial_neural_adapter.data.preprocessing import prepare_all
from geospatial_neural_adapter.metrics import fusion_score, frobenius_norm

print("✅ All imports successful!")

## Example 1: Generate Combined Synthetic Data

Generate synthetic data with combined trend and spatial components.

In [None]:
# Generate combined synthetic data
N = 100
n_samples = 50
locations = np.linspace(-3, 3, N)

cat_features, cont_features, targets = generate_combined_synthetic_data(
    location=locations,
    n_samples=n_samples,
    noise_std=1.0,
    eigenvalue=0.5,
    seed=42
)

print("Generated data shapes:")
print(f"  Categorical features: {cat_features.shape}")
print(f"  Continuous features: {cont_features.shape}")
print(f"  Targets: {targets.shape}")

# Show some statistics
print(f"\nTarget statistics:")
print(f"  Mean: {targets.mean():.3f}")
print(f"  Std: {targets.std():.3f}")
print(f"  Min: {targets.min():.3f}")
print(f"  Max: {targets.max():.3f}")

## Example 2: Generate Temporal Synthetic Data

Generate synthetic temporal data with spatial correlation.

In [None]:
# Generate temporal synthetic data
T = 200
cat_temporal, cont_temporal, targets_temporal = generate_time_synthetic_data(
    locs=locations,
    n_time_steps=T,
    noise_std=1.0,
    eigenvalue=0.5,
    seed=42
)

print("Generated temporal data shapes:")
print(f"  Categorical features: {cat_temporal.shape}")
print(f"  Continuous features: {cont_temporal.shape}")
print(f"  Targets: {targets_temporal.shape}")

# Show temporal statistics
print(f"\nTemporal target statistics:")
print(f"  Mean: {targets_temporal.mean():.3f}")
print(f"  Std: {targets_temporal.std():.3f}")
print(f"  Min: {targets_temporal.min():.3f}")
print(f"  Max: {targets_temporal.max():.3f}")

## Example 3: Data Preprocessing

Prepare train/validation/test datasets from the generated data.

In [None]:
# Prepare datasets
train_dataset, val_dataset, test_dataset = prepare_all(
    cat_features=cat_temporal,
    cont_features=cont_temporal,
    targets=targets_temporal,
    train_ratio=0.7,
    val_ratio=0.15
)

print("Dataset sizes:")
print(f"  Training: {len(train_dataset)} samples")
print(f"  Validation: {len(val_dataset)} samples")
print(f"  Test: {len(test_dataset)} samples")

# Show dataset structure
train_cat, train_cont, train_y = train_dataset.tensors
print(f"\nTraining dataset shapes:")
print(f"  Categorical: {train_cat.shape}")
print(f"  Continuous: {train_cont.shape}")
print(f"  Targets: {train_y.shape}")

## Example 4: Using Utility Metrics

Demonstrate the evaluation metrics functions.

In [None]:
# Example matrices for Frobenius norm
A = np.random.randn(5, 5)
B = np.random.randn(5, 5)
fro_norm = frobenius_norm(A, B)
print(f"Frobenius norm between random matrices: {fro_norm:.4f}")

# Example fusion score
rmse = 0.5
proj_gap = 2.0
p = 3
fusion = fusion_score(rmse, proj_gap, p)
print(f"Fusion score (RMSE={rmse}, proj_gap={proj_gap}, p={p}): {fusion:.4f}")

# Test with different parameters
print(f"\nFusion score examples:")
print(f"  RMSE only: {fusion_score(0.3, None, None):.4f}")
print(f"  With projection gap: {fusion_score(0.3, 1.5, 5):.4f}")
print(f"  Zero p: {fusion_score(0.3, 1.5, 0):.4f}")

## Example 5: Extracted Spatial Dependency

Demonstrate the shared spatial basis function that both data generators use.

In [None]:
# Import the internal function to demonstrate spatial basis
from geospatial_neural_adapter.data.generators import _generate_spatial_basis

# Generate spatial basis for visualization
spatial_basis = _generate_spatial_basis(locations)

print(f"Spatial basis shape: {spatial_basis.shape}")
print(f"Spatial basis norm: {np.linalg.norm(spatial_basis):.6f} (should be 1.0)")

# Visualize spatial dependency
plt.figure(figsize=(15, 4))

plt.subplot(1, 3, 1)
plt.plot(locations, spatial_basis.flatten(), 'b-', linewidth=2)
plt.title('Spatial Basis Function')
plt.xlabel('Location')
plt.ylabel('φ(x)')
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 2)
# Show how spatial dependency affects targets
sample_idx = 0
plt.scatter(locations, targets[sample_idx, :], alpha=0.6, s=20, label='Targets')
plt.plot(locations, 50 + 0.5 * spatial_basis.flatten(), 'r--', linewidth=2, label='Spatial Component')
plt.title(f'Spatial Dependency (Sample {sample_idx})')
plt.xlabel('Location')
plt.ylabel('Target Value')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 3, 3)
# Show temporal spatial dependency
time_idx = 100
plt.scatter(locations, targets_temporal[time_idx, :], alpha=0.6, s=20, label='Temporal Targets')
plt.plot(locations, 50 + 0.5 * spatial_basis.flatten(), 'r--', linewidth=2, label='Spatial Component')
plt.title(f'Temporal Spatial Dependency (Time {time_idx})')
plt.xlabel('Location')
plt.ylabel('Target Value')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Spatial dependency extracted and visualized!")

## Example 6: Visualization

Create visualizations of the generated data.

In [None]:
# Plot time series for first location
plt.figure(figsize=(15, 10))

# Plot 1: Temporal target
plt.subplot(2, 2, 1)
plt.plot(targets_temporal[:, 0], label='Target', alpha=0.7, linewidth=1)
plt.title('Temporal Target (Location 0)')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 2: Temporal features
plt.subplot(2, 2, 2)
for i in range(cont_temporal.shape[2]):
    plt.plot(cont_temporal[:, 0, i], label=f'Feature {i+1}', alpha=0.7, linewidth=1)
plt.title('Temporal Features (Location 0)')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 3: Spatial distribution at a specific time
plt.subplot(2, 2, 3)
time_idx = 100  # Middle of the time series
plt.scatter(locations, targets_temporal[time_idx, :], alpha=0.6, s=20)
plt.title(f'Spatial Distribution (Time {time_idx})')
plt.xlabel('Location')
plt.ylabel('Target Value')
plt.grid(True, alpha=0.3)

# Plot 4: Heatmap of temporal-spatial data
plt.subplot(2, 2, 4)
im = plt.imshow(targets_temporal.T, aspect='auto', cmap='viridis')
plt.colorbar(im, label='Target Value')
plt.title('Temporal-Spatial Heatmap')
plt.xlabel('Time')
plt.ylabel('Location Index')

plt.tight_layout()
plt.show()

## Example 7: Comparison with Combined Data

Compare the two different data generation approaches.

In [None]:
# Compare temporal vs combined data
plt.figure(figsize=(12, 5))

# Plot temporal data (first location)
plt.subplot(1, 2, 1)
plt.plot(targets_temporal[:, 0], label='Temporal', alpha=0.7, linewidth=1)
plt.title('Temporal Data Generation')
plt.xlabel('Time')
plt.ylabel('Target Value')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot combined data (first sample)
plt.subplot(1, 2, 2)
plt.plot(targets[0, :], label='Combined', alpha=0.7, linewidth=1)
plt.title('Combined Data Generation')
plt.xlabel('Location')
plt.ylabel('Target Value')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Show key differences
print("Key differences between data generation approaches:")
print(f"  Temporal data shape: {targets_temporal.shape}")
print(f"  Combined data shape: {targets.shape}")
print(f"  Temporal temporal correlation: {np.corrcoef(targets_temporal[:-1, 0], targets_temporal[1:, 0])[0,1]:.3f}")
print(f"  Combined spatial correlation: {np.corrcoef(targets[0, :-1], targets[0, 1:])[0,1]:.3f}")