#  Autoencoder Experimentation Notebook

This notebook is designed for **systematic experimentation** with convolutional autoencoders trained on GEDI waveform data. 
It allows the user to quickly test different encoder architectures, embedding dimensions, and training parameters while logging results and saving models/embeddings.

🎯 Goals:
Build a compact autoencoder that:

- Reconstructs GEDI waveforms with moderate - high fidelity

- Outputs low-dimensional embeddings that can be regressed from Sentinel imagery

## 1. Load Data

In [1]:
# import standard libraries
import os
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# import ML libraries
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

# import visualization libraries
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import TensorBoard

## 2. Load and Split Data

In [None]:
# get path to cwd and set project root
notebook_dir = Path.cwd()
project_root = notebook_dir.parent

# define full path to dataset and load
data_path =  os.path.join(project_root, 'data/gedi_waveforms_tf.npz')
data = np.load(data_path)

In [3]:
# Extract waveform data
waveforms = data['waveforms']

# Add new axis to waveform data
waveforms = waveforms[..., np.newaxis]

# inspect waveform data and shape
print(waveforms.shape)
print(waveforms)

(10546, 500, 1)
[[[-0.92182818]
  [-1.11135732]
  [-1.0882749 ]
  ...
  [-0.82720233]
  [-0.7545843 ]
  [-0.65852474]]

 [[-0.51685445]
  [-0.91077666]
  [-1.0163088 ]
  ...
  [-1.14346151]
  [-0.74755905]
  [-0.30184295]]

 [[-0.47643436]
  [-0.54564899]
  [-0.33103624]
  ...
  [ 0.23716828]
  [ 0.30925776]
  [ 0.10473613]]

 ...

 [[-1.01867713]
  [-1.48746914]
  [-1.54916126]
  ...
  [-0.18743011]
  [-0.19228905]
  [-0.06010556]]

 [[ 0.06117187]
  [ 0.00706989]
  [-0.20781391]
  ...
  [-0.20177718]
  [-0.17382068]
  [-0.17141481]]

 [[-0.78835739]
  [-0.72094595]
  [-0.28164888]
  ...
  [ 0.97619698]
  [ 0.64958933]
  [ 0.17167537]]]


In [4]:
# Split dataset into training and validation sets (80/20 split)
x_train, x_temp = train_test_split(waveforms, test_size = 0.2, random_state = 0)
x_test, x_val = train_test_split(x_temp, test_size = 0.5, random_state = 0)

# inspect the shape of the training and validation sets
print(f"Training data:  {x_train.shape}")
print(f"Testing data:  {x_test.shape}")
print(f"Validation data: {x_val.shape}")

Training data:  (8436, 500, 1)
Testing data:  (1055, 500, 1)
Validation data: (1055, 500, 1)


## 3. Build Autoencoder Model
This function constructs the encoder-decoder model using configurable parameters:
- `latent_dim`: size of the final embedding layer
- `use_global_avg`: whether to use GlobalAveragePooling or Flatten before the bottleneck

In [5]:
from tensorflow.keras import layers, models

def build_autoencoder(input_shape, latent_dim=16, use_global_avg=False,
                      dropout_rate=0.0, use_batchnorm=False, bottleneck_type='dense'):
    """
    Build a convolutional autoencoder with flexible bottleneck and optional batchnorm/dropout.

    Parameters:
        input_shape (tuple): Shape of input waveform (e.g., (500, 1))
        latent_dim (int): Size of bottleneck representation
        use_global_avg (bool): Use GlobalAveragePooling1D before bottleneck if True
        dropout_rate (float): Dropout rate before bottleneck
        use_batchnorm (bool): If True, applies BatchNormalization after each Conv1D
        bottleneck_type (str): 'dense' for a single Dense layer, 'mlp' for 2-layer bottleneck

    Returns:
        (autoencoder, encoder): Keras models
    """
    # Encoder
    inputs = layers.Input(shape=input_shape, name='input_layer')
    x = layers.Conv1D(32, 3, padding='same')(inputs)
    if use_batchnorm:
        x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.MaxPooling1D(2, padding='same')(x)

    x = layers.Conv1D(64, 3, padding='same')(x)

    if use_batchnorm:
        x = layers.BatchNormalization()(x)

    x = layers.Activation('relu')(x)
    x = layers.MaxPooling1D(2, padding='same')(x)

    # Flatten or Global Average before bottleneck
    if use_global_avg:
        x = layers.GlobalAveragePooling1D()(x)  # → (64,)
    else:
        x = layers.Flatten()(x)                 # → (125 * 64 = 8000,)

    if dropout_rate > 0:
        x = layers.Dropout(dropout_rate)(x)

    # Bottleneck 
    if bottleneck_type == 'mlp':
        x = layers.Dense(128, activation='relu')(x)
        bottleneck = layers.Dense(latent_dim, activation='linear', name='bottleneck')(x)
    else:
        bottleneck = layers.Dense(latent_dim, activation='linear', name='bottleneck')(x)

    # Decoder
    x = layers.Dense(125 * 64, activation='relu')(bottleneck)
    x = layers.Reshape((125, 64))(x)
    x = layers.Conv1D(64, 3, padding='same')(x)

    if use_batchnorm:
        x = layers.BatchNormalization()(x)

    x = layers.Activation('relu')(x)
    x = layers.UpSampling1D(2)(x)

    x = layers.Conv1D(32, 3, padding='same')(x)

    if use_batchnorm:
        x = layers.BatchNormalization()(x)
        
    x = layers.Activation('relu')(x)
    x = layers.UpSampling1D(2)(x)

    decoded = layers.Conv1D(1, 3, activation='linear', padding='same')(x)

    autoencoder = models.Model(inputs, decoded, name='autoencoder')
    encoder = models.Model(inputs, bottleneck, name='encoder')

    return autoencoder, encoder

##  4. Run and Log Experiments
This function trains the autoencoder, saves the model and encoder, and logs performance metrics.

In [None]:
def save_reconstruction_plot(model, data, experiment_id, test_loss, config, n=10, save_dir=None, seed=42):
    """
    Save a side-by-side plot of original vs reconstructed waveforms with experiment metadata.
    Consistently uses the same n random indices across experiments (saved to .npy).

    Parameters:
        model: Trained autoencoder
        data: Input waveforms
        experiment_id: UID
        test_loss: test MSE to annotate on plot
        config: Dict of model parameters
        n: Number of waveform plots
        save_dir: Output directory
        seed: Random seed
    """
    if save_dir is None:
        save_dir = os.path.join(project_root, 'models/plots')
    os.makedirs(save_dir, exist_ok = True)

    index_path = os.path.join(save_dir, "selected_indices.npy")

    # Load or generate consistent indices
    if os.path.exists(index_path):
        indices = np.load(index_path)
    else:
        np.random.seed(seed)
        indices = np.random.choice(len(data), size=n, replace=False)
        np.save(index_path, indices)

    selected_data = data[indices]
    reconstructions = model.predict(selected_data)

    plt.figure(figsize=(12, 3 * n))

    # Annotate with experiment config
    config_str = (
        f"Latent Dim: {config['latent_dim']} | "
        f"Dropout: {config['dropout_rate']} | "
        f"BatchNorm: {config['use_batchnorm']} | "
        f"Bottleneck: {config['bottleneck_type']} | "
        f"Test MSE: {test_loss:.4f}"
    )
    plt.suptitle(f"{experiment_id} — {config_str}", fontsize=12, y=1.02)

    for i in range(n):
        plt.subplot(n, 2, 2*i + 1)
        plt.plot(selected_data[i].squeeze(), color='blue')
        plt.title(f"Original #{indices[i]}")
        plt.grid(True)

        plt.subplot(n, 2, 2*i + 2)
        plt.plot(reconstructions[i].squeeze(), color='orange')
        plt.title(f"Reconstructed #{indices[i]}")
        plt.grid(True)

    plt.tight_layout()
    save_path = os.path.join(save_dir, f"{experiment_id}_reconstructions.png")
    plt.savefig(save_path, bbox_inches='tight')
    plt.close()

In [7]:
results_log = []

def run_experiment(
    experiment_id,
    input_shape,
    latent_dim=16,
    use_global_avg=False,
    learning_rate=1e-3,
    dropout_rate=0.0,
    use_batchnorm=False,
    bottleneck_type='dense',
    epochs=20,
    batch_size=64,
    results_path=os.path.join(project_root, "models/experiment_results.csv")
):
    print(f"\n🚀 Running {experiment_id} | Latent Dim: {latent_dim}, AvgPool: {use_global_avg}, "
          f"LR: {learning_rate}, Dropout: {dropout_rate}, BatchNorm: {use_batchnorm}, Bottleneck: {bottleneck_type}")

    # Build model
    autoencoder, encoder = build_autoencoder(
        input_shape=input_shape,
        latent_dim=latent_dim,
        use_global_avg=use_global_avg,
        dropout_rate=dropout_rate,
        use_batchnorm=use_batchnorm,
        bottleneck_type=bottleneck_type
    )
    autoencoder.compile(optimizer=optimizers.Adam(learning_rate), loss='mse')

    # Set up TensorBoard
    log_dir = os.path.join(project_root, f"logs/{experiment_id}_{datetime.now().strftime('%Y%m%d-%H%M%S')}")
    tensorboard_cb = TensorBoard(log_dir=log_dir)

    # Train model
    history = autoencoder.fit(
        x_train, x_train,
        validation_data=(x_val, x_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=0,
        callbacks=[tensorboard_cb]
    )

    # Evaluate
    train_loss = history.history['loss'][-1]
    val_loss = history.history['val_loss'][-1]
    test_loss = autoencoder.evaluate(x_test, x_test, verbose=0)

    # Save model + embeddings
    autoencoder.save(os.path.join(project_root, f"models/autoencoder_{experiment_id}.keras"))
    encoder.save(os.path.join(project_root, f"models/encoder_{experiment_id}.keras"))
    embeddings = encoder.predict(waveforms)
    np.save(os.path.join(project_root, f"models/embeddings_{experiment_id}.npy"), embeddings)

    # Log results
    result = {
        'Experiment': experiment_id,
        'Latent Dim': latent_dim,
        'Global Avg?': use_global_avg,
        'LR': learning_rate,
        'Dropout': dropout_rate,
        'BatchNorm': use_batchnorm,
        'Bottleneck': bottleneck_type,
        'Epochs': epochs,
        'Train Loss': train_loss,
        'Val Loss': val_loss,
        'Test Loss': test_loss
    }
    results_log.append(result)

    # Save to CSV
    df_results = pd.DataFrame(results_log)
    df_results.to_csv(results_path, index=False)

    # Save a reconstruction plot (10 samples with consistent indices)
    save_reconstruction_plot(
        model=autoencoder,
        data=x_val,
        experiment_id=experiment_id,
        test_loss=test_loss,
        config={
            'latent_dim': latent_dim,
            'dropout_rate': dropout_rate,
            'use_batchnorm': use_batchnorm,
            'bottleneck_type': bottleneck_type
        },
        n=10,
        save_dir=os.path.join(project_root, "plots")
    )

    print(f"{experiment_id} complete — Test MSE: {test_loss:.4f} — Logged to {results_path}")


## 5. Define and Run a Set of Experiments

In [8]:
experiments = []

latent_dims = [4, 8, 16]
dropouts = [0.0, 0.2]
batchnorms = [False, True]
bottlenecks = ['dense', 'mlp']

exp_id = 1

for ld in latent_dims:
    for dr in dropouts:
        for bn in batchnorms:
            for bt in bottlenecks:
                experiments.append({
                    'experiment_id': f"exp_{exp_id:02d}",
                    'latent_dim': ld,
                    'use_global_avg': False,         # Keep this off for now (better performance)
                    'learning_rate': 1e-3,
                    'dropout_rate': dr,
                    'use_batchnorm': bn,
                    'bottleneck_type': bt
                })
                exp_id += 1
experiments

[{'experiment_id': 'exp_01',
  'latent_dim': 4,
  'use_global_avg': False,
  'learning_rate': 0.001,
  'dropout_rate': 0.0,
  'use_batchnorm': False,
  'bottleneck_type': 'dense'},
 {'experiment_id': 'exp_02',
  'latent_dim': 4,
  'use_global_avg': False,
  'learning_rate': 0.001,
  'dropout_rate': 0.0,
  'use_batchnorm': False,
  'bottleneck_type': 'mlp'},
 {'experiment_id': 'exp_03',
  'latent_dim': 4,
  'use_global_avg': False,
  'learning_rate': 0.001,
  'dropout_rate': 0.0,
  'use_batchnorm': True,
  'bottleneck_type': 'dense'},
 {'experiment_id': 'exp_04',
  'latent_dim': 4,
  'use_global_avg': False,
  'learning_rate': 0.001,
  'dropout_rate': 0.0,
  'use_batchnorm': True,
  'bottleneck_type': 'mlp'},
 {'experiment_id': 'exp_05',
  'latent_dim': 4,
  'use_global_avg': False,
  'learning_rate': 0.001,
  'dropout_rate': 0.2,
  'use_batchnorm': False,
  'bottleneck_type': 'dense'},
 {'experiment_id': 'exp_06',
  'latent_dim': 4,
  'use_global_avg': False,
  'learning_rate': 0.001,


In [None]:
for exp in experiments:
    run_experiment(
        experiment_id=exp['experiment_id'],
        input_shape=(500, 1),
        latent_dim=exp['latent_dim'],
        use_global_avg=exp['use_global_avg'],
        learning_rate=exp['learning_rate'],
        dropout_rate=exp['dropout_rate'],
        use_batchnorm=exp['use_batchnorm'],
        bottleneck_type=exp['bottleneck_type'],
        epochs=10
    )

In [10]:
exp_results = pd.read_csv(os.path.join(project_root, "models/experiment_results.csv"))
exp_results

Unnamed: 0,Experiment,Latent Dim,Global Avg?,LR,Dropout,BatchNorm,Bottleneck,Epochs,Train Loss,Val Loss,Test Loss
0,exp_01,4,False,0.001,0.0,False,dense,10,0.069322,0.066939,0.071382
1,exp_02,4,False,0.001,0.0,False,mlp,10,0.067434,0.065875,0.07025
2,exp_03,4,False,0.001,0.0,True,dense,10,0.064839,0.062457,0.067398
3,exp_04,4,False,0.001,0.0,True,mlp,10,0.062686,0.064171,0.068558
4,exp_05,4,False,0.001,0.2,False,dense,10,0.069728,0.066244,0.070839
5,exp_06,4,False,0.001,0.2,False,mlp,10,0.069151,0.067018,0.070095
6,exp_07,4,False,0.001,0.2,True,dense,10,0.065186,0.063029,0.067322
7,exp_08,4,False,0.001,0.2,True,mlp,10,0.068641,0.08618,0.090518
8,exp_09,8,False,0.001,0.0,False,dense,10,0.044398,0.044744,0.047653
9,exp_10,8,False,0.001,0.0,False,mlp,10,0.044359,0.044952,0.047301
