<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE4-Sensor-Pretraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Simple Sensor Pre-Training System
=================================

Trains LSTM Autoencoders for each sensor and saves models with baseline statistics.

Usage:
    python sensor_pretraining.py
"""

import numpy as np
import os
import pickle
from datetime import datetime
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping


def build_lstm_autoencoder(window_length: int, latent_dim: int = 32) -> Model:
    """Build LSTM Autoencoder."""
    inputs = Input(shape=(window_length, 1))

    # Encoder
    encoded = LSTM(latent_dim, activation='relu', return_sequences=False)(inputs)

    # Decoder
    decoded = RepeatVector(window_length)(encoded)
    decoded = LSTM(latent_dim, activation='relu', return_sequences=True)(decoded)
    outputs = TimeDistributed(Dense(1, activation='linear'))(decoded)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    return model


def train_sensor_model(sensor_data, sensor_id, base_path, window_length):
    """Train model for one sensor."""

    print(f"\nTraining sensor {sensor_id}...")
    print(f"Data shape: {sensor_data.shape}")

    # Split data
    n_samples = len(sensor_data)
    n_train = int(0.8 * n_samples)

    X_train = sensor_data[:n_train]
    X_val = sensor_data[n_train:]

    print(f"Train: {len(X_train)}, Val: {len(X_val)}")

    # Build and train model
    model = build_lstm_autoencoder(window_length)

    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    history = model.fit(
        X_train, X_train,
        validation_data=(X_val, X_val),
        epochs=100,
        batch_size=32,
        callbacks=[early_stop],
        verbose=1
    )

    # Compute baseline reconstruction errors on validation set
    print("Computing baseline errors...")
    val_predictions = model.predict(X_val, verbose=0)
    baseline_errors = []

    for i in range(len(X_val)):
        error = mean_squared_error(X_val[i].flatten(), val_predictions[i].flatten())
        baseline_errors.append(error)

    baseline_stats = {
        'mean': float(np.mean(baseline_errors)),
        'std': float(np.std(baseline_errors)) + 1e-8,
        'q95': float(np.percentile(baseline_errors, 95)),
        'q99': float(np.percentile(baseline_errors, 99)),
        'baseline_errors': baseline_errors  # Store for drift detection
    }

    # Save model
    sensor_dir = os.path.join(base_path, 'sensor', 'model')
    os.makedirs(sensor_dir, exist_ok=True)

    model_path = os.path.join(sensor_dir, f'sensor_{sensor_id}_model.h5')
    metadata_path = os.path.join(sensor_dir, f'sensor_{sensor_id}_metadata.pkl')

    model.save(model_path)

    metadata = {
        'sensor_id': sensor_id,
        'window_length': window_length,
        'baseline_stats': baseline_stats,
        'trained_at': datetime.now(),
        'epochs_trained': len(history.history['loss']),
        'final_val_loss': float(history.history['val_loss'][-1])
    }

    with open(metadata_path, 'wb') as f:
        pickle.dump(metadata, f)

    print(f"Saved: {model_path}")
    print(f"Baseline error: {baseline_stats['mean']:.6f} ± {baseline_stats['std']:.6f}")

    return baseline_stats


def main():
    """Main training function."""

    # Load data
    data_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/multivariate_long_sequences-TRAIN-Daily-DIRECT-VAR.npy'
    base_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'

    print("Loading data...")
    data = np.load(data_path)
    print(f"Original data shape: {data.shape}")

    # Remove last 1000 samples
    training_data = data[:-1000]
    print(f"Training data shape: {training_data.shape}")

    batch_size, window_length, num_sensors = training_data.shape

    print(f"Training {num_sensors} sensors...")

    # Train each sensor
    results = {}
    for sensor_id in range(num_sensors):
        # Extract data for this sensor: [batch, timestep, sensor_id]
        sensor_data = training_data[:, :, sensor_id:sensor_id+1]  # Keep feature dim

        try:
            baseline_stats = train_sensor_model(sensor_data, sensor_id, base_path, window_length)
            results[sensor_id] = {'success': True, 'baseline_stats': baseline_stats}
        except Exception as e:
            print(f"Failed training sensor {sensor_id}: {e}")
            results[sensor_id] = {'success': False, 'error': str(e)}

    # Summary
    successful = sum(1 for r in results.values() if r['success'])
    print(f"\nTraining complete: {successful}/{num_sensors} sensors successful")

    # Save summary
    summary_path = os.path.join(base_path, 'sensor', 'model', 'training_summary.pkl')
    with open(summary_path, 'wb') as f:
        pickle.dump({
            'results': results,
            'data_shape': training_data.shape,
            'timestamp': datetime.now()
        }, f)

    print(f"Summary saved: {summary_path}")


if __name__ == "__main__":
    main()

In [None]:
from google.colab import drive
drive.mount('/content/drive')