<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE4-Sensor-Pretraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Simple Sensor Pre-Training System
=================================

Trains LSTM Autoencoders for each sensor and saves models with baseline statistics.

Usage:
    python sensor_pretraining.py
"""

import numpy as np
import os
import pickle
from datetime import datetime
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

from multiprocessing import Process, Queue
import multiprocessing
import traceback

def train_sensor_worker(sensor_id, sensor_data, base_path, window_length, return_dict):
    """Worker process for a single sensor."""
    try:
        # Re-import inside process (TensorFlow isolation)
        import tensorflow as tf
        from tensorflow.keras.models import Model
        from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input
        from tensorflow.keras.optimizers import Adam
        from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
        from sklearn.metrics import mean_squared_error
        import numpy as np
        import pickle
        import os
        from datetime import datetime

        # REBUILD MODEL (identical as your existing one)
        def build_lstm_autoencoder(window_length: int, latent_dim: int = 4):
            inputs = Input(shape=(window_length, 1))
            encoded = LSTM(latent_dim, activation='relu', return_sequences=False)(inputs)
            decoded = RepeatVector(window_length)(encoded)
            decoded = LSTM(latent_dim, activation='relu', return_sequences=True)(decoded)
            outputs = TimeDistributed(Dense(1))(decoded)
            model = Model(inputs, outputs)
            model.compile(optimizer=Adam(0.001), loss='mse')
            return model

        print(f"[Worker {sensor_id}] Starting... Data shape: {sensor_data.shape}")

        # TRAIN/VAL split
        n = len(sensor_data)
        n_train = int(0.8 * n)
        X_train, X_val = sensor_data[:n_train], sensor_data[n_train:]

        # Paths
        sensor_dir = os.path.join(base_path, "sensor", "model")
        ckpt_dir = os.path.join(sensor_dir, "checkpoints")
        os.makedirs(sensor_dir, exist_ok=True)
        os.makedirs(ckpt_dir, exist_ok=True)

        ckpt_path = os.path.join(ckpt_dir, f"sensor_{sensor_id}_best.h5")
        model_path = os.path.join(sensor_dir, f"sensor_{sensor_id}_model.h5")
        meta_path = os.path.join(sensor_dir, f"sensor_{sensor_id}_metadata.pkl")

        # Build model
        model = build_lstm_autoencoder(window_length)

        # Callbacks
        callbacks = [
            ModelCheckpoint(ckpt_path, monitor='val_loss', save_best_only=True, verbose=0),
            EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True, verbose=0),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=6, min_lr=1e-6, verbose=0),
        ]

        print(f"[Worker {sensor_id}] Training AE...")
        history = model.fit(
            X_train, X_train,
            validation_data=(X_val, X_val),
            epochs=50,
            batch_size=64,
            verbose=1,
            callbacks=callbacks
        )

        # Compute baseline reconstruction errors
        val_pred = model.predict(X_val, verbose=0)
        errors = [
            mean_squared_error(X_val[i].flatten(), val_pred[i].flatten())
            for i in range(len(X_val))
        ]

        baseline = {
            "mean": float(np.mean(errors)),
            "std": float(np.std(errors)),
            "q95": float(np.percentile(errors, 95)),
            "q99": float(np.percentile(errors, 99)),
            "median": float(np.median(errors)),
            "mad": float(np.median(np.abs(errors - np.median(errors)))),
        }

        # Save model & metadata
        model.save(model_path)
        with open(meta_path, "wb") as f:
            pickle.dump({
                "sensor_id": sensor_id,
                "window_length": window_length,
                "baseline_stats": baseline,
                "trained_at": datetime.now(),
            }, f)

        print(f"[Worker {sensor_id}] Done ‚úì")
        return_dict[sensor_id] = baseline

    except Exception as e:
        print(f"[Worker {sensor_id}] ERROR:", e)
        traceback.print_exc()
        return_dict[sensor_id] = None




def main():
    """Main training function."""

    # Your exact paths
    data_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/multivariate_long_sequences-TRAIN-10Sec-DIRECT-VAR.npy'
    labelpath = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/window_labels_3class.npy'
    train_mask = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/train_mask.npy'

    base_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/'

    print("üöÄ Simple Sensor Pre-Training")
    print("=" * 40)

    # Load data
    print("Loading data...")
    data = np.load(data_path)           # shape ~ (1,068,551, 100, 12)
    label = np.load(labelpath)          # shape ~ (1,068,551,)
    train_mask = np.load(train_mask)  # shape ~ (1,068,551,)

    # ---- AE WINDOW EXTRACTION (ONLY ONCE) ----
    ae_mask = np.logical_and(train_mask, label == 0)
    print("Total AE windows:", np.sum(ae_mask))

    # Extract final AE training windows
    training_data = data[ae_mask]
    print(f"AE data shape: {training_data.shape}")

    batch_size, window_length, num_sensors = training_data.shape
    print(f"Will train {num_sensors} sensors")

    # Train each sensor

    print("\n‚ö° Launching parallel sensor training...\n")

    manager = multiprocessing.Manager()
    return_dict = manager.dict()
    processes = []

    MAX_WORKERS = 4   # Colab usually supports 2‚Äì4 CPU workers reliably

    # Loop sensors
    for sensor_id in range(num_sensors):

        model_path = os.path.join(base_path, "sensor", "model", f"sensor_{sensor_id}_model.h5")
        ckpt_path = os.path.join(base_path, "sensor", "model", "checkpoints", f"sensor_{sensor_id}_best.h5")

        if os.path.exists(model_path) and os.path.exists(ckpt_path):
            print(f"‚è≠Ô∏è Sensor {sensor_id} already trained. Skipping.")
            continue

        sensor_data = training_data[:, :, sensor_id:sensor_id+1]
        print(f"\nüîß Launching Worker for Sensor {sensor_id} " f"({sensor_id+1}/{num_sensors}) ‚Äî remaining: {num_sensors - sensor_id - 1}")
        # Start worker process
        p = Process(target=train_sensor_worker,
                    args=(sensor_id, sensor_data, base_path, window_length, return_dict))
        p.start()
        processes.append(p)

        # Limit concurrency
        if len(processes) >= MAX_WORKERS:
            for p in processes:
                p.join()
            processes = []

    # Join remaining
    print("‚è≥ Waiting for current batch of workers to finish...")
    for p in processes:
        p.join()
    print("‚úÖ Batch completed.\n")

    print("\nüî• ALL PARALLEL TRAINING DONE\n")

    print("\nüî• ALL PARALLEL TRAINING DONE\n")

    results = {}
    successful = 0

    print("üìä Worker Results:")
    for sid in sorted(return_dict.keys()):
        res = return_dict[sid]
        if res is None:
            print(f"‚ùå Sensor {sid} failed")
            results[sid] = {"success": False}
        else:
            print(f"‚úÖ Sensor {sid} baseline mean={res['mean']:.6f}")
            results[sid] = {"success": True, "baseline_stats": res}
            successful += 1

    print("\nüìä TRAINING SUMMARY")
    print("=" * 40)
    print(f"Trained this run: {successful}/{num_sensors}")
    print(f"Models saved to: {os.path.join(base_path, 'sensor', 'model')}")

    summary_path = os.path.join(base_path, 'sensor', 'model', 'training_summary.pkl')
    with open(summary_path, "wb") as f:
        pickle.dump({
            "results": results,
            "training_data_shape": training_data.shape,
            "successful_sensors": successful,
            "timestamp": datetime.now()
        }, f)

    print(f"üíæ Summary saved: {summary_path}")
    print("‚úÖ Pre-training completed!")



if __name__ == "__main__":
    multiprocessing.set_start_method("spawn", force=True)
    main()


üöÄ Simple Sensor Pre-Training
Loading data...
Total AE windows: 432142
AE data shape: (432142, 100, 12)
Will train 12 sensors

‚ö° Launching parallel sensor training...


üîß Launching Worker for Sensor 0 (1/12) ‚Äî remaining: 11


In [None]:
from google.colab import drive
drive.mount('/content/drive')