<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE4-Sensor-Pretraining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
"""
Sensor Model Pre-Training System
================================

This script handles the initial training of sensor models using your real dataset.
Models are trained once and saved to disk for use in the production agent system.

Usage:
    python sensor_pretraining.py --data_path /path/to/dataset --models_dir ./trained_models
"""

import numpy as np
import pandas as pd
import pickle
import os
import argparse
from datetime import datetime
from typing import Dict, List, Tuple
import warnings
warnings.filterwarnings('ignore')

# Deep learning
try:
    import tensorflow as tf
    from tensorflow.keras.models import Sequential, Model
    from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed, Input
    from tensorflow.keras.optimizers import Adam
    from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
    KERAS_AVAILABLE = True
except ImportError:
    print("⚠️ TensorFlow not available. Install with: pip install tensorflow")
    KERAS_AVAILABLE = False

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

class SensorModelTrainer:
    """
    Handles training and saving of individual sensor models.
    """

    def __init__(self,
                 window_length: int = 50,
                 model_type: str = 'lstm_autoencoder',
                 latent_dim: int = 32,
                 epochs: int = 100,
                 batch_size: int = 32,
                 validation_split: float = 0.2):

        self.window_length = window_length
        self.model_type = model_type
        self.latent_dim = latent_dim
        self.epochs = epochs
        self.batch_size = batch_size
        self.validation_split = validation_split

        if not KERAS_AVAILABLE:
            raise ImportError("TensorFlow required for model training")

    def build_lstm_autoencoder(self) -> Model:
        """Build LSTM Autoencoder for anomaly detection."""

        # Encoder
        inputs = Input(shape=(self.window_length, 1), name='encoder_input')
        encoded = LSTM(self.latent_dim, activation='relu', return_sequences=False, name='encoder_lstm')(inputs)

        # Decoder
        decoded = RepeatVector(self.window_length, name='repeat_vector')(encoded)
        decoded = LSTM(self.latent_dim, activation='relu', return_sequences=True, name='decoder_lstm')(decoded)
        outputs = TimeDistributed(Dense(1, activation='linear'), name='decoder_output')(decoded)

        # Create model
        model = Model(inputs, outputs, name='sensor_lstm_autoencoder')
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='mse',
            metrics=['mae']
        )

        return model

    def build_vae(self) -> Model:
        """Build Variational Autoencoder for anomaly detection."""

        # Encoder
        inputs = Input(shape=(self.window_length, 1))
        x = LSTM(self.latent_dim, return_sequences=False)(inputs)

        # Latent space
        z_mean = Dense(self.latent_dim // 2, name='z_mean')(x)
        z_log_var = Dense(self.latent_dim // 2, name='z_log_var')(x)

        # Sampling function
        def sampling(args):
            z_mean, z_log_var = args
            batch = tf.shape(z_mean)[0]
            dim = tf.shape(z_mean)[1]
            epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
            return z_mean + tf.exp(0.5 * z_log_var) * epsilon

        z = tf.keras.layers.Lambda(sampling, name='sampling')([z_mean, z_log_var])

        # Decoder
        decoder_input = RepeatVector(self.window_length)(z)
        decoded = LSTM(self.latent_dim, return_sequences=True)(decoder_input)
        outputs = TimeDistributed(Dense(1))(decoded)

        # VAE model
        model = Model(inputs, outputs, name='sensor_vae')

        # VAE loss
        reconstruction_loss = tf.reduce_mean(tf.square(inputs - outputs))
        kl_loss = -0.5 * tf.reduce_mean(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
        vae_loss = reconstruction_loss + 0.1 * kl_loss
        model.add_loss(vae_loss)

        model.compile(optimizer=Adam(learning_rate=0.001))

        return model

    def prepare_sequences(self, sensor_data: np.ndarray, overlap_ratio: float = 0.5) -> np.ndarray:
        """
        Convert time series data into sequences for training.

        Args:
            sensor_data: 1D time series data for one sensor
            overlap_ratio: Overlap between consecutive sequences (0.0 = no overlap, 0.9 = high overlap)

        Returns:
            Array of sequences [n_sequences, window_length]
        """

        if len(sensor_data) < self.window_length:
            raise ValueError(f"Data length ({len(sensor_data)}) < window_length ({self.window_length})")

        # Calculate step size based on overlap
        step_size = max(1, int(self.window_length * (1 - overlap_ratio)))

        sequences = []
        for i in range(0, len(sensor_data) - self.window_length + 1, step_size):
            sequences.append(sensor_data[i:i + self.window_length])

        return np.array(sequences)

    def train_sensor_model(self, sensor_data: np.ndarray, sensor_id: int) -> Tuple[Model, Dict]:
        """
        Train a model for a single sensor.

        Args:
            sensor_data: Pre-scaled 1D time series data
            sensor_id: Sensor identifier

        Returns:
            Trained model and training history
        """

        print(f"Training model for sensor {sensor_id}...")
        print(f"  Data shape: {sensor_data.shape}")

        # Create sequences
        sequences = self.prepare_sequences(sensor_data, overlap_ratio=0.3)
        print(f"  Created {len(sequences)} sequences of length {self.window_length}")

        if len(sequences) < 100:
            raise ValueError(f"Insufficient sequences ({len(sequences)}) for training. Need at least 100.")

        # Prepare training data
        X = sequences.reshape(len(sequences), self.window_length, 1)

        # Split data
        X_train, X_val = train_test_split(X, test_size=self.validation_split, random_state=42)
        print(f"  Train sequences: {len(X_train)}, Validation sequences: {len(X_val)}")

        # Build model
        if self.model_type == 'lstm_autoencoder':
            model = self.build_lstm_autoencoder()
        elif self.model_type == 'vae':
            model = self.build_vae()
        else:
            raise ValueError(f"Unknown model type: {self.model_type}")

        print(f"  Model: {self.model_type} with {model.count_params():,} parameters")

        # Callbacks
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
        ]

        # Train model
        start_time = datetime.now()

        if self.model_type == 'vae':
            # VAE training (no target needed due to custom loss)
            history = model.fit(
                X_train, epochs=self.epochs, batch_size=self.batch_size,
                validation_data=(X_val,), callbacks=callbacks, verbose=1
            )
        else:
            # Autoencoder training
            history = model.fit(
                X_train, X_train, epochs=self.epochs, batch_size=self.batch_size,
                validation_data=(X_val, X_val), callbacks=callbacks, verbose=1
            )

        training_time = datetime.now() - start_time
        print(f"  Training completed in {training_time}")

        # Compute baseline errors for anomaly detection
        print("  Computing baseline error statistics...")
        baseline_errors = []

        for i in range(0, len(X_val), self.batch_size):
            batch = X_val[i:i + self.batch_size]
            predictions = model.predict(batch, verbose=0)

            for j, pred in enumerate(predictions):
                error = mean_squared_error(batch[j].flatten(), pred.flatten())
                baseline_errors.append(error)

        baseline_stats = {
            'mean': float(np.mean(baseline_errors)),
            'std': float(np.std(baseline_errors)) + 1e-8,
            'q95': float(np.percentile(baseline_errors, 95)),
            'q99': float(np.percentile(baseline_errors, 99)),
            'min': float(np.min(baseline_errors)),
            'max': float(np.max(baseline_errors))
        }

        training_info = {
            'sensor_id': sensor_id,
            'model_type': self.model_type,
            'window_length': self.window_length,
            'training_sequences': len(X_train),
            'validation_sequences': len(X_val),
            'training_time': str(training_time),
            'final_loss': float(history.history['loss'][-1]),
            'final_val_loss': float(history.history['val_loss'][-1]),
            'epochs_trained': len(history.history['loss']),
            'baseline_errors': baseline_errors[-100:],  # Store last 100 for drift detection
            'baseline_stats': baseline_stats,
            'trained_at': datetime.now()
        }

        print(f"  ✅ Training successful!")
        print(f"     Final loss: {training_info['final_loss']:.6f}")
        print(f"     Final val loss: {training_info['final_val_loss']:.6f}")
        print(f"     Baseline error: {baseline_stats['mean']:.6f} ± {baseline_stats['std']:.6f}")

        return model, training_info

    def save_model(self, model: Model, training_info: Dict, models_dir: str):
        """Save trained model and metadata."""

        sensor_id = training_info['sensor_id']
        os.makedirs(models_dir, exist_ok=True)

        # Save model
        model_path = os.path.join(models_dir, f"sensor_{sensor_id}_model.h5")
        model.save(model_path)

        # Save metadata
        metadata_path = os.path.join(models_dir, f"sensor_{sensor_id}_metadata.pkl")
        with open(metadata_path, 'wb') as f:
            pickle.dump(training_info, f)

        print(f"  💾 Saved model and metadata for sensor {sensor_id}")

        return model_path, metadata_path


def load_your_dataset(data_path: str) -> Dict[int, np.ndarray]:
    """
    Load your real dataset and return sensor data.

    CUSTOMIZE THIS FUNCTION FOR YOUR DATA FORMAT

    Args:
        data_path: Path to your dataset

    Returns:
        Dictionary mapping sensor_id -> pre-scaled time series data

    Example formats supported:
    - CSV with columns: timestamp, sensor_0, sensor_1, ...
    - Multiple CSV files: sensor_0.csv, sensor_1.csv, ...
    - HDF5 files
    - Numpy arrays
    """

    print(f"📂 Loading dataset from: {data_path}")

    sensor_data = {}

    if data_path.endswith('.csv'):
        # Single CSV file with multiple sensors
        print("  Format: Single CSV file")
        df = pd.read_csv(data_path)

        # Assume columns are: timestamp, sensor_0, sensor_1, sensor_2, ...
        sensor_columns = [col for col in df.columns if col.startswith('sensor_')]

        for col in sensor_columns:
            sensor_id = int(col.split('_')[1])  # Extract sensor ID
            data = df[col].values

            # Remove NaNs and infinite values
            data = data[np.isfinite(data)]

            if len(data) > 0:
                sensor_data[sensor_id] = data
                print(f"    Sensor {sensor_id}: {len(data)} samples")

    elif data_path.endswith('.h5') or data_path.endswith('.hdf5'):
        # HDF5 file
        print("  Format: HDF5 file")
        import h5py

        with h5py.File(data_path, 'r') as f:
            for key in f.keys():
                if key.startswith('sensor_'):
                    sensor_id = int(key.split('_')[1])
                    data = f[key][:]
                    data = data[np.isfinite(data)]

                    if len(data) > 0:
                        sensor_data[sensor_id] = data
                        print(f"    Sensor {sensor_id}: {len(data)} samples")

    elif os.path.isdir(data_path):
        # Directory with multiple files
        print("  Format: Directory with multiple sensor files")

        for filename in os.listdir(data_path):
            if filename.startswith('sensor_') and filename.endswith('.csv'):
                sensor_id = int(filename.split('_')[1].split('.')[0])
                file_path = os.path.join(data_path, filename)

                df = pd.read_csv(file_path)
                # Assume single column or use first numeric column
                numeric_cols = df.select_dtypes(include=[np.number]).columns
                if len(numeric_cols) > 0:
                    data = df[numeric_cols[0]].values
                    data = data[np.isfinite(data)]

                    if len(data) > 0:
                        sensor_data[sensor_id] = data
                        print(f"    Sensor {sensor_id}: {len(data)} samples")

    elif data_path.endswith('.npy'):
        # Numpy array [timesteps, sensors]
        print("  Format: Numpy array")
        data_array = np.load(data_path)

        if data_array.ndim == 2:
            for sensor_id in range(data_array.shape[1]):
                data = data_array[:, sensor_id]
                data = data[np.isfinite(data)]

                if len(data) > 0:
                    sensor_data[sensor_id] = data
                    print(f"    Sensor {sensor_id}: {len(data)} samples")

    else:
        raise ValueError(f"Unsupported data format: {data_path}")

    if not sensor_data:
        raise ValueError("No sensor data loaded. Check your data format.")

    print(f"✅ Loaded {len(sensor_data)} sensors")
    return sensor_data


def validate_dataset(sensor_data: Dict[int, np.ndarray], window_length: int) -> Dict[int, np.ndarray]:
    """Validate and prepare dataset for training."""

    print("🔍 Validating dataset...")

    validated_data = {}

    for sensor_id, data in sensor_data.items():
        print(f"  Sensor {sensor_id}:")

        # Check data length
        if len(data) < window_length * 10:
            print(f"    ⚠️ Insufficient data ({len(data)} < {window_length * 10}) - SKIPPING")
            continue

        # Check for constant values
        if np.std(data) < 1e-6:
            print(f"    ⚠️ Constant values detected (std={np.std(data):.2e}) - SKIPPING")
            continue

        # Basic statistics
        print(f"    📊 Length: {len(data)}")
        print(f"    📊 Range: [{np.min(data):.3f}, {np.max(data):.3f}]")
        print(f"    📊 Mean: {np.mean(data):.3f}, Std: {np.std(data):.3f}")
        print(f"    📊 Missing values: {np.sum(~np.isfinite(data))}")

        # Check if data appears to be already scaled
        if -5 <= np.min(data) <= 5 and -5 <= np.max(data) <= 5:
            print(f"    ✅ Data appears pre-scaled")
        else:
            print(f"    ⚠️ Data may need scaling (range: [{np.min(data):.2f}, {np.max(data):.2f}])")

        validated_data[sensor_id] = data

    print(f"✅ Validated {len(validated_data)} sensors for training")
    return validated_data


def main():
    """Main training pipeline."""

    parser = argparse.ArgumentParser(description="Train sensor anomaly detection models")
    parser.add_argument('--data_path', type=str, required=True, help='Path to dataset')
    parser.add_argument('--models_dir', type=str, default='./trained_models', help='Directory to save models')
    parser.add_argument('--window_length', type=int, default=50, help='Sequence window length')
    parser.add_argument('--model_type', type=str, default='lstm_autoencoder',
                       choices=['lstm_autoencoder', 'vae'], help='Model type')
    parser.add_argument('--epochs', type=int, default=100, help='Training epochs')
    parser.add_argument('--batch_size', type=int, default=32, help='Batch size')
    parser.add_argument('--sensors', type=str, default=None,
                       help='Comma-separated sensor IDs to train (default: all)')

    args = parser.parse_args()

    print("🚀 SENSOR MODEL PRE-TRAINING SYSTEM")
    print("=" * 50)
    print(f"📂 Data path: {args.data_path}")
    print(f"💾 Models directory: {args.models_dir}")
    print(f"📏 Window length: {args.window_length}")
    print(f"🧠 Model type: {args.model_type}")
    print(f"🔄 Epochs: {args.epochs}")
    print()

    # Load dataset
    sensor_data = load_your_dataset(args.data_path)

    # Validate dataset
    validated_data = validate_dataset(sensor_data, args.window_length)

    if not validated_data:
        print("❌ No valid sensors found for training")
        return

    # Filter sensors if specified
    if args.sensors:
        requested_sensors = [int(x.strip()) for x in args.sensors.split(',')]
        validated_data = {sid: data for sid, data in validated_data.items()
                         if sid in requested_sensors}
        print(f"🎯 Training only sensors: {list(validated_data.keys())}")

    # Initialize trainer
    trainer = SensorModelTrainer(
        window_length=args.window_length,
        model_type=args.model_type,
        epochs=args.epochs,
        batch_size=args.batch_size
    )

    # Train models
    print(f"\n🏋️ TRAINING {len(validated_data)} SENSOR MODELS")
    print("=" * 50)

    training_results = {}
    successful_training = 0

    for sensor_id, data in validated_data.items():
        try:
            print(f"\n🎯 SENSOR {sensor_id}")
            print("-" * 30)

            # Train model
            model, training_info = trainer.train_sensor_model(data, sensor_id)

            # Save model
            model_path, metadata_path = trainer.save_model(model, training_info, args.models_dir)

            training_results[sensor_id] = {
                'success': True,
                'model_path': model_path,
                'metadata_path': metadata_path,
                'training_info': training_info
            }

            successful_training += 1

        except Exception as e:
            print(f"  ❌ Training failed: {str(e)}")
            training_results[sensor_id] = {
                'success': False,
                'error': str(e)
            }

    # Final summary
    print(f"\n📊 TRAINING SUMMARY")
    print("=" * 50)
    print(f"✅ Successful: {successful_training}/{len(validated_data)} sensors")
    print(f"💾 Models saved to: {args.models_dir}")

    if successful_training > 0:
        print(f"\n🏆 TRAINED SENSORS:")
        for sensor_id, result in training_results.items():
            if result['success']:
                info = result['training_info']
                print(f"  Sensor {sensor_id}: {info['epochs_trained']} epochs, "
                      f"final loss: {info['final_loss']:.6f}")

    failed_sensors = [sid for sid, result in training_results.items() if not result['success']]
    if failed_sensors:
        print(f"\n❌ FAILED SENSORS: {failed_sensors}")

    # Save training summary
    summary_path = os.path.join(args.models_dir, 'training_summary.pkl')
    with open(summary_path, 'wb') as f:
        pickle.dump({
            'training_results': training_results,
            'config': vars(args),
            'timestamp': datetime.now()
        }, f)

    print(f"\n💾 Training summary saved to: {summary_path}")
    print(f"✅ PRE-TRAINING COMPLETED!")


if __name__ == "__main__":
    main()

In [None]:
from google.colab import drive
drive.mount('/content/drive')