<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE2-VAE-METROPM-LARGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import math
import plotly.graph_objects as go
import keras
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed, Input
from keras.models import Model
from keras import saving
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import EarlyStopping
from keras.callbacks import Callback
import plotly
from keras import losses
import plotly.express as px
import gc
from tqdm import tqdm   # nice progress bar

# Load data
data_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/multivariate_long_sequences-TRAIN-10Sec-DIRECT-VAR.npy'
labelpath = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/window_labels_3class.npy'
train_mask_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/train_mask.npy'
data = np.load(data_path)
label = np.load(labelpath)
train_mask = np.load(train_mask_path)
mlp_mask = np.logical_and(train_mask, label == 0)
train_data = data[mlp_mask]
n_seq = train_data.shape[0]
window_size = train_data.shape[1]
n_features = train_data.shape[2]

maxval = train_data.shape[0]
count_train = int(math.ceil(0.8*maxval))
x_train = train_data[:count_train]
x_test = train_data[count_train:]

# Always define the custom Sampling layer (needed for loading existing models)
saving.get_custom_objects().clear()

@saving.register_keras_serializable(package="MyLayers")
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def __init__(self, factor):
        super().__init__()
        self.factor = factor

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

    def get_config(self):
        return {"factor": self.factor}

def safe_decode(decoder, z, batch_size=32):
    outputs = []
    for i in range(0, len(z), batch_size):
        batch = z[i:i+batch_size]
        out = decoder(batch)
        outputs.append(out.numpy())
    return np.concatenate(outputs, axis=0)

# Check if trained VAE models already exist
encoder_path = r'/content/drive/MyDrive/PHD/2025/VAE_SIMULATION/METROPM_vae-encoder-latent5-dim256.keras'
decoder_path = r'/content/drive/MyDrive/PHD/2025/VAE_SIMULATION/METROPM_vae-decoder-latent5-dim256.keras'

if os.path.exists(encoder_path) and os.path.exists(decoder_path):
    print("Found existing trained VAE models. Loading...")
    encoder = keras.models.load_model(encoder_path)
    decoder = keras.models.load_model(decoder_path)
    print("VAE models loaded successfully!")
    history = None  # No training history since we didn't train

else:
    print("No existing VAE models found. Training new VAE...")

    # Build the encoder
    latent_dim = 5
    intermediate_dim = 256

    # Encoder
    encoder_inputs = layers.Input(shape=(window_size, n_features), name="encoder_input")
    x = layers.LSTM(intermediate_dim, activation='tanh', name="lstm1", return_sequences=True)(encoder_inputs)
    xx = layers.LSTM(int(intermediate_dim/2), activation='tanh', name="lstm2", return_sequences=False)(x)
    x1 = layers.Dense(int(intermediate_dim/2), name="dense")(xx)
    z_mean = layers.Dense(latent_dim, name="z_mean")(x1)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(x1)
    z = Sampling(1)([z_mean, z_log_var])
    encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
    encoder.summary()

    # Decoder
    inp_z = Input(shape=(latent_dim,), name="decoder")
    x1 = layers.RepeatVector(window_size, name="repeatvect")(inp_z)
    x2 = layers.Dense(int(intermediate_dim/2), name="Dense2")(x1)
    x22 = layers.LSTM(int(intermediate_dim/2), activation='tanh', return_sequences=True, name="lstm1")(x2)
    x3 = layers.LSTM(intermediate_dim, activation='tanh', return_sequences=True, name="lstm2")(x22)
    decode_out = layers.TimeDistributed(Dense(n_features), name="decodeout")(x3)
    decoder = keras.Model(inp_z, decode_out, name="decoder")
    decoder.summary()

    reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)

    # Parameters
    n_epochs = 20
    klstart = 5
    kl_annealtime = n_epochs - klstart
    weight = K.variable(0.0)

    # Define the VAE as a Model with a custom train_step
    class VAE(keras.Model):
        def __init__(self, encoder, decoder, **kwargs):
            super(VAE, self).__init__(**kwargs)
            self.encoder = encoder
            self.decoder = decoder
            self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
            self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
            self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

        @property
        def metrics(self):
            return [
                self.total_loss_tracker,
                self.reconstruction_loss_tracker,
                self.kl_loss_tracker,
            ]

        def train_step(self, data):
            with tf.GradientTape() as tape:
                z_mean, z_log_var, z = self.encoder(data)
                reconstruction = self.decoder(z)
                reconstruction_loss = tf.reduce_mean(
                    tf.reduce_sum(
                        losses.mean_squared_error(data, reconstruction), axis=-1), keepdims=True
                )

                kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
                total_loss = reconstruction_loss + (weight * kl_loss)
            grads = tape.gradient(total_loss, self.trainable_weights)
            self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
            self.total_loss_tracker.update_state(total_loss)
            self.reconstruction_loss_tracker.update_state(reconstruction_loss)
            self.kl_loss_tracker.update_state(kl_loss)
            return {
                "loss": self.total_loss_tracker.result(),
                "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                "kl_loss": self.kl_loss_tracker.result(),
            }

        def test_step(self, data):
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    losses.mean_squared_error(data, reconstruction), axis=-1), keepdims=True
            )

            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss

            return {
                "loss": self.total_loss_tracker.result(),
                "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                "kl_loss": self.kl_loss_tracker.result(),
            }

    # CALLBACKS
    es = keras.callbacks.EarlyStopping(patience=50, verbose=1, min_delta=0.0001, monitor='loss', mode='auto', restore_best_weights=True)

    class AnnealingCallback(Callback):
        def __init__(self, weight):
            self.weight = weight

        def on_epoch_end(self, epoch, logs={}):
            if epoch > klstart and epoch < klstart * 1.2:
                new_weight = min(K.get_value(self.weight) + (1. / kl_annealtime), 1.)
                K.set_value(self.weight, new_weight)
            print("Current KL Weight is " + str(K.get_value(self.weight)))

    # Train the VAE
    vae = VAE(encoder, decoder)
    vae.compile(optimizer=keras.optimizers.Adam(clipnorm=1))
    history = vae.fit(x_train,
                      epochs=n_epochs,
                      batch_size=200,
                      validation_split=0.1,
                      callbacks=[AnnealingCallback(weight)])

    # Save models
    encoder.save(encoder_path)
    decoder.save(decoder_path)
    print("VAE training complete and models saved!")

    # Reload models to ensure consistency
    encoder = keras.models.load_model(encoder_path)
    decoder = keras.models.load_model(decoder_path)

# Plot training history (only if we actually trained)
if history is not None:
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['reconstruction_loss'], label='reconstruction_loss')
    plt.plot(history.history['kl_loss'], label='kl_Loss')
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    plt.ylim(0, 100)
    plt.show()

    # Just Loss
    plt.plot(history.history['loss'], label='Training Loss')
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("Skipping training plots since VAE was loaded from existing models")

# PLOT TRAIN RECONSTRUCTION
x_train_small = x_train[:10000]
X_test_encoded = encoder.predict(x_train_small)
z = X_test_encoded[2]     # shape (10000, 5)
# Decode safely
X_test_predict = safe_decode(decoder, z, batch_size=32)


plt.suptitle('Example Reconstruction of Training Data')
plt.xlabel('Time', fontsize='10')
plt.ylabel('Feature 6', fontsize='10')
plt.plot(x_train[0:10000, :, 5], "r", label="Actual")
plt.plot(X_test_predict[:, :, 5], "b", label="reconstructed")
plt.show()

# PLOT TEST RECONSTRUCTION
x_test_small = x_test[:10000]
X_test_encoded = encoder.predict(x_test_small)
z = X_test_encoded[2]     # shape (10000, 5)
# Decode safely
X_test_predict = safe_decode(decoder, z, batch_size=32)
plt.suptitle('Example Reconstruction of Testing Data')
plt.xlabel('Time', fontsize='10')
plt.ylabel('Feature 6', fontsize='10')
plt.plot(x_test_small[0:1000, :, 5], "r")
plt.plot(X_test_predict[0:1000, :, 5], "b")
plt.show()

#ANother test - pick a specific example
i = 0  # pick a specific example
x_true  = x_test[i, :, 5]          # feature 6
z_mean, z_log_var, z = encoder.predict(x_test[i:i+1])
x_recon = decoder.predict(z)[0, :, 5]

plt.figure(figsize=(8,4))
plt.plot(x_true, 'r', label='True')
plt.plot(x_recon, 'b', label='Recon')
plt.legend()
plt.title(f'Window {i} â€“ Feature 6')
plt.show()



# Check if generated data already exists
generated_data_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/generated_large_subsquence2_data.npy'

if os.path.exists(generated_data_path):
    print("Found existing generated data. Loading...")
    results1 = np.load(generated_data_path)
    print(f"Loaded generated data with shape: {results1.shape}")
else:
    print("No existing generated data found. Generating new data...")
    # Generate data for MLP

    N_SAMPLES = 10000          # number of real windows to sample
    generator_multiply =   5            # synthetic per real window
    BATCH_SIZE = 2000           # safe batch size for VAE inference

    # =========================================================
    # 1. RANDOMLY SAMPLE 50,000 WINDOWS
    # =========================================================
    idx = np.random.choice(n_seq, N_SAMPLES, replace=False)
    sampled_data = train_data[idx]

        # =========================================================
    # 2. USE VAE (encoder + decoder) TO GENERATE SYNTHETIC WINDOWS
    # =========================================================

    generated_list = []

    num_batches = int(np.ceil(N_SAMPLES / BATCH_SIZE))
    print(f"Generating synthetic data in {num_batches} batches...")

    for b in range(num_batches):
        start = b * BATCH_SIZE
        end = min((b+1)*BATCH_SIZE, N_SAMPLES)
        batch = sampled_data[start:end]

        # Encode
        z_mean, z_log_var, _ = encoder.predict(batch, verbose=0)

        # Generate MULTIPLIER random z for each window
        sigma = tf.exp(0.5 * z_log_var)

        for m in range(generator_multiply):
            z_new = tf.random.normal(
                shape=z_mean.shape,
                mean=z_mean,
                stddev=sigma
            )
            decoded = decoder.predict(z_new, verbose=0)

            # decoded shape: (batch, window_size, n_features)
            generated_list.append(decoded)

        print(f" Batch {b+1}/{num_batches} complete")
          # Stack all generated batches
    generated_data = np.concatenate(generated_list, axis=0)
    print(f"Synthetic windows: {generated_data.shape}")
    # =========================================================
    # 3. COMBINE REAL + SYNTHETIC DATA
    # =========================================================
    combined_data = np.concatenate([train_data, generated_data], axis=0)
    results1 =combined_data
    np.save(generated_data_path, results1)
    print(f"Generated and saved new data with shape: {results1.shape}")

# Generate window labels using VAR analysis - Dynamic batching with checkpoints
import pickle
import gc

# Check if final VAR analysis results already exist
final_window_labels_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/generated-data-true-window2.npy'
final_data_path = r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/generated-data2.npy'

if os.path.exists(final_window_labels_path):
    print("Found existing window labels...")
    optimal_K_for_ALL = np.load(final_window_labels_path)
    print(f"Loaded optimal K for ALL windows with shape: {optimal_K_for_ALL.shape}")
else:
    # ----------------------------------------------------------
    # 1) TRAIN LSTM NEXT-STEP PREDICTOR ON A SUBSET
    # ----------------------------------------------------------
    print("\n=== STEP 1: Training LSTM Next-Step predictor on subset ===")
    N_total = results1.shape[0]
    train_subset = min(50000, N_total)  # in case you have fewer than 50k

    idx = np.random.choice(N_total, train_subset, replace=False)
    nsp_train = results1[idx]       # shape: (train_subset, window_size, n_features)

    # VARIABLE LENGTH NSP MODEL
    inputs = keras.Input(shape=(None, n_features))
    x = layers.LSTM(128)(inputs)
    x = layers.Dense(64, activation="relu")(x)
    outputs = layers.Dense(n_features)(x)

    nsp_model = keras.Model(inputs, outputs)
    nsp_model.compile(optimizer='adam', loss='mse')
    nsp_model.summary()

    # TRAIN on full K-length windows
    X = nsp_train[:, :-1, :]   # (subset, window_size-1, n_features)
    y = nsp_train[:, -1, :]    # (subset, n_features)

    nsp_model.fit(X, y, epochs=5, batch_size=512)

    # ----------------------------------------------------------
    # 2) LABEL ALL WINDOWS WITH OPTIMAL K (CHECKPOINTED)
    # ----------------------------------------------------------
    print("\n=== STEP 2: Generating optimal K for ALL windows with checkpointing ===")

    CHECKPOINT_DIR = "/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/NEXTSTEP"
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    CHECKPOINT_FILE = f"{CHECKPOINT_DIR}/klabel_progress.pkl"

    BATCH_SIZE_SAVE = 20000       # save every 20k sequences
    K_MAX = window_size           # search K in range 2..window_size

    SEQ_DATA = results1           # full dataset
    N = SEQ_DATA.shape[0]

    print(f"Total sequences to label: {N:,}")
    print(f"Searching K in [2, {K_MAX}]")

    # -----------------------------
    # Helper function: compute optimal K for one sequence
    # -----------------------------
    def get_optimal_k(window, model, K):
        W = window.shape[0]      # window_size
        F = window.shape[1]      # n_features

        target = window[-1].reshape(1, F)

        num_k = K - 1
        past_batch = np.zeros((num_k, W-1, F), dtype=np.float32)

        for k in range(2, K+1):
            seq_len = k - 1
            past = window[-k:-1, :]      # (k-1, F)

            # IMPORTANT: reset row
            past_batch[k-2, :, :] = 0
            past_batch[k-2, -seq_len:, :] = past

        pred_batch = model.predict(past_batch, verbose=0)

        mse = np.mean((pred_batch - target)**2, axis=1)
        return int(np.argmin(mse) + 2)

    # --------------------------------
    # RESUME SUPPORT
    # --------------------------------
    if os.path.exists(CHECKPOINT_FILE):
        print("\nResuming from checkpoint...")
        with open(CHECKPOINT_FILE, "rb") as f:
            start_idx, collected_labels = pickle.load(f)
        print(f"Resuming at index {start_idx:,}")
    else:
        print("\nStarting fresh labeling run...")
        start_idx = 0
        collected_labels = []

    # Ensure it's a Python list (pickle-safe)
    if not isinstance(collected_labels, list):
        collected_labels = collected_labels.tolist()

    # --------------------------------
    # MAIN PROCESSING LOOP
    # --------------------------------
    for i in tqdm(range(start_idx, N), total=N, initial=start_idx):
        seq = SEQ_DATA[i]                      # one window: (window_size, n_features)
        klabel = get_optimal_k(seq, nsp_model, K=K_MAX)
        collected_labels.append(klabel)

        # ------- SAVE CHECKPOINT ----------
        if (i + 1) % BATCH_SIZE_SAVE == 0:
            with open(CHECKPOINT_FILE, "wb") as f:
                pickle.dump((i + 1, collected_labels), f)

            batch_id = (i + 1) // BATCH_SIZE_SAVE
            np.save(f"{CHECKPOINT_DIR}/KLABEL_BATCH_{batch_id}.npy",
                    np.array(collected_labels[-BATCH_SIZE_SAVE:]))

            print(f"\nBatch {batch_id} saved safely at index {i+1}")
            gc.collect()

    # ----------------------------
    # FINAL SAVE
    # ----------------------------
    print("\nFinal save...")
    optimal_K_for_ALL = np.array(collected_labels)
    np.save(final_window_labels_path, optimal_K_for_ALL)
    np.save(final_data_path, results1)


print("VAE training complete. Generated data and window labels saved.")


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!nvidia-smi


Mon Dec  1 04:31:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             51W /  400W |    1523MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
with tf.device('/GPU:0'):
    # If you need to reload the model:
    # nsp_model = keras.models.load_model('your_model_path')

    # Or just verify it's on GPU
    print(f"Model device: {nsp_model.layers[0].weights[0].device if nsp_model.layers else 'checking...'}")


NameError: name 'nsp_model' is not defined