<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE2-VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# VAE Data Generator Module - Your existing code separated
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import math
import plotly.graph_objects as go
import keras
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed, Input
from keras.models import Model
from keras import saving
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import EarlyStopping
from keras.callbacks import Callback
import plotly
from keras import losses
import plotly.express as px
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class VAEDataGenerator:
    """
    One-time VAE training and synthetic data generation
    Uses your existing VAE code exactly as-is
    """

    def __init__(self, output_dir='/content/drive/MyDrive/PHD/2024/TEMP_OUTPUT_METROPM/'):
        self.output_dir = output_dir
        self.encoder = None
        self.decoder = None
        self.vae = None

    def train_vae_pipeline(self, train_data_path):
        """
        Complete VAE training pipeline - your exact code
        """
        print("Starting VAE training pipeline...")

        # Load data - your code
        train_data = np.load(train_data_path)
        n_seq = train_data.shape[0]
        window_size = train_data.shape[1]
        n_features = train_data.shape[2]

        maxval = train_data.shape[0]
        count_train = int(math.ceil(0.8*maxval))
        x_train = train_data[:count_train]
        x_test = train_data[count_train:]

        # Clear all previously registered custom objects - your code
        saving.get_custom_objects().clear()

        # Create a custom layer - your code
        @saving.register_keras_serializable(package="MyLayers")
        class Sampling(layers.Layer):
            """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

            def __init__(self, factor):
                super().__init__()
                self.factor = factor

            def call(self, inputs):
                z_mean, z_log_var = inputs
                batch = tf.shape(z_mean)[0]
                dim = tf.shape(z_mean)[1]
                epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
                return z_mean + tf.exp(0.5 * z_log_var) * epsilon

            def get_config(self):
                return {"factor": self.factor}

        # Build the encoder - your code
        latent_dim = 5
        intermediate_dim = 256

        # Encoder - your code
        encoder_inputs =  layers.Input(shape=(window_size, n_features),name="encoder_input")
        x = layers.LSTM(intermediate_dim, activation='tanh', name="lstm1", return_sequences=True)(encoder_inputs)
        xx = layers.LSTM(int(intermediate_dim/2), activation='tanh', name="lstm2", return_sequences=False)(x)
        x1 = layers.Dense(int(intermediate_dim/2), name="dense" )(xx)
        z_mean = layers.Dense(latent_dim, name="z_mean")(x1)
        z_log_var = layers.Dense(latent_dim, name="z_log_var")(x1)
        z = Sampling(1)([z_mean, z_log_var])
        encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
        encoder.summary()

        # Decoder - your code
        inp_z = Input(shape=(latent_dim,),name="decoder")
        x1 = layers.RepeatVector(window_size, name="repeatvect")(inp_z)
        x2= layers.Dense(int(intermediate_dim/2),  name="Dense2")(x1)
        x22= layers.LSTM(int(intermediate_dim/2),activation='tanh', return_sequences=True, name="lstm1")(x2)
        x3 = layers.LSTM(intermediate_dim,activation='tanh', return_sequences=True, name="lstm2")(x22)
        decode_out = layers.TimeDistributed(Dense(n_features), name="decodeout")(x3)
        decoder = keras.Model(inp_z, decode_out, name="decoder")
        decoder.summary()

        reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)

        # Parameters - your code
        n_epochs = 150
        klstart = 20
        kl_annealtime = n_epochs-klstart
        weight = K.variable(0.0)

        # Define the VAE as a Model with a custom train_step - your code
        class VAE(keras.Model):
            def __init__(self, encoder, decoder, **kwargs):
                super(VAE, self).__init__(**kwargs)
                self.encoder = encoder
                self.decoder = decoder
                self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
                self.reconstruction_loss_tracker = keras.metrics.Mean(
                    name="reconstruction_loss"
                )
                self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

            @property
            def metrics(self):
                return [
                    self.total_loss_tracker,
                    self.reconstruction_loss_tracker,
                    self.kl_loss_tracker,
                ]

            def train_step(self, data):
                with tf.GradientTape() as tape:
                    z_mean, z_log_var, z = self.encoder(data)
                    reconstruction = self.decoder(z)
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            losses.mean_squared_error(data, reconstruction), axis=-1),keepdims=True
                        )

                    kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
                    total_loss = reconstruction_loss + (weight*kl_loss)
                grads = tape.gradient(total_loss, self.trainable_weights)
                self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
                self.total_loss_tracker.update_state(total_loss)
                self.reconstruction_loss_tracker.update_state(reconstruction_loss)
                self.kl_loss_tracker.update_state(kl_loss)
                return {
                    "loss": self.total_loss_tracker.result(),
                    "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                    "kl_loss": self.kl_loss_tracker.result(),
                }

            def test_step(self, data):
                    z_mean, z_log_var, z = self.encoder(data)
                    reconstruction = self.decoder(z)
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            losses.mean_squared_error(data, reconstruction), axis=-1),keepdims=True
                        )

                    kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))

                    total_loss = reconstruction_loss + kl_loss

                    return {
                        "loss": self.total_loss_tracker.result(),
                        "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                        "kl_loss": self.kl_loss_tracker.result(),
                          }

        # CALLBACKS - your code
        es = keras.callbacks.EarlyStopping(patience=50, verbose=1, min_delta=0.0001, monitor='loss', mode='auto', restore_best_weights=True)

        class AnnealingCallback(Callback):
            def __init__(self, weight):
                self.weight = weight
            def on_epoch_end (self, epoch, logs={}):
                if epoch > klstart and epoch <klstart*1.2:
                    new_weight = min(K.get_value(self.weight) + (1./ kl_annealtime), 1.)
                    K.set_value(self.weight, new_weight)
                print ("Current KL Weight is " + str(K.get_value(self.weight)))

        # Train the VAE - your code
        vae = VAE(encoder, decoder)
        vae.compile(optimizer=keras.optimizers.Adam(clipnorm=1))
        history=vae.fit( x_train,
                         epochs=n_epochs,
                         batch_size=32,
                         validation_split=0.1,
                         callbacks=[AnnealingCallback(weight)])

        # Save models - your code
        encoder.save(f'{self.output_dir}METROPM_vae-encoder-latent5-dim256.keras')
        decoder.save(f'{self.output_dir}METROPM_vae-decoder-latent5-dim256.keras')

        # Store for data generation
        self.encoder = encoder
        self.decoder = decoder
        self.window_size = window_size
        self.n_features = n_features

        print("VAE training complete and models saved!")
        return history

    def generate_synthetic_dataset(self, train_data_path, cohort_size=350000):
        """
        Generate synthetic dataset - your exact code
        """
        print(f"Generating synthetic dataset of size {cohort_size}...")

        # Load encoder/decoder if not already loaded
        if self.encoder is None:
            self.encoder = keras.models.load_model(f'{self.output_dir}METROPM_vae-encoder-latent5-dim256.keras')
            self.decoder = keras.models.load_model(f'{self.output_dir}METROPM_vae-decoder-latent5-dim256.keras')

        # Load original training data
        train_data = np.load(train_data_path)
        window_size = train_data.shape[1]
        n_features = train_data.shape[2]
        generator_multiply = 100

        X_train_encoded = self.encoder.predict(train_data)
        mu, logvar, z = X_train_encoded
        sigma = tf.exp(0.5 * logvar)
        batch = tf.shape(mu)[0]
        dim = tf.shape(mu)[1]
        store = list()

        # For each batch, iterate, get the generator_multiply number of latent vectors - your code
        for i in range(0,batch):
            all_Z_i = tf.random.normal(shape=(generator_multiply,dim), mean = mu[i,:], stddev=sigma[i,:])
            X_train_decoded = self.decoder.predict(all_Z_i)
            X_train_decoded = X_train_decoded.reshape((X_train_decoded.shape[0],window_size*n_features))
            store.append(X_train_decoded)

            if i % 1000 == 0:
                print(f"Generated {i * generator_multiply} samples...")

        results1 = np.concatenate(store,axis=0)

        # Save generated data
        np.save(f'{self.output_dir}generated_large_subsquence2_data_V2.npy', results1)
        print(f"Synthetic dataset saved: {results1.shape}")
        return results1

    def compute_var_windows_batch(self, data_path, start_idx=0, batch_size=50000):
        """
        Compute VAR windows in batches - your exact VAR code
        """
        print(f"Computing VAR windows for batch starting at {start_idx}...")

        # Load data
        x = np.load(data_path)
        window_size = self.window_size if self.window_size else 50  # Default if not set
        n_features = self.n_features if self.n_features else 12     # Default if not set

        x_3d = x.reshape((x.shape[0], window_size, n_features))
        n_future = 1
        K = window_size

        end_idx = min(start_idx + batch_size, x_3d.shape[0])
        best_window_for_long_seq = []

        # Your exact VAR computation loop
        for i in range(start_idx, end_idx):
            rmse_list = []
            for k in range(2, round(K)):
                cur_seq = x_3d[i,:,:]
                df = pd.DataFrame(cur_seq, columns=['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12'])
                df_train, df_test = df[0:-n_future], df[-n_future:]
                model= VAR(df_train)
                try:
                    model_fitted1 = model.fit(k)
                    forecast_input1 = df_train.values[-k:]
                    fc1 = model_fitted1.forecast(y=forecast_input1, steps=n_future)
                    df_forecast1 = pd.DataFrame(fc1, index=df.index[-n_future:], columns=df.columns)
                    mse =  mean_squared_error(df_test['V1'], df_forecast1['V1'].values)
                    rmse_list.append(mse)
                except:
                    rmse_list.append(99999)

            min_index = rmse_list.index(min(rmse_list))
            min_sw = min_index + 2
            best_window_for_long_seq.append(min_sw)

            if (i - start_idx) % 1000 == 0:
                print(f'Processed {i - start_idx}/{end_idx - start_idx} sequences...')

        Window = np.array(best_window_for_long_seq)

        # Save batch results
        np.save(f'{self.output_dir}generated-data-true-window2-BATCH_{start_idx}_{end_idx}.npy', Window)
        print(f"VAR windows computed and saved for batch {start_idx}-{end_idx}")
        return Window

    def run_complete_data_generation(self, train_data_path):
        """
        Complete data generation pipeline - runs once
        """
        print("="*60)
        print("STARTING VAE DATA GENERATION PIPELINE")
        print("="*60)

        # Step 1: Train VAE
        history = self.train_vae_pipeline(train_data_path)

        # Step 2: Generate synthetic data
        synthetic_data = self.generate_synthetic_dataset(train_data_path, cohort_size=350000)

        # Step 3: Compute VAR windows in batches (your approach)
        batch_size = 50000
        total_samples = synthetic_data.shape[0]
        all_windows = []

        for start_idx in range(0, total_samples, batch_size):
            batch_windows = self.compute_var_windows_batch(
                f'{self.output_dir}generated_large_subsquence2_data_V2.npy',
                start_idx,
                batch_size
            )
            all_windows.append(batch_windows)

        # Combine all batches
        final_windows = np.concatenate(all_windows, axis=0)

        # Save final results
        np.save(f'{self.output_dir}generated-data-true-window2.npy', final_windows)
        np.save(f'{self.output_dir}generated-data2.npy', synthetic_data)

        print("="*60)
        print("VAE DATA GENERATION COMPLETE!")
        print(f"Generated {synthetic_data.shape[0]} synthetic samples")
        print(f"Computed {final_windows.shape[0]} VAR windows")
        print("="*60)

        return {
            'synthetic_data': synthetic_data,
            'var_windows': final_windows,
            'vae_history': history
        }

# Usage
if __name__ == "__main__":
    generator = VAEDataGenerator()

    # Run once to generate all synthetic data
    results = generator.run_complete_data_generation(
        r'/content/drive/MyDrive/PHD/2024/TEMP_OUTPUT_METROPM/multivariate_long_sequences-TRAIN-Daily-DIRECT-VAR.npy'
    )

Epoch 1/1000
[1m4430/4430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 2ms/step - loss: 0.8964 - mean_squared_error: 0.8964 - val_loss: 0.7946 - val_mean_squared_error: 0.7946
Epoch 2/1000
[1m4430/4430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.8044 - mean_squared_error: 0.8044 - val_loss: 0.7901 - val_mean_squared_error: 0.7901
Epoch 3/1000
[1m4430/4430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.7842 - mean_squared_error: 0.7842 - val_loss: 0.7714 - val_mean_squared_error: 0.7714
Epoch 4/1000
[1m4430/4430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - loss: 0.7728 - mean_squared_error: 0.7728 - val_loss: 0.7566 - val_mean_squared_error: 0.7566
Epoch 5/1000
[1m4430/4430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.7643 - mean_squared_error: 0.7643 - val_loss: 0.7566 - val_mean_squared_error: 0.7566
Epoch 6/1000
[1m4430/4430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
from google.colab import drive
drive.mount('/content/drive')