<a href="https://colab.research.google.com/github/supriyag123/PHD_Pub/blob/main/AGENTIC-MODULE2-VAE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# VAE Data Generator Module - Your code with resumability added
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import math
import plotly.graph_objects as go
import keras
from tensorflow import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, RepeatVector, TimeDistributed, Input
from keras.models import Model
from keras import saving
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import *
from tensorflow.keras.callbacks import EarlyStopping
from keras.callbacks import Callback
import plotly
from keras import losses
import plotly.express as px
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

class VAEDataGenerator:
    """
    One-time VAE training and synthetic data generation with resumability
    """

    def register_custom_layers(self):
        """Register custom layers for model loading"""
        # Clear and register custom layers
        saving.get_custom_objects().clear()

        @saving.register_keras_serializable(package="MyLayers")
        class Sampling(layers.Layer):
            """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

            def __init__(self, factor):
                super().__init__()
                self.factor = factor

            def call(self, inputs):
                z_mean, z_log_var = inputs
                batch = tf.shape(z_mean)[0]
                dim = tf.shape(z_mean)[1]
                epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
                return z_mean + tf.exp(0.5 * z_log_var) * epsilon

            def get_config(self):
                return {"factor": self.factor}

        return Sampling

    def load_vae_models(self):
        """Load VAE models with proper custom layer registration"""
        # Register custom layers first
        Sampling = self.register_custom_layers()

        encoder_path = f'{self.output_dir}METROPM_vae-encoder-latent5-dim256.keras'
        decoder_path = f'{self.output_dir}METROPM_vae-decoder-latent5-dim256.keras'

        if os.path.exists(encoder_path) and os.path.exists(decoder_path):
            print("Loading VAE models with custom layers...")
            self.encoder = keras.models.load_model(encoder_path)
            self.decoder = keras.models.load_model(decoder_path)
            print("VAE models loaded successfully!")
            return True
        else:
            print("VAE models not found!")
            return False
        self.output_dir = output_dir
        self.encoder = None
        self.decoder = None
        self.vae = None

        # Create checkpoints directory
        self.checkpoint_dir = f"{output_dir}checkpoints/"
        os.makedirs(self.checkpoint_dir, exist_ok=True)

    def train_vae_pipeline(self, train_data_path):
        """
        Complete VAE training pipeline
        """
        print("Starting VAE training pipeline...")

        # Load data
        train_data = np.load(train_data_path)
        n_seq = train_data.shape[0]
        window_size = train_data.shape[1]
        n_features = train_data.shape[2]

        maxval = train_data.shape[0]
        count_train = int(math.ceil(0.8*maxval))
        x_train = train_data[:count_train]
        x_test = train_data[count_train:]

        # Clear all previously registered custom objects
        saving.get_custom_objects().clear()

        # Create a custom layer
        @saving.register_keras_serializable(package="MyLayers")
        class Sampling(layers.Layer):
            """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

            def __init__(self, factor):
                super().__init__()
                self.factor = factor

            def call(self, inputs):
                z_mean, z_log_var = inputs
                batch = tf.shape(z_mean)[0]
                dim = tf.shape(z_mean)[1]
                epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
                return z_mean + tf.exp(0.5 * z_log_var) * epsilon

            def get_config(self):
                return {"factor": self.factor}

        # Build the encoder
        latent_dim = 5
        intermediate_dim = 256

        # Encoder
        encoder_inputs =  layers.Input(shape=(window_size, n_features),name="encoder_input")
        x = layers.LSTM(intermediate_dim, activation='tanh', name="lstm1", return_sequences=True)(encoder_inputs)
        xx = layers.LSTM(int(intermediate_dim/2), activation='tanh', name="lstm2", return_sequences=False)(x)
        x1 = layers.Dense(int(intermediate_dim/2), name="dense" )(xx)
        z_mean = layers.Dense(latent_dim, name="z_mean")(x1)
        z_log_var = layers.Dense(latent_dim, name="z_log_var")(x1)
        z = Sampling(1)([z_mean, z_log_var])
        encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
        encoder.summary()

        # Decoder
        inp_z = Input(shape=(latent_dim,),name="decoder")
        x1 = layers.RepeatVector(window_size, name="repeatvect")(inp_z)
        x2= layers.Dense(int(intermediate_dim/2),  name="Dense2")(x1)
        x22= layers.LSTM(int(intermediate_dim/2),activation='tanh', return_sequences=True, name="lstm1")(x2)
        x3 = layers.LSTM(intermediate_dim,activation='tanh', return_sequences=True, name="lstm2")(x22)
        decode_out = layers.TimeDistributed(Dense(n_features), name="decodeout")(x3)
        decoder = keras.Model(inp_z, decode_out, name="decoder")
        decoder.summary()

        reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)

        # Parameters - your code
        n_epochs = 150
        klstart = 20
        kl_annealtime = n_epochs-klstart
        weight = K.variable(0.0)

        # Define the VAE as a Model with a custom train_step
        class VAE(keras.Model):
            def __init__(self, encoder, decoder, **kwargs):
                super(VAE, self).__init__(**kwargs)
                self.encoder = encoder
                self.decoder = decoder
                self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
                self.reconstruction_loss_tracker = keras.metrics.Mean(
                    name="reconstruction_loss"
                )
                self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

            @property
            def metrics(self):
                return [
                    self.total_loss_tracker,
                    self.reconstruction_loss_tracker,
                    self.kl_loss_tracker,
                ]

            def train_step(self, data):
                with tf.GradientTape() as tape:
                    z_mean, z_log_var, z = self.encoder(data)
                    reconstruction = self.decoder(z)
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            losses.mean_squared_error(data, reconstruction), axis=-1),keepdims=True
                        )

                    kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
                    total_loss = reconstruction_loss + (weight*kl_loss)
                grads = tape.gradient(total_loss, self.trainable_weights)
                self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
                self.total_loss_tracker.update_state(total_loss)
                self.reconstruction_loss_tracker.update_state(reconstruction_loss)
                self.kl_loss_tracker.update_state(kl_loss)
                return {
                    "loss": self.total_loss_tracker.result(),
                    "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                    "kl_loss": self.kl_loss_tracker.result(),
                }

            def test_step(self, data):
                    z_mean, z_log_var, z = self.encoder(data)
                    reconstruction = self.decoder(z)
                    reconstruction_loss = tf.reduce_mean(
                        tf.reduce_sum(
                            losses.mean_squared_error(data, reconstruction), axis=-1),keepdims=True
                        )

                    kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
                    kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))

                    total_loss = reconstruction_loss + kl_loss

                    return {
                        "loss": self.total_loss_tracker.result(),
                        "reconstruction_loss": self.reconstruction_loss_tracker.result(),
                        "kl_loss": self.kl_loss_tracker.result(),
                          }

        # CALLBACKS
        es = keras.callbacks.EarlyStopping(patience=50, verbose=1, min_delta=0.0001, monitor='loss', mode='auto', restore_best_weights=True)

        class AnnealingCallback(Callback):
            def __init__(self, weight):
                super().__init__()
                self.weight = weight
            def on_epoch_end(self, epoch, logs={}):
                if epoch > klstart and epoch < klstart*1.2:
                    new_weight = min(K.get_value(self.weight) + (1./ kl_annealtime), 1.)
                    K.set_value(self.weight, new_weight)
                print ("Current KL Weight is " + str(K.get_value(self.weight)))

        # Train the VAE
        vae = VAE(encoder, decoder)
        vae.compile(optimizer=keras.optimizers.Adam(clipnorm=1))
        history=vae.fit( x_train,
                         epochs=n_epochs,
                         batch_size=32,
                         validation_split=0.1,
                         callbacks=[AnnealingCallback(weight)])

        # Save models
        encoder.save(f'{self.output_dir}METROPM_vae-encoder-latent5-dim256.keras')
        decoder.save(f'{self.output_dir}METROPM_vae-decoder-latent5-dim256.keras')

        # Store for data generation
        self.encoder = encoder
        self.decoder = decoder
        self.window_size = window_size
        self.n_features = n_features

        print("VAE training complete and models saved!")
        return history

    def generate_synthetic_dataset_fast(self, train_data_path, cohort_size=350000, checkpoint_every=5000):
        """
        FASTER synthetic dataset generation with optimizations
        """
        print(f"Generating synthetic dataset FAST - {cohort_size} samples...")

        # Load models if needed with proper custom layer registration
        if self.encoder is None:
            if not self.load_vae_models():
                raise ValueError("VAE models not found! Train VAE first.")

        # Load and process data
        train_data = np.load(train_data_path)
        window_size = train_data.shape[1]
        n_features = train_data.shape[2]

        # Keep original generator_multiply = 100 (as requested)
        generator_multiply = 100
        print(f"Using generator_multiply = {generator_multiply}")

        # Process all base samples (no reduction)
        max_base_samples = min(cohort_size // generator_multiply, train_data.shape[0])
        if max_base_samples < train_data.shape[0]:
            print(f"Processing {max_base_samples} base samples to reach target {cohort_size}")
            train_data = train_data[:max_base_samples]

        # Get encodings
        print("Computing encodings...")
        X_train_encoded = self.encoder.predict(train_data, batch_size=64, verbose=1)  # Larger batch
        mu, logvar, z = X_train_encoded
        sigma = tf.exp(0.5 * logvar)

        # OPTIMIZATION: Batch processing for decoder
        print("Generating synthetic data in batches...")
        all_results = []
        batch_size = 50  # Process 50 samples at once

        for i in range(0, len(mu), batch_size):
            batch_end = min(i + batch_size, len(mu))
            batch_mu = mu[i:batch_end]
            batch_sigma = sigma[i:batch_end]

            # Generate all Z vectors for this batch
            batch_z = []
            for j in range(len(batch_mu)):
                z_samples = tf.random.normal(
                    shape=(generator_multiply, mu.shape[1]),
                    mean=batch_mu[j],
                    stddev=batch_sigma[j]
                )
                batch_z.append(z_samples)

            # Decode all at once
            all_z = tf.concat(batch_z, axis=0)
            decoded_batch = self.decoder.predict(all_z, batch_size=128, verbose=0)  # Large batch
            decoded_batch = decoded_batch.reshape((decoded_batch.shape[0], window_size * n_features))
            all_results.append(decoded_batch)

            if (i // batch_size + 1) % 10 == 0:
                print(f"Processed {i + batch_size}/{len(mu)} base samples...")

        results1 = np.concatenate(all_results, axis=0)
        np.save(f'{self.output_dir}generated_large_subsquence_data.npy', results1)
        print(f"Fast synthetic dataset complete: {results1.shape}")
        return results1
        """
        Generate synthetic dataset with resumability - MODIFIED FOR COLAB
        """
        print(f"Generating synthetic dataset of size {cohort_size} with checkpointing...")

        # Check for existing checkpoint
        checkpoint_file = f"{self.checkpoint_dir}generation_progress.npz"
        if os.path.exists(checkpoint_file):
            checkpoint_data = np.load(checkpoint_file, allow_pickle=True)
            start_idx = int(checkpoint_data['last_completed_idx'])
            existing_files = checkpoint_data['completed_files'].tolist()
            print(f"Resuming from checkpoint: {start_idx}/{cohort_size}")
        else:
            start_idx = 0
            existing_files = []
            print("Starting fresh generation...")

        # Load encoder/decoder if not already loaded
        if self.encoder is None:
            encoder_path = f'{self.output_dir}METROPM_vae-encoder-latent5-dim256.keras'
            decoder_path = f'{self.output_dir}METROPM_vae-decoder-latent5-dim256.keras'

            if os.path.exists(encoder_path) and os.path.exists(decoder_path):
                self.encoder = keras.models.load_model(encoder_path)
                self.decoder = keras.models.load_model(decoder_path)
                print("Loaded existing VAE models")
            else:
                raise ValueError("VAE models not found! Train VAE first.")

        # Load original training data
        train_data = np.load(train_data_path)
        window_size = train_data.shape[1]
        n_features = train_data.shape[2]
        generator_multiply = 100

        # Get encodings once (save to avoid recomputation)
        encodings_file = f"{self.checkpoint_dir}encodings.npz"
        if start_idx == 0 and not os.path.exists(encodings_file):
            print("Computing encodings...")
            X_train_encoded = self.encoder.predict(train_data, verbose=1)
            mu, logvar, z = X_train_encoded

            # Save encodings to avoid recomputation
            np.savez(encodings_file, mu=mu, logvar=logvar)
            print("Encodings saved to checkpoint")
        else:
            print("Loading encodings from checkpoint...")
            encodings_data = np.load(encodings_file)
            mu = encodings_data['mu']
            logvar = encodings_data['logvar']

        sigma = tf.exp(0.5 * logvar)
        batch = tf.shape(mu)[0]
        dim = tf.shape(mu)[1]

        # Generate data with checkpointing - SAVE CHUNKS TO SEPARATE FILES
        target_samples = min(cohort_size, batch.numpy())
        chunk_size = checkpoint_every  # Save every 5000 samples

        for chunk_start in range(start_idx, target_samples, chunk_size):
            chunk_end = min(chunk_start + chunk_size, target_samples)
            chunk_file = f"{self.output_dir}synthetic_chunk_{chunk_start}_{chunk_end}.npy"

            # Skip if chunk already exists
            if chunk_file in existing_files:
                print(f"Chunk {chunk_start}-{chunk_end} already exists, skipping...")
                continue

            print(f"Generating chunk {chunk_start}-{chunk_end}...")
            chunk_data = []

            for i in range(chunk_start, chunk_end):
                try:
                    all_Z_i = tf.random.normal(
                        shape=(generator_multiply, dim),
                        mean=mu[i, :],
                        stddev=sigma[i, :]
                    )
                    X_train_decoded = self.decoder.predict(all_Z_i, verbose=0)
                    X_train_decoded = X_train_decoded.reshape((X_train_decoded.shape[0], window_size * n_features))
                    chunk_data.append(X_train_decoded)

                    # Progress update
                    if (i - chunk_start + 1) % 100 == 0:
                        print(f"  Progress: {i - chunk_start + 1}/{chunk_end - chunk_start}")

                except Exception as e:
                    print(f"Error at sample {i}: {e}")
                    # Save emergency checkpoint
                    emergency_checkpoint = {
                        'last_completed_idx': i,
                        'completed_files': existing_files,
                        'error_at': i
                    }
                    np.savez(f"{checkpoint_file}_emergency", **emergency_checkpoint)
                    raise

            # Save chunk
            chunk_array = np.concatenate(chunk_data, axis=0)
            np.save(chunk_file, chunk_array)
            existing_files.append(chunk_file)

            # Update checkpoint
            checkpoint = {
                'last_completed_idx': chunk_end,
                'completed_files': np.array(existing_files),
                'total_target': target_samples
            }
            np.savez(checkpoint_file, **checkpoint)
            print(f"Chunk {chunk_start}-{chunk_end} saved. Progress: {chunk_end}/{target_samples}")

        # Combine all chunks into final file
        print("Combining all chunks...")
        all_data = []
        for chunk_file in existing_files:
            chunk_data = np.load(chunk_file)
            all_data.append(chunk_data)

        results1 = np.concatenate(all_data, axis=0)

        # Save final result
        np.save(f'{self.output_dir}generated_large_subsquence_data.npy', results1)

        # Clean up
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)
        if os.path.exists(encodings_file):
            os.remove(encodings_file)
        print(f"Synthetic dataset complete: {results1.shape}")
        return results1

    def compute_var_windows_fast(self, data_path, start_idx=0, batch_size=5000):
        """
        FASTER VAR computation with optimizations
        """
        print(f"Computing VAR windows FAST for batch starting at {start_idx}...")

        # OPTIMIZATION: Extended window range 2->30 (as requested)
        max_var_lag = 30
        print(f"Testing VAR lags from 2 to {max_var_lag}")

        x = np.load(data_path)
        window_size = self.window_size if self.window_size else 50
        n_features = self.n_features if self.n_features else 13

        x_3d = x.reshape((x.shape[0], window_size, n_features))
        n_future = 1

        end_idx = min(start_idx + batch_size, x_3d.shape[0])
        best_window_for_long_seq = []

        # OPTIMIZATION: Batch processing
        for i in range(start_idx, end_idx):
            rmse_list = []

            # Test lags from 2 to 30 (as requested)
            for k in range(2, min(max_var_lag + 1, window_size//2)):
                cur_seq = x_3d[i,:,:]
                df = pd.DataFrame(cur_seq, columns=['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13'])
                df_train, df_test = df[0:-n_future], df[-n_future:]

                try:
                    model = VAR(df_train)
                    model_fitted1 = model.fit(k)
                    forecast_input1 = df_train.values[-k:]
                    fc1 = model_fitted1.forecast(y=forecast_input1, steps=n_future)
                    df_forecast1 = pd.DataFrame(fc1, index=df.index[-n_future:], columns=df.columns)
                    mse = mean_squared_error(df_test['V1'], df_forecast1['V1'].values)
                    rmse_list.append(mse)
                except:
                    rmse_list.append(99999)

            min_index = rmse_list.index(min(rmse_list))
            min_sw = min_index + 2
            best_window_for_long_seq.append(min_sw)

            if (i - start_idx) % 500 == 0:
                print(f'FAST processed {i - start_idx}/{end_idx - start_idx} sequences...')

        Window = np.array(best_window_for_long_seq)
        batch_file = f'{self.output_dir}generated-data-true-window-BATCH_{start_idx}_{end_idx}.npy'
        np.save(batch_file, Window)
        print(f"FAST VAR windows computed for batch {start_idx}-{end_idx}")
        return Window
        """
        Compute VAR windows with resumability - SMALLER BATCHES FOR COLAB
        """
        print(f"Computing VAR windows for batch starting at {start_idx}...")

        # Load data
        x = np.load(data_path)
        window_size = self.window_size if self.window_size else 50
        n_features = self.n_features if self.n_features else 13  # Updated for your 13 features

        x_3d = x.reshape((x.shape[0], window_size, n_features))
        n_future = 1
        K = window_size

        end_idx = min(start_idx + batch_size, x_3d.shape[0])

        # Check if this batch was already completed
        batch_file = f'{self.output_dir}generated-data-true-window-BATCH_{start_idx}_{end_idx}.npy'
        if os.path.exists(batch_file):
            print(f"Batch {start_idx}-{end_idx} already completed, loading...")
            return np.load(batch_file)

        best_window_for_long_seq = []

        # Process with mini-checkpoints
        mini_checkpoint_every = 1000
        checkpoint_file = f"{self.checkpoint_dir}var_batch_{start_idx}_{end_idx}_progress.npy"

        # Check for existing progress
        if os.path.exists(checkpoint_file):
            progress_data = np.load(checkpoint_file, allow_pickle=True).item()
            completed_idx = progress_data['completed_idx']
            best_window_for_long_seq = progress_data['results']
            print(f"Resuming batch from index {completed_idx}")
        else:
            completed_idx = start_idx

        # Process remaining samples
        for i in range(completed_idx, end_idx):
            rmse_list = []
            for k in range(2, round(K)):
                cur_seq = x_3d[i,:,:]
                df = pd.DataFrame(cur_seq, columns=['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13'])
                df_train, df_test = df[0:-n_future], df[-n_future:]
                model= VAR(df_train)
                try:
                    model_fitted1 = model.fit(k)
                    forecast_input1 = df_train.values[-k:]
                    fc1 = model_fitted1.forecast(y=forecast_input1, steps=n_future)
                    df_forecast1 = pd.DataFrame(fc1, index=df.index[-n_future:], columns=df.columns)
                    mse =  mean_squared_error(df_test['V1'], df_forecast1['V1'].values)
                    rmse_list.append(mse)
                except:
                    rmse_list.append(99999)

            min_index = rmse_list.index(min(rmse_list))
            min_sw = min_index + 2
            best_window_for_long_seq.append(min_sw)

            # Mini-checkpoint
            if (i - start_idx) % mini_checkpoint_every == 0:
                progress = {
                    'completed_idx': i + 1,
                    'results': best_window_for_long_seq,
                    'start_idx': start_idx,
                    'end_idx': end_idx
                }
                np.save(checkpoint_file, progress)
                print(f'Mini-checkpoint saved at {i + 1}/{end_idx}')

            if (i - start_idx) % 1000 == 0:
                print(f'Processed {i - start_idx}/{end_idx - start_idx} sequences...')

        Window = np.array(best_window_for_long_seq)

        # Save final batch results
        np.save(batch_file, Window)

        # Clean up checkpoint
        if os.path.exists(checkpoint_file):
            os.remove(checkpoint_file)

        print(f"VAR windows computed and saved for batch {start_idx}-{end_idx}")
        return Window

    def run_complete_data_generation_resumable(self, train_data_path):
        """
        Complete data generation pipeline with resumability
        """
        print("="*60)
        print("STARTING RESUMABLE VAE DATA GENERATION PIPELINE")
        print("="*60)

        # Step 1: Train VAE (only if models don't exist)
        encoder_path = f'{self.output_dir}METROPM_vae-encoder-latent5-dim256.keras'
        if not os.path.exists(encoder_path):
            print("Training VAE...")
            history = self.train_vae_pipeline(train_data_path)
        else:
            print("VAE models already exist, skipping training...")
            self.encoder = keras.models.load_model(encoder_path)
            self.decoder = keras.models.load_model(f'{self.output_dir}METROPM_vae-decoder-latent5-dim256.keras')
            # Get data properties
            train_data = np.load(train_data_path)
            self.window_size = train_data.shape[1]
            self.n_features = train_data.shape[2]

        # Step 2: Generate synthetic data (resumable)
        synthetic_file = f'{self.output_dir}generated_large_subsquence_data.npy'
        if not os.path.exists(synthetic_file):
            print("Generating synthetic data...")
            synthetic_data = self.generate_synthetic_dataset_resumable(train_data_path, cohort_size=350000)
        else:
            print("Synthetic data already exists, loading...")
            synthetic_data = np.load(synthetic_file)

        # Step 3: Compute VAR windows in batches (resumable)
        final_windows_file = f'{self.output_dir}generated-data-true-window.npy'
        if not os.path.exists(final_windows_file):
            print("Computing VAR windows...")
            batch_size = 10000  # Smaller batches for Colab
            total_samples = synthetic_data.shape[0]
            all_windows = []

            for start_idx in range(0, total_samples, batch_size):
                batch_windows = self.compute_var_windows_batch_resumable(
                    synthetic_file,
                    start_idx,
                    batch_size
                )
                all_windows.append(batch_windows)

            # Combine all batches
            final_windows = np.concatenate(all_windows, axis=0)

            # Save final results
            np.save(final_windows_file, final_windows)
            np.save(f'{self.output_dir}generated-data.npy', synthetic_data)
        else:
            print("VAR windows already computed, loading...")
            final_windows = np.load(final_windows_file)

        print("="*60)
        print("VAE DATA GENERATION COMPLETE!")
        print(f"Generated {synthetic_data.shape[0]} synthetic samples")
        print(f"Computed {final_windows.shape[0]} VAR windows")
        print("="*60)

    def run_fast_generation(self, train_data_path, target_samples=350000):
        """
        Optimized pipeline with your requested changes
        """
        print("="*60)
        print("STARTING OPTIMIZED VAE DATA GENERATION")
        print("="*60)

        # Step 1: Load existing VAE or train
        if not self.load_vae_models():
            print("Training VAE...")
            self.train_vae_pipeline(train_data_path)
        else:
            print("VAE models loaded successfully!")
            # Get data properties
            train_data = np.load(train_data_path)
            self.window_size = train_data.shape[1]
            self.n_features = train_data.shape[2]

        # Step 2: Generate synthetic dataset (full size, generator_multiply=100)
        print(f"Generating {target_samples} synthetic samples...")
        synthetic_data = self.generate_synthetic_dataset_fast(train_data_path, cohort_size=target_samples)

        # Step 3: Compute VAR windows (testing lags 2->30)
        print("Computing VAR windows (testing lags 2-30)...")
        batch_size = 5000
        total_samples = synthetic_data.shape[0]
        all_windows = []

        for start_idx in range(0, total_samples, batch_size):
            batch_windows = self.compute_var_windows_fast(
                f'{self.output_dir}generated_large_subsquence_data.npy',
                start_idx,
                batch_size
            )
            all_windows.append(batch_windows)

        # Combine and save
        final_windows = np.concatenate(all_windows, axis=0)
        np.save(f'{self.output_dir}generated-data-true-window-OPTIMIZED.npy', final_windows)
        np.save(f'{self.output_dir}generated-data-OPTIMIZED.npy', synthetic_data)

        print("="*60)
        print("OPTIMIZED VAE DATA GENERATION COMPLETE!")
        print(f"Generated {synthetic_data.shape[0]} synthetic samples")
        print(f"Computed {final_windows.shape[0]} VAR windows")
        print(f"VAR lag range: 2-30")
        print(f"Generator multiply: 100 (kept original)")
        print("="*60)

        return {
            'synthetic_data': synthetic_data,
            'var_windows': final_windows
        }

# Usage - OPTIMIZED VERSION with your specifications
if __name__ == "__main__":
    generator = VAEDataGenerator()

    # Optimized generation with your requested changes:
    # - Keep generator_multiply = 100 (no reduction)
    # - VAR lag range 2->30 (extended from 15)
    # - Keep batch processing optimization
    # - Keep full dataset size
    print("Running optimized generation...")
    results = generator.run_fast_generation(
        r'/content/drive/MyDrive/PHD/2025/TEMP_OUTPUT_METROPM/multivariate_long_sequences-TRAIN-AUTO.npy',
        target_samples=350000  # Full size
    )

Running optimized generation...
STARTING OPTIMIZED VAE DATA GENERATION
Loading existing VAE...


TypeError: <class 'keras.src.models.functional.Functional'> could not be deserialized properly. Please ensure that components that are Python object instances (layers, models, etc.) returned by `get_config()` are explicitly deserialized in the model's `from_config()` method.

config={'module': 'keras.src.models.functional', 'class_name': 'Functional', 'config': {}, 'registered_name': 'Functional', 'build_config': {'input_shape': None}, 'compile_config': {}}.

Exception encountered: Could not locate class 'Sampling'. Make sure custom classes are decorated with `@keras.saving.register_keras_serializable()`. Full object config: {'module': None, 'class_name': 'Sampling', 'config': {'factor': 1}, 'registered_name': 'MyLayers>Sampling', 'build_config': {'input_shape': [[None, 5], [None, 5]]}, 'name': 'sampling_1', 'inbound_nodes': [{'args': [[{'class_name': '__keras_tensor__', 'config': {'shape': [None, 5], 'dtype': 'float32', 'keras_history': ['z_mean', 0, 0]}}, {'class_name': '__keras_tensor__', 'config': {'shape': [None, 5], 'dtype': 'float32', 'keras_history': ['z_log_var', 0, 0]}}]], 'kwargs': {}}]}

In [None]:
from google.colab import drive
drive.mount('/content/drive')