# VAE

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sdv.metadata import SingleTableMetadata
from sdv.single_table import TVAESynthesizer
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
from sklearn.mixture import BayesianGaussianMixture
from scipy.stats import multivariate_normal
import joblib
from itertools import product
import numpy as np
from scipy.spatial.distance import cdist
import optuna
from functools import partial
from sklearn.model_selection import ShuffleSplit

## 1. Vanilla TVAE
Using plain TVAE generate synthetic data as baseline model.

In [14]:
# Read the file and select numerical column
train_real = pd.read_csv("data/merged_data/train_set.csv").select_dtypes(include="number")
test_real  = pd.read_csv("data/merged_data/test_set.csv").select_dtypes(include="number")
df_copu  = pd.read_csv ("data/Copula_data/synthetic_clayton_split.csv").select_dtypes(include="number")

In [15]:
# Train / Test data split
train_copu, test_copu = train_test_split(df_copu, test_size=0.2,
                                            shuffle=True, random_state=42)

In [16]:
# Generate metadata
meta = SingleTableMetadata()
for c in train_real.columns:
    meta.add_column(c, sdtype="numerical")

In [17]:
# Define TVAE model and training
def train_tvae(df, meta, epochs=300, batch=256):
    tvae = TVAESynthesizer(metadata=meta, epochs=epochs,
                           batch_size=batch, cuda=True)
    tvae.fit(df)
    return tvae

# 1. Train with only real data
tvae_real = train_tvae(train_real, meta)
fake_real = tvae_real.sample(10_000)

# 2. Train with pure copula 
tvae_copu = train_tvae(train_copu, meta)
fake_copu = tvae_copu.sample(10_000)


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.



In [18]:
# 3. Train with mixing data (mixing ratio = [1:1, 1:2, 1:4, 1:8])
def build_mixes(train_real: pd.DataFrame,
                train_copu: pd.DataFrame,
                ratios=(1, 2, 4, 8),
                random_state: int = 42):
    n_real = len(train_real)
    mixes = {}

    for r in ratios:
        n_fake = n_real * r
        fake_subset = train_copu.sample(
            n_fake, replace=False, random_state=random_state
        )
        df_mix = pd.concat(
            [train_real, fake_subset], ignore_index=True
        ).sample(frac=1, random_state=random_state)  
        mixes[r] = df_mix.reset_index(drop=True)

    return mixes

mix_dict = build_mixes(train_real, train_copu)  # Default: 1,2,4,8
df_1x = mix_dict[1]   # 1 : 1
df_2x = mix_dict[2]   # 1 : 2
df_4x = mix_dict[4]   # 1 : 4
df_8x = mix_dict[8]   # 1 : 8

tvae_1x = train_tvae(df_1x, meta)
fake_1x = tvae_1x.sample(10_000)

tvae_2x = train_tvae(df_2x, meta)
fake_2x = tvae_2x.sample(10_000)

tvae_4x = train_tvae(df_4x, meta)
fake_4x = tvae_4x.sample(10_000)

tvae_8x = train_tvae(df_8x, meta)
fake_8x = tvae_8x.sample(10_000)


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.


The 'SingleTableMetadata' is deprecated. Please use the new 'Metadata' class for synthesizers.



In [19]:
# Test the results
TARGET = "fragmentation_index"          # Our prediction target

def xgb_score(df_train, df_test):
    X_tr, y_tr = df_train.drop(columns=[TARGET]), df_train[TARGET]
    X_te, y_te = df_test.drop(columns=[TARGET]), df_test[TARGET]
    mdl = XGBRegressor(
        n_estimators=400, max_depth=4, learning_rate=0.05,
        subsample=0.8, colsample_bytree=0.8, random_state=0
    )
    mdl.fit(X_tr, y_tr)
    pred = mdl.predict(X_te)
    return r2_score(y_te, pred), mean_squared_error(y_te, pred, squared=False)

In [20]:
# TSTR
r2_tstr_baseline, rmse_tstr_baseline = xgb_score(train_real, test_real)   # always-real baseline
r2_tstr_real,     rmse_tstr_real     = xgb_score(fake_real,  test_real)   # TVAE-real
r2_tstr_copu,     rmse_tstr_copu     = xgb_score(fake_copu,  test_real)   # Copula + real

print(f"TSTR Baseline : R^2={r2_tstr_baseline:.3f}  RMSE={rmse_tstr_baseline:.3f}")
print(f"TSTR RealOnly : R^2={r2_tstr_real    :.3f}  RMSE={rmse_tstr_real    :.3f}")
print(f"TSTR CopuOnly : R^2={r2_tstr_copu    :.3f}  RMSE={rmse_tstr_copu    :.3f}")

# TRTS
r2_trts_baseline, rmse_trts_baseline = xgb_score(train_real, test_copu)   # real to copula split
r2_trts_real,     rmse_trts_real     = xgb_score(train_real, fake_real)   # real to TVAE-real
r2_trts_copu,     rmse_trts_copu     = xgb_score(train_copu, fake_copu)   # copula to copula

print(f"TRTS Baseline : R^2 ={r2_trts_baseline:.3f}  RMSE={rmse_trts_baseline:.3f}")
print(f"TRTS RealOnly : R^2 ={r2_trts_real    :.3f}  RMSE={rmse_trts_real    :.3f}")
print(f"TRTS CopuOnly : R^2 ={r2_trts_copu    :.3f}  RMSE={rmse_trts_copu    :.3f}")

TSTR Baseline : R^2=0.983  RMSE=0.425
TSTR RealOnly : R^2=0.434  RMSE=2.489
TSTR CopuOnly : R^2=-0.613  RMSE=4.203
TRTS Baseline : R^2 =-0.388  RMSE=3.496
TRTS RealOnly : R^2 =-0.012  RMSE=2.702
TRTS CopuOnly : R^2 =-0.050  RMSE=1.749


In [21]:
# Mix data test:
# TSTR
r2_tstr_1x, rmse_tstr_1x = xgb_score(fake_1x, test_real)   
r2_tstr_2x, rmse_tstr_2x = xgb_score(fake_2x, test_real)   
r2_tstr_4x, rmse_tstr_4x = xgb_score(fake_4x, test_real)  
r2_tstr_8x, rmse_tstr_8x = xgb_score(fake_8x, test_real)  

print(f"TSTR 1x : R^2={r2_tstr_1x:.3f}  RMSE={rmse_tstr_1x:.3f}")
print(f"TSTR 2x : R^2={r2_tstr_2x:.3f}  RMSE={rmse_tstr_2x:.3f}")
print(f"TSTR 4x : R^2={r2_tstr_4x:.3f}  RMSE={rmse_tstr_4x:.3f}")
print(f"TSTR 8x : R^2={r2_tstr_8x:.3f}  RMSE={rmse_tstr_8x:.3f}")

TSTR 1x : R^2=0.289  RMSE=2.791
TSTR 2x : R^2=0.385  RMSE=2.596
TSTR 4x : R^2=0.253  RMSE=2.859
TSTR 8x : R^2=-0.359  RMSE=3.857


## 2. TAVE + Annealing
Basing on the logic of paper “Generating Sentences from a Continuous Space”, we can apply annealing system in TVAE.

In [22]:
# Data preprocessing
class NumScaler:
    """Simple packaging for fit / transform / inverse_transform"""
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, df: pd.DataFrame):
        self.columns = df.columns
        self.scaler.fit(df.values)

    def transform(self, df: pd.DataFrame) -> torch.Tensor:
        return torch.tensor(self.scaler.transform(df[self.columns].values),
                            dtype=torch.float32)

    def inverse_transform(self, tensor: torch.Tensor) -> pd.DataFrame:
        arr = self.scaler.inverse_transform(tensor.detach().cpu().numpy())
        return pd.DataFrame(arr, columns=self.columns)


# Define VAE structure
class VAE(nn.Module):
    def __init__(self, input_dim: int, latent_dim=32,
                 h=[128, 64]):
        super().__init__()
        h1, h2 = h
        # Encoder
        self.enc_fc1 = nn.Linear(input_dim, h[0])
        self.enc_fc2 = nn.Linear(h[0], h[1])
        self.mu      = nn.Linear(h[1], latent_dim)
        self.logvar  = nn.Linear(h[1], latent_dim)
        # Decoder
        self.dec_fc1 = nn.Linear(latent_dim, h[1])
        self.dec_fc2 = nn.Linear(h[1], h[0])
        self.recon   = nn.Linear(h[0], input_dim)

    def encode(self, x):
        h = F.relu(self.enc_fc1(x))
        h = F.relu(self.enc_fc2(h))
        return self.mu(h), self.logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h = F.relu(self.dec_fc1(z))
        h = F.relu(self.dec_fc2(h))
        return self.recon(h)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z          = self.reparameterize(mu, logvar)
        recon_x    = self.decode(z)
        return recon_x, mu, logvar

# 3)  KL‑Annealing
class TVAEAnneal:
    def __init__(self,
                 metadata,
                 latent_dim=32,
                 hidden_sizes=(128, 64),
                 beta_max=1.0,
                 batch_size=256,
                 device=None):

        self.metadata = metadata
        self.latent   = latent_dim
        self.h_sizes  = hidden_sizes
        self.beta_max = beta_max
        self.batch    = batch_size
        self.device   = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.scaler   = NumScaler()

    # fit
    def fit(self, df: pd.DataFrame,
            epochs=300, warmup_epochs=30, lr=2e-3,
            sched='linear'):

        # 1) scale
        self.scaler.fit(df)
        tensor_data = self.scaler.transform(df)

        loader = DataLoader(
            TensorDataset(tensor_data),
            batch_size=min(self.batch, len(tensor_data)),  # Protect batch size > row number
            shuffle=True, drop_last=False)

        # 2) model / opt
        dim = tensor_data.shape[1]
        self.model = VAE(dim,
                         latent_dim=self.latent,
                         h=list(self.h_sizes)).to(self.device)

        opt = torch.optim.Adam(self.model.parameters(), lr=lr)

        total_steps = epochs * len(loader)
        warm_steps  = warmup_epochs * len(loader)       # Unify Variable name

        # Beta Annealing
        def beta(step: int) -> float:
            if sched == 'linear':
                core = min(1.0, step / warm_steps)
            elif sched == 'sigmoid':
                k = 6 / warm_steps
                core = 1 / (1 + np.exp(-k * (step - warm_steps/2)))
            elif sched == 'cyclic':
                cycles = 4
                cycle_len = total_steps // cycles
                pos = step % cycle_len
                core = min(1.0, pos / warm_steps)
            else:                     
                core = 1.0
            return core * self.beta_max                # Add beta_max

        # 3) train loop
        global_step = 0
        self.model.train()
        for epoch in range(epochs):
            for (x_batch,) in loader:
                global_step += 1

                x_batch = x_batch.to(self.device)
                recon, mu, logvar = self.model(x_batch)

                mse = F.mse_loss(recon, x_batch, reduction='mean')
                kl  = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
                loss = mse + beta(global_step) * kl

                opt.zero_grad()
                loss.backward()
                opt.step()

            if (epoch + 1) % 50 == 0:
                print(f"Epoch {epoch+1}/{epochs} | "
                      f"Loss={loss.item():.4f} | "
                      f"MSE={mse.item():.4f} | KL={kl.item():.4f} | "
                      f"β={beta(global_step):.3f}")

    # sample
    def sample(self, n: int) -> pd.DataFrame:
        self.model.eval()
        with torch.no_grad():
            z = torch.randn(n, self.latent).to(self.device)
            gen = self.model.decode(z)
        return self.scaler.inverse_transform(gen)


# Train & generate
def train_tvae_anneal(df_train, metadata,
                      epochs=300, warmup=30):
    synthesizer = TVAEAnneal(metadata, batch_size=256)
    synthesizer.fit(df_train, epochs=epochs, warmup_epochs=warmup)
    return synthesizer


In [23]:
# Train with real data only
tvae_real = TVAEAnneal(
    metadata=meta,
    batch_size=256,      
    device="cpu")         # CPU is enought for the data size

tvae_real.fit(
    train_real,
    epochs=300,           # Train epochs from fit
    warmup_epochs=30)     # KL-annealing

fake_real = tvae_real.sample(10_000)

# Train with Copula data only
tvae_copu = TVAEAnneal(
    metadata=meta,
    batch_size=256,      
    device="cpu")         

tvae_copu.fit(
    train_copu,
    epochs=300,           
    warmup_epochs=30)     

fake_copu = tvae_copu.sample(10_000)


Epoch 50/300 | Loss=0.4476 | MSE=0.2705 | KL=0.1771 | β=1.000
Epoch 100/300 | Loss=0.3812 | MSE=0.2009 | KL=0.1804 | β=1.000
Epoch 150/300 | Loss=0.3507 | MSE=0.1678 | KL=0.1829 | β=1.000
Epoch 200/300 | Loss=0.3271 | MSE=0.1512 | KL=0.1759 | β=1.000
Epoch 250/300 | Loss=0.2989 | MSE=0.1218 | KL=0.1772 | β=1.000
Epoch 300/300 | Loss=0.2835 | MSE=0.1198 | KL=0.1637 | β=1.000
Epoch 50/300 | Loss=0.3530 | MSE=0.1217 | KL=0.2313 | β=1.000
Epoch 100/300 | Loss=0.3584 | MSE=0.1283 | KL=0.2300 | β=1.000
Epoch 150/300 | Loss=0.3467 | MSE=0.1136 | KL=0.2332 | β=1.000
Epoch 200/300 | Loss=0.3422 | MSE=0.1103 | KL=0.2320 | β=1.000
Epoch 250/300 | Loss=0.3446 | MSE=0.1159 | KL=0.2287 | β=1.000
Epoch 300/300 | Loss=0.3481 | MSE=0.1154 | KL=0.2327 | β=1.000


In [24]:
# TSTR
r2_tstr_real, rmse_tstr_real = xgb_score(fake_real, test_real)
r2_tstr_copu, rmse_tstr_copu = xgb_score(fake_copu, test_real)

# TRTS
r2_trts_real, rmse_trts_real = xgb_score(train_real, fake_real)
r2_trts_copu, rmse_trts_copu = xgb_score(train_copu, fake_copu)

# Print
print(f"TSTR RealOnly : R^2={r2_tstr_real:.3f}  RMSE={rmse_tstr_real:.3f}")
print(f"TSTR CopuOnly : R^2={r2_tstr_copu:.3f}  RMSE={rmse_tstr_copu:.3f}")

print(f"TRTS RealOnly : R^2={r2_trts_real:.3f}  RMSE={rmse_trts_real:.3f}")
print(f"TRTS CopuOnly : R^2={r2_tstr_copu:.3f}  RMSE={rmse_tstr_copu:.3f}")


TSTR RealOnly : R^2=0.688  RMSE=1.849
TSTR CopuOnly : R^2=-0.079  RMSE=3.437
TRTS RealOnly : R^2=0.231  RMSE=1.589
TRTS CopuOnly : R^2=-0.079  RMSE=3.437


In [25]:
# Mix Data
tvae_1x = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu")

# 1 : 1
tvae_1x.fit(
    df_1x,
    epochs=300,
    warmup_epochs=30)

fake_1x = tvae_1x.sample(10_000)

# 1 : 2
tvae_2x = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu")

tvae_2x.fit(
    df_2x,
    epochs=300,
    warmup_epochs=30)

fake_2x = tvae_2x.sample(10_000)

# 1 : 4
tvae_4x = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu")

tvae_4x.fit(
    df_4x,
    epochs=300,
    warmup_epochs=30)

fake_4x = tvae_4x.sample(10_000)

# 1 : 8
tvae_8x = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu")

tvae_8x.fit(
    df_8x,
    epochs=300,
    warmup_epochs=30)

fake_8x = tvae_8x.sample(10_000)

Epoch 50/300 | Loss=0.4501 | MSE=0.2544 | KL=0.1957 | β=1.000
Epoch 100/300 | Loss=0.4017 | MSE=0.1950 | KL=0.2067 | β=1.000
Epoch 150/300 | Loss=0.3656 | MSE=0.1655 | KL=0.2001 | β=1.000
Epoch 200/300 | Loss=0.3321 | MSE=0.1362 | KL=0.1959 | β=1.000
Epoch 250/300 | Loss=0.3290 | MSE=0.1268 | KL=0.2022 | β=1.000
Epoch 300/300 | Loss=0.3211 | MSE=0.1138 | KL=0.2073 | β=1.000
Epoch 50/300 | Loss=0.4639 | MSE=0.2475 | KL=0.2164 | β=1.000
Epoch 100/300 | Loss=0.4289 | MSE=0.2124 | KL=0.2166 | β=1.000
Epoch 150/300 | Loss=0.3763 | MSE=0.1686 | KL=0.2077 | β=1.000
Epoch 200/300 | Loss=0.3406 | MSE=0.1282 | KL=0.2124 | β=1.000
Epoch 250/300 | Loss=0.3270 | MSE=0.1236 | KL=0.2034 | β=1.000
Epoch 300/300 | Loss=0.3422 | MSE=0.1223 | KL=0.2200 | β=1.000
Epoch 50/300 | Loss=0.5067 | MSE=0.2887 | KL=0.2180 | β=1.000
Epoch 100/300 | Loss=0.3978 | MSE=0.1645 | KL=0.2333 | β=1.000
Epoch 150/300 | Loss=0.3988 | MSE=0.1616 | KL=0.2372 | β=1.000
Epoch 200/300 | Loss=0.3263 | MSE=0.1037 | KL=0.2226 | β=1

In [26]:
# TSTR
r2_tstr_1x, rmse_tstr_1x = xgb_score(fake_1x, test_real)
r2_tstr_2x, rmse_tstr_2x = xgb_score(fake_2x, test_real)
r2_tstr_4x, rmse_tstr_4x = xgb_score(fake_4x, test_real)
r2_tstr_8x, rmse_tstr_8x = xgb_score(fake_8x, test_real)

# Print
print(f"TSTR 1x : R^2={r2_tstr_1x:.3f}  RMSE={rmse_tstr_1x:.3f}")
print(f"TSTR 2x : R^2={r2_tstr_2x:.3f}  RMSE={rmse_tstr_2x:.3f}")
print(f"TSTR 4x : R^2={r2_tstr_4x:.3f}  RMSE={rmse_tstr_4x:.3f}")
print(f"TSTR 8x : R^2={r2_tstr_8x:.3f}  RMSE={rmse_tstr_8x:.3f}")

TSTR 1x : R^2=0.524  RMSE=2.283
TSTR 2x : R^2=0.516  RMSE=2.302
TSTR 4x : R^2=0.285  RMSE=2.799
TSTR 8x : R^2=0.304  RMSE=2.760


In [27]:
# Only consider R^2 score for model validation
def downstream_r2(fake_df: pd.DataFrame,
                  real_val_df: pd.DataFrame,
                  target_col: str = None,
                  seed: int = 0) -> float:
    """

    • fake_df      : Synthetic data generated by TVAE.
    • real_val_df  : Data set for test.
    • target_col   : Target column name.
    """
    target_col = target_col or real_val_df.columns[-1]

    X_train = fake_df.drop(columns=[target_col]).values
    y_train = fake_df[target_col].values
    X_val   = real_val_df.drop(columns=[target_col]).values
    y_val   = real_val_df[target_col].values

    model = XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=seed,                  
    )

    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    return r2_score(y_val, preds)


In [28]:
# Test the distribution difference between synthetic data and real data

def _rbf_kernel(X, Y, gamma):
    """RBF kernel k(x,y) = exp(-gamma‖x-y‖²)"""
    dists = cdist(X, Y, 'sqeuclidean')
    return np.exp(-gamma * dists)

def compute_mmd(fake_df: pd.DataFrame,
                real_df: pd.DataFrame,
                gamma: float = None) -> float:
    """
    return Maximum Mean Discrepancy (MMD): The smaller value means more similar distribution.
    gamma is set to 1/(2 sigma^2) where sigma^2 is the mean characeristic variance of the merge of real & syn data
    """
    X = fake_df.values.astype(np.float64)
    Y = real_df.values.astype(np.float64)

    if gamma is None:
        var = np.var(np.vstack([X, Y]), axis=0).mean()
        gamma = 1.0 / (2 * var + 1e-8)

    Kxx = _rbf_kernel(X, X, gamma)
    Kyy = _rbf_kernel(Y, Y, gamma)
    Kxy = _rbf_kernel(X, Y, gamma)

    m = len(X)
    n = len(Y)

    mmd2 = (Kxx.sum() - np.trace(Kxx)) / (m * (m - 1)) \
         + (Kyy.sum() - np.trace(Kyy)) / (n * (n - 1)) \
         - 2 * Kxy.mean()

    return float(np.sqrt(max(mmd2, 0)))        


In [29]:
def objective(trial, df_train, meta):
    # Validation set
    ss = ShuffleSplit(n_splits=1, test_size=0.25, random_state=trial.number)
    tr_sub_idx, val_idx = next(ss.split(df_train))
    df_sub = df_train.iloc[tr_sub_idx].reset_index(drop=True)
    df_val = df_train.iloc[val_idx].reset_index(drop=True)

    # Hyperparameter Sampling
    h1 = trial.suggest_int('h1', 64, 256, step=64)
    h2 = trial.suggest_int('h2', 32, 128, step=32)
    latent = trial.suggest_int('latent', 16, 128, step=16)
    lr = trial.suggest_float('lr', 1e-4, 5e-3, log=True)   
    warm = trial.suggest_int('warmup', 5, 40)
    beta_max = trial.suggest_float('beta_max', 1.0, 4.0)
    sched = trial.suggest_categorical('sched', ['linear', 'sigmoid', 'cyclic'])

    # Build up models
    model = TVAEAnneal(
        metadata=meta,
        batch_size=256,
        device="cpu",
        latent_dim=latent,
        hidden_sizes=(h1, h2),
        beta_max=beta_max)

    model.fit(df_train,
              epochs=200,
              warmup_epochs=warm,
              lr=lr,
              sched=sched)

    # Evaluate and return metrics
    fake = model.sample(len(df_train))
    r2   = downstream_r2(fake, df_val)    
    mmd  = compute_mmd(fake, df_val)
    trial.set_user_attr("r2",  r2)
    trial.set_user_attr("mmd", mmd)
    score = r2 - 0.1 * mmd
    return score


In [30]:
study = optuna.create_study(
    study_name="anneal_tvae_search",
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42),  
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10)  # Early-stop for obvious wrong trial
)

study.optimize(
    partial(objective,
            df_train=train_real,
            meta=meta),
    n_trials=50,          # Try 50 different combination
    timeout=60*60,        # Time limitation
    n_jobs=1,             
    show_progress_bar=True
)


[I 2025-08-18 00:58:28,732] A new study created in memory with name: anneal_tvae_search
  0%|          | 0/50 [00:00<?, ?it/s]

Epoch 50/200 | Loss=0.3898 | MSE=0.2628 | KL=0.0865 | β=1.468
Epoch 100/200 | Loss=0.3228 | MSE=0.1831 | KL=0.0952 | β=1.468
Epoch 150/200 | Loss=0.2910 | MSE=0.1545 | KL=0.0930 | β=1.468
Epoch 200/200 | Loss=0.2783 | MSE=0.1388 | KL=0.0950 | β=1.468


Best trial: 0. Best value: 0.20783:   2%|▏         | 1/50 [00:00<00:26,  1.83it/s, 0.55/3600 seconds]

[I 2025-08-18 00:58:29,279] Trial 0 finished with value: 0.20783043233758203 and parameters: {'h1': 128, 'h2': 128, 'latent': 96, 'lr': 0.0010401663679887319, 'warmup': 10, 'beta_max': 1.4679835610086078, 'sched': 'sigmoid'}. Best is trial 0 with value: 0.20783043233758203.
Epoch 50/200 | Loss=0.2003 | MSE=0.2003 | KL=0.0749 | β=0.000
Epoch 100/200 | Loss=0.1618 | MSE=0.1618 | KL=0.0796 | β=0.000
Epoch 150/200 | Loss=0.1532 | MSE=0.1532 | KL=0.0782 | β=0.000
Epoch 200/200 | Loss=0.1225 | MSE=0.1225 | KL=0.0798 | β=0.000


Best trial: 1. Best value: 0.228629:   4%|▍         | 2/50 [00:01<00:26,  1.84it/s, 1.09/3600 seconds]

[I 2025-08-18 00:58:29,823] Trial 1 finished with value: 0.22862945095189863 and parameters: {'h1': 192, 'h2': 32, 'latent': 128, 'lr': 0.002595942550311264, 'warmup': 12, 'beta_max': 1.5454749016213019, 'sched': 'cyclic'}. Best is trial 1 with value: 0.22862945095189863.
Epoch 50/200 | Loss=0.9974 | MSE=0.9888 | KL=0.0041 | β=2.099
Epoch 100/200 | Loss=0.9638 | MSE=0.9429 | KL=0.0100 | β=2.099
Epoch 150/200 | Loss=0.7722 | MSE=0.6676 | KL=0.0498 | β=2.099
Epoch 200/200 | Loss=0.5810 | MSE=0.4382 | KL=0.0680 | β=2.099


Best trial: 1. Best value: 0.228629:   6%|▌         | 3/50 [00:01<00:23,  1.96it/s, 1.56/3600 seconds]

[I 2025-08-18 00:58:30,290] Trial 2 finished with value: 0.19741379671176829 and parameters: {'h1': 128, 'h2': 64, 'latent': 80, 'lr': 0.00017258215396625024, 'warmup': 15, 'beta_max': 2.099085529881075, 'sched': 'sigmoid'}. Best is trial 1 with value: 0.22862945095189863.
Epoch 50/200 | Loss=0.6198 | MSE=0.4019 | KL=0.1823 | β=1.195
Epoch 100/200 | Loss=0.5220 | MSE=0.2725 | KL=0.2088 | β=1.195
Epoch 150/200 | Loss=0.5017 | MSE=0.2606 | KL=0.2017 | β=1.195
Epoch 200/200 | Loss=0.4650 | MSE=0.2262 | KL=0.1998 | β=1.195


Best trial: 3. Best value: 0.487488:   8%|▊         | 4/50 [00:02<00:22,  2.03it/s, 2.02/3600 seconds]

[I 2025-08-18 00:58:30,755] Trial 3 finished with value: 0.4874883279636331 and parameters: {'h1': 192, 'h2': 96, 'latent': 16, 'lr': 0.001076962247826313, 'warmup': 11, 'beta_max': 1.1951547789558385, 'sched': 'sigmoid'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=0.9661 | MSE=0.9338 | KL=0.0130 | β=2.486
Epoch 100/200 | Loss=0.6901 | MSE=0.5780 | KL=0.0451 | β=2.486
Epoch 150/200 | Loss=0.5297 | MSE=0.3861 | KL=0.0578 | β=2.486
Epoch 200/200 | Loss=0.4413 | MSE=0.2852 | KL=0.0628 | β=2.486


Best trial: 3. Best value: 0.487488:  10%|█         | 5/50 [00:02<00:21,  2.07it/s, 2.49/3600 seconds]

[I 2025-08-18 00:58:31,225] Trial 4 finished with value: 0.32371403491526207 and parameters: {'h1': 128, 'h2': 32, 'latent': 96, 'lr': 0.0005595074635794797, 'warmup': 9, 'beta_max': 2.4855307303338106, 'sched': 'sigmoid'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=0.7886 | MSE=0.6895 | KL=0.0254 | β=3.909
Epoch 100/200 | Loss=0.5611 | MSE=0.3679 | KL=0.0494 | β=3.909
Epoch 150/200 | Loss=0.5096 | MSE=0.3048 | KL=0.0524 | β=3.909
Epoch 200/200 | Loss=0.4966 | MSE=0.2874 | KL=0.0535 | β=3.909


Best trial: 3. Best value: 0.487488:  12%|█▏        | 6/50 [00:03<00:21,  2.02it/s, 3.00/3600 seconds]

[I 2025-08-18 00:58:31,739] Trial 5 finished with value: 0.245590130306807 and parameters: {'h1': 192, 'h2': 64, 'latent': 80, 'lr': 0.0008488762161408717, 'warmup': 11, 'beta_max': 3.9087538832936755, 'sched': 'sigmoid'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=0.9819 | MSE=0.9819 | KL=0.0075 | β=0.000
Epoch 100/200 | Loss=0.7548 | MSE=0.7548 | KL=0.0585 | β=0.000
Epoch 150/200 | Loss=0.5612 | MSE=0.5612 | KL=0.1029 | β=0.000
Epoch 200/200 | Loss=0.5092 | MSE=0.5092 | KL=0.1246 | β=0.000


Best trial: 3. Best value: 0.487488:  14%|█▍        | 7/50 [00:03<00:20,  2.08it/s, 3.46/3600 seconds]

[I 2025-08-18 00:58:32,193] Trial 6 finished with value: 0.4292577634278079 and parameters: {'h1': 192, 'h2': 128, 'latent': 16, 'lr': 0.000215262809722153, 'warmup': 6, 'beta_max': 1.975990992289793, 'sched': 'cyclic'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=0.9890 | MSE=0.9781 | KL=0.0089 | β=1.224
Epoch 100/200 | Loss=0.8568 | MSE=0.8079 | KL=0.0400 | β=1.224
Epoch 150/200 | Loss=0.6391 | MSE=0.5396 | KL=0.0813 | β=1.224
Epoch 200/200 | Loss=0.4710 | MSE=0.3351 | KL=0.1110 | β=1.224


Best trial: 3. Best value: 0.487488:  16%|█▌        | 8/50 [00:03<00:20,  2.07it/s, 3.95/3600 seconds]

[I 2025-08-18 00:58:32,682] Trial 7 finished with value: 0.12306285316239395 and parameters: {'h1': 128, 'h2': 64, 'latent': 80, 'lr': 0.000173550564698551, 'warmup': 33, 'beta_max': 1.2236519310393126, 'sched': 'linear'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=0.2219 | MSE=0.2219 | KL=0.0979 | β=0.000
Epoch 100/200 | Loss=0.1335 | MSE=0.1335 | KL=0.1094 | β=0.000
Epoch 150/200 | Loss=0.1419 | MSE=0.1419 | KL=0.1070 | β=0.000
Epoch 200/200 | Loss=0.1239 | MSE=0.1239 | KL=0.1077 | β=0.000


Best trial: 3. Best value: 0.487488:  18%|█▊        | 9/50 [00:04<00:20,  2.04it/s, 4.45/3600 seconds]

[I 2025-08-18 00:58:33,185] Trial 8 finished with value: 0.21447345734762707 and parameters: {'h1': 64, 'h2': 128, 'latent': 96, 'lr': 0.001732053535845956, 'warmup': 32, 'beta_max': 1.222133955202271, 'sched': 'cyclic'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=1.0045 | MSE=0.9981 | KL=0.0020 | β=3.189
Epoch 100/200 | Loss=0.9941 | MSE=0.9749 | KL=0.0060 | β=3.189
Epoch 150/200 | Loss=0.9160 | MSE=0.8065 | KL=0.0343 | β=3.189
Epoch 200/200 | Loss=0.8789 | MSE=0.7174 | KL=0.0507 | β=3.189


Best trial: 3. Best value: 0.487488:  20%|██        | 10/50 [00:04<00:19,  2.08it/s, 4.91/3600 seconds]

[I 2025-08-18 00:58:33,643] Trial 9 finished with value: 0.27590592625368854 and parameters: {'h1': 192, 'h2': 64, 'latent': 16, 'lr': 0.0003375589571206087, 'warmup': 16, 'beta_max': 3.1888185350141924, 'sched': 'sigmoid'}. Best is trial 3 with value: 0.4874883279636331.
Epoch 50/200 | Loss=0.5684 | MSE=0.3370 | KL=0.0754 | β=3.071
Epoch 100/200 | Loss=0.4969 | MSE=0.2709 | KL=0.0736 | β=3.071
Epoch 150/200 | Loss=0.4693 | MSE=0.2422 | KL=0.0739 | β=3.071
Epoch 200/200 | Loss=0.4458 | MSE=0.2276 | KL=0.0711 | β=3.071


Best trial: 10. Best value: 0.49955:  22%|██▏       | 11/50 [00:05<00:19,  2.02it/s, 5.44/3600 seconds]

[I 2025-08-18 00:58:34,170] Trial 10 finished with value: 0.49954979608069194 and parameters: {'h1': 256, 'h2': 96, 'latent': 48, 'lr': 0.0038364834250811503, 'warmup': 23, 'beta_max': 3.070741220707436, 'sched': 'linear'}. Best is trial 10 with value: 0.49954979608069194.
Epoch 50/200 | Loss=0.5700 | MSE=0.3318 | KL=0.0797 | β=2.990
Epoch 100/200 | Loss=0.5468 | MSE=0.3022 | KL=0.0818 | β=2.990
Epoch 150/200 | Loss=0.4725 | MSE=0.2495 | KL=0.0746 | β=2.990
Epoch 200/200 | Loss=0.4356 | MSE=0.2119 | KL=0.0748 | β=2.990


Best trial: 10. Best value: 0.49955:  24%|██▍       | 12/50 [00:05<00:19,  1.98it/s, 5.97/3600 seconds]

[I 2025-08-18 00:58:34,699] Trial 11 finished with value: 0.1821921606590728 and parameters: {'h1': 256, 'h2': 96, 'latent': 48, 'lr': 0.004881460763000736, 'warmup': 25, 'beta_max': 2.989668990292367, 'sched': 'linear'}. Best is trial 10 with value: 0.49954979608069194.
Epoch 50/200 | Loss=0.5774 | MSE=0.3498 | KL=0.0700 | β=3.250
Epoch 100/200 | Loss=0.5329 | MSE=0.2952 | KL=0.0731 | β=3.250
Epoch 150/200 | Loss=0.5027 | MSE=0.2563 | KL=0.0758 | β=3.250


Best trial: 12. Best value: 0.583464:  26%|██▌       | 13/50 [00:06<00:19,  1.91it/s, 6.53/3600 seconds]

Epoch 200/200 | Loss=0.4534 | MSE=0.2230 | KL=0.0709 | β=3.250
[I 2025-08-18 00:58:35,263] Trial 12 finished with value: 0.583464366069942 and parameters: {'h1': 256, 'h2': 96, 'latent': 48, 'lr': 0.004970573667308987, 'warmup': 24, 'beta_max': 3.2498939638387294, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.6006 | MSE=0.3613 | KL=0.0699 | β=3.423
Epoch 100/200 | Loss=0.5672 | MSE=0.3161 | KL=0.0733 | β=3.423
Epoch 150/200 | Loss=0.5378 | MSE=0.3074 | KL=0.0673 | β=3.423


Best trial: 12. Best value: 0.583464:  28%|██▊       | 14/50 [00:07<00:18,  1.92it/s, 7.04/3600 seconds]

Epoch 200/200 | Loss=0.4725 | MSE=0.2166 | KL=0.0748 | β=3.423
[I 2025-08-18 00:58:35,776] Trial 13 finished with value: 0.49723573918496744 and parameters: {'h1': 256, 'h2': 96, 'latent': 48, 'lr': 0.004839368871662966, 'warmup': 22, 'beta_max': 3.4233340985941108, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.6338 | MSE=0.4009 | KL=0.0643 | β=3.620
Epoch 100/200 | Loss=0.6082 | MSE=0.3865 | KL=0.0612 | β=3.620
Epoch 150/200 | Loss=0.5535 | MSE=0.3140 | KL=0.0661 | β=3.620


Best trial: 12. Best value: 0.583464:  30%|███       | 15/50 [00:07<00:18,  1.92it/s, 7.56/3600 seconds]

Epoch 200/200 | Loss=0.5286 | MSE=0.2815 | KL=0.0683 | β=3.620
[I 2025-08-18 00:58:36,297] Trial 14 finished with value: 0.5641328358915929 and parameters: {'h1': 256, 'h2': 96, 'latent': 48, 'lr': 0.0025235748876857762, 'warmup': 25, 'beta_max': 3.619935018788723, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.7270 | MSE=0.4661 | KL=0.0653 | β=3.998
Epoch 100/200 | Loss=0.7086 | MSE=0.4416 | KL=0.0668 | β=3.998
Epoch 150/200 | Loss=0.6672 | MSE=0.3991 | KL=0.0671 | β=3.998


Best trial: 12. Best value: 0.583464:  32%|███▏      | 16/50 [00:08<00:17,  1.93it/s, 8.08/3600 seconds]

Epoch 200/200 | Loss=0.6663 | MSE=0.4124 | KL=0.0635 | β=3.998
[I 2025-08-18 00:58:36,810] Trial 15 finished with value: 0.40421856950617924 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.002199353581717668, 'warmup': 40, 'beta_max': 3.997552053668846, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.5383 | MSE=0.3285 | KL=0.0604 | β=3.476
Epoch 100/200 | Loss=0.4984 | MSE=0.2801 | KL=0.0628 | β=3.476
Epoch 150/200 | Loss=0.4929 | MSE=0.2690 | KL=0.0644 | β=3.476
Epoch 200/200 | Loss=0.4416 | MSE=0.2242 | KL=0.0626 | β=3.476


Best trial: 12. Best value: 0.583464:  34%|███▍      | 17/50 [00:08<00:17,  1.85it/s, 8.67/3600 seconds]

[I 2025-08-18 00:58:37,400] Trial 16 finished with value: 0.4307004783510339 and parameters: {'h1': 256, 'h2': 128, 'latent': 64, 'lr': 0.002862080283426596, 'warmup': 28, 'beta_max': 3.475940740092566, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.8610 | MSE=0.6982 | KL=0.0444 | β=3.663
Epoch 100/200 | Loss=0.7244 | MSE=0.4920 | KL=0.0634 | β=3.663
Epoch 150/200 | Loss=0.6899 | MSE=0.4561 | KL=0.0638 | β=3.663
Epoch 200/200 | Loss=0.6742 | MSE=0.4269 | KL=0.0675 | β=3.663


Best trial: 12. Best value: 0.583464:  36%|███▌      | 18/50 [00:09<00:16,  1.94it/s, 9.13/3600 seconds]

[I 2025-08-18 00:58:37,860] Trial 17 finished with value: 0.36755897227467255 and parameters: {'h1': 64, 'h2': 96, 'latent': 32, 'lr': 0.001377958327791997, 'warmup': 19, 'beta_max': 3.663295649205324, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.6718 | MSE=0.5120 | KL=0.0589 | β=2.713
Epoch 100/200 | Loss=0.5135 | MSE=0.3130 | KL=0.0739 | β=2.713
Epoch 150/200 | Loss=0.4887 | MSE=0.2854 | KL=0.0749 | β=2.713
Epoch 200/200 | Loss=0.4682 | MSE=0.2592 | KL=0.0770 | β=2.713


Best trial: 12. Best value: 0.583464:  38%|███▊      | 19/50 [00:09<00:16,  1.89it/s, 9.69/3600 seconds]

[I 2025-08-18 00:58:38,422] Trial 18 finished with value: 0.44592488686954307 and parameters: {'h1': 256, 'h2': 64, 'latent': 64, 'lr': 0.0006226844812931184, 'warmup': 29, 'beta_max': 2.7131384122454323, 'sched': 'linear'}. Best is trial 12 with value: 0.583464366069942.
Epoch 50/200 | Loss=0.6272 | MSE=0.3761 | KL=0.0958 | β=2.620
Epoch 100/200 | Loss=0.5597 | MSE=0.3144 | KL=0.0936 | β=2.620
Epoch 150/200 | Loss=0.5170 | MSE=0.2673 | KL=0.0953 | β=2.620
Epoch 200/200 | Loss=0.4861 | MSE=0.2363 | KL=0.0953 | β=2.620


Best trial: 19. Best value: 0.636991:  40%|████      | 20/50 [00:10<00:15,  1.91it/s, 10.20/3600 seconds]

[I 2025-08-18 00:58:38,928] Trial 19 finished with value: 0.6369914046792744 and parameters: {'h1': 256, 'h2': 128, 'latent': 32, 'lr': 0.0033215769881425887, 'warmup': 34, 'beta_max': 2.620285060340782, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5846 | MSE=0.3309 | KL=0.0994 | β=2.551
Epoch 100/200 | Loss=0.5382 | MSE=0.2938 | KL=0.0958 | β=2.551
Epoch 150/200 | Loss=0.5153 | MSE=0.2534 | KL=0.1027 | β=2.551
Epoch 200/200 | Loss=0.4846 | MSE=0.2348 | KL=0.0979 | β=2.551


Best trial: 19. Best value: 0.636991:  42%|████▏     | 21/50 [00:10<00:15,  1.91it/s, 10.72/3600 seconds]

[I 2025-08-18 00:58:39,447] Trial 20 finished with value: 0.43456268485372196 and parameters: {'h1': 192, 'h2': 128, 'latent': 32, 'lr': 0.0032089670536255364, 'warmup': 39, 'beta_max': 2.550959408359579, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5540 | MSE=0.3290 | KL=0.0800 | β=2.812
Epoch 100/200 | Loss=0.5092 | MSE=0.2857 | KL=0.0795 | β=2.812
Epoch 150/200 | Loss=0.5257 | MSE=0.2995 | KL=0.0804 | β=2.812
Epoch 200/200 | Loss=0.5023 | MSE=0.2670 | KL=0.0837 | β=2.812


Best trial: 19. Best value: 0.636991:  44%|████▍     | 22/50 [00:11<00:15,  1.84it/s, 11.31/3600 seconds]

[I 2025-08-18 00:58:40,041] Trial 21 finished with value: 0.3843526437245136 and parameters: {'h1': 256, 'h2': 128, 'latent': 48, 'lr': 0.0019541117349883164, 'warmup': 35, 'beta_max': 2.8117220730963055, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6694 | MSE=0.4417 | KL=0.0682 | β=3.340
Epoch 100/200 | Loss=0.6238 | MSE=0.3606 | KL=0.0788 | β=3.340
Epoch 150/200 | Loss=0.5831 | MSE=0.3043 | KL=0.0835 | β=3.340
Epoch 200/200 | Loss=0.5395 | MSE=0.2652 | KL=0.0821 | β=3.340


Best trial: 19. Best value: 0.636991:  46%|████▌     | 23/50 [00:11<00:14,  1.88it/s, 11.82/3600 seconds]

[I 2025-08-18 00:58:40,552] Trial 22 finished with value: 0.5812868005102138 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.0037864630497272548, 'warmup': 28, 'beta_max': 3.339512083663391, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.9901 | MSE=0.9812 | KL=0.0038 | β=2.339
Epoch 100/200 | Loss=0.9846 | MSE=0.9593 | KL=0.0108 | β=2.339
Epoch 150/200 | Loss=0.8694 | MSE=0.7730 | KL=0.0412 | β=2.339
Epoch 200/200 | Loss=0.7652 | MSE=0.6170 | KL=0.0634 | β=2.339


Best trial: 19. Best value: 0.636991:  48%|████▊     | 24/50 [00:12<00:13,  1.92it/s, 12.31/3600 seconds]

[I 2025-08-18 00:58:41,047] Trial 23 finished with value: 0.2917488966303102 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.00010605242119436795, 'warmup': 29, 'beta_max': 2.339007929219062, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6564 | MSE=0.3970 | KL=0.0785 | β=3.304
Epoch 100/200 | Loss=0.6520 | MSE=0.3885 | KL=0.0798 | β=3.304
Epoch 150/200 | Loss=0.5929 | MSE=0.3483 | KL=0.0740 | β=3.304
Epoch 200/200 | Loss=0.5606 | MSE=0.2696 | KL=0.0881 | β=3.304


Best trial: 19. Best value: 0.636991:  50%|█████     | 25/50 [00:12<00:13,  1.89it/s, 12.86/3600 seconds]

[I 2025-08-18 00:58:41,590] Trial 24 finished with value: 0.5888014322265388 and parameters: {'h1': 256, 'h2': 128, 'latent': 32, 'lr': 0.004197569711602385, 'warmup': 36, 'beta_max': 3.3035059061995864, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.2790 | MSE=0.2790 | KL=0.0717 | β=0.000
Epoch 100/200 | Loss=0.2300 | MSE=0.2300 | KL=0.0758 | β=0.000
Epoch 150/200 | Loss=0.1615 | MSE=0.1615 | KL=0.0720 | β=0.000


Best trial: 19. Best value: 0.636991:  52%|█████▏    | 26/50 [00:13<00:13,  1.84it/s, 13.44/3600 seconds]

Epoch 200/200 | Loss=0.1705 | MSE=0.1705 | KL=0.0672 | β=0.000
[I 2025-08-18 00:58:42,172] Trial 25 finished with value: 0.1722473141559416 and parameters: {'h1': 192, 'h2': 128, 'latent': 64, 'lr': 0.004702121368886932, 'warmup': 36, 'beta_max': 2.888890523219173, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.8135 | MSE=0.5444 | KL=0.0845 | β=3.183
Epoch 100/200 | Loss=0.7447 | MSE=0.4716 | KL=0.0858 | β=3.183
Epoch 150/200 | Loss=0.7686 | MSE=0.4774 | KL=0.0915 | β=3.183


Best trial: 19. Best value: 0.636991:  54%|█████▍    | 27/50 [00:13<00:12,  1.87it/s, 13.95/3600 seconds]

Epoch 200/200 | Loss=0.7437 | MSE=0.4595 | KL=0.0893 | β=3.183
[I 2025-08-18 00:58:42,684] Trial 26 finished with value: 0.4475683010265898 and parameters: {'h1': 256, 'h2': 128, 'latent': 16, 'lr': 0.001550393385677788, 'warmup': 36, 'beta_max': 3.1832126610136324, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5282 | MSE=0.3042 | KL=0.1146 | β=1.955
Epoch 100/200 | Loss=0.5205 | MSE=0.2937 | KL=0.1160 | β=1.955
Epoch 150/200 | Loss=0.4682 | MSE=0.2503 | KL=0.1114 | β=1.955


Best trial: 19. Best value: 0.636991:  56%|█████▌    | 28/50 [00:14<00:11,  1.88it/s, 14.48/3600 seconds]

Epoch 200/200 | Loss=0.4025 | MSE=0.1754 | KL=0.1162 | β=1.955
[I 2025-08-18 00:58:43,214] Trial 27 finished with value: 0.5349637258179889 and parameters: {'h1': 192, 'h2': 128, 'latent': 32, 'lr': 0.0034118816578705604, 'warmup': 31, 'beta_max': 1.955028078793357, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.4827 | MSE=0.3258 | KL=0.0702 | β=2.237
Epoch 100/200 | Loss=0.3736 | MSE=0.2267 | KL=0.0657 | β=2.237
Epoch 150/200 | Loss=0.3627 | MSE=0.2118 | KL=0.0675 | β=2.237
Epoch 200/200 | Loss=0.3181 | MSE=0.1697 | KL=0.0663 | β=2.237


Best trial: 19. Best value: 0.636991:  58%|█████▊    | 29/50 [00:15<00:12,  1.73it/s, 15.16/3600 seconds]

[I 2025-08-18 00:58:43,891] Trial 28 finished with value: 0.2919036391019343 and parameters: {'h1': 256, 'h2': 128, 'latent': 128, 'lr': 0.0004278227374286041, 'warmup': 38, 'beta_max': 2.2367582860386768, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.3620 | MSE=0.3620 | KL=0.0593 | β=0.000
Epoch 100/200 | Loss=0.3318 | MSE=0.3318 | KL=0.0629 | β=0.000
Epoch 150/200 | Loss=0.2921 | MSE=0.2921 | KL=0.0610 | β=0.000


Best trial: 19. Best value: 0.636991:  60%|██████    | 30/50 [00:15<00:11,  1.74it/s, 15.73/3600 seconds]

Epoch 200/200 | Loss=0.2981 | MSE=0.2981 | KL=0.0625 | β=0.000
[I 2025-08-18 00:58:44,461] Trial 29 finished with value: 0.4622819884098974 and parameters: {'h1': 256, 'h2': 128, 'latent': 64, 'lr': 0.0012575300380977747, 'warmup': 34, 'beta_max': 3.7248451108651777, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5521 | MSE=0.3595 | KL=0.0742 | β=2.597
Epoch 100/200 | Loss=0.4837 | MSE=0.2728 | KL=0.0812 | β=2.597
Epoch 150/200 | Loss=0.4729 | MSE=0.2322 | KL=0.0927 | β=2.597


Best trial: 19. Best value: 0.636991:  62%|██████▏   | 31/50 [00:16<00:10,  1.74it/s, 16.30/3600 seconds]

Epoch 200/200 | Loss=0.4458 | MSE=0.2235 | KL=0.0856 | β=2.597
[I 2025-08-18 00:58:45,033] Trial 30 finished with value: 0.49982708960049455 and parameters: {'h1': 192, 'h2': 128, 'latent': 48, 'lr': 0.0021820174387397855, 'warmup': 20, 'beta_max': 2.5966384236721223, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6378 | MSE=0.3713 | KL=0.0803 | β=3.318
Epoch 100/200 | Loss=0.6362 | MSE=0.3899 | KL=0.0743 | β=3.318
Epoch 150/200 | Loss=0.5884 | MSE=0.3240 | KL=0.0797 | β=3.318


Best trial: 19. Best value: 0.636991:  64%|██████▍   | 32/50 [00:16<00:10,  1.80it/s, 16.82/3600 seconds]

Epoch 200/200 | Loss=0.5432 | MSE=0.2504 | KL=0.0883 | β=3.318
[I 2025-08-18 00:58:45,551] Trial 31 finished with value: 0.47273180775913837 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.0037761422354041464, 'warmup': 27, 'beta_max': 3.317531771242, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6628 | MSE=0.4018 | KL=0.0788 | β=3.313
Epoch 100/200 | Loss=0.6127 | MSE=0.3541 | KL=0.0781 | β=3.313
Epoch 150/200 | Loss=0.5869 | MSE=0.3136 | KL=0.0825 | β=3.313


Best trial: 19. Best value: 0.636991:  66%|██████▌   | 33/50 [00:17<00:09,  1.80it/s, 17.36/3600 seconds]

Epoch 200/200 | Loss=0.5626 | MSE=0.2817 | KL=0.0848 | β=3.313
[I 2025-08-18 00:58:46,099] Trial 32 finished with value: 0.4790192373233517 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.003839120831451692, 'warmup': 31, 'beta_max': 3.312668022467409, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6713 | MSE=0.4033 | KL=0.1588 | β=1.687
Epoch 100/200 | Loss=0.6206 | MSE=0.3458 | KL=0.1629 | β=1.687
Epoch 150/200 | Loss=0.5888 | MSE=0.3168 | KL=0.1612 | β=1.687


Best trial: 19. Best value: 0.636991:  68%|██████▊   | 34/50 [00:17<00:08,  1.92it/s, 17.81/3600 seconds]

Epoch 200/200 | Loss=0.5756 | MSE=0.2988 | KL=0.1640 | β=1.687
[I 2025-08-18 00:58:46,545] Trial 33 finished with value: 0.526600873244973 and parameters: {'h1': 256, 'h2': 32, 'latent': 16, 'lr': 0.002863448436653836, 'warmup': 26, 'beta_max': 1.6874641401323758, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6481 | MSE=0.3972 | KL=0.0810 | β=3.096
Epoch 100/200 | Loss=0.5813 | MSE=0.3115 | KL=0.0871 | β=3.096
Epoch 150/200 | Loss=0.5759 | MSE=0.3195 | KL=0.0828 | β=3.096


Best trial: 19. Best value: 0.636991:  70%|███████   | 35/50 [00:18<00:07,  1.92it/s, 18.33/3600 seconds]

Epoch 200/200 | Loss=0.5471 | MSE=0.2708 | KL=0.0892 | β=3.096
[I 2025-08-18 00:58:47,064] Trial 34 finished with value: 0.5651775990461239 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.004079257033538504, 'warmup': 37, 'beta_max': 3.0958934979831794, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.8741 | MSE=0.6177 | KL=0.0728 | β=3.521
Epoch 100/200 | Loss=0.8460 | MSE=0.5326 | KL=0.0889 | β=3.524
Epoch 150/200 | Loss=0.7951 | MSE=0.5091 | KL=0.0812 | β=3.524


Best trial: 19. Best value: 0.636991:  72%|███████▏  | 36/50 [00:18<00:07,  1.98it/s, 18.80/3600 seconds]

Epoch 200/200 | Loss=0.7580 | MSE=0.4413 | KL=0.0899 | β=3.524
[I 2025-08-18 00:58:47,534] Trial 35 finished with value: 0.3879288657037154 and parameters: {'h1': 192, 'h2': 128, 'latent': 16, 'lr': 0.0024283588751329243, 'warmup': 30, 'beta_max': 3.523734063206364, 'sched': 'sigmoid'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6176 | MSE=0.4983 | KL=0.0425 | β=2.807
Epoch 100/200 | Loss=0.4400 | MSE=0.2786 | KL=0.0575 | β=2.807
Epoch 150/200 | Loss=0.3990 | MSE=0.2376 | KL=0.0575 | β=2.807
Epoch 200/200 | Loss=0.3798 | MSE=0.2115 | KL=0.0600 | β=2.807


Best trial: 19. Best value: 0.636991:  74%|███████▍  | 37/50 [00:19<00:06,  1.90it/s, 19.37/3600 seconds]

[I 2025-08-18 00:58:48,108] Trial 36 finished with value: 0.28585328647836733 and parameters: {'h1': 128, 'h2': 64, 'latent': 112, 'lr': 0.0008652499262834052, 'warmup': 23, 'beta_max': 2.806948476112449, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.3858 | MSE=0.3858 | KL=0.0635 | β=0.000
Epoch 100/200 | Loss=0.3129 | MSE=0.3129 | KL=0.0688 | β=0.000
Epoch 150/200 | Loss=0.2817 | MSE=0.2817 | KL=0.0677 | β=0.000
Epoch 200/200 | Loss=0.2450 | MSE=0.2450 | KL=0.0714 | β=0.000


Best trial: 19. Best value: 0.636991:  76%|███████▌  | 38/50 [00:19<00:06,  1.87it/s, 19.92/3600 seconds]

[I 2025-08-18 00:58:48,658] Trial 37 finished with value: 0.4725159591563257 and parameters: {'h1': 192, 'h2': 96, 'latent': 48, 'lr': 0.0030606009317921457, 'warmup': 33, 'beta_max': 3.739465178493889, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.8931 | MSE=0.6831 | KL=0.0634 | β=3.312
Epoch 100/200 | Loss=0.7972 | MSE=0.5492 | KL=0.0749 | β=3.312


Best trial: 19. Best value: 0.636991:  78%|███████▊  | 39/50 [00:20<00:05,  1.97it/s, 19.92/3600 seconds]

Epoch 150/200 | Loss=0.8364 | MSE=0.5668 | KL=0.0814 | β=3.312
Epoch 200/200 | Loss=0.8078 | MSE=0.5303 | KL=0.0838 | β=3.312
[I 2025-08-18 00:58:49,104] Trial 38 finished with value: 0.34923030712080616 and parameters: {'h1': 256, 'h2': 32, 'latent': 16, 'lr': 0.0018829510885774907, 'warmup': 17, 'beta_max': 3.312100996949383, 'sched': 'sigmoid'}. Best is trial 19 with value: 0.6369914046792744.


Best trial: 19. Best value: 0.636991:  78%|███████▊  | 39/50 [00:20<00:05,  1.97it/s, 20.37/3600 seconds]

Epoch 50/200 | Loss=0.7167 | MSE=0.4766 | KL=0.0628 | β=3.825
Epoch 100/200 | Loss=0.7165 | MSE=0.4403 | KL=0.0722 | β=3.825
Epoch 150/200 | Loss=0.6943 | MSE=0.3949 | KL=0.0783 | β=3.825


Best trial: 19. Best value: 0.636991:  80%|████████  | 40/50 [00:20<00:05,  1.99it/s, 20.86/3600 seconds]

Epoch 200/200 | Loss=0.6201 | MSE=0.3381 | KL=0.0737 | β=3.825
[I 2025-08-18 00:58:49,597] Trial 39 finished with value: 0.49117537501220815 and parameters: {'h1': 256, 'h2': 64, 'latent': 32, 'lr': 0.004997056295291269, 'warmup': 34, 'beta_max': 3.825016438768745, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5856 | MSE=0.4310 | KL=0.0644 | β=2.401
Epoch 100/200 | Loss=0.4718 | MSE=0.2828 | KL=0.0787 | β=2.401
Epoch 150/200 | Loss=0.4377 | MSE=0.2512 | KL=0.0776 | β=2.401


Best trial: 19. Best value: 0.636991:  82%|████████▏ | 41/50 [00:21<00:04,  1.97it/s, 21.38/3600 seconds]

Epoch 200/200 | Loss=0.4313 | MSE=0.2351 | KL=0.0817 | β=2.401
[I 2025-08-18 00:58:50,114] Trial 40 finished with value: 0.37189842631507464 and parameters: {'h1': 64, 'h2': 128, 'latent': 64, 'lr': 0.0011561098246113235, 'warmup': 13, 'beta_max': 2.401097815777663, 'sched': 'sigmoid'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6355 | MSE=0.3832 | KL=0.0831 | β=3.034
Epoch 100/200 | Loss=0.5810 | MSE=0.3430 | KL=0.0784 | β=3.034
Epoch 150/200 | Loss=0.5353 | MSE=0.2731 | KL=0.0864 | β=3.034


Best trial: 19. Best value: 0.636991:  84%|████████▍ | 42/50 [00:21<00:04,  1.97it/s, 21.88/3600 seconds]

Epoch 200/200 | Loss=0.5182 | MSE=0.2506 | KL=0.0882 | β=3.034
[I 2025-08-18 00:58:50,619] Trial 41 finished with value: 0.40108277480771315 and parameters: {'h1': 256, 'h2': 96, 'latent': 32, 'lr': 0.003916864807241258, 'warmup': 37, 'beta_max': 3.0343782648261732, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5582 | MSE=0.3194 | KL=0.0773 | β=3.088
Epoch 100/200 | Loss=0.5733 | MSE=0.3475 | KL=0.0731 | β=3.088
Epoch 150/200 | Loss=0.4864 | MSE=0.2419 | KL=0.0792 | β=3.088
Epoch 200/200 | Loss=0.4404 | MSE=0.2009 | KL=0.0776 | β=3.088


Best trial: 19. Best value: 0.636991:  86%|████████▌ | 43/50 [00:22<00:03,  1.86it/s, 22.49/3600 seconds]

[I 2025-08-18 00:58:51,226] Trial 42 finished with value: 0.5484263763697979 and parameters: {'h1': 256, 'h2': 96, 'latent': 48, 'lr': 0.00407685257297692, 'warmup': 38, 'beta_max': 3.0876604163132892, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.8384 | MSE=0.5575 | KL=0.0871 | β=3.224
Epoch 100/200 | Loss=0.7722 | MSE=0.4882 | KL=0.0881 | β=3.224
Epoch 150/200 | Loss=0.7799 | MSE=0.4590 | KL=0.0995 | β=3.224
Epoch 200/200 | Loss=0.7314 | MSE=0.4053 | KL=0.1012 | β=3.224


Best trial: 19. Best value: 0.636991:  88%|████████▊ | 44/50 [00:22<00:03,  1.92it/s, 22.98/3600 seconds]

[I 2025-08-18 00:58:51,709] Trial 43 finished with value: 0.3248314953984587 and parameters: {'h1': 256, 'h2': 96, 'latent': 16, 'lr': 0.0032664122183853673, 'warmup': 40, 'beta_max': 3.2240461569521766, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.6145 | MSE=0.3573 | KL=0.0895 | β=2.873
Epoch 100/200 | Loss=0.5879 | MSE=0.3277 | KL=0.0906 | β=2.873
Epoch 150/200 | Loss=0.5249 | MSE=0.2636 | KL=0.0909 | β=2.873
Epoch 200/200 | Loss=0.4882 | MSE=0.2208 | KL=0.0931 | β=2.873


Best trial: 19. Best value: 0.636991:  90%|█████████ | 45/50 [00:23<00:02,  1.94it/s, 23.48/3600 seconds]

[I 2025-08-18 00:58:52,212] Trial 44 finished with value: 0.4747313704921229 and parameters: {'h1': 192, 'h2': 96, 'latent': 32, 'lr': 0.0041795249964392, 'warmup': 32, 'beta_max': 2.8726882154174014, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5140 | MSE=0.2884 | KL=0.0845 | β=2.671
Epoch 100/200 | Loss=0.4945 | MSE=0.2768 | KL=0.0815 | β=2.671
Epoch 150/200 | Loss=0.4946 | MSE=0.2847 | KL=0.0786 | β=2.671
Epoch 200/200 | Loss=0.4529 | MSE=0.2251 | KL=0.0853 | β=2.671


Best trial: 19. Best value: 0.636991:  92%|█████████▏| 46/50 [00:23<00:02,  1.95it/s, 23.99/3600 seconds]

[I 2025-08-18 00:58:52,721] Trial 45 finished with value: 0.4322775146686698 and parameters: {'h1': 256, 'h2': 64, 'latent': 48, 'lr': 0.002805019614240476, 'warmup': 35, 'beta_max': 2.6712679039269984, 'sched': 'linear'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5573 | MSE=0.5573 | KL=0.0780 | β=0.000
Epoch 100/200 | Loss=0.5126 | MSE=0.5126 | KL=0.0842 | β=0.000
Epoch 150/200 | Loss=0.5117 | MSE=0.5117 | KL=0.0905 | β=0.000
Epoch 200/200 | Loss=0.4518 | MSE=0.4518 | KL=0.0896 | β=0.000


Best trial: 19. Best value: 0.636991:  94%|█████████▍| 47/50 [00:24<00:01,  1.94it/s, 24.51/3600 seconds]

[I 2025-08-18 00:58:53,242] Trial 46 finished with value: 0.6120581238964519 and parameters: {'h1': 256, 'h2': 96, 'latent': 16, 'lr': 0.0022558133960227625, 'warmup': 37, 'beta_max': 3.525471041333585, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.5754 | MSE=0.5754 | KL=0.0749 | β=0.000
Epoch 100/200 | Loss=0.5269 | MSE=0.5269 | KL=0.0757 | β=0.000
Epoch 150/200 | Loss=0.4899 | MSE=0.4899 | KL=0.0819 | β=0.000
Epoch 200/200 | Loss=0.4880 | MSE=0.4880 | KL=0.0826 | β=0.000


Best trial: 19. Best value: 0.636991:  96%|█████████▌| 48/50 [00:25<00:01,  1.93it/s, 25.03/3600 seconds]

[I 2025-08-18 00:58:53,768] Trial 47 finished with value: 0.49798092607745115 and parameters: {'h1': 256, 'h2': 128, 'latent': 16, 'lr': 0.0022537473566420817, 'warmup': 25, 'beta_max': 3.6044813573892815, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.7179 | MSE=0.7179 | KL=0.0593 | β=0.000
Epoch 100/200 | Loss=0.5847 | MSE=0.5847 | KL=0.0744 | β=0.000
Epoch 150/200 | Loss=0.5226 | MSE=0.5226 | KL=0.0885 | β=0.000
Epoch 200/200 | Loss=0.4322 | MSE=0.4322 | KL=0.0912 | β=0.000


Best trial: 19. Best value: 0.636991:  98%|█████████▊| 49/50 [00:25<00:00,  1.97it/s, 25.52/3600 seconds]

[I 2025-08-18 00:58:54,251] Trial 48 finished with value: 0.3421152467360458 and parameters: {'h1': 128, 'h2': 96, 'latent': 16, 'lr': 0.001580250388004995, 'warmup': 22, 'beta_max': 3.420691246193001, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.
Epoch 50/200 | Loss=0.4667 | MSE=0.4667 | KL=0.0624 | β=0.000
Epoch 100/200 | Loss=0.4333 | MSE=0.4333 | KL=0.0693 | β=0.000
Epoch 150/200 | Loss=0.4160 | MSE=0.4160 | KL=0.0714 | β=0.000
Epoch 200/200 | Loss=0.3573 | MSE=0.3573 | KL=0.0701 | β=0.000


Best trial: 19. Best value: 0.636991: 100%|██████████| 50/50 [00:25<00:00,  1.92it/s, 25.99/3600 seconds]

[I 2025-08-18 00:58:54,727] Trial 49 finished with value: 0.4370231801534821 and parameters: {'h1': 192, 'h2': 64, 'latent': 32, 'lr': 0.0025717083589782094, 'warmup': 33, 'beta_max': 3.9040303515974992, 'sched': 'cyclic'}. Best is trial 19 with value: 0.6369914046792744.





In [31]:
# 1) Overview Trial
best = study.best_trial
print("Best score:", best.value)
print("Params :", best.params)
print("R^2 =", best.user_attrs['r2'])

# 2) Results to DataFrame
df_trials = study.trials_dataframe(attrs=('number', 'value', 'params', 'user_attrs'))
df_trials.to_csv("TAVE_annealing_trials.csv", index=False)

Best score: 0.6369914046792744
Params : {'h1': 256, 'h2': 128, 'latent': 32, 'lr': 0.0033215769881425887, 'warmup': 34, 'beta_max': 2.620285060340782, 'sched': 'linear'}
R^2 = 0.6578663299118668


In [334]:
best_params = best.params
model_best = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (best_params['h1'], best_params['h2']),
    beta_max       = best_params['beta_max']
)

model_best.fit(
    train_real,
    epochs=300,
    warmup_epochs=best_params['warmup'],
    lr=best_params['lr'],
    sched=best_params['sched']
)

fake_best = model_best.sample(len(train_real))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing with real data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)


Epoch 50/300 | Loss=0.6195 | MSE=0.3516 | KL=0.1714 | β=1.563
Epoch 100/300 | Loss=0.5864 | MSE=0.3034 | KL=0.1811 | β=1.563
Epoch 150/300 | Loss=0.5270 | MSE=0.2463 | KL=0.1796 | β=1.563
Epoch 200/300 | Loss=0.5214 | MSE=0.2356 | KL=0.1829 | β=1.563
Epoch 250/300 | Loss=0.4931 | MSE=0.2127 | KL=0.1794 | β=1.563
Epoch 300/300 | Loss=0.5047 | MSE=0.2226 | KL=0.1805 | β=1.563
TSTR results of TVAE_annealing with real data input (R^2, RMSE): 0.5338771358006065 , 2.25905003475542
MMD: 0.17815335897210818


In [335]:
best_params = best.params
model_best = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (best_params['h1'], best_params['h2']),
    beta_max       = best_params['beta_max']
)

model_best.fit(
    train_copu,
    epochs=300,
    warmup_epochs=best_params['warmup'],
    lr=best_params['lr'],
    sched=best_params['sched']
)

fake_best = model_best.sample(len(train_copu))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing with Copula synthetic data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.7055 | MSE=0.3917 | KL=0.2008 | β=1.563
Epoch 100/300 | Loss=0.6761 | MSE=0.3601 | KL=0.2022 | β=1.563
Epoch 150/300 | Loss=0.6295 | MSE=0.3134 | KL=0.2022 | β=1.563
Epoch 200/300 | Loss=0.6678 | MSE=0.3422 | KL=0.2083 | β=1.563
Epoch 250/300 | Loss=0.7126 | MSE=0.3757 | KL=0.2156 | β=1.563
Epoch 300/300 | Loss=0.6763 | MSE=0.3551 | KL=0.2055 | β=1.563
TSTR results of TVAE_annealing with Copula synthetic data input (R^2, RMSE): -0.18233631132338068 , 3.59787574513955
MMD: 0.19360975761986343


In [336]:
best_params = best.params
model_best = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (best_params['h1'], best_params['h2']),
    beta_max       = best_params['beta_max']
)

model_best.fit(
    df_1x,
    epochs=300,
    warmup_epochs=best_params['warmup'],
    lr=best_params['lr'],
    sched=best_params['sched']
)

fake_best = model_best.sample(len(df_1x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing with 1:1 mixing data (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.7487 | MSE=0.4404 | KL=0.1973 | β=1.563
Epoch 100/300 | Loss=0.6041 | MSE=0.3183 | KL=0.1829 | β=1.563
Epoch 150/300 | Loss=0.6305 | MSE=0.3384 | KL=0.1869 | β=1.563
Epoch 200/300 | Loss=0.6653 | MSE=0.3599 | KL=0.1954 | β=1.563
Epoch 250/300 | Loss=0.6427 | MSE=0.3366 | KL=0.1958 | β=1.563
Epoch 300/300 | Loss=0.6577 | MSE=0.3390 | KL=0.2039 | β=1.563
TSTR results of TVAE_annealing with 1:1 mixing data (R^2, RMSE): 0.3936032191950072 , 2.5766420517909436
MMD: 0.19916061871709542


In [337]:
best_params = best.params
model_best = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (best_params['h1'], best_params['h2']),
    beta_max       = best_params['beta_max']
)

model_best.fit(
    df_2x,
    epochs=300,
    warmup_epochs=best_params['warmup'],
    lr=best_params['lr'],
    sched=best_params['sched']
)

fake_best = model_best.sample(len(df_2x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing with 1:2 mixing data (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6886 | MSE=0.4091 | KL=0.1788 | β=1.563
Epoch 100/300 | Loss=0.6738 | MSE=0.3688 | KL=0.1952 | β=1.563
Epoch 150/300 | Loss=0.7393 | MSE=0.3898 | KL=0.2236 | β=1.563
Epoch 200/300 | Loss=0.6984 | MSE=0.3496 | KL=0.2231 | β=1.563
Epoch 250/300 | Loss=0.6325 | MSE=0.3226 | KL=0.1983 | β=1.563
Epoch 300/300 | Loss=0.5924 | MSE=0.3063 | KL=0.1830 | β=1.563
TSTR results of TVAE_annealing with 1:2 mixing data (R^2, RMSE): 0.5469189822966607 , 2.2272223727543934
MMD: 0.19951628045129266


In [338]:
best_params = best.params
model_best = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (best_params['h1'], best_params['h2']),
    beta_max       = best_params['beta_max']
)

model_best.fit(
    df_4x,
    epochs=300,
    warmup_epochs=best_params['warmup'],
    lr=best_params['lr'],
    sched=best_params['sched']
)

fake_best = model_best.sample(len(df_4x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing with 1:4 mixing data (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.7782 | MSE=0.4654 | KL=0.2001 | β=1.563
Epoch 100/300 | Loss=0.6295 | MSE=0.3692 | KL=0.1666 | β=1.563
Epoch 150/300 | Loss=0.6462 | MSE=0.3345 | KL=0.1994 | β=1.563
Epoch 200/300 | Loss=0.7032 | MSE=0.3899 | KL=0.2005 | β=1.563
Epoch 250/300 | Loss=0.7333 | MSE=0.4456 | KL=0.1841 | β=1.563
Epoch 300/300 | Loss=0.6011 | MSE=0.3588 | KL=0.1550 | β=1.563
TSTR results of TVAE_annealing with 1:4 mixing data (R^2, RMSE): 0.23747032056712192 , 2.889375745779571
MMD: 0.22796218212881475


In [340]:
best_params = best.params
model_best = TVAEAnneal(
    metadata=meta,
    batch_size=256,
    device="cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (best_params['h1'], best_params['h2']),
    beta_max       = best_params['beta_max']
)

model_best.fit(
    df_8x,
    epochs=300,
    warmup_epochs=best_params['warmup'],
    lr=best_params['lr'],
    sched=best_params['sched']
)

fake_best = model_best.sample(len(df_8x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing with 1:8 mixing data (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.7140 | MSE=0.4130 | KL=0.1926 | β=1.563
Epoch 100/300 | Loss=0.6977 | MSE=0.3863 | KL=0.1992 | β=1.563
Epoch 150/300 | Loss=0.7024 | MSE=0.3880 | KL=0.2012 | β=1.563
Epoch 200/300 | Loss=0.6108 | MSE=0.3108 | KL=0.1920 | β=1.563
Epoch 250/300 | Loss=0.6573 | MSE=0.3474 | KL=0.1983 | β=1.563
Epoch 300/300 | Loss=0.7004 | MSE=0.3679 | KL=0.2127 | β=1.563
TSTR results of TVAE_annealing with 1:8 mixing data (R^2, RMSE): 0.17223897014135148 , 3.0104272397656606
MMD: 0.23099350770856247


In [341]:

# Define for training and saving hyperparameters
def train_and_save(name, latent_dim, beta_max,
                   epochs=300, warmup=32, lr=0.0017,
                   hidden_sizes=(256, 96), device="cpu"):

    model = TVAEAnneal(
        metadata     = meta,
        latent_dim   = latent_dim,
        hidden_sizes = hidden_sizes,
        beta_max     = beta_max,
        batch_size   = 128,
        device       = device)

    model.fit(
        train_real,
        epochs        = epochs,
        warmup_epochs = warmup,
        lr            = lr,
        sched         = "sigmoid" )

    # Save hyperparameters
    weight_path = f"tvae_{name}.pt"
    torch.save(model.model.state_dict(), weight_path)
    print(f"Saved weights: {weight_path}")

    # Generate synthetic data
    fake = model.sample(10_000)
    csv_path = f"Synthetic_{name}.csv"
    fake.to_csv(csv_path, index=False)
    print(f"Saved samples: {csv_path}")

# Best Result
train_and_save(
    name       = "TAVE_annealing",  
    latent_dim  = 32,
    beta_max    = 2.0229147092990662,
    lr          = 0.004568164246533539,
    warmup      = 40,
    hidden_sizes= (256, 128),    
    device      = "cpu")       


Epoch 50/300 | Loss=0.5183 | MSE=0.2824 | KL=0.1179 | β=2.001
Epoch 100/300 | Loss=0.4666 | MSE=0.2114 | KL=0.1262 | β=2.023
Epoch 150/300 | Loss=0.4487 | MSE=0.2112 | KL=0.1174 | β=2.023
Epoch 200/300 | Loss=0.3852 | MSE=0.1635 | KL=0.1096 | β=2.023
Epoch 250/300 | Loss=0.3625 | MSE=0.1269 | KL=0.1164 | β=2.023
Epoch 300/300 | Loss=0.3941 | MSE=0.1354 | KL=0.1279 | β=2.023
Saved weights: tvae_TAVE_annealing.pt
Saved samples: Synthetic_TAVE_annealing.csv


In [342]:
class TVAEAnnealGMM(TVAEAnneal):
    """
    Inherit from TVAE-annealing and adding extra Bayesian GMM to latent space
    sample() with TVAE-GMM by default
    """
    def __init__(self, *args,
                 gmm_components=50,     # Max combination number
                 gmm_cov='diag',        # 'full' or 'diag'
                 **kws):
        super().__init__(*args, **kws)
        self.gmm_components = gmm_components
        self.gmm_cov = gmm_cov
        self.gmm = None                

    # fit function
    def fit(self, df: pd.DataFrame,
            epochs=300, warmup_epochs=30, lr=2e-3,
            sched='linear'):
        # Regular TVAE 
        super().fit(df, epochs, warmup_epochs, lr, sched)

        # Collect latent vector muon as GMM training data
        tensor_data = self.scaler.transform(df)
        dl = DataLoader(TensorDataset(tensor_data),
                        batch_size=1024, shuffle=False)
        self.model.eval()
        mu_list = []
        with torch.no_grad():
            for (x,) in dl:
                x = x.to(self.device)
                mu, _ = self.model.encode(x)
                mu_list.append(mu.cpu())
        Z = torch.cat(mu_list, dim=0).numpy()

        # Simulate Bayesian GMM
        self.gmm = BayesianGaussianMixture(
            n_components = self.gmm_components,
            covariance_type = self.gmm_cov,
            weight_concentration_prior_type="dirichlet_process",
            max_iter=500,
            random_state=0
        ).fit(Z)

        print(f"BGM fitted: active comps = "
              f"{np.sum(self.gmm.weights_ > 1e-3)}/{self.gmm_components}")

    # sample function
    def sample(self, n: int, use_gmm=True) -> pd.DataFrame:
        self.model.eval()
        with torch.no_grad():
            if use_gmm and self.gmm is not None:
                z_np = self.gmm.sample(n)[0].astype(np.float32)
                z = torch.from_numpy(z_np).to(self.device)
            else:
                z = torch.randn(n, self.latent).to(self.device)

            gen = self.model.decode(z)
        return self.scaler.inverse_transform(gen)

    # Save and load
    def save(self, path_prefix: str):
        torch.save(self.model.state_dict(), f"{path_prefix}_vae.pt")
        if self.gmm is not None:
            joblib.dump(self.gmm, f"{path_prefix}_gmm.pkl")
        self.scaler.scaler.mean_.tofile(f"{path_prefix}_mean.bin")
        self.scaler.scaler.scale_.tofile(f"{path_prefix}_scale.bin")

    def load(self, path_prefix: str):
        self.model.load_state_dict(torch.load(f"{path_prefix}_vae.pt"))
        self.gmm = joblib.load(f"{path_prefix}_gmm.pkl")
        

In [343]:
# Training
tvae_gmm = TVAEAnnealGMM(
    metadata      = meta,
    latent_dim    = 32,
    hidden_sizes  = (256, 128),
    beta_max      = 1.80,
    batch_size    = 128,
    device        = "cpu",          
    gmm_components= 50,              # Important hyperparameter for GMM
    gmm_cov       ='diag')

tvae_gmm.fit(train_real,
             epochs=300,
             warmup_epochs=32,
             lr=0.0040,
             sched='sigmoid')

# Sampling
fake_gmm = tvae_gmm.sample(10_000)          

Epoch 50/300 | Loss=0.5095 | MSE=0.2752 | KL=0.1304 | β=1.797
Epoch 100/300 | Loss=0.4805 | MSE=0.2538 | KL=0.1259 | β=1.800
Epoch 150/300 | Loss=0.3973 | MSE=0.1930 | KL=0.1136 | β=1.800
Epoch 200/300 | Loss=0.3642 | MSE=0.1416 | KL=0.1236 | β=1.800
Epoch 250/300 | Loss=0.3430 | MSE=0.1364 | KL=0.1148 | β=1.800
Epoch 300/300 | Loss=0.3569 | MSE=0.1445 | KL=0.1180 | β=1.800
BGM fitted: active comps = 50/50


In [344]:
target_col = "fragmentation_index" 
r2  = downstream_r2(fake_gmm,  test_real, target_col)
mmd = compute_mmd  (fake_gmm.drop(columns=[target_col]),
                    test_real.drop(columns=[target_col]))
print(f"GMM-TVAE: R^2={r2:.3f}, MMD={mmd:.3f}")


GMM-TVAE: R^2=0.681, MMD=0.101


In [None]:
def gmm_objective(trial, df_train, meta):
    # Validation set
    ss = ShuffleSplit(n_splits=1, test_size=0.25, random_state=trial.number)
    tr_sub_idx, val_idx = next(ss.split(df_train))
    df_sub = df_train.iloc[tr_sub_idx].reset_index(drop=True)  
    df_val = df_train.iloc[val_idx].reset_index(drop=True) 

    latent = trial.suggest_categorical("latent", [24,28,32,36,40])
    beta   = trial.suggest_float("beta", 1.4, 2.2)
    K      = trial.suggest_categorical("gmm_K", [12,20,28,36])
    lr     = trial.suggest_float("lr", 3e-3, 6e-3, log=True)
    warm   = trial.suggest_categorical("warm", [16,20,24,28])

    model = TVAEAnnealGMM(
        metadata=meta,
        latent_dim=latent,
        hidden_sizes=(256,128),
        beta_max=beta,
        gmm_components=K,
        gmm_cov='diag',
        batch_size=128,
        device='cpu')

    model.fit(df_sub,
              epochs=250,
              warmup_epochs=warm,
              lr=lr,
              sched='sigmoid')

    fake = model.sample(len(df_train))
    r2   = downstream_r2(fake, df_val)
    mmd  = compute_mmd(fake, df_val)

    score = r2 - 0.1*mmd            # Similar metrics combining r^2 and mmd
    trial.set_user_attr("r2", r2)
    trial.set_user_attr("mmd", mmd)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(lambda t: gmm_objective(t, train_real, meta),
               n_trials=25, timeout=15*60, show_progress_bar=True)
print("Best params:", study.best_trial.params,
      "R^2 =", study.best_trial.user_attrs['r2'],
      "MMD =", study.best_trial.user_attrs['mmd'])


[I 2025-08-06 23:59:42,217] A new study created in memory with name: no-name-1ba54a31-276d-4e28-8186-ff0b83316e12
  0%|          | 0/25 [00:00<?, ?it/s]

Epoch 50/250 | Loss=0.4443 | MSE=0.2345 | KL=0.1391 | β=1.508
Epoch 100/250 | Loss=0.3574 | MSE=0.1688 | KL=0.1250 | β=1.508
Epoch 150/250 | Loss=0.2875 | MSE=0.1075 | KL=0.1193 | β=1.508
Epoch 200/250 | Loss=0.2707 | MSE=0.1095 | KL=0.1069 | β=1.508


Best trial: 0. Best value: 0.505622:   4%|▍         | 1/25 [00:00<00:19,  1.20it/s, 0.83/900 seconds]

Epoch 250/250 | Loss=0.2625 | MSE=0.1034 | KL=0.1054 | β=1.508
BGM fitted: active comps = 28/28
[I 2025-08-06 23:59:43,048] Trial 0 finished with value: 0.5056215843593223 and parameters: {'latent': 40, 'beta': 1.5082506317057154, 'gmm_K': 28, 'lr': 0.0032490854875309, 'warm': 20}. Best is trial 0 with value: 0.5056215843593223.
Epoch 50/250 | Loss=0.5765 | MSE=0.3204 | KL=0.1262 | β=2.030
Epoch 100/250 | Loss=0.4899 | MSE=0.2293 | KL=0.1284 | β=2.030
Epoch 150/250 | Loss=0.4067 | MSE=0.1683 | KL=0.1174 | β=2.030
Epoch 200/250 | Loss=0.3894 | MSE=0.1579 | KL=0.1140 | β=2.030


Best trial: 1. Best value: 0.563253:   8%|▊         | 2/25 [00:01<00:19,  1.20it/s, 1.67/900 seconds]

Epoch 250/250 | Loss=0.4020 | MSE=0.1486 | KL=0.1248 | β=2.030
BGM fitted: active comps = 36/36
[I 2025-08-06 23:59:43,885] Trial 1 finished with value: 0.5632532833708964 and parameters: {'latent': 28, 'beta': 2.0300441014053323, 'gmm_K': 36, 'lr': 0.00510080903814947, 'warm': 24}. Best is trial 1 with value: 0.5632532833708964.
Epoch 50/250 | Loss=0.5147 | MSE=0.3204 | KL=0.1099 | β=1.767
Epoch 100/250 | Loss=0.3883 | MSE=0.1972 | KL=0.1081 | β=1.767
Epoch 150/250 | Loss=0.3524 | MSE=0.1364 | KL=0.1222 | β=1.767
Epoch 200/250 | Loss=0.2913 | MSE=0.1066 | KL=0.1046 | β=1.767


Best trial: 1. Best value: 0.563253:  12%|█▏        | 3/25 [00:02<00:18,  1.18it/s, 2.53/900 seconds]

Epoch 250/250 | Loss=0.2982 | MSE=0.1228 | KL=0.0993 | β=1.767
BGM fitted: active comps = 12/12
[I 2025-08-06 23:59:44,749] Trial 2 finished with value: 0.35673558115440673 and parameters: {'latent': 40, 'beta': 1.7671267176627157, 'gmm_K': 12, 'lr': 0.004450886821584596, 'warm': 16}. Best is trial 1 with value: 0.5632532833708964.
Epoch 50/250 | Loss=0.4752 | MSE=0.2350 | KL=0.1381 | β=1.739
Epoch 100/250 | Loss=0.3443 | MSE=0.1440 | KL=0.1151 | β=1.739
Epoch 150/250 | Loss=0.3365 | MSE=0.1142 | KL=0.1278 | β=1.739
Epoch 200/250 | Loss=0.3406 | MSE=0.1352 | KL=0.1181 | β=1.739


Best trial: 1. Best value: 0.563253:  16%|█▌        | 4/25 [00:03<00:18,  1.15it/s, 3.44/900 seconds]

Epoch 250/250 | Loss=0.3134 | MSE=0.1167 | KL=0.1131 | β=1.739
BGM fitted: active comps = 12/12
[I 2025-08-06 23:59:45,656] Trial 3 finished with value: 0.45457196746116824 and parameters: {'latent': 36, 'beta': 1.7394143041040397, 'gmm_K': 12, 'lr': 0.005682303457728934, 'warm': 20}. Best is trial 1 with value: 0.5632532833708964.
Epoch 50/250 | Loss=0.4348 | MSE=0.2567 | KL=0.1228 | β=1.450
Epoch 100/250 | Loss=0.3283 | MSE=0.1618 | KL=0.1148 | β=1.450
Epoch 150/250 | Loss=0.3256 | MSE=0.1449 | KL=0.1246 | β=1.450
Epoch 200/250 | Loss=0.2930 | MSE=0.1184 | KL=0.1204 | β=1.450


Best trial: 1. Best value: 0.563253:  20%|██        | 5/25 [00:04<00:17,  1.15it/s, 4.30/900 seconds]

Epoch 250/250 | Loss=0.2967 | MSE=0.1127 | KL=0.1268 | β=1.450
BGM fitted: active comps = 12/12
[I 2025-08-06 23:59:46,516] Trial 4 finished with value: 0.4860381640875452 and parameters: {'latent': 36, 'beta': 1.450327655325641, 'gmm_K': 12, 'lr': 0.0034397920565473793, 'warm': 16}. Best is trial 1 with value: 0.5632532833708964.
Epoch 50/250 | Loss=0.4102 | MSE=0.2111 | KL=0.1129 | β=1.763
Epoch 100/250 | Loss=0.3913 | MSE=0.1539 | KL=0.1346 | β=1.763
Epoch 150/250 | Loss=0.3613 | MSE=0.1480 | KL=0.1210 | β=1.763
Epoch 200/250 | Loss=0.3608 | MSE=0.1601 | KL=0.1138 | β=1.763


Best trial: 5. Best value: 0.582404:  24%|██▍       | 6/25 [00:05<00:16,  1.16it/s, 5.14/900 seconds]

Epoch 250/250 | Loss=0.3204 | MSE=0.1119 | KL=0.1183 | β=1.763
BGM fitted: active comps = 36/36
[I 2025-08-06 23:59:47,359] Trial 5 finished with value: 0.5824043493139308 and parameters: {'latent': 32, 'beta': 1.7631215720297482, 'gmm_K': 36, 'lr': 0.004699155315161513, 'warm': 24}. Best is trial 5 with value: 0.5824043493139308.
Epoch 50/250 | Loss=0.5939 | MSE=0.3042 | KL=0.1325 | β=2.187
Epoch 100/250 | Loss=0.5049 | MSE=0.2423 | KL=0.1201 | β=2.187
Epoch 150/250 | Loss=0.4121 | MSE=0.1672 | KL=0.1120 | β=2.187
Epoch 200/250 | Loss=0.3933 | MSE=0.1503 | KL=0.1111 | β=2.187


Best trial: 6. Best value: 0.66093:  28%|██▊       | 7/25 [00:05<00:15,  1.18it/s, 5.97/900 seconds] 

Epoch 250/250 | Loss=0.3856 | MSE=0.1465 | KL=0.1093 | β=2.187
BGM fitted: active comps = 36/36
[I 2025-08-06 23:59:48,191] Trial 6 finished with value: 0.660929600151592 and parameters: {'latent': 28, 'beta': 2.186918712363912, 'gmm_K': 36, 'lr': 0.004404653491275035, 'warm': 24}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5003 | MSE=0.2447 | KL=0.1212 | β=2.109
Epoch 100/250 | Loss=0.5171 | MSE=0.2695 | KL=0.1174 | β=2.109
Epoch 150/250 | Loss=0.4359 | MSE=0.1913 | KL=0.1160 | β=2.109
Epoch 200/250 | Loss=0.4084 | MSE=0.1630 | KL=0.1164 | β=2.109


Best trial: 6. Best value: 0.66093:  32%|███▏      | 8/25 [00:06<00:14,  1.20it/s, 6.77/900 seconds]

Epoch 250/250 | Loss=0.4274 | MSE=0.1813 | KL=0.1167 | β=2.109
BGM fitted: active comps = 20/20
[I 2025-08-06 23:59:48,993] Trial 7 finished with value: 0.4594773392876241 and parameters: {'latent': 28, 'beta': 2.10900536939987, 'gmm_K': 20, 'lr': 0.004669390888507499, 'warm': 16}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.4684 | MSE=0.2455 | KL=0.1477 | β=1.509
Epoch 100/250 | Loss=0.3452 | MSE=0.1581 | KL=0.1239 | β=1.509
Epoch 150/250 | Loss=0.3570 | MSE=0.1472 | KL=0.1390 | β=1.509
Epoch 200/250 | Loss=0.3492 | MSE=0.1544 | KL=0.1291 | β=1.509


Best trial: 6. Best value: 0.66093:  36%|███▌      | 9/25 [00:07<00:13,  1.22it/s, 7.57/900 seconds]

Epoch 250/250 | Loss=0.3245 | MSE=0.1232 | KL=0.1334 | β=1.509
BGM fitted: active comps = 36/36
[I 2025-08-06 23:59:49,790] Trial 8 finished with value: 0.5566283594838034 and parameters: {'latent': 32, 'beta': 1.5092729054468674, 'gmm_K': 36, 'lr': 0.00495373626660881, 'warm': 16}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5454 | MSE=0.3273 | KL=0.1032 | β=2.114
Epoch 100/250 | Loss=0.4665 | MSE=0.2461 | KL=0.1043 | β=2.114
Epoch 150/250 | Loss=0.4726 | MSE=0.2396 | KL=0.1102 | β=2.114
Epoch 200/250 | Loss=0.3974 | MSE=0.1491 | KL=0.1175 | β=2.114


Best trial: 6. Best value: 0.66093:  40%|████      | 10/25 [00:08<00:12,  1.22it/s, 8.38/900 seconds]

Epoch 250/250 | Loss=0.3819 | MSE=0.1448 | KL=0.1122 | β=2.114
BGM fitted: active comps = 20/20
[I 2025-08-06 23:59:50,598] Trial 9 finished with value: 0.595686438168824 and parameters: {'latent': 32, 'beta': 2.1136357739119718, 'gmm_K': 20, 'lr': 0.0030419408547740466, 'warm': 16}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5107 | MSE=0.2467 | KL=0.1366 | β=1.933
Epoch 100/250 | Loss=0.4792 | MSE=0.2006 | KL=0.1441 | β=1.934
Epoch 150/250 | Loss=0.4672 | MSE=0.2236 | KL=0.1260 | β=1.934
Epoch 200/250 | Loss=0.4472 | MSE=0.1833 | KL=0.1365 | β=1.934


Best trial: 6. Best value: 0.66093:  44%|████▍     | 11/25 [00:09<00:11,  1.24it/s, 9.16/900 seconds]

Epoch 250/250 | Loss=0.4290 | MSE=0.1815 | KL=0.1280 | β=1.934
BGM fitted: active comps = 28/28
[I 2025-08-06 23:59:51,376] Trial 10 finished with value: 0.5354332953342172 and parameters: {'latent': 24, 'beta': 1.9336154416213611, 'gmm_K': 28, 'lr': 0.003847022459988682, 'warm': 28}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5633 | MSE=0.3246 | KL=0.1095 | β=2.180
Epoch 100/250 | Loss=0.4535 | MSE=0.1989 | KL=0.1167 | β=2.181
Epoch 150/250 | Loss=0.4597 | MSE=0.2316 | KL=0.1046 | β=2.181
Epoch 200/250 | Loss=0.4357 | MSE=0.1976 | KL=0.1092 | β=2.181


Best trial: 6. Best value: 0.66093:  48%|████▊     | 12/25 [00:09<00:10,  1.23it/s, 9.99/900 seconds]

Epoch 250/250 | Loss=0.3863 | MSE=0.1564 | KL=0.1054 | β=2.181
BGM fitted: active comps = 20/20
[I 2025-08-06 23:59:52,210] Trial 11 finished with value: 0.6174352192588316 and parameters: {'latent': 28, 'beta': 2.180530461304339, 'gmm_K': 20, 'lr': 0.003929132080884614, 'warm': 24}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.6275 | MSE=0.3740 | KL=0.1156 | β=2.192
Epoch 100/250 | Loss=0.5018 | MSE=0.2605 | KL=0.1101 | β=2.193
Epoch 150/250 | Loss=0.4354 | MSE=0.1988 | KL=0.1079 | β=2.193
Epoch 200/250 | Loss=0.4457 | MSE=0.1903 | KL=0.1165 | β=2.193


Best trial: 6. Best value: 0.66093:  52%|█████▏    | 13/25 [00:10<00:09,  1.21it/s, 10.85/900 seconds]

Epoch 250/250 | Loss=0.4287 | MSE=0.1608 | KL=0.1222 | β=2.193
BGM fitted: active comps = 20/20
[I 2025-08-06 23:59:53,068] Trial 12 finished with value: 0.5568977051462602 and parameters: {'latent': 28, 'beta': 2.192553788363022, 'gmm_K': 20, 'lr': 0.003871545725494156, 'warm': 24}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5557 | MSE=0.3006 | KL=0.1313 | β=1.943
Epoch 100/250 | Loss=0.4842 | MSE=0.2518 | KL=0.1195 | β=1.944
Epoch 150/250 | Loss=0.4487 | MSE=0.2080 | KL=0.1238 | β=1.944
Epoch 200/250 | Loss=0.3970 | MSE=0.1642 | KL=0.1198 | β=1.944
Epoch 250/250 | Loss=0.4029 | MSE=0.1677 | KL=0.1210 | β=1.944
BGM fitted: active comps = 20/20


Best trial: 6. Best value: 0.66093:  56%|█████▌    | 14/25 [00:11<00:09,  1.18it/s, 11.74/900 seconds]

[I 2025-08-06 23:59:53,958] Trial 13 finished with value: 0.40233066245592264 and parameters: {'latent': 28, 'beta': 1.943584022842486, 'gmm_K': 20, 'lr': 0.0039965234662445006, 'warm': 24}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5079 | MSE=0.2677 | KL=0.1251 | β=1.920
Epoch 100/250 | Loss=0.4336 | MSE=0.1965 | KL=0.1235 | β=1.920
Epoch 150/250 | Loss=0.4501 | MSE=0.1732 | KL=0.1442 | β=1.920
Epoch 200/250 | Loss=0.3964 | MSE=0.1618 | KL=0.1222 | β=1.920
Epoch 250/250 | Loss=0.3896 | MSE=0.1462 | KL=0.1268 | β=1.920
BGM fitted: active comps = 36/36


Best trial: 6. Best value: 0.66093:  60%|██████    | 15/25 [00:12<00:08,  1.18it/s, 12.60/900 seconds]

[I 2025-08-06 23:59:54,816] Trial 14 finished with value: 0.5569151015292708 and parameters: {'latent': 28, 'beta': 1.920021822490035, 'gmm_K': 36, 'lr': 0.00419589801222366, 'warm': 24}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5561 | MSE=0.2835 | KL=0.1241 | β=2.197
Epoch 100/250 | Loss=0.5036 | MSE=0.2330 | KL=0.1231 | β=2.198
Epoch 150/250 | Loss=0.5202 | MSE=0.2538 | KL=0.1212 | β=2.198
Epoch 200/250 | Loss=0.4869 | MSE=0.2298 | KL=0.1170 | β=2.198
Epoch 250/250 | Loss=0.4811 | MSE=0.2200 | KL=0.1188 | β=2.198
BGM fitted: active comps = 20/20


Best trial: 6. Best value: 0.66093:  64%|██████▍   | 16/25 [00:13<00:07,  1.19it/s, 13.41/900 seconds]

[I 2025-08-06 23:59:55,627] Trial 15 finished with value: 0.6362469529216229 and parameters: {'latent': 24, 'beta': 2.198003182869964, 'gmm_K': 20, 'lr': 0.003616645177935128, 'warm': 28}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5090 | MSE=0.2795 | KL=0.1416 | β=1.621
Epoch 100/250 | Loss=0.3972 | MSE=0.1835 | KL=0.1318 | β=1.621
Epoch 150/250 | Loss=0.3900 | MSE=0.1535 | KL=0.1459 | β=1.621
Epoch 200/250 | Loss=0.4048 | MSE=0.1693 | KL=0.1452 | β=1.621
Epoch 250/250 | Loss=0.4024 | MSE=0.1647 | KL=0.1466 | β=1.621
BGM fitted: active comps = 36/36


Best trial: 6. Best value: 0.66093:  68%|██████▊   | 17/25 [00:14<00:06,  1.20it/s, 14.24/900 seconds]

[I 2025-08-06 23:59:56,453] Trial 16 finished with value: 0.5533383034908218 and parameters: {'latent': 24, 'beta': 1.6213860126805804, 'gmm_K': 36, 'lr': 0.0035270432658442287, 'warm': 28}. Best is trial 6 with value: 0.660929600151592.
Epoch 50/250 | Loss=0.5594 | MSE=0.2989 | KL=0.1280 | β=2.034
Epoch 100/250 | Loss=0.5684 | MSE=0.3019 | KL=0.1309 | β=2.035
Epoch 150/250 | Loss=0.4455 | MSE=0.2062 | KL=0.1176 | β=2.035
Epoch 200/250 | Loss=0.4436 | MSE=0.2011 | KL=0.1192 | β=2.035
Epoch 250/250 | Loss=0.4611 | MSE=0.2122 | KL=0.1223 | β=2.035
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  72%|███████▏  | 18/25 [00:15<00:05,  1.19it/s, 15.08/900 seconds]

[I 2025-08-06 23:59:57,301] Trial 17 finished with value: 0.7095161834382396 and parameters: {'latent': 24, 'beta': 2.0352837502306653, 'gmm_K': 28, 'lr': 0.0059558380505119754, 'warm': 28}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.5892 | MSE=0.3257 | KL=0.1300 | β=2.027
Epoch 100/250 | Loss=0.4901 | MSE=0.2068 | KL=0.1397 | β=2.028
Epoch 150/250 | Loss=0.4861 | MSE=0.2253 | KL=0.1287 | β=2.028
Epoch 200/250 | Loss=0.4504 | MSE=0.1914 | KL=0.1277 | β=2.028
Epoch 250/250 | Loss=0.4803 | MSE=0.1849 | KL=0.1457 | β=2.028
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  76%|███████▌  | 19/25 [00:15<00:04,  1.21it/s, 15.89/900 seconds]

[I 2025-08-06 23:59:58,105] Trial 18 finished with value: 0.6413256151272138 and parameters: {'latent': 24, 'beta': 2.027578192351747, 'gmm_K': 28, 'lr': 0.005839340905115049, 'warm': 28}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.5388 | MSE=0.2544 | KL=0.1522 | β=1.869
Epoch 100/250 | Loss=0.4447 | MSE=0.1758 | KL=0.1438 | β=1.870
Epoch 150/250 | Loss=0.4286 | MSE=0.1617 | KL=0.1427 | β=1.870
Epoch 200/250 | Loss=0.4208 | MSE=0.1483 | KL=0.1457 | β=1.870
Epoch 250/250 | Loss=0.4000 | MSE=0.1648 | KL=0.1258 | β=1.870
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  80%|████████  | 20/25 [00:16<00:04,  1.22it/s, 16.69/900 seconds]

[I 2025-08-06 23:59:58,910] Trial 19 finished with value: 0.6099726561328385 and parameters: {'latent': 24, 'beta': 1.8696433411074893, 'gmm_K': 28, 'lr': 0.005448156361922831, 'warm': 28}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.4731 | MSE=0.2562 | KL=0.1064 | β=2.039
Epoch 100/250 | Loss=0.4352 | MSE=0.2126 | KL=0.1092 | β=2.039
Epoch 150/250 | Loss=0.3787 | MSE=0.1633 | KL=0.1057 | β=2.039
Epoch 200/250 | Loss=0.3325 | MSE=0.1364 | KL=0.0962 | β=2.039
Epoch 250/250 | Loss=0.3519 | MSE=0.1337 | KL=0.1070 | β=2.039
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  84%|████████▍ | 21/25 [00:17<00:03,  1.17it/s, 17.62/900 seconds]

[I 2025-08-06 23:59:59,840] Trial 20 finished with value: 0.6573567330672615 and parameters: {'latent': 40, 'beta': 2.0386834769276643, 'gmm_K': 28, 'lr': 0.005308362812797066, 'warm': 20}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.4817 | MSE=0.2371 | KL=0.1197 | β=2.043
Epoch 100/250 | Loss=0.3943 | MSE=0.1737 | KL=0.1080 | β=2.043
Epoch 150/250 | Loss=0.3190 | MSE=0.1253 | KL=0.0948 | β=2.043
Epoch 200/250 | Loss=0.3444 | MSE=0.1357 | KL=0.1021 | β=2.043
Epoch 250/250 | Loss=0.3238 | MSE=0.1204 | KL=0.0996 | β=2.043
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  88%|████████▊ | 22/25 [00:18<00:02,  1.16it/s, 18.51/900 seconds]

[I 2025-08-07 00:00:00,729] Trial 21 finished with value: 0.5353760726024127 and parameters: {'latent': 40, 'beta': 2.0427583970335332, 'gmm_K': 28, 'lr': 0.00541391227944334, 'warm': 20}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.4775 | MSE=0.2313 | KL=0.1172 | β=2.101
Epoch 100/250 | Loss=0.3807 | MSE=0.1699 | KL=0.1003 | β=2.101
Epoch 150/250 | Loss=0.3631 | MSE=0.1451 | KL=0.1037 | β=2.101
Epoch 200/250 | Loss=0.3459 | MSE=0.1233 | KL=0.1059 | β=2.101
Epoch 250/250 | Loss=0.3301 | MSE=0.1313 | KL=0.0946 | β=2.101
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  92%|█████████▏| 23/25 [00:19<00:01,  1.15it/s, 19.39/900 seconds]

[I 2025-08-07 00:00:01,609] Trial 22 finished with value: 0.6384157089640426 and parameters: {'latent': 40, 'beta': 2.101331835117563, 'gmm_K': 28, 'lr': 0.005994241580606686, 'warm': 20}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.4443 | MSE=0.2562 | KL=0.1017 | β=1.850
Epoch 100/250 | Loss=0.3878 | MSE=0.1858 | KL=0.1091 | β=1.850
Epoch 150/250 | Loss=0.3460 | MSE=0.1598 | KL=0.1006 | β=1.850
Epoch 200/250 | Loss=0.3204 | MSE=0.1357 | KL=0.0998 | β=1.850
Epoch 250/250 | Loss=0.3135 | MSE=0.1175 | KL=0.1060 | β=1.850
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516:  96%|█████████▌| 24/25 [00:20<00:00,  1.17it/s, 20.22/900 seconds]

[I 2025-08-07 00:00:02,442] Trial 23 finished with value: 0.41406744200828943 and parameters: {'latent': 40, 'beta': 1.8501943093573596, 'gmm_K': 28, 'lr': 0.005254065370991711, 'warm': 20}. Best is trial 17 with value: 0.7095161834382396.
Epoch 50/250 | Loss=0.4692 | MSE=0.2477 | KL=0.1106 | β=2.002
Epoch 100/250 | Loss=0.4026 | MSE=0.2090 | KL=0.0966 | β=2.003
Epoch 150/250 | Loss=0.3699 | MSE=0.1318 | KL=0.1189 | β=2.003
Epoch 200/250 | Loss=0.3715 | MSE=0.1332 | KL=0.1190 | β=2.003
Epoch 250/250 | Loss=0.3345 | MSE=0.1270 | KL=0.1036 | β=2.003
BGM fitted: active comps = 28/28


Best trial: 17. Best value: 0.709516: 100%|██████████| 25/25 [00:21<00:00,  1.19it/s, 21.03/900 seconds]

[I 2025-08-07 00:00:03,245] Trial 24 finished with value: 0.38235469437716185 and parameters: {'latent': 36, 'beta': 2.002814562968468, 'gmm_K': 28, 'lr': 0.004852601129281566, 'warm': 28}. Best is trial 17 with value: 0.7095161834382396.
Best params: {'latent': 24, 'beta': 2.0352837502306653, 'gmm_K': 28, 'lr': 0.0059558380505119754, 'warm': 28} R^2 = 0.7180610162478024 MMD = 0.0854483280956274





In [349]:
best = study.best_trial
print("Best score :", f"{best.value:.5f}")
print("Best params:", best.params)
print("Best R^2    :", f"{best.user_attrs['r2']:.4f}")
print("Best MMD   :", f"{best.user_attrs['mmd']:.4f}")

df_trials = study.trials_dataframe(attrs=("number", "state", "value", "params", "user_attrs"))
df_trials.to_csv("TVAE_anneal_gmm_trials.csv", index=False)

Best score : 0.70952
Best params: {'latent': 24, 'beta': 2.0352837502306653, 'gmm_K': 28, 'lr': 0.0059558380505119754, 'warm': 28}
Best R^2    : 0.7181
Best MMD   : 0.0854


In [351]:
best_params = best.params
model_best = TVAEAnnealGMM(
    metadata       = meta,
    batch_size     = 256,
    device         = "cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (256, 128),             
    beta_max       = best_params['beta'],
    gmm_components = best_params['gmm_K'],
    gmm_cov        = 'diag'                  
)

model_best.fit(
    train_real,
    epochs        = 300,
    warmup_epochs = best_params['warm'],
    lr            = best_params['lr'],
    sched         = 'sigmoid'                
)

fake_best = model_best.sample(len(train_real))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing_GMM with real data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6149 | MSE=0.3676 | KL=0.1215 | β=2.034
Epoch 100/300 | Loss=0.5268 | MSE=0.2562 | KL=0.1329 | β=2.035
Epoch 150/300 | Loss=0.4796 | MSE=0.2323 | KL=0.1215 | β=2.035
Epoch 200/300 | Loss=0.4479 | MSE=0.1682 | KL=0.1374 | β=2.035
Epoch 250/300 | Loss=0.4393 | MSE=0.1931 | KL=0.1210 | β=2.035
Epoch 300/300 | Loss=0.4300 | MSE=0.1705 | KL=0.1275 | β=2.035
BGM fitted: active comps = 28/28
TSTR results of TVAE_annealing_GMM with real data input (R^2, RMSE): 0.6353864838325292 , 1.9979841044735147
MMD: 0.10509308064636685


In [352]:
best_params = best.params
model_best = TVAEAnnealGMM(
    metadata       = meta,
    batch_size     = 256,
    device         = "cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (256, 128),             
    beta_max       = best_params['beta'],
    gmm_components = best_params['gmm_K'],
    gmm_cov        = 'diag'                  
)

model_best.fit(
    train_copu,
    epochs        = 300,
    warmup_epochs = best_params['warm'],
    lr            = best_params['lr'],
    sched         = 'sigmoid'                
)

fake_best = model_best.sample(len(train_copu))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing_GMM with Copula data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6516 | MSE=0.3373 | KL=0.1545 | β=2.034
Epoch 100/300 | Loss=0.6428 | MSE=0.3126 | KL=0.1622 | β=2.035
Epoch 150/300 | Loss=0.6338 | MSE=0.3158 | KL=0.1562 | β=2.035
Epoch 200/300 | Loss=0.6523 | MSE=0.3287 | KL=0.1590 | β=2.035
Epoch 250/300 | Loss=0.6365 | MSE=0.3163 | KL=0.1573 | β=2.035
Epoch 300/300 | Loss=0.6428 | MSE=0.3171 | KL=0.1600 | β=2.035
BGM fitted: active comps = 27/28
TSTR results of TVAE_annealing_GMM with Copula data input (R^2, RMSE): -0.3078293224555666 , 3.784000472766599
MMD: 0.24768935276694826


In [353]:
best_params = best.params
model_best = TVAEAnnealGMM(
    metadata       = meta,
    batch_size     = 256,
    device         = "cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (256, 128),             
    beta_max       = best_params['beta'],
    gmm_components = best_params['gmm_K'],
    gmm_cov        = 'diag'                  
)

model_best.fit(
    df_1x,
    epochs        = 300,
    warmup_epochs = best_params['warm'],
    lr            = best_params['lr'],
    sched         = 'sigmoid'                
)

fake_best = model_best.sample(len(df_1x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing_GMM with 1:1 mixing data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6872 | MSE=0.4078 | KL=0.1373 | β=2.034
Epoch 100/300 | Loss=0.6461 | MSE=0.3477 | KL=0.1466 | β=2.035
Epoch 150/300 | Loss=0.5867 | MSE=0.2805 | KL=0.1504 | β=2.035
Epoch 200/300 | Loss=0.6221 | MSE=0.2969 | KL=0.1598 | β=2.035
Epoch 250/300 | Loss=0.5836 | MSE=0.2827 | KL=0.1479 | β=2.035
Epoch 300/300 | Loss=0.5837 | MSE=0.2769 | KL=0.1507 | β=2.035
BGM fitted: active comps = 28/28
TSTR results of TVAE_annealing_GMM with 1:1 mixing data input (R^2, RMSE): 0.5970392688251108 , 2.1004241826269796
MMD: 0.1589433402744021


In [354]:
best_params = best.params
model_best = TVAEAnnealGMM(
    metadata       = meta,
    batch_size     = 256,
    device         = "cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (256, 128),             
    beta_max       = best_params['beta'],
    gmm_components = best_params['gmm_K'],
    gmm_cov        = 'diag'                  
)

model_best.fit(
    df_2x,
    epochs        = 300,
    warmup_epochs = best_params['warm'],
    lr            = best_params['lr'],
    sched         = 'sigmoid'                
)

fake_best = model_best.sample(len(df_2x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing_GMM with 1:2 mixing data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6514 | MSE=0.3584 | KL=0.1440 | β=2.034
Epoch 100/300 | Loss=0.6456 | MSE=0.3410 | KL=0.1497 | β=2.035
Epoch 150/300 | Loss=0.6523 | MSE=0.3122 | KL=0.1671 | β=2.035
Epoch 200/300 | Loss=0.5670 | MSE=0.2658 | KL=0.1480 | β=2.035
Epoch 250/300 | Loss=0.5846 | MSE=0.2895 | KL=0.1450 | β=2.035
Epoch 300/300 | Loss=0.5755 | MSE=0.2743 | KL=0.1480 | β=2.035
BGM fitted: active comps = 28/28
TSTR results of TVAE_annealing_GMM with 1:2 mixing data input (R^2, RMSE): 0.4454384671026135 , 2.464055681512016
MMD: 0.1850200714138146


In [355]:
best_params = best.params
model_best = TVAEAnnealGMM(
    metadata       = meta,
    batch_size     = 256,
    device         = "cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (256, 128),             
    beta_max       = best_params['beta'],
    gmm_components = best_params['gmm_K'],
    gmm_cov        = 'diag'                  
)

model_best.fit(
    df_4x,
    epochs        = 300,
    warmup_epochs = best_params['warm'],
    lr            = best_params['lr'],
    sched         = 'sigmoid'                
)

fake_best = model_best.sample(len(df_4x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing_GMM with 1:4 mixing data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6785 | MSE=0.3831 | KL=0.1452 | β=2.034
Epoch 100/300 | Loss=0.5495 | MSE=0.2964 | KL=0.1244 | β=2.035
Epoch 150/300 | Loss=0.6835 | MSE=0.3691 | KL=0.1545 | β=2.035
Epoch 200/300 | Loss=0.5797 | MSE=0.2596 | KL=0.1573 | β=2.035
Epoch 250/300 | Loss=0.6440 | MSE=0.3805 | KL=0.1295 | β=2.035
Epoch 300/300 | Loss=0.7075 | MSE=0.3537 | KL=0.1738 | β=2.035
BGM fitted: active comps = 28/28
TSTR results of TVAE_annealing_GMM with 1:4 mixing data input (R^2, RMSE): 0.41068748320061965 , 2.5400863434894134
MMD: 0.2762530813192122


In [356]:
best_params = best.params
model_best = TVAEAnnealGMM(
    metadata       = meta,
    batch_size     = 256,
    device         = "cpu",
    latent_dim     = best_params['latent'],
    hidden_sizes   = (256, 128),             
    beta_max       = best_params['beta'],
    gmm_components = best_params['gmm_K'],
    gmm_cov        = 'diag'                  
)

model_best.fit(
    df_8x,
    epochs        = 300,
    warmup_epochs = best_params['warm'],
    lr            = best_params['lr'],
    sched         = 'sigmoid'                
)

fake_best = model_best.sample(len(df_8x))

# Check R^2 score and RMSE
r2_best, rmse_best = xgb_score(fake_best, test_real)
print("TSTR results of TVAE_annealing_GMM with 1:8 mixing data input (R^2, RMSE):", r2_best, ",", rmse_best)

# Check the distribution discrepancy
mmd_best = compute_mmd(fake_best, test_real)
print("MMD:", mmd_best)

Epoch 50/300 | Loss=0.6806 | MSE=0.3180 | KL=0.1783 | β=2.034
Epoch 100/300 | Loss=0.6842 | MSE=0.3501 | KL=0.1642 | β=2.035
Epoch 150/300 | Loss=0.6347 | MSE=0.3226 | KL=0.1533 | β=2.035
Epoch 200/300 | Loss=0.6184 | MSE=0.3082 | KL=0.1525 | β=2.035
Epoch 250/300 | Loss=0.6010 | MSE=0.2863 | KL=0.1546 | β=2.035
Epoch 300/300 | Loss=0.6415 | MSE=0.3429 | KL=0.1467 | β=2.035
BGM fitted: active comps = 28/28
TSTR results of TVAE_annealing_GMM with 1:8 mixing data input (R^2, RMSE): 0.3409601273776601 , 2.6861576155860787
MMD: 0.2588595462283399


In [357]:
def train_and_save_gmm(name,
                       latent_dim,
                       beta_max,
                       gmm_components,
                       gmm_cov        ='diag',
                       epochs         =300,
                       warmup         =32,
                       lr             =0.0035,
                       hidden_sizes   =(256, 128),
                       batch_size     =128,
                       device         ="cpu"):

    model = TVAEAnnealGMM(
        metadata       = meta,
        latent_dim     = latent_dim,
        hidden_sizes   = hidden_sizes,
        beta_max       = beta_max,
        batch_size     = batch_size,
        device         = device,
        gmm_components = gmm_components,
        gmm_cov        = gmm_cov
    )

    model.fit(train_real,
              epochs        = epochs,
              warmup_epochs = warmup,
              lr            = lr,
              sched         = "sigmoid")

    # Save hyperparameters weights
    weight_path = f"tvae_gmm_{name}.pt"
    torch.save(model.model.state_dict(), weight_path)
    print(f"Saved weights: {weight_path}")

    # Save synthetic data
    fake = model.sample(10_000)        
    csv_path = f"Synthetic_TVAE_gmm_{name}.csv"
    fake.to_csv(csv_path, index=False)
    print(f"Saved samples to {csv_path}")


# Best Results
train_and_save_gmm(
    name            = "best",
    latent_dim      = 36,
    beta_max        = 1.45178048,
    gmm_components  = 36,
    lr              = 0.0054798077,
    warmup          = 20
)


Epoch 50/300 | Loss=0.4290 | MSE=0.2129 | KL=0.1489 | β=1.452
Epoch 100/300 | Loss=0.3444 | MSE=0.1632 | KL=0.1248 | β=1.452
Epoch 150/300 | Loss=0.2920 | MSE=0.1039 | KL=0.1295 | β=1.452
Epoch 200/300 | Loss=0.2925 | MSE=0.1163 | KL=0.1214 | β=1.452
Epoch 250/300 | Loss=0.2674 | MSE=0.0849 | KL=0.1257 | β=1.452
Epoch 300/300 | Loss=0.2837 | MSE=0.1035 | KL=0.1241 | β=1.452
BGM fitted: active comps = 36/36
Saved weights: tvae_gmm_best.pt
Saved samples to Synthetic_TVAE_gmm_best.csv


To check the robustness of TVAE-annealing and TVAE-GMM, we need to do more tests.

In [358]:
N_SAMPLE   = 10_000     # Sampling
N_SEEDS    = 10         # Try 10 different seed
BATCH_EVAL = 256

# Load in data
train_df = train_real
test_df  = test_real

In [359]:
# Best Hypereparameters of searching
cfg_anneal = dict(latent_dim=32, hidden_sizes=(256,128),
                  beta_max=2.023, 
                  batch_size=BATCH_EVAL)

cfg_gmm    = dict(latent_dim=36, hidden_sizes=(256,128),
                  beta_max=1.452, gmm_components=36,
                  batch_size=BATCH_EVAL)

In [360]:
# Build up models
def load_model(path, cfg, train_data,is_gmm=False):
    # instantiate TVAE models
    if is_gmm:
        m = TVAEAnnealGMM(metadata=meta,
                          latent_dim     = cfg['latent_dim'],
                          hidden_sizes   = cfg['hidden_sizes'],
                          beta_max       = cfg['beta_max'],
                          gmm_components = cfg['gmm_components'],
                          batch_size     = cfg['batch_size'],
                          device         ="cpu")
    else:
        m = TVAEAnneal(metadata=meta,
                       latent_dim   = cfg['latent_dim'],
                       hidden_sizes = cfg['hidden_sizes'],
                       beta_max     = cfg['beta_max'],
                       batch_size   = cfg['batch_size'],
                       device       ="cpu")

    # Create same VAE model 
    input_dim = len(train_data.columns)
    m.model = VAE(input_dim=input_dim,
                  latent_dim=cfg['latent_dim'],
                  h=list(cfg['hidden_sizes'])).to("cpu")
    state = torch.load(path, map_location="cpu")
    m.model.load_state_dict(state)
    m.model.eval()
    m.scaler.fit(train_data) 

    return m


In [361]:
# Evaluation
def eval_once(model, seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    fake = model.sample(N_SAMPLE)
    r2, rmse = xgb_score(fake, test_df)
    mmd      = compute_mmd(fake, test_df)
    return dict(r2=r2, rmse=rmse, mmd=mmd)

def robustness_summary(path, cfg, train_data, is_gmm=False):
    model = load_model(path, cfg, train_data, is_gmm)
    stats = [eval_once(model, s) for s in range(N_SEEDS)]
    df    = pd.DataFrame(stats)
    return df.agg(['mean', 'std']).T      

ann_df_real = robustness_summary("tvae_annealing_best.pt", cfg_anneal, train_df, is_gmm=False)
gmm_df_real = robustness_summary("tvae_gmm_best.pt",       cfg_gmm   , train_df, is_gmm=True)

summary = pd.concat({'Anneal_real': ann_df_real, 'GMM_real': gmm_df_real}, axis=1)
print(summary)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

     Anneal_real            GMM_real          
            mean       std      mean       std
r2      0.646359  0.016250  0.736284  0.018010
rmse    1.967217  0.045515  1.698295  0.058361
mmd     0.164021  0.002898  0.160267  0.002881


In [362]:
ann_df_1x = robustness_summary("tvae_annealing_best.pt", cfg_anneal, df_1x, is_gmm=False)
gmm_df_1x = robustness_summary("tvae_gmm_best.pt",       cfg_gmm   , df_1x, is_gmm=True)

summary = pd.concat({'Anneal_1x': ann_df_real, 'GMM_1x': gmm_df_real}, axis=1)
print(summary)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

     Anneal_1x              GMM_1x          
          mean       std      mean       std
r2    0.646359  0.016250  0.736284  0.018010
rmse  1.967217  0.045515  1.698295  0.058361
mmd   0.164021  0.002898  0.160267  0.002881


In [363]:
ann_df_2x = robustness_summary("tvae_annealing_best.pt", cfg_anneal, df_2x, is_gmm=False)
gmm_df_2x = robustness_summary("tvae_gmm_best.pt",       cfg_gmm   , df_2x, is_gmm=True)

summary = pd.concat({'Anneal_2x': ann_df_2x, 'GMM_2x': gmm_df_2x}, axis=1)
print(summary)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

     Anneal_2x              GMM_2x          
          mean       std      mean       std
r2    0.634801  0.012507  0.731600  0.016150
rmse  1.999322  0.034446  1.713526  0.051494
mmd   0.174965  0.002582  0.175812  0.002877


In [364]:
ann_df_4x = robustness_summary("tvae_annealing_best.pt", cfg_anneal, df_4x, is_gmm=False)
gmm_df_4x = robustness_summary("tvae_gmm_best.pt",       cfg_gmm   , df_4x, is_gmm=True)

summary = pd.concat({'Anneal_4x': ann_df_4x, 'GMM_4x': gmm_df_4x}, axis=1)
print(summary)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

     Anneal_4x              GMM_4x          
          mean       std      mean       std
r2    0.630149  0.013018  0.728528  0.018949
rmse  2.011995  0.035786  1.723047  0.060526
mmd   0.173634  0.002665  0.173954  0.002829


In [365]:
ann_df_8x = robustness_summary("tvae_annealing_best.pt", cfg_anneal, df_8x, is_gmm=False)
gmm_df_8x = robustness_summary("tvae_gmm_best.pt",       cfg_gmm   , df_8x, is_gmm=True)

summary = pd.concat({'Anneal_8x': ann_df_8x, 'GMM_8x': gmm_df_8x}, axis=1)
print(summary)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

     Anneal_8x              GMM_8x          
          mean       std      mean       std
r2    0.636911  0.020213  0.727856  0.017429
rmse  1.993093  0.056013  1.725319  0.055933
mmd   0.170316  0.002850  0.168751  0.002826


In [366]:
ann_copu = robustness_summary("tvae_annealing_best.pt", cfg_anneal, train_copu, is_gmm=False)
gmm_copu = robustness_summary("tvae_gmm_best.pt",       cfg_gmm   , train_copu, is_gmm=True)

summary = pd.concat({'Anneal_copu': ann_copu, 'GMM_copu': gmm_copu}, axis=1)
print(summary)


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is poss

     Anneal_copu            GMM_copu          
            mean       std      mean       std
r2      0.636461  0.012168  0.728746  0.019223
rmse    1.994787  0.033370  1.722324  0.061486
mmd     0.166216  0.002877  0.163160  0.002897


In [374]:
# Save data generated by TAVE_gmm + Copula
gmm_copu = load_model("tvae_gmm_best.pt", cfg_gmm, train_copu, is_gmm=True)
fake_gmm_copu  = gmm_copu.sample(10_000)
fake_gmm_copu.to_csv("Synthetic_TAVE_gmm_Copula.csv", index=False)
print("Saved 10k GMM-synthetic rows to Synthetic_TAVE_gmm_Copula.csv")

Saved 10k GMM-synthetic rows to Synthetic_TAVE_gmm_Copula.csv



You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



Basing on the paper "Evaluating Generative Models for Tabular Data: Novel Metrics and Benchmarking", we can apply `FAED` to check the quality of synthetic data. 

In [375]:
df_gen = pd.read_csv("Synthetic_TAVE_gmm_Copula.csv")
df_gen = df_gen[train_real.columns]

df_gen_real = pd.read_csv("Synthetic_TVAE_gmm_best.csv")
df_gen_real = df_gen_real[train_real.columns]

In [376]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_real_scaled = scaler.fit_transform(train_real)
X_gen_scaled = scaler.transform(df_gen)
X_gen_real_scaled =  scaler.transform(df_gen_real)

In [377]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

class AutoEncoder(nn.Module):
    def __init__(self, input_dim, latent_dim=16):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat, z

X_real_tensor = torch.tensor(X_real_scaled, dtype=torch.float32)
loader = DataLoader(TensorDataset(X_real_tensor), batch_size=64, shuffle=True)

model = AutoEncoder(input_dim=X_real_tensor.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

model.train()
for epoch in range(100):
    for batch in loader:
        x = batch[0]
        x_hat, _ = model(x)
        loss = criterion(x_hat, x)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [378]:
model.eval()
with torch.no_grad():
    z_real = model.encoder(torch.tensor(X_real_scaled, dtype=torch.float32))
    z_gen = model.encoder(torch.tensor(X_gen_scaled, dtype=torch.float32))
    z_gen_real = model.encoder(torch.tensor(X_gen_real_scaled, dtype=torch.float32))

In [379]:
import scipy.linalg

def compute_frechet_distance(mu1, sigma1, mu2, sigma2, eps=1e-6):
    covmean, _ = scipy.linalg.sqrtm(sigma1 @ sigma2, disp=False)
    if np.iscomplexobj(covmean):
        covmean = covmean.real
    return np.sum((mu1 - mu2) ** 2) + np.trace(sigma1 + sigma2 - 2 * covmean)

z_real_np = z_real.numpy()
z_gen_np = z_gen.numpy()
z_gen_real_np = z_gen_real.numpy()

mu_r, sigma_r = np.mean(z_real_np, axis=0), np.cov(z_real_np, rowvar=False)
mu_g, sigma_g = np.mean(z_gen_np, axis=0), np.cov(z_gen_np, rowvar=False)
mu_gr, sigma_gr = np.mean(z_gen_real_np, axis=0), np.cov(z_gen_real_np, rowvar=False)

copu_faed_score = compute_frechet_distance(mu_r, sigma_r, mu_g, sigma_g)
real_faed_score = compute_frechet_distance(mu_r, sigma_r, mu_gr, sigma_gr)
print("Copu_FAED score =", copu_faed_score)
print("real_FAED score =", real_faed_score)

Copu_FAED score = 1.6605847251875836
real_FAED score = 0.43598867447289397
