## Imports


In [1]:
import csv
import os
import random
import zipfile

from pathlib import Path 
from functools import partial
from typing import Callable, Any,List

import numpy as np 
import torch

from tqdm.notebook import tqdm
from data import get_challenge_points
from metrics import get_tpr_at_fpr
from midst_models.single_table_TabDDPM.wb_pipeline_utils import quantile_normalize_sklearn

In [2]:
import json

from midst_models.single_table_TabDDPM.wb_complex_pipeline import (
    clava_clustering,
    clava_training,
    clava_load_pretrained,
    clava_synthesizing,
    load_configs,
)
from midst_models.single_table_TabDDPM.wb_pipeline_modules import load_multi_table
from midst_models.single_table_TabDDPM.tab_ddpm.gaussian_multinomial_diffsuion import GaussianMultinomialDiffusion

In [3]:
TABDDPM_DATA_DIR = "tabddpm_white_box"
TABSYN_DATA_DIR = "tabsyn_white_box"

## Loading Model


In [62]:
# Load config
config_path = "/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1/trans.json"
configs, save_dir = load_configs(config_path)

# Display config
json_str = json.dumps(configs, indent=4)
print(json_str)

{
    "general": {
        "data_dir": "/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1",
        "exp_name": "",
        "workspace_dir": "/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1",
        "sample_prefix": "",
        "test_data_dir": "/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1"
    },
    "clustering": {
        "parent_scale": 1.0,
        "num_clusters": 50,
        "clustering_method": "both"
    },
    "diffusion": {
        "d_layers": [
            512,
            1024,
            1024,
            1024,
            1024,
            512
        ],
        "dropout": 0.0,
        "num_timesteps": 2000,
        "model_type": "mlp",
        "iterations": 200000,
        "batch_size": 4096,
        "lr": 0.0006,
        "gaussian_loss_type": "mse",
        "weight_decay": 1e-05,
        "scheduler": "cosine"
    },
    "classifier": {
        "d_layers": [


In [63]:
configs

{'general': {'data_dir': '/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1',
  'exp_name': '',
  'workspace_dir': '/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1',
  'sample_prefix': '',
  'test_data_dir': '/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1'},
 'clustering': {'parent_scale': 1.0,
  'num_clusters': 50,
  'clustering_method': 'both'},
 'diffusion': {'d_layers': [512, 1024, 1024, 1024, 1024, 512],
  'dropout': 0.0,
  'num_timesteps': 2000,
  'model_type': 'mlp',
  'iterations': 200000,
  'batch_size': 4096,
  'lr': 0.0006,
  'gaussian_loss_type': 'mse',
  'weight_decay': 1e-05,
  'scheduler': 'cosine'},
 'classifier': {'d_layers': [128, 256, 512, 1024, 512, 256, 128],
  'lr': 0.0001,
  'dim_t': 128,
  'batch_size': 4096,
  'iterations': 20000},
 'sampling': {'batch_size': 20000, 'classifier_scale': 1.0},
 'matching': {'num_matching_clusters': 1,
  'matching_batch_si

In [64]:
save_dir

'/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1/'

In [65]:
relation_order = [[None,'trans']]
models = clava_load_pretrained(relation_order,save_dir)

None -> trans checkpoint found, loading...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [66]:
model = models[None,'trans']['diffusion']

In [67]:
device = "cuda"

In [68]:
model.to(device)

GaussianMultinomialDiffusion(
  (_denoise_fn): MLPDiffusion(
    (mlp): MLP(
      (blocks): ModuleList(
        (0): Block(
          (linear): Linear(in_features=128, out_features=512, bias=True)
          (activation): ReLU()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (1): Block(
          (linear): Linear(in_features=512, out_features=1024, bias=True)
          (activation): ReLU()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (2): Block(
          (linear): Linear(in_features=1024, out_features=1024, bias=True)
          (activation): ReLU()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (3): Block(
          (linear): Linear(in_features=1024, out_features=1024, bias=True)
          (activation): ReLU()
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (4): Block(
          (linear): Linear(in_features=1024, out_features=1024, bias=True)
          (activation): ReLU()
          (dropout): Dro

## Making Functions for Quantile Regressors

In [18]:
# Imports 
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [19]:
##############################
# Quantile Regressor Network #
##############################
class QuantileRegressor(nn.Module):
    def __init__(self, input_dim, hidden_dim=64):
        super(QuantileRegressor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)  # outputs the predicted threshold
        )
        
    def forward(self, x):
        return self.model(x)


In [20]:
##############################
# Pinball Loss (Quantile Loss)#
##############################
def pinball_loss(q_pred, target, alpha):
    """
    Computes the pinball (quantile) loss.
    For each sample:
      if target <= q_pred: loss = (q_pred - target) * (1 - alpha)
      else:                loss = (q_pred - target) * (-alpha)
    """
    indicator = (target <= q_pred).float()
    loss = (q_pred - target) * (indicator - alpha)
    return loss.mean()

In [90]:
##############################
# Training Loop for a Model  #
##############################
def train_model(model, dataloader, optimizer, alpha, num_epochs=100):
    model.train()
    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for X_batch, y_batch in dataloader:
            q_pred = model(X_batch)  # predicted quantile threshold
            loss = pinball_loss(q_pred, y_batch, alpha)
            #loss.backward()
            optimizer.step()
            epoch_loss += loss.item() * X_batch.size(0)
            optimizer.zero_grad()
        epoch_loss /= len(dataloader.dataset)
        if epoch % 20 == 0:
            print(f"Epoch {epoch:3d}: Loss = {epoch_loss:.4f}")

In [91]:
def compute_t_error(model, x, t):
    """
    Computes the reconstruction error at a specific timestep t.

    Args:
        model: Pretrained GaussianMultinomialDiffusion model.
        x: Input data tensor.
        t: Diffusion timestep (int or tensor).

    Returns:
        Reconstruction error tensor for each sample in x.
    """
    # l = [2.1900000e+03, 2.0000000e+00, 5.0000000e+00, 8.0100000e+04,
    #    1.4015810e+05, 8.0000000e+00, 1.3000000e+01, 9.7259157e+07] # this list was supposed to be the max values in each coulmn of challenge_with_id

    #l = torch.tensor(l).to(device="cuda")
    # Simulate the diffusion process
    #x = x/l 
    x_t = model.gaussian_q_sample(x_start=x, t=torch.tensor([t]).to(x.device))
    
    #x_t = x_t/l
    # Perform reverse denoising
    x_recon = model._denoise_fn(x_t, torch.tensor([t]).to(x.device))
    
    # Compute reconstruction error (e.g., L2 norm)
    t_error = torch.norm(x - x_recon, p=2, dim=1)
    return t_error

In [92]:
def perform_membership_inference(models, x, threshold_alpha):
    """
    Perform membership inference using reconstruction errors and a quantile regression model.
    """
    errors = compute_t_error(model, x, t=5)  # Example timestep
    thresholds = models(x)
    predictions = (errors <= thresholds).float()  # Membership decision
    return predictions

## Training Quantile Regressor

In [22]:
""" 
first write the code for getting normalized train_with_id data and then do the same for holdout_with_id then train two quantile regressor models, one on holdout error
and the other on train error, then if the error on the cp is less than holdout error then it is train point, or otherwise you can train the two quantile models and check
which distribution has the higher probability of containing the data point.
"""

' \nfirst write the code for getting normalized train_with_id data and then do the same for holdout_with_id then train two quantile regressor models, one on holdout error\nand the other on train error, then if the error on the cp is less than holdout error then it is train point, or otherwise you can train the two quantile models and check\nwhich distribution has the higher probability of containing the data point.\n'

### Loading & Normalizing Data

In [57]:
import pandas as pd
import torch
from tqdm import tqdm  # Import tqdm for progress tracking

def load_csv_as_tensor(file_path, nrows=None):
    df = pd.read_csv(file_path, header=None, nrows=nrows)  # Read without headers
    df = df.iloc[1:, 2:]  # Drop first two columns
    df = df.apply(pd.to_numeric, errors='coerce')  # Convert all values to numeric
    df = df.fillna(0)  # Replace NaNs with 0 (or use another strategy)
    return torch.tensor(df.values, dtype=torch.float32)  # Convert to tensor

# Load and process the first CSV file
file1 = "/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1/train_with_id.csv"
tensor1 = load_csv_as_tensor(file1)

# Load and process the second CSV file (only first 20,000 rows)
file2 = "/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1/holdout_with_id.csv"  
tensor2 = load_csv_as_tensor(file2, nrows=20000)

In [58]:
print(tensor1.shape, tensor2.shape)

torch.Size([20000, 8]) torch.Size([19999, 8])


In [59]:
tensor1_nor, transformers_train = quantile_normalize_sklearn(tensor1.cpu().numpy())
tensor2_nor, transformers_holdout = quantile_normalize_sklearn(tensor1.cpu().numpy())

final_tensor1_nor = torch.tensor(tensor1_nor,dtype = torch.float32).to(device)
final_tensor2_nor = torch.tensor(tensor2_nor,dtype = torch.float32).to(device)

In [60]:
print(final_tensor1_nor.shape)

torch.Size([20000, 8])


In [69]:
error_train = compute_t_error(model, final_tensor1_nor, t= 100)
error_holdout = compute_t_error(model,final_tensor2_nor,t = 100)

In [99]:
# training first quantile regressor on train_with_id

input_dim = 8
X_train  = final_tensor1_nor
y_train = error_train.to(device)
train_dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

alpha = 0.1        # target quantile level (e.g., 0.1 gives the 10% quantile threshold)
num_epochs = 30   # epochs for training
quantile_regressor_train = QuantileRegressor(input_dim, hidden_dim=64).to(device)
optimizer = optim.Adam(quantile_regressor_train.parameters(), lr=1e-3)
print("Training Single Quantile Regressor on All Training Data")
train_model(quantile_regressor_train, dataloader, optimizer, alpha, num_epochs=num_epochs)

Training Single Quantile Regressor on All Training Data
Epoch   0: Loss = 2.1484
Epoch  20: Loss = 2.1484


In [100]:
# training quantile regressor on holdout_with_id

X_train = final_tensor2_nor
y_train = error_holdout.to(device)
train_dataset = TensorDataset(X_train, y_train)
dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

alpha = 0.9        # target quantile level (e.g., 0.1 gives the 10% quantile threshold)
num_epochs = 30   # epochs for training
quantile_regressor_holdout = QuantileRegressor(input_dim, hidden_dim=64).to(device)
optimizer = optim.Adam(quantile_regressor_holdout.parameters(), lr=1e-3)
print("Training Single Quantile Regressor on Holdout Data")
train_model(quantile_regressor_holdout, dataloader, optimizer, alpha, num_epochs=num_epochs)

Training Single Quantile Regressor on Holdout Data
Epoch   0: Loss = 19.4127
Epoch  20: Loss = 19.4127


In [107]:
test_data= load_csv_as_tensor("/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1/challenge_with_id.csv").to(device='cuda')
type(test_data)

torch.Tensor

In [108]:
test_data_nor_train = np.empty((test_data.shape[0], 0))
test_data = test_data.cpu().numpy()
for i in range(8):
    transformed_column = transformers_train[i].transform(test_data[:,i].reshape(-1,1))
    test_data_nor_train = np.concatenate((test_data_nor_train, transformed_column), axis=1)

final_test_data_nor_train = torch.tensor(test_data_nor_train,dtype = torch.float32).to(device)

test_data_nor_holdout = np.empty((test_data.shape[0], 0))
test_data = test_data
for i in range(8):
    transformed_column = transformers_holdout[i].transform(test_data[:,i].reshape(-1,1))
    test_data_nor_holdout = np.concatenate((test_data_nor_holdout, transformed_column), axis=1)

final_test_data_nor_holdout = torch.tensor(test_data_nor_holdout,dtype = torch.float32).to(device)

In [109]:
print(final_test_data_nor_train.shape, final_test_data_nor_holdout.shape)

torch.Size([200, 8]) torch.Size([200, 8])


In [116]:
pred_error_train = quantile_regressor_train(final_test_data_nor_train)
pred_error_holdout = quantile_regressor_holdout(final_test_data_nor_holdout)
actual_error = compute_t_error(model,final_test_data_nor_train,t = 100)

print(pred_error_train.shape,pred_error_train.shape,actual_error.shape)

torch.Size([200, 1]) torch.Size([200, 1]) torch.Size([200])


In [None]:

print("\nControl Point Predictions:")
print("Training model prediction:", pred_error_train.item())
print("Holdout model prediction: ", pred_error_holdout.item())

if pred_error_train.item() < pred_error_holdout.item():
    print("Control point is considered a training point (lower error).")
else:
    print("Control point is not a training point (higher holdout error).")

In [118]:
challenge_labels = pd.read_csv("/home/vidit/Desktop/SaTML/MIDSTModels/starter_kits/tabddpm_white_box/train/tabddpm_1/challenge_label.csv")
challenge_ground_truth = challenge_labels["is_train"].values

(200,)

In [122]:
predictions = []
for i in range(test_data.shape[0]):
    train_err = pred_error_train[i].item()
    holdout_err = pred_error_holdout[i].item()
    actual_err = actual_error[i].item()
    classification = 0 if (abs(actual_err - train_err) > abs(actual_err - holdout_err)) else 1
    predictions.append(classification)

print(predictions)

[0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1]


In [123]:
from sklearn.metrics import accuracy_score, roc_curve

# Example inputs (replace with actual data)
y_true = challenge_ground_truth  # Binary labels (0 or 1)
y_pred = predictions  # Predicted scores (continuous)

# Compute Accuracy
y_pred_labels = (y_pred >= 0.5).astype(int)  # Convert scores to binary labels
accuracy = accuracy_score(y_true, y_pred_labels)

# Compute TPR at the closest FPR to 10%
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
closest_idx = np.argmin(np.abs(fpr - 0.1))  # Find the index where FPR is closest to 10%
tpr_at_10_fpr = tpr[closest_idx]

# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"TPR at 10% FPR: {tpr_at_10_fpr:.4f}")

Accuracy: 0.5750
TPR at 10% FPR: 0.0000


In [None]:

    
    # # =============================================================================
    # # Inference: For each test sample, use the trained model to predict a threshold.
    # # Compare the actual score to the threshold to decide membership:
    # # If actual score <= predicted threshold, output 1 (member); else, output 0 (nonmember).
    # # =============================================================================
    # output_list = []
    # for sample in test_tensor:
    #     x_sample = sample[:input_dim].unsqueeze(0)  # shape: [1, input_dim]
    #     actual_score = sample[input_dim].item()       # the observed score for this sample
    #     model.eval()
    #     with torch.no_grad():
    #         pred_threshold = model(x_sample).item()
    #     decision = 1 if actual_score <= pred_threshold else 0
    #     output_list.append(decision)
    
    # # Create a tensor of output predictions.
    # output = torch.tensor(output_list)
    
    # print("\nFinal membership predictions (1 = member, 0 = nonmember):")
    # print(output)
