In [1]:
# Set the working directory to the parent directory
import sys
sys.path.append('..')
sys.dont_write_bytecode = True

# Import relevant custom libraries
from src.eda import data_info
from src.evaluation import ValidationEvaluation

# Import relevant libraries
import pandas as pd
import warnings
from IPython.display import display
import matplotlib.pyplot as plt
import math
import os
import glob
import numpy as np

# Config
pd.set_option('display.max_columns', None) # Ensure all columns are displayed
warnings.filterwarnings("ignore")

# Read relevant files
X_train = pd.read_feather("../data/processed/X_train.feather")
X_train_validate = pd.read_feather("../data/processed/X_train_validate.feather")

# Get data info
var_info = data_info(X_train)
all_cols = X_train.columns
real_cols = var_info[var_info["var_type"]=="numerical"]["var_name"].tolist()
binary_cols = var_info[var_info["var_type"]=="binary"]["var_name"].tolist()

# Read relevant files
X_validate = pd.read_feather("../data/processed/X_validate.feather")
y_validate = pd.read_feather("../data/processed/y_validate.feather")

# Initialize the validation evaluation
valeval = ValidationEvaluation(X_validate, y_validate, real_cols, binary_cols, all_cols, dp_sgd=True)
    
# Read the log file
log_path = "../logs/dpsgd_tune_log.txt"

# Extract the latest successful Bayesian versions
latest_successful_versions = valeval.extract_latest_successful_bayesian_versions(log_path)
print(latest_successful_versions)

# Evaluate the model performance
eval_results = valeval.evaluate_model_performance(latest_successful_versions)

{'202505161637': ('AUC', 1.0, 1e-05, datetime.datetime(2025, 5, 16, 18, 54, 18, 622859)), '202505070419': ('Precision', 1.0, 1e-05, datetime.datetime(2025, 5, 15, 20, 15, 45, 649085)), '202505170232': ('F1-Score', 1.0, 1e-05, datetime.datetime(2025, 5, 17, 5, 26, 3, 436569)), '202505170910': ('Recall', 1.0, 1e-05, datetime.datetime(2025, 5, 17, 15, 36, 10, 805811)), '202505161903': ('AUC', 3.0, 1e-05, datetime.datetime(2025, 5, 17, 1, 21, 4, 994784)), '202505170526': ('F1-Score', 3.0, 1e-05, datetime.datetime(2025, 5, 17, 7, 32, 36, 458159)), '202505150921': ('Precision', 3.0, 1e-05, datetime.datetime(2025, 5, 15, 20, 44, 23, 491416)), '202505150606': ('Recall', 3.0, 1e-05, datetime.datetime(2025, 5, 16, 0, 37, 31, 384408)), '202505150349': ('AUC', 5.0, 1e-05, datetime.datetime(2025, 5, 16, 1, 29, 3, 290500)), '202505170732': ('F1-Score', 5.0, 1e-05, datetime.datetime(2025, 5, 17, 9, 10, 19, 355577)), '202505151015': ('Precision', 5.0, 1e-05, datetime.datetime(2025, 5, 15, 21, 3, 33, 6

In [2]:
selected_versions = eval_results[eval_results['epsilon']>=0.5].sort_values(by=["tuned_by", "epsilon"], ascending=True)[['epsilon', 'tuned_by']].reset_index()
display(selected_versions)

Unnamed: 0,version,epsilon,tuned_by
0,202505070419,1.0,Precision
1,202505070955,3.0,Precision
2,202505071004,5.0,Precision
3,202505071236,1.0,Recall
4,202505071332,3.0,Recall
5,202505071358,5.0,Recall
6,202505071022,1.0,F1-Score
7,202505101353,3.0,F1-Score
8,202505101605,5.0,F1-Score
9,202505070326,1.0,AUC


In [3]:
def get_convergence_status(version, dpsgd=True):
    # Read result file
    folder = "../experiments/tracking"

    # Read the existing files with the same version prefix
    if dpsgd:
        pattern = os.path.join(folder, f"{version}_noise*.csv")
        for file in glob.glob(pattern):
            results_df = pd.read_csv(file)
    else:
        results_df = pd.read_csv(os.path.join(folder, f"{version}_baseline.csv"))

    # Check convergence
    loss_history = np.array(results_df["val_loss"].tolist())
    n = len(loss_history)
    tail_len = 20 #int(0.1 * n)

    # Ensure tail_len is at least 2 to compute slope
    if n < 20:
        raise ValueError("Too few points in the tail to compute slope.")

    # Compute the slope of the last 10% of the loss history
    y = loss_history[-tail_len:]
    x = np.arange(tail_len)

    # Fit a line to the last 10% of the loss history
    slope = np.polyfit(x, y, 1)[0]  # degree 1 polynomial fit

    rel_change = abs(y[-1] - y[0]) / max(abs(y[0]), 1e-8)  # avoid divide-by-zero

    return slope, rel_change

In [4]:
slope_rel_change = selected_versions["version"].apply(get_convergence_status)
selected_versions["slope"] = slope_rel_change.apply(lambda x: x[0])
selected_versions["rel_change"] = slope_rel_change.apply(lambda x: x[1])

In [5]:
selected_versions.sort_values(["slope"], ascending=True)

Unnamed: 0,version,epsilon,tuned_by,slope,rel_change
6,202505071022,1.0,F1-Score,-0.002061,0.005957
0,202505070419,1.0,Precision,-0.000787,0.0029
2,202505071004,5.0,Precision,-0.000672,0.01721
10,202505070348,3.0,AUC,-0.00062,0.008685
9,202505070326,1.0,AUC,-0.000596,0.005145
11,202505070407,5.0,AUC,-0.000565,0.021841
1,202505070955,3.0,Precision,-0.00043,0.003036
5,202505071358,5.0,Recall,-0.000308,0.00299
3,202505071236,1.0,Recall,-0.000277,0.003426
4,202505071332,3.0,Recall,-0.000234,0.004991


In [6]:
eval_results.reset_index().sort_values("version", ascending=True)["version"]

11    202504222233
8     202504230006
10    202504230052
15    202504242314
12    202504242353
13    202504250018
14    202504250026
3     202504250252
0     202504250426
1     202504250447
2     202504250457
7     202504250502
4     202504250552
6     202504250625
5     202504252301
9     202505042114
Name: version, dtype: object

In [2]:
# Initialize the validation evaluation
valeval = ValidationEvaluation(X_validate, y_validate, real_cols, binary_cols, all_cols, dp_sgd=False)
    
# Read the log file
log_path = "../logs/baseline_tune_log.txt"

# Extract the latest successful Bayesian versions
latest_successful_versions = valeval.extract_latest_successful_bayesian_versions(log_path)
print(latest_successful_versions)

# Evaluate the model performance
eval_results = valeval.evaluate_model_performance(latest_successful_versions)

{'202505080012': ('AUC', datetime.datetime(2025, 5, 13, 3, 21, 57, 186961)), '202505080154': ('Precision', datetime.datetime(2025, 5, 8, 2, 50, 26, 588021)), '202505112013': ('F1-Score', datetime.datetime(2025, 5, 13, 3, 5, 23, 711939)), '202505080250': ('Recall', datetime.datetime(2025, 5, 13, 2, 41, 53, 651435))}
Evaluating version 202505080012
Metric: AUC
Evaluating version 202505080154
Metric: Precision
Evaluating version 202505112013
Metric: F1-Score
Evaluating version 202505080250
Metric: Recall


In [16]:
selected_versions = eval_results.reset_index()
slope_rel_change = selected_versions["version"].apply(get_convergence_status, dpsgd=False)
selected_versions["slope"] = slope_rel_change.apply(lambda x: x[0])
selected_versions["rel_change"] = slope_rel_change.apply(lambda x: x[1])

In [17]:
selected_versions

Unnamed: 0,version,precision,recall,f1_score,auc,accuracy,hidden_dims,batch_size,dropout_rate,learning_rate,lam,gamma,threshold,tuned_by,slope,rel_change
0,202504180314,0.771379,0.191757,0.307158,0.699887,0.825958,[64],108.0,0.237561,0.070543,0.044082,0.084559,16.706953,Precision,-0.001706,0.003224
1,202504180327,0.432645,0.645119,0.517938,0.796319,0.758401,[64],123.0,0.378855,0.059096,0.0001,0.999,0.038421,Recall,-1.5e-05,0.010256
2,202504180259,0.496141,0.585683,0.537207,0.809764,0.79698,[64],64.0,0.352388,0.02,0.0001,0.999,0.045073,F1-Score,-6.6e-05,0.024831
3,202504180245,0.499449,0.589588,0.540788,0.812854,0.798551,[64],128.0,0.0,0.1,0.0001,0.999,0.039729,AUC,-0.000215,0.086768


In [3]:
from src.dp_utils import DPSGDSanitizer

san = DPSGDSanitizer(len(X_train), 64, 3, 500, 1e-5)
print(san.compute_noise_from_eps())

4.796703935700338


In [4]:
from src.dp_utils_poisson import DPSGDSanitizer

san = DPSGDSanitizer(len(X_train), 64, 3, 500, 1e-5)
print(san.compute_noise_from_eps())

2.468514516045322


In [3]:
import tensorflow as tf
import pickle
import os
os.chdir("/Users/trinhha/Documents/VU AMSTERDAM/STUDY/Thesis/Code/")
from src.models import AnomalyDetector

X_test = pd.read_feather("data/processed/X_test.feather")

for version in eval_results.index.tolist():
    print(version)
    # Load model and hyperparameters
    model = tf.keras.models.load_model(f"models/baseline/{version}")
    with open(f"hyperparams/baseline/{version}.pkl", "rb") as f:
        params = pickle.load(f)
    detector = AnomalyDetector(
                model=model,
                real_cols=real_cols,
                binary_cols=binary_cols,
                all_cols=all_cols,
                lam=params["lam"],
                gamma=params["gamma"],
            )
    # Compute scores
    scores, x_hat = detector._compute_anomaly_scores(X_test, test_set=True)

    # Save reconstructed data
    pd.DataFrame(x_hat, columns=all_cols).to_feather(f"experiments/predictions/baseline/{version}_recons.feather")

    # Detect
    y_pred = detector._detect(scores, params['threshold'])

    # Save predictions
    pd.DataFrame(y_pred, columns=["anomaly"]).to_feather(f"experiments/predictions/baseline/{version}_pred.feather")

202505080154
202505080250
202505112013
202505080012


In [1]:
import pandas as pd

pd.read_feather("../experiments/scores/posthoc_dp/202505282206_test.feather")

Unnamed: 0,score
0,1.078168
1,1.507936
2,5.899250
3,-4.176338
4,2.635825
...,...
11452,7.965775
11453,-0.024553
11454,-0.008538
11455,-9.573600


In [14]:
import sys
sys.path.append('..')
sys.dont_write_bytecode = True

# Import relevant custom libraries
from src.eda import data_info
from src.evaluation import ValidationEvaluation
from src.dp_utils import DPSGDSanitizer

import pandas as pd

# Read relevant files
x_train = pd.read_feather("../data/processed/X_train.feather")
n = len(x_train)
DPSGDSanitizer(n, 128, 50, 1000, 1e-5).compute_noise_from_eps()

1.2490533073166674

In [4]:
import numpy as np

np.random.choice([1, 2, 3, 5, 6], size=(2, 2), replace=False)

array([[5, 3],
       [6, 1]])

In [11]:
import tensorflow as tf 
batch_size = 150
max_epochs = 300
train_data = []
while len(train_data) < max_epochs:
    # Simulate training process
    indices = np.random.choice(len(x_train), size=(batch_size, min(len(x_train) // batch_size, max_epochs - len(train_data))), replace=False)
    for i in indices:
        # Convert the sampled batch to tf.Tensor
        x_batch_np = x_train.sample(frac=1).reset_index(drop=True).iloc[i].values.astype(np.float32)
        x_batch = tf.convert_to_tensor(x_batch_np)
        tf.print(x_batch.shape)
        train_data.append(x_batch)

TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShape([85, 64])
TensorShap

In [10]:
train_data

[<tf.Tensor: shape=(85, 64), dtype=float32, numpy=
 array([[4.41295564e-01, 7.98448801e-01, 1.84284016e-01, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [2.41240293e-01, 7.58877754e-01, 1.88946858e-01, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [1.18015595e-01, 7.65066683e-01, 1.40953249e-10, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        ...,
        [4.19709027e-01, 8.94207895e-01, 6.72770994e-08, ...,
         1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.55322164e-01, 7.70118117e-01, 1.62231534e-10, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [3.17728251e-01, 8.95207644e-01, 2.99471259e-01, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00]], dtype=float32)>,
 <tf.Tensor: shape=(85, 64), dtype=float32, numpy=
 array([[3.1696302e-01, 7.9090852e-01, 1.8428725e-01, ..., 0.0000000e+00,
         1.0000000e+00, 0.0000000e+00],
        [4.2885509e-01, 8.2786500

In [1]:
import pickle

with open("../hyperparams/dpsgd/202506070611.pkl", "rb") as f:
    params = pickle.load(f)

In [3]:
params['threshold'] = 0.0705775535106659
params['q'] = 0.74

In [4]:
with open("../hyperparams/dpsgd/202506070611.pkl", "wb") as f:
    pickle.dump(params, f)

In [2]:
params

{'hidden_dims': [64],
 'batch_size': 121,
 'dropout_rate': 0.0831694884767021,
 'learning_rate': 0.005602586183223,
 'lam': 0.0001,
 'gamma': 0.999,
 'l2norm_pct': 76,
 'max_epochs': 150,
 'threshold': 0.07274726003408431,
 'q': 0.76}