In [1]:
# Set the working directory to the parent directory
import sys
sys.path.append('..')
sys.dont_write_bytecode = True

# Import relevant custom libraries
from src.eda import data_info
from src.evaluation import ValidationEvaluation

# Import relevant libraries
import pandas as pd
import warnings
from IPython.display import display
import matplotlib.pyplot as plt
import math
import os
import glob
import numpy as np

# Config
pd.set_option('display.max_columns', None) # Ensure all columns are displayed
warnings.filterwarnings("ignore")

# Read relevant files
X_train = pd.read_feather("../data/processed/X_train.feather")
X_train_validate = pd.read_feather("../data/processed/X_train_validate.feather")

# Get data info
var_info = data_info(X_train)
all_cols = X_train.columns
real_cols = var_info[var_info["var_type"]=="numerical"]["var_name"].tolist()
binary_cols = var_info[var_info["var_type"]=="binary"]["var_name"].tolist()

# Read relevant files
X_validate = pd.read_feather("../data/processed/X_validate.feather")
y_validate = pd.read_feather("../data/processed/y_validate.feather")

# Initialize the validation evaluation
valeval = ValidationEvaluation(X_validate, y_validate, real_cols, binary_cols, all_cols, dp_sgd=True)
    
# Read the log file
log_path = "../logs/dpsgd_tune_log.txt"

# Extract the latest successful Bayesian versions
latest_successful_versions = valeval.extract_latest_successful_bayesian_versions(log_path)
print(latest_successful_versions)

# Evaluate the model performance
eval_results = valeval.evaluate_model_performance(latest_successful_versions)

{'202505161637': ('AUC', 1.0, 1e-05, datetime.datetime(2025, 5, 16, 18, 54, 18, 622859)), '202505070419': ('Precision', 1.0, 1e-05, datetime.datetime(2025, 5, 15, 20, 15, 45, 649085)), '202505170232': ('F1-Score', 1.0, 1e-05, datetime.datetime(2025, 5, 17, 5, 26, 3, 436569)), '202505170910': ('Recall', 1.0, 1e-05, datetime.datetime(2025, 5, 17, 15, 36, 10, 805811)), '202505161903': ('AUC', 3.0, 1e-05, datetime.datetime(2025, 5, 17, 1, 21, 4, 994784)), '202505170526': ('F1-Score', 3.0, 1e-05, datetime.datetime(2025, 5, 17, 7, 32, 36, 458159)), '202505150921': ('Precision', 3.0, 1e-05, datetime.datetime(2025, 5, 15, 20, 44, 23, 491416)), '202505150606': ('Recall', 3.0, 1e-05, datetime.datetime(2025, 5, 16, 0, 37, 31, 384408)), '202505150349': ('AUC', 5.0, 1e-05, datetime.datetime(2025, 5, 16, 1, 29, 3, 290500)), '202505170732': ('F1-Score', 5.0, 1e-05, datetime.datetime(2025, 5, 17, 9, 10, 19, 355577)), '202505151015': ('Precision', 5.0, 1e-05, datetime.datetime(2025, 5, 15, 21, 3, 33, 6

In [4]:
from src.dp_utils_poisson import DPSGDSanitizer

san = DPSGDSanitizer(len(X_train), 64, 3, 500, 1e-5)
print(san.compute_noise_from_eps())

2.468514516045322


In [3]:
import tensorflow as tf
import pickle
import os
os.chdir("/Users/trinhha/Documents/VU AMSTERDAM/STUDY/Thesis/Code/")
from src.models import AnomalyDetector

X_test = pd.read_feather("data/processed/X_test.feather")

for version in eval_results.index.tolist():
    print(version)
    # Load model and hyperparameters
    model = tf.keras.models.load_model(f"models/baseline/{version}")
    with open(f"hyperparams/baseline/{version}.pkl", "rb") as f:
        params = pickle.load(f)
    detector = AnomalyDetector(
                model=model,
                real_cols=real_cols,
                binary_cols=binary_cols,
                all_cols=all_cols,
                lam=params["lam"],
                gamma=params["gamma"],
            )
    # Compute scores
    scores, x_hat = detector._compute_anomaly_scores(X_test, test_set=True)

    # Save reconstructed data
    pd.DataFrame(x_hat, columns=all_cols).to_feather(f"experiments/predictions/baseline/{version}_recons.feather")

    # Detect
    y_pred = detector._detect(scores, params['threshold'])

    # Save predictions
    pd.DataFrame(y_pred, columns=["anomaly"]).to_feather(f"experiments/predictions/baseline/{version}_pred.feather")

202505080154
202505080250
202505112013
202505080012


In [1]:
import pickle

with open("../hyperparams/dpsgd/202506070611.pkl", "rb") as f:
    params = pickle.load(f)

In [2]:
params['threshold'] = 0.06004473716020584
params['q'] = 0.7

In [3]:
with open("../hyperparams/dpsgd/202506070611.pkl", "wb") as f:
    pickle.dump(params, f)

In [4]:
import pickle
import tensorflow as tf
import pandas as pd
import sys
sys.path.append('..')
sys.dont_write_bytecode = True

# Import relevant custom libraries
from src.eda import data_info
from src.evaluation import AnomalyDetector

import os
os.chdir("/Users/trinhha/Documents/VU AMSTERDAM/STUDY/Thesis/Code/")

X_test = pd.read_feather("data/processed/X_test.feather")
y_test = pd.read_feather("data/processed/y_test.feather")
# Get data info
var_info = data_info(X_test)
all_cols = X_test.columns
real_cols = var_info[var_info["var_type"]=="numerical"]["var_name"].tolist()
binary_cols = var_info[var_info["var_type"]=="binary"]["var_name"].tolist()

with open("hyperparams/dpsgd/202506070611.pkl", "rb") as f:
    params = pickle.load(f)
model = tf.keras.models.load_model(f"models/dpsgd/202506070611")



In [5]:
detector = AnomalyDetector(
                model=model,
                real_cols=real_cols,
                binary_cols=binary_cols,
                all_cols=all_cols,
                lam=params["lam"],
                gamma=params["gamma"],
                target_epsilon=3, delta=1e-5
            )

In [6]:
scores = detector._compute_anomaly_scores(X_test)
y_pred = detector._detect(scores, params["threshold"])
perf = detector._evaluate(y_pred, y_test, scores)

In [7]:
perf

{'accuracy': 0.7446102819237148,
 'precision': 0.41311379006174653,
 'recall': 0.6017130620985011,
 'f1_score': 0.4898884239888424,
 'auc': 0.7593880854671883}

In [1]:
import os
os.chdir("/Users/trinhha/Documents/VU AMSTERDAM/STUDY/Thesis/Code/")
import re
from datetime import datetime
import pandas as pd
import tensorflow as tf
import pickle
from src.models import AutoencoderTrainer, AnomalyDetector
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from math import pi
from src.dp_utils import *
from src.eda import data_info
from tqdm import tqdm
import random

class StatisticalEval():
    def __init__(self):

        # Load training data
        self.X_train = pd.read_feather("data/processed/X_train.feather")

        # Load train-validation data
        self.X_train_val = pd.read_feather("data/processed/X_train_validate.feather")
        
        # Load test data
        self.X_test = pd.read_feather("data/processed/X_test.feather")
        self.y_test = pd.read_feather("data/processed/y_test.feather")
        
        # Extract variable types from metadata
        self.var_info = data_info(self.X_test)
        self.all_cols = self.X_test.columns
        self.real_cols = self.var_info[self.var_info["var_type"] == "numerical"]["var_name"].tolist()
        self.binary_cols = self.var_info[self.var_info["var_type"] == "binary"]["var_name"].tolist()

        # Metrics
        self.metric_labels = {
            "precision": "Precision",
            "recall": "Recall",
            "f1_score": "F1-Score",
            "auc": "AUC"
        }
        
    def _single_eval(self, model_type, version, epsilon, delta, seed=None):
        
        with open(f"hyperparams/{model_type}/{version}.pkl", "rb") as f:
            params = pickle.load(f)
        
        tf.random.set_seed(seed)
        np.random.seed(seed)
        random.seed(seed)
        
        # Train model
        trainer = AutoencoderTrainer(
            input_dim=self.X_train.shape[1],
            real_cols=self.real_cols,
            binary_cols=self.binary_cols,
            all_cols=self.all_cols,
            activation='relu',
            patience_limit=10,
            verbose=False,
            dp_sgd=True if model_type == "dpsgd" else False,
            post_hoc=False,
            target_epsilon=epsilon,
            delta=delta,
            version=version,
            save_tracking=True,
            raise_convergence_error=True,
            **{key: value for key, value in params.items() if key not in ['threshold', 'q']}
        )
        model = trainer.train(self.X_train, self.X_train_val)

        return model

    def final_eval(self, metric_used, seed):

        # Load the best models
        # Baseline
        baseline = pd.read_csv("experiments/perf_summary/baseline_val_results.csv")
        baseline_model = baseline.query(f'tuned_by == "{metric_used}"')
        # DP-SGD
        dpsgd = pd.read_csv("experiments/perf_summary/dpsgd_val_results.csv")
        dpsgd_models = dpsgd.query(f'tuned_by == "{metric_used}"')
        
        # Load baseline model versions
        for version in baseline_model["version"].tolist():
            model = self._single_eval("baseline", version, 0, 0, seed)
            model.save(f"models/baseline/{version}_final")

        # Load dpsgd model versions
        for i, row in dpsgd_models.iterrows():
            epsilon = row["epsilon"]
            delta = row["delta"]
            version = row["version"]
            model = self._single_eval("dpsgd", version, epsilon, delta, seed)
            model.save(f"models/dpsgd/{version}_final")

In [None]:
# Define labels for the metrics to be plotted
metric_labels = {
            "precision": "Precision",
            "recall": "Recall",
            "f1_score": "F1-Score",
            "auc": "AUC",
            "fidelity":  "Fidelity"

        }
metric_used = "AUC"

# Get the best baseline model
baseline = pd.read_csv("experiments/perf_summary/baseline_val_results.csv")
baseline_model = baseline.query(f'tuned_by == "{metric_used}"')["version"].astype(str).tolist()
baseline_test_perf = pd.read_csv("results/metrics/baseline.csv")
baseline_test_perf["version"] = baseline_test_perf["version"].astype(str)
baseline_best = baseline_test_perf[baseline_test_perf["version"].isin(baseline_model)]
display(baseline_best)

# Read the test performance of the DP-SGD models
dpsgd = pd.read_csv("experiments/perf_summary/dpsgd_val_results.csv")
dpsgd_models = dpsgd.query(f'tuned_by == "{metric_used}"')["version"].astype(str).tolist()
dpsgd_test_perf = pd.read_csv("results/metrics/dpsgd.csv")
dpsgd_test_perf["version"] = dpsgd_test_perf["version"].astype(str)
dpsgd_best = dpsgd_test_perf[dpsgd_test_perf["version"].isin(dpsgd_models)].sort_values(by="eps", ascending=True)
display(dpsgd_best)

# Gather the performance metrics and perform normality test
model_list = {"Baseline": f"baseline/{m}" for m in baseline_best["version"].tolist()}
model_list.update({r"$\varepsilon=$" + f"{row['eps']:.0f}": f"dpsgd/{row['version']}" for i, row in dpsgd_best.sort_values("eps", ascending=False).iterrows()})

min_len = min([len(pd.read_csv(f"results/stats_eval/{value}.csv")) for value in model_list.values()])

# Get the seed values
seeds = pd.read_csv("results/stats_eval/seeds.txt", header=None).rename(columns={0:"seed"})
median_ind = int(len(seeds)/2)

perf_stats = pd.DataFrame()
for key, value in model_list.items():
    print("Model:", key)
    perf = pd.read_csv(f"results/stats_eval/{value}.csv")[:min_len]*100
    perf.insert(0, "Model", key)
    perf["seed"] = seeds
    display(perf.sort_values(by="AUC")[median_ind:median_ind+1])

    perf_stats = pd.concat([perf_stats, perf], ignore_index=True)

mean_by_seed = perf_stats.drop(columns=["Model"]).groupby("seed").mean().sort_values(by="AUC")
median_seed = mean_by_seed[median_ind:median_ind+1].index[0]
print(median_seed)

Unnamed: 0,accuracy,precision,recall,f1_score,auc,version,timestamp
24,0.780134,0.472909,0.687794,0.560461,0.81813,202505080012,2025-06-10 00:25:54


Unnamed: 0,accuracy,precision,recall,f1_score,auc,version,eps,delta,timestamp
108,0.742254,0.411054,0.611563,0.491651,0.7555,202506071334,1.0,1e-05,2025-06-10 03:54:53
107,0.753775,0.425643,0.595717,0.49652,0.764574,202506070611,3.0,1e-05,2025-06-11 01:29:12
106,0.754473,0.430076,0.62955,0.511038,0.788205,202506070329,5.0,1e-05,2025-06-11 04:26:30


Model: Baseline


Unnamed: 0,Model,Precision,Recall,F1-Score,AUC,seed
50,Baseline,47.512365,69.93576,56.583507,82.496151,140891


Model: $\varepsilon=$5


Unnamed: 0,Model,Precision,Recall,F1-Score,AUC,Fidelity,seed
27,$\varepsilon=$5,43.729997,64.368308,52.079002,79.178241,84.289081,361524


Model: $\varepsilon=$3


Unnamed: 0,Model,Precision,Recall,F1-Score,AUC,Fidelity,seed
50,$\varepsilon=$3,44.146855,62.826552,51.855779,78.095394,84.533473,140891


Model: $\varepsilon=$1


Unnamed: 0,Model,Precision,Recall,F1-Score,AUC,Fidelity,seed
35,$\varepsilon=$1,41.344196,60.856531,49.237699,75.384653,81.984813,58224


943413


In [3]:
# Initialize the evaluation class
eval = StatisticalEval()
# Run the final evaluation
eval.final_eval(metric_used="AUC", seed=int(median_seed))

INFO:tensorflow:Assets written to: models/baseline/202505080012_final/assets


INFO:tensorflow:Assets written to: models/baseline/202505080012_final/assets






INFO:tensorflow:Assets written to: models/dpsgd/202506071334_final/assets


INFO:tensorflow:Assets written to: models/dpsgd/202506071334_final/assets






INFO:tensorflow:Assets written to: models/dpsgd/202506070611_final/assets


INFO:tensorflow:Assets written to: models/dpsgd/202506070611_final/assets






INFO:tensorflow:Assets written to: models/dpsgd/202506070329_final/assets


INFO:tensorflow:Assets written to: models/dpsgd/202506070329_final/assets


In [10]:
exist_seeds = seeds["seed"].tolist()
seed_no = 1
while len(exist_seeds) < 100:
    random.seed(seed_no)
    new_seeds = random.sample(range(1000000), 100 - len(exist_seeds))
    no_dup_seeds = [s for s in new_seeds if s not in exist_seeds]
    exist_seeds += no_dup_seeds
    seed_no += 1

In [17]:
with open("results/stats_eval/seeds_new.txt", "w") as f:
    for seed in exist_seeds:
        f.write(f"{seed}\n")