In [1]:
import sys, math
import numpy as np
import pandas as pd

import tensorflow as tf
print("Python:", sys.version)
print("TF:", tf.__version__)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasSGDOptimizer
from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy

Python: 3.11.11 | packaged by conda-forge | (main, Dec  5 2024, 08:47:03) [Clang 18.1.8 ]
TF: 2.15.0


In [2]:
CSV_V2 = "../data/processed/athletes_v2_with_total.csv"
RANDOM_STATE = 42
TEST_SIZE = 0.20

df = pd.read_csv(CSV_V2)

# Features & target
X = df[['age','height','weight']].astype(float)
y = df['total_lift'].astype(float)

# Drop rows with missing values in features or target
mask = X.notna().all(axis=1) & y.notna()
X, y = X[mask], y[mask]

# Train/test split
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

# Standardize features
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)

# Make everything float32 & strictly finite
X_tr_s = X_tr_s.astype("float32"); X_te_s = X_te_s.astype("float32")
y_tr    = y_tr.astype("float32");   y_te   = y_te.astype("float32")

tr_ok = np.isfinite(X_tr_s).all(axis=1)
te_ok = np.isfinite(X_te_s).all(axis=1)
X_tr_s, y_tr = X_tr_s[tr_ok], y_tr.iloc[tr_ok].to_numpy()
X_te_s, y_te = X_te_s[te_ok], y_te.iloc[te_ok].to_numpy()

print("Shapes:", X_tr_s.shape, X_te_s.shape)


Shapes: (24665, 3) (6167, 3)


In [3]:
from tensorflow.keras.optimizers.legacy import SGD

tf.keras.utils.set_random_seed(42)

non_dp = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(3,)),
    tf.keras.layers.Dense(
        16, activation='relu',
        kernel_initializer='he_normal',
        kernel_regularizer=tf.keras.regularizers.l2(1e-4)
    ),
    tf.keras.layers.Dense(1),
])

non_dp.compile(
    optimizer=SGD(learning_rate=0.001, momentum=0.9, nesterov=True, clipnorm=1.0),
    loss=tf.keras.losses.Huber(delta=50.0),
)

es = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=8, restore_best_weights=True)
non_dp.fit(X_tr_s, y_tr, epochs=200, batch_size=128, verbose=0, callbacks=[es])

pred_non_dp = non_dp.predict(X_te_s, verbose=0).ravel()
mae_ndp  = mean_absolute_error(y_te, pred_non_dp)
rmse_ndp = math.sqrt(mean_squared_error(y_te, pred_non_dp))
r2_ndp   = r2_score(y_te, pred_non_dp)
print(f"[v2 non-DP Keras] MAE={mae_ndp:.2f}  RMSE={rmse_ndp:.2f}  R2={r2_ndp:.3f}")


[v2 non-DP Keras] MAE=151.04  RMSE=195.47  R2=0.511


In [4]:
# DP Training
from tensorflow_privacy.privacy.optimizers.dp_optimizer_keras import DPKerasAdamOptimizer
from tensorflow.keras.losses import Reduction

# Hyperparams
batch_size       = 64
epochs           = 30        
l2_norm_clip     = 1.0
noise_multiplier = 0.6
learning_rate    = 5e-4

train_ds = (
    tf.data.Dataset.from_tensor_slices((X_tr_s, y_tr))
      .shuffle(len(X_tr_s), seed=42, reshuffle_each_iteration=True)
      .batch(batch_size, drop_remainder=True)
      .cache()
      .prefetch(tf.data.AUTOTUNE)
)

dp_opt = DPKerasAdamOptimizer(
    l2_norm_clip=l2_norm_clip,
    noise_multiplier=noise_multiplier,
    num_microbatches=batch_size,
    learning_rate=learning_rate,
)

dp = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(3,)),
    tf.keras.layers.Dense(16, activation='relu', kernel_initializer='he_normal'),
    tf.keras.layers.Dense(1),
])

dp.compile(
    optimizer=dp_opt,
    loss=tf.keras.losses.MeanSquaredError(reduction=Reduction.NONE),
    run_eagerly=False,
)

history = dp.fit(train_ds, epochs=epochs, verbose=1)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [5]:
# Evaluate DP Model
import numpy as np, math
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

for name in ["dp", "X_te_s", "y_te"]:
    assert name in globals(), f"Missing variable: {name} (run the previous cells that define it)"

pred_dp = dp.predict(X_te_s, batch_size=1024, verbose=0).ravel()

good = np.isfinite(pred_dp)
if not good.all():
    pred_dp = pred_dp[good]
    y_te_eval = y_te[good]
else:
    y_te_eval = y_te

mae_dp  = mean_absolute_error(y_te_eval, pred_dp)
rmse_dp = math.sqrt(mean_squared_error(y_te_eval, pred_dp))
r2_dp   = r2_score(y_te_eval, pred_dp)
print(f"[v2 DP] MAE={mae_dp:.2f}  RMSE={rmse_dp:.2f}  R2={r2_dp:.3f}")

[v2 DP] MAE=399.94  RMSE=469.34  R2=-1.816


In [6]:
# Privacy Accounting
try:
    from tensorflow_privacy.privacy.analysis.compute_dp_sgd_privacy_lib import compute_dp_sgd_privacy
except Exception:
    
    from tensorflow_privacy.privacy.analysis import compute_dp_sgd_privacy as _mod
    compute_dp_sgd_privacy = _mod.compute_dp_sgd_privacy

for name in ["X_tr_s", "batch_size", "epochs", "noise_multiplier"]:
    assert name in globals(), f"Missing variable: {name} (run the DP setup cell)"

delta = 1e-5
n = len(X_tr_s)

eps, opt_order = compute_dp_sgd_privacy(
    n=n,
    batch_size=batch_size,
    noise_multiplier=noise_multiplier,
    epochs=epochs,
    delta=delta,
)
print(f"DP-SGD privacy: ε={eps:.2f}, δ={delta} (opt_order={opt_order})")




DP-SGD privacy: ε=6.98, δ=1e-05 (opt_order=3.0)


In [7]:
# Saving comparison to a CSV for slides later
import pandas as pd
rows = []

if {"mae_ndp","rmse_ndp","r2_ndp"} <= set(globals()):
    rows.append(dict(model="v2 non-DP (Keras)", MAE=mae_ndp, RMSE=rmse_ndp, R2=r2_ndp,
                     dp=False, eps=None, delta=None))

rows.append(dict(model="v2 DP", MAE=mae_dp, RMSE=rmse_dp, R2=r2_dp,
                 dp=True, eps=eps, delta=delta))

cmp_df = pd.DataFrame(rows)
display(cmp_df)

out = "../data/processed/metrics_v2_dp_vs_nodp.csv"
cmp_df.to_csv(out, index=False)
print("Saved →", out)


Unnamed: 0,model,MAE,RMSE,R2,dp,eps,delta
0,v2 non-DP (Keras),151.041626,195.471615,0.511492,False,,
1,v2 DP,399.936523,469.341294,-1.816319,True,6.978008,1e-05


Saved → ../data/processed/metrics_v2_dp_vs_nodp.csv
