In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
import time
import matplotlib.pyplot as plt

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
        "hours-per-week", "native-country", "income"]

df = pd.read_csv(url, header=None, names=cols, na_values=" ?")
df.dropna(inplace=True)

# Encode categoricals
categorical = df.select_dtypes(include='object').columns
df[categorical] = df[categorical].apply(LabelEncoder().fit_transform)

X = df.drop("income", axis=1)
y = df["income"]

# Normalize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# For DP trees: scale to [0,1] and save bounds
X_minmax = MinMaxScaler().fit_transform(X)
x_train_mm, x_test_mm, _, _ = train_test_split(X_minmax, y, test_size=0.2, random_state=42)
bounds = [(0, 1)] * X.shape[1]

epsilons = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]

# Baseline models

In [None]:

# Logistic Regression
start = time.time()
baseline_lr = LogisticRegression(max_iter=1000).fit(x_train, y_train)
y_pred_lr = baseline_lr.predict(x_test)
baseline_acc_lr = accuracy_score(y_test, y_pred_lr)
baseline_f1_lr = f1_score(y_test, y_pred_lr)
runtime_baseline_lr = time.time() - start

In [None]:
# Decision Tree
start = time.time()
baseline_dt = DecisionTreeClassifier().fit(x_train, y_train)
y_pred_dt = baseline_dt.predict(x_test)
baseline_acc_dt = accuracy_score(y_test, y_pred_dt)
baseline_f1_dt = f1_score(y_test, y_pred_dt)
runtime_baseline_dt = time.time() - start

In [None]:
print("Baseline LR:", baseline_acc_lr, baseline_f1_lr, runtime_baseline_lr)
print("Baseline DT:", baseline_acc_dt, baseline_f1_dt, runtime_baseline_dt)

# Input Perturbation (Add noise to features)

In [None]:
def add_laplace_noise(data, epsilon, sensitivity=1.0):
    scale = sensitivity / epsilon
    return data + np.random.laplace(loc=0, scale=scale, size=data.shape)

acc_input_lr, f1_input_lr, runtime_input_lr = [], [], []
acc_input_dt, f1_input_dt, runtime_input_dt = [], [], []

for eps in epsilons:
    noisy = add_laplace_noise(x_train, eps)

    # LR
    start = time.time()
    lr = LogisticRegression(max_iter=1000).fit(noisy, y_train)
    y_pred = lr.predict(x_test)
    acc_input_lr.append(accuracy_score(y_test, y_pred))
    f1_input_lr.append(f1_score(y_test, y_pred))
    runtime_input_lr.append(time.time() - start)

    # DT
    start = time.time()
    dt = DecisionTreeClassifier().fit(noisy, y_train)
    y_pred = dt.predict(x_test)
    acc_input_dt.append(accuracy_score(y_test, y_pred))
    f1_input_dt.append(f1_score(y_test, y_pred))
    runtime_input_dt.append(time.time() - start)

Input Perturbation Accuracy: 0.7598209845847836
F1-score: 0.11158798283261803


#  Internal Perturbation (IBM DP Library)

In [None]:
from diffprivlib.models import LogisticRegression as DPLogReg
from diffprivlib.models import DecisionTreeClassifier as DPDecisionTree

acc_internal_lr, f1_internal_lr, runtime_internal_lr = [], [], []
acc_internal_dt, f1_internal_dt, runtime_internal_dt = [], [], []

for eps in epsilons:
    # DP Logistic Regression
    start = time.time()
    dp_lr = DPLogReg(epsilon=eps, data_norm=10.0).fit(x_train, y_train)
    y_pred = dp_lr.predict(x_test)
    acc_internal_lr.append(accuracy_score(y_test, y_pred))
    f1_internal_lr.append(f1_score(y_test, y_pred))
    runtime_internal_lr.append(time.time() - start)

    # DP Decision Tree
    start = time.time()
    dp_dt = DPDecisionTree(epsilon=eps, bounds=bounds).fit(x_train_mm, y_train)
    y_pred = dp_dt.predict(x_test_mm)
    acc_internal_dt.append(accuracy_score(y_test, y_pred))
    f1_internal_dt.append(f1_score(y_test, y_pred))
    runtime_internal_dt.append(time.time() - start)


ModuleNotFoundError: No module named 'diffprivlib'

# Output Perturbation (Add noise to model parameters)

In [None]:
def perturb_weights(model, epsilon, sensitivity=1.0):
    scale = sensitivity / epsilon
    model.coef_ += np.random.laplace(0, scale, model.coef_.shape)
    model.intercept_ += np.random.laplace(0, scale, model.intercept_.shape)
    return model

acc_output_lr, f1_output_lr, runtime_output_lr = [], [], []

for eps in epsilons:
    start = time.time()
    clf = LogisticRegression(max_iter=1000).fit(x_train, y_train)
    clf = perturb_weights(clf, epsilon=eps)
    y_pred = clf.predict(x_test)
    acc_output_lr.append(accuracy_score(y_test, y_pred))
    f1_output_lr.append(f1_score(y_test, y_pred))
    runtime_output_lr.append(time.time() - start)

Output Perturbation Accuracy: 0.5995358859605503
F1-score: 0.30294287362954414


# Results

In [None]:

results = pd.DataFrame({
    "Epsilon": epsilons,
    "Input LR Acc": acc_input_lr,
    "Input LR F1": f1_input_lr,
    "Input DT Acc": acc_input_dt,
    "Input DT F1": f1_input_dt,
    "Internal LR Acc": acc_internal_lr,
    "Internal LR F1": f1_internal_lr,
    "Internal DT Acc": acc_internal_dt,
    "Internal DT F1": f1_internal_dt,
    "Output LR Acc": acc_output_lr,
    "Output LR F1": f1_output_lr
})

print(results.round(4))

In [None]:
# Runtime Table
runtime = pd.DataFrame({
    "Epsilon": epsilons,
    "Runtime Input LR": runtime_input_lr,
    "Runtime Input DT": runtime_input_dt,
    "Runtime Internal LR": runtime_internal_lr,
    "Runtime Internal DT": runtime_internal_dt,
    "Runtime Output LR": runtime_output_lr
})
baseline_runtime_row = pd.DataFrame({
    "Epsilon": ["Baseline"],
    "Runtime Input LR": [runtime_baseline_lr],
    "Runtime Input DT": [runtime_baseline_dt],
    "Runtime Internal LR": [None],
    "Runtime Internal DT": [None],
    "Runtime Output LR": [None]
})
runtime_summary = pd.concat([baseline_runtime_row, runtime], ignore_index=True)
print(runtime_summary)

In [None]:
# Accuracy Plot
plt.figure(figsize=(10, 5))
plt.plot(epsilons, acc_input_lr, marker='o', label='Input (LogReg)')
plt.plot(epsilons, acc_input_dt, marker='o', label='Input (DT)')
plt.plot(epsilons, acc_internal_lr, marker='o', label='Internal (LogReg)')
plt.plot(epsilons, acc_internal_dt, marker='o', label='Internal (DT)')
plt.plot(epsilons, acc_output_lr, marker='o', label='Output (LogReg)')
plt.axhline(baseline_acc_lr, color='black', linestyle='--', label='Baseline (LogReg)')
plt.axhline(baseline_acc_dt, color='gray', linestyle='--', label='Baseline (DT)')
plt.xlabel("ε")
plt.ylabel("Accuracy")
plt.title("ε vs Accuracy (Adult Dataset)")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# F1 Score Plot
plt.figure(figsize=(10, 5))
plt.plot(epsilons, f1_input_lr, marker='o', label='Input (LogReg)')
plt.plot(epsilons, f1_input_dt, marker='o', label='Input (DT)')
plt.plot(epsilons, f1_internal_lr, marker='o', label='Internal (LogReg)')
plt.plot(epsilons, f1_internal_dt, marker='o', label='Internal (DT)')
plt.plot(epsilons, f1_output_lr, marker='o', label='Output (LogReg)')
plt.axhline(baseline_f1_lr, color='black', linestyle='--', label='Baseline (LogReg)')
plt.axhline(baseline_f1_dt, color='gray', linestyle='--', label='Baseline (DT)')
plt.xlabel("ε")
plt.ylabel("F1 Score")
plt.title("ε vs F1 Score (Adult Dataset)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Accuracy Drop
acc_drop_input_lr = [baseline_acc_lr - a for a in acc_input_lr]
acc_drop_input_dt = [baseline_acc_dt - a for a in acc_input_dt]
acc_drop_internal_lr = [baseline_acc_lr - a for a in acc_internal_lr]
acc_drop_internal_dt = [baseline_acc_dt - a for a in acc_internal_dt]
acc_drop_output_lr = [baseline_acc_lr - a for a in acc_output_lr]

# F1 Drop
f1_drop_input_lr = [baseline_f1_lr - f for f in f1_input_lr]
f1_drop_input_dt = [baseline_f1_dt - f for f in f1_input_dt]
f1_drop_internal_lr = [baseline_f1_lr - f for f in f1_internal_lr]
f1_drop_internal_dt = [baseline_f1_dt - f for f in f1_internal_dt]
f1_drop_output_lr = [baseline_f1_lr - f for f in f1_output_lr]

In [None]:
# Accuracy Drop Plot
plt.figure(figsize=(10, 5))
plt.plot(epsilons, acc_drop_input_lr, marker='o', label='Input (LogReg)')
plt.plot(epsilons, acc_drop_input_dt, marker='o', label='Input (DT)')
plt.plot(epsilons, acc_drop_internal_lr, marker='o', label='Internal (LogReg)')
plt.plot(epsilons, acc_drop_internal_dt, marker='o', label='Internal (DT)')
plt.plot(epsilons, acc_drop_output_lr, marker='o', label='Output (LogReg)')
plt.xlabel("ε")
plt.ylabel("Accuracy Drop from Baseline")
plt.title("Privacy-Utility Tradeoff: Accuracy Drop")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# F1 Drop Plot
plt.figure(figsize=(10, 5))
plt.plot(epsilons, f1_drop_input_lr, marker='o', label='Input (LogReg)')
plt.plot(epsilons, f1_drop_input_dt, marker='o', label='Input (DT)')
plt.plot(epsilons, f1_drop_internal_lr, marker='o', label='Internal (LogReg)')
plt.plot(epsilons, f1_drop_internal_dt, marker='o', label='Internal (DT)')
plt.plot(epsilons, f1_drop_output_lr, marker='o', label='Output (LogReg)')
plt.xlabel("ε")
plt.ylabel("F1 Drop from Baseline")
plt.title("Privacy-Utility Tradeoff: F1 Score Drop")
plt.legend()
plt.grid(True)
plt.show()