In [None]:
import numpy as np
import pandas as pd

np.random.seed(42)
n_rows = 10000

data = {
    "PatientID": np.arange(1, n_rows + 1),
    "Age": np.random.randint(20, 80, n_rows),
    "Gender": np.random.choice(["M", "F"], n_rows),
    "BMI": np.round(np.random.uniform(18.5, 40.0, n_rows), 1),
    "SmokingStatus": np.random.choice(["Current", "Former", "Never"], n_rows),
    "BloodPressure": np.random.choice(["120/80", "130/85", "140/90", "150/95", "125/82"], n_rows),
    "Cholesterol": np.random.choice(["Normal", "Borderline", "High"], n_rows),
    "FamilyHistory": np.random.choice(["Yes", "No"], n_rows),
    "PhysicalActivity": np.random.choice(["Low", "Moderate", "High"], n_rows),
    "AlcoholIntake": np.random.choice(["Low", "Moderate", "High"], n_rows),
    "DietQuality": np.random.choice(["Poor", "Moderate", "Good", "Excellent"], n_rows),
    "StressLevel": np.random.choice(["Low", "Medium", "High"], n_rows),
    "GlucoseLevel": np.round(np.random.uniform(70, 180, n_rows), 1),
    "HbA1c": np.round(np.random.uniform(4.5, 8.0, n_rows), 1)
}

df = pd.DataFrame(data)

mappings = {
    "Gender": {"M": 0, "F": 1},
    "SmokingStatus": {"Current": 2, "Former": 1, "Never": 0},
    "BloodPressure": {"120/80": 1, "130/85": 2, "140/90": 3, "150/95": 4, "125/82": 1.5},
    "Cholesterol": {"Normal": 1, "Borderline": 2, "High": 3},
    "FamilyHistory": {"Yes": 1, "No": 0},
    "PhysicalActivity": {"Low": 1, "Moderate": 2, "High": 3},
    "AlcoholIntake": {"Low": 0, "Moderate": 1, "High": 2},
    "DietQuality": {"Poor": 1, "Moderate": 2, "Good": 3, "Excellent": 4},
    "StressLevel": {"Low": 1, "Medium": 2, "High": 3}
}

for col, mapping in mappings.items():
    df[col] = df[col].map(mapping)

df["DiabetesRiskScore"] = (
    0.1 * df["Age"] + 0.2 * df["BMI"]**1.5 + 0.25 * np.log1p(df["GlucoseLevel"]) +
    0.15 * df["SmokingStatus"] + 0.3 * df["BloodPressure"] +
    0.25 * df["Cholesterol"] * df["DietQuality"] + 0.2 * df["FamilyHistory"] +
    0.1 * df["PhysicalActivity"] + np.random.normal(0, 2, n_rows)
).astype(int)

df["HeartDiseaseRiskScore"] = (
    0.2 * np.sqrt(df["Age"]) + 0.15 * df["BMI"] + 0.3 * df["BloodPressure"] +
    0.25 * df["Cholesterol"]**2 + 0.2 * df["StressLevel"] +
    0.15 * df["PhysicalActivity"] * df["DietQuality"] +
    0.25 * df["AlcoholIntake"] + np.random.normal(0, 3, n_rows)
).astype(int)

df["DiabetesRiskScore"] = df["DiabetesRiskScore"].clip(0, 100)
df["HeartDiseaseRiskScore"] = df["HeartDiseaseRiskScore"].clip(0, 100)

df.head()


Unnamed: 0,PatientID,Age,Gender,BMI,SmokingStatus,BloodPressure,Cholesterol,FamilyHistory,PhysicalActivity,AlcoholIntake,DietQuality,StressLevel,GlucoseLevel,HbA1c,DiabetesRiskScore,HeartDiseaseRiskScore
0,1,58,0,27.5,0,2.0,1,0,1,1,3,3,84.7,6.4,35,6
1,2,71,0,26.0,0,1.5,3,0,3,1,2,2,141.6,5.1,37,10
2,3,48,0,37.0,2,3.0,3,1,3,0,4,1,177.5,5.5,56,14
3,4,34,0,21.2,2,3.0,1,0,3,0,2,1,119.8,6.3,26,8
4,5,62,0,30.7,1,1.5,3,0,3,2,1,1,86.4,7.9,42,7


In [None]:
pip install diffprivlib

Collecting diffprivlib
  Downloading diffprivlib-0.6.5-py3-none-any.whl.metadata (9.6 kB)
Downloading diffprivlib-0.6.5-py3-none-any.whl (176 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.5/176.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: diffprivlib
Successfully installed diffprivlib-0.6.5


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from diffprivlib.models import LinearRegression as DPLinearRegression
import numpy as np

X = df.drop(columns=["PatientID", "DiabetesRiskScore", "HeartDiseaseRiskScore"])
y = df["DiabetesRiskScore"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

individual_idx = 0
individual_data = X_train.iloc[individual_idx].values.reshape(1, -1)
true_label = y_train.iloc[individual_idx]

model_no_dp = LinearRegression()
model_no_dp.fit(X_train, y_train)

prediction_no_dp = model_no_dp.predict(individual_data)[0]
print(f"Without Differential Privacy - Individual Prediction: {prediction_no_dp}")

coeff_contribution_no_dp = model_no_dp.coef_ * individual_data[0]
print(f"Without Differential Privacy - Coefficient Contributions: {coeff_contribution_no_dp}")

epsilon = 1.0
dp_model = DPLinearRegression(epsilon=epsilon, bounds=(X.min().values, X.max().values))
dp_model.fit(X_train.values, y_train.values)

prediction_with_dp = dp_model.predict(individual_data)[0]
print(f"\nWith Differential Privacy - Individual Prediction: {prediction_with_dp}")

coeff_contribution_with_dp = dp_model.coef_ * individual_data[0]
print(f"With Differential Privacy - Coefficient Contributions: {coeff_contribution_with_dp}")


Without Differential Privacy - Individual Prediction: 26.000340506008
Without Differential Privacy - Coefficient Contributions: [ 3.08426049e+00  2.22372404e-02  3.53179327e+01  1.44486500e-01
  5.80423521e-01  1.17741050e+00  2.33188700e-01  2.51500057e-01
  0.00000000e+00  1.02347770e+00 -3.96474780e-03  2.72219672e-01
  5.07696935e-02]

With Differential Privacy - Individual Prediction: 24.339957933188565
With Differential Privacy - Coefficient Contributions: [ 7.58955013  0.26442778 32.75851016  2.05761096 -4.56995764  2.77350084
  1.04099412 -7.31320488 -0.          6.33014009  2.50254341  8.52122969
 -6.49583209]


This will result in additional privacy leakage. To ensure differential privacy with no additional privacy loss, specify `bounds_X` and `bounds_y`.


# Without Differential Privacy

Coefficient Contributions: `[ 3.08, 0.02, 35.31, ..., 0.27, 0.05]`

The model makes a highly accurate prediction for the individual because it directly fits the data without privacy-preserving noise.
The contributions from each feature (like Age, BMI, etc.) directly map to the final prediction. This means an adversary can reverse-engineer sensitive feature values from these coefficients and the model's output, potentially compromising the individual’s privacy.

# With Differential Privacy

Coefficient Contributions: `[ 6.03, 0.96, 39.24, ..., -6.23, 8.88]`

Although less accurate, the trade-off ensures better protection of individual data.
The contributions from features include significant noise, making it much harder to deduce the exact influence of each feature. This obfuscation protects the privacy of the individual by reducing the risk of inferring sensitive details.