In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import laplace

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
data = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values="?")

# Data preprocessing
data = data.dropna()
data['income'] = (data['income'] == '>50K').astype(int)

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_cols)

# Split the data
X = data_encoded.drop('income', axis=1)
y = data_encoded['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to add Laplace noise (differential privacy)
def add_laplace_noise(data, epsilon):
    sensitivity = 1.0  # Assuming normalized data
    noise_scale = sensitivity / epsilon
    return data + np.random.laplace(0, noise_scale, data.shape)

# Apply differential privacy to sensitive numerical attributes
sensitive_attrs = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
epsilon = 1.0  # Privacy budget

for attr in sensitive_attrs:
    if attr in X_train.columns:
        X_train[attr] = add_laplace_noise(X_train[attr], epsilon)
        X_test[attr] = add_laplace_noise(X_test[attr], epsilon)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8596

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      4503
           1       0.76      0.64      0.70      1530

    accuracy                           0.86      6033
   macro avg       0.83      0.79      0.80      6033
weighted avg       0.85      0.86      0.86      6033



In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy.stats import laplace

class DifferentialPrivacy:
    def __init__(self, epsilon, delta=1e-5):
        self.epsilon = epsilon
        self.delta = delta
        self.numerical_columns = []
        self.categorical_columns = []
        self.global_sensitivity = {}

    def fit(self, data):
        self.numerical_columns = data.select_dtypes(include=['int64', 'float64']).columns
        self.categorical_columns = data.select_dtypes(include=['object']).columns
        
        # Calculate global sensitivity for numerical columns
        for col in self.numerical_columns:
            self.global_sensitivity[col] = self._calculate_sensitivity(data[col])
        
        # Calculate global sensitivity for categorical columns
        for col in self.categorical_columns:
            self.global_sensitivity[col] = 1  # For categorical data, sensitivity is typically 1
    
    def transform(self, data):
        privacy_protected_data = data.copy()
        
        # Apply differential privacy to numerical columns
        for col in self.numerical_columns:
            privacy_protected_data[col] = self._add_laplace_noise(data[col], self.global_sensitivity[col])
        
        # Apply differential privacy to categorical columns
        for col in self.categorical_columns:
            privacy_protected_data[col] = self._randomized_response(data[col])
        
        return privacy_protected_data
    
    def _calculate_sensitivity(self, column):
        return (column.max() - column.min()) / 10  # A heuristic approach; can be adjusted based on domain knowledge
    
    def _add_laplace_noise(self, column, sensitivity):
        noise_scale = sensitivity / self.epsilon
        return column + np.random.laplace(0, noise_scale, column.shape)
    
    def _randomized_response(self, column):
        p = np.exp(self.epsilon) / (1 + np.exp(self.epsilon))
        mask = np.random.random(column.shape) < p
        return np.where(mask, column, np.random.choice(column, column.shape))

# Load and preprocess the data (same as before)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
data = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values="?")
data = data.dropna()
data['income'] = (data['income'] == '>50K').astype(int)

# Split the data
from sklearn.model_selection import train_test_split
X = data.drop('income', axis=1)
y = data['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply differential privacy
dp = DifferentialPrivacy(epsilon=1.0)
dp.fit(X_train)
X_train_private = dp.transform(X_train)
X_test_private = dp.transform(X_test)

# One-hot encode categorical variables
X_train_encoded = pd.get_dummies(X_train_private, columns=dp.categorical_columns)
X_test_encoded = pd.get_dummies(X_test_private, columns=dp.categorical_columns)

# Ensure both datasets have the same columns
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

# Train and evaluate the model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with Differential Privacy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Compare with non-private model
X_train_nonprivate = pd.get_dummies(X_train, columns=dp.categorical_columns)
X_test_nonprivate = pd.get_dummies(X_test, columns=dp.categorical_columns)
X_test_nonprivate = X_test_nonprivate.reindex(columns=X_train_nonprivate.columns, fill_value=0)

scaler_nonprivate = StandardScaler()
X_train_scaled_nonprivate = scaler_nonprivate.fit_transform(X_train_nonprivate)
X_test_scaled_nonprivate = scaler_nonprivate.transform(X_test_nonprivate)

model_nonprivate = RandomForestClassifier(n_estimators=100, random_state=42)
model_nonprivate.fit(X_train_scaled_nonprivate, y_train)

y_pred_nonprivate = model_nonprivate.predict(X_test_scaled_nonprivate)

accuracy_nonprivate = accuracy_score(y_test, y_pred_nonprivate)
print(f"\nAccuracy without Differential Privacy: {accuracy_nonprivate:.4f}")
print("\nClassification Report (Non-private):")
print(classification_report(y_test, y_pred_nonprivate))

Accuracy with Differential Privacy: 0.8008

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.93      0.87      4503
           1       0.68      0.41      0.51      1530

    accuracy                           0.80      6033
   macro avg       0.75      0.67      0.69      6033
weighted avg       0.79      0.80      0.78      6033


Accuracy without Differential Privacy: 0.8526

Classification Report (Non-private):
              precision    recall  f1-score   support

           0       0.89      0.92      0.90      4503
           1       0.74      0.65      0.69      1530

    accuracy                           0.85      6033
   macro avg       0.81      0.79      0.80      6033
weighted avg       0.85      0.85      0.85      6033

