In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import laplace

# Load the Adult dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
data = pd.read_csv(url, names=column_names, skipinitialspace=True, na_values="?")

# Data preprocessing
data = data.dropna()
data['income'] = (data['income'] == '>50K').astype(int)

# Identify numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data.select_dtypes(include=['object']).columns

# One-hot encode categorical variables
data_encoded = pd.get_dummies(data, columns=categorical_cols)

# Split the data
X = data_encoded.drop('income', axis=1)
y = data_encoded['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to add Laplace noise (differential privacy)
def add_laplace_noise(data, epsilon):
    sensitivity = 1.0  # Assuming normalized data
    noise_scale = sensitivity / epsilon
    return data + np.random.laplace(0, noise_scale, data.shape)

# Apply differential privacy to sensitive numerical attributes
sensitive_attrs = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
epsilon = 1.0  # Privacy budget

for attr in sensitive_attrs:
    if attr in X_train.columns:
        X_train[attr] = add_laplace_noise(X_train[attr], epsilon)
        X_test[attr] = add_laplace_noise(X_test[attr], epsilon)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8596

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      4503
           1       0.76      0.64      0.70      1530

    accuracy                           0.86      6033
   macro avg       0.83      0.79      0.80      6033
weighted avg       0.85      0.86      0.86      6033

