In [8]:
!pip install opacus



In [9]:
import pandas as pd

# Load the training dataset
train_data = pd.read_csv('InsuraceTrain.csv')
train_data.tail()


Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
58587,ID58588,0.355089,0.13,0.644231,C8,8794,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0
58588,ID58589,1.199642,0.02,0.519231,C14,7788,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
58589,ID58590,1.162273,0.05,0.451923,C5,34738,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
58590,ID58591,1.236307,0.14,0.557692,C8,8794,1,B2,M6,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
58591,ID58592,0.124429,0.02,0.442308,C8,8794,3,C2,M4,Diesel,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,3,0


In [10]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler


train_data.drop('policy_id', axis=1, inplace=True)

# Separate target and features
X = train_data.drop('is_claim', axis=1)
y = train_data['is_claim']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Re-define the one-hot encoder
encoder = OneHotEncoder(drop='first', sparse=False)

# One-hot encode categorical columns using the updated method
encoded_features = encoder.fit_transform(X[categorical_columns])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Replace categorical columns with encoded columns in original dataframe
X = pd.concat([X, encoded_df], axis=1)
X.drop(categorical_columns, axis=1, inplace=True)

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_val.shape





((46873, 100), (11719, 100))

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

import time

# Define the PyTorch model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return torch.sigmoid(self.fc4(x))

# Convert data to PyTorch tensors
X_train_torch = torch.FloatTensor(X_train)
y_train_torch = torch.FloatTensor(y_train.values).view(-1, 1)
X_val_torch = torch.FloatTensor(X_val)
y_val_torch = torch.FloatTensor(y_val.values).view(-1, 1)

model = SimpleNN(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

# Start the training time measurement
start_time = time.time()

# Training loop
for epoch in range(10):
    optimizer.zero_grad()
    outputs = model(X_train_torch)
    loss = criterion(outputs, y_train_torch)
    loss.backward()
    optimizer.step()

# End the training time measurement
end_time = time.time()

# Calculate total training time
training_time = end_time - start_time

# Evaluation
with torch.no_grad():
    val_outputs = model(X_val_torch)
    val_predictions = (val_outputs > 0.5).float()
    correct = (val_predictions == y_val_torch).sum().item()
    non_dp_accuracy = correct / len(y_val)

print(f"Non-DP Model Accuracy: {non_dp_accuracy:.4f}")
print(f"Training Time: {training_time:.2f} seconds")

Non-DP Model Accuracy: 0.9355
Training Time: 1.58 seconds


In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from opacus import PrivacyEngine
from torch.utils.data import DataLoader, TensorDataset
import time

# Define the PyTorch model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return torch.sigmoid(self.fc4(x))

# Convert data to PyTorch tensors
X_train_torch = torch.FloatTensor(X_train)
y_train_torch = torch.FloatTensor(y_train.values).view(-1, 1)
X_val_torch = torch.FloatTensor(X_val)
y_val_torch = torch.FloatTensor(y_val.values).view(-1, 1)

# Hyperparameters
noise_multipliers = [0.5, 1.0, 1.5, 2.0, 10]
num_epochs = 10
batch_size = 64
results = []

for noise in noise_multipliers:
    # Start the training time measurement
    start_time = time.time()

    # Create a fresh model, optimizer and DataLoader for each iteration
    model_dp = SimpleNN(X_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model_dp.parameters())

    train_dataset = TensorDataset(X_train_torch, y_train_torch)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Attach PrivacyEngine to the optimizer
    privacy_engine = PrivacyEngine()
    model_dp, optimizer, train_loader = privacy_engine.make_private(
        module=model_dp,
        optimizer=optimizer,
        data_loader=train_loader,
        noise_multiplier=noise,
        max_grad_norm=1.0,
    )

    # Training loop
    for epoch in range(num_epochs):
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model_dp(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    # End the training time measurement
    end_time = time.time()

    # Calculate total training time
    training_time = end_time - start_time

    # Evaluation
    with torch.no_grad():
        val_outputs = model_dp(X_val_torch)
        val_predictions = (val_outputs > 0.5).float()
        correct = (val_predictions == y_val_torch).sum().item()
        dp_accuracy = correct / len(y_val)

    # Compute the privacy budget (epsilon) spent
    epsilon = privacy_engine.get_epsilon(delta=1e-5)

    results.append((noise, dp_accuracy, epsilon, training_time))

# Display results
for noise, accuracy, epsilon, time_spent in results:
    print(f"Noise: {noise}, Accuracy: {accuracy:.4f}, Epsilon: {epsilon:.4f}, Training Time: {time_spent:.2f} seconds")


  z = np.log((np.exp(t) + q - 1) / q)


Noise: 0.5, Accuracy: 0.9355, Epsilon: 5.9284, Training Time: 94.45 seconds
Noise: 1.0, Accuracy: 0.9355, Epsilon: 0.5777, Training Time: 91.72 seconds
Noise: 1.5, Accuracy: 0.9355, Epsilon: 0.3096, Training Time: 90.91 seconds
Noise: 2.0, Accuracy: 0.9355, Epsilon: 0.2157, Training Time: 102.11 seconds
Noise: 10, Accuracy: 0.9355, Epsilon: 0.0426, Training Time: 98.75 seconds




In [29]:
subset_size = 100

subset_X = X_val_torch[:subset_size]
subset_y = y_val_torch[:subset_size]



In [30]:
class ActivationExtractor(nn.Module):
    def __init__(self, original_model):
        super(ActivationExtractor, self).__init__()
        self.fc1 = original_model.fc1
        self.fc2 = original_model.fc2
        self.fc3 = original_model.fc3

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        return x

extractor = ActivationExtractor(model_dp)
activations = extractor(subset_X)


In [31]:
activations = activations.detach()


In [32]:
class Reconstructor(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Reconstructor, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

reconstructor = Reconstructor(32, X_train.shape[1])  # 32 is the output size of fc3 in the original model
criterion = nn.MSELoss()
optimizer = optim.Adam(reconstructor.parameters())

num_epochs = 10
for epoch in range(num_epochs):
    optimizer.zero_grad()
    reconstructed_X = reconstructor(activations)
    loss = criterion(reconstructed_X, subset_X)
    loss.backward()
    optimizer.step()


In [33]:
with torch.no_grad():
    reconstructed_X = reconstructor(activations)
    mse = criterion(reconstructed_X, subset_X)
print(f"Reconstruction MSE: {mse.item()}")


Reconstruction MSE: 0.8499842882156372


In [None]:
#It's also worth noting that differential privacy provides a probabilistic guarantee, not an absolute one.
#The goal of DP is to ensure that the inclusion (or exclusion) of any single data point doesn't significantly affect the model's outputs.
# However, some aggregate information about the dataset as a whole might still be learned, which is what allows models to still be useful while preserving privacy.
#The challenge is balancing utility (model accuracy) with privacy (protection against information leakage).