In [5]:
import pandas as pd
import xgboost as xgb
import numpy as np
import time
from sklearn.metrics import accuracy_score


data = pd.read_csv("InsuraceTrain.csv")
data.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [None]:
data.info()

In [6]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler


data.drop('policy_id', axis=1, inplace=True)

# Separate target and features
X = data.drop('is_claim', axis=1)
y = data['is_claim']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Re-define the one-hot encoder
encoder = OneHotEncoder(drop='first', sparse=False)

# One-hot encode categorical columns using the updated method
encoded_features = encoder.fit_transform(X[categorical_columns])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Replace categorical columns with encoded columns in original dataframe
X = pd.concat([X, encoded_df], axis=1)
X.drop(categorical_columns, axis=1, inplace=True)

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_val.shape




((46873, 100), (11719, 100))

In [7]:
def standard_xgboost(X_train, y_train, X_val, y_val):
    # Train XGBoost on standard data
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss'
    }

    # Start timer
    start_time = time.time()

    bst = xgb.train(params, dtrain, num_boost_round=5)

    # End timer
    end_time = time.time()
    training_time = end_time - start_time

    # Predict on validation set
    y_pred_proba = bst.predict(dval)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)

    # Evaluate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy, training_time

# Apply standard XGBoost and evaluate
accuracy, training_time = standard_xgboost(X_train, y_train, X_val, y_val)
print(f"Standard XGBoost Accuracy: {accuracy:.4f}, Training Time: {training_time:.2f} seconds")

Standard XGBoost Accuracy: 0.9355, Training Time: 0.53 seconds


In [8]:

def dp_xgboost(X_train, y_train, X_val, y_val, epsilon=1.0):
    # Compute standard deviation for Gaussian noise
    sigma = np.sqrt(2 * np.log(1.25 / 0.01)) / epsilon

    # Add Gaussian noise to the training data
    noisy_X_train = X_train + np.random.normal(0, sigma, X_train.shape)

    # Train XGBoost on noisy data
    dtrain_noisy = xgb.DMatrix(noisy_X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)

    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss'
    }

    # Start timer
    start_time = time.time()

    bst = xgb.train(params, dtrain_noisy, num_boost_round=5)

    # End timer
    end_time = time.time()
    training_time = end_time - start_time

    # Predict on validation set
    y_pred_proba = bst.predict(dval)
    y_pred = np.where(y_pred_proba > 0.5, 1, 0)

    # Evaluate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy, training_time

# Hyperparameters
epsilons = [0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0]
results = []

for eps in epsilons:
    accuracy, training_time = dp_xgboost(X_train, y_train, X_val, y_val, epsilon=eps)
    results.append((eps, accuracy, training_time))
    print(f"DP XGBoost Accuracy with epsilon {eps}: {accuracy:.4f}, Training Time: {training_time:.2f} seconds")

# Display results
for eps, accuracy, training_time in results:
    print(f"Epsilon: {eps}, Accuracy: {accuracy:.4f}, Training Time: {training_time:.2f} seconds")


DP XGBoost Accuracy with epsilon 0.5: 0.9355, Training Time: 0.85 seconds
DP XGBoost Accuracy with epsilon 1.0: 0.9355, Training Time: 0.89 seconds
DP XGBoost Accuracy with epsilon 1.5: 0.9355, Training Time: 2.16 seconds
DP XGBoost Accuracy with epsilon 2.0: 0.9355, Training Time: 4.54 seconds
DP XGBoost Accuracy with epsilon 3.0: 0.9355, Training Time: 1.53 seconds
DP XGBoost Accuracy with epsilon 4.0: 0.9355, Training Time: 0.88 seconds
DP XGBoost Accuracy with epsilon 5.0: 0.9355, Training Time: 0.86 seconds
DP XGBoost Accuracy with epsilon 6.0: 0.9355, Training Time: 0.87 seconds
DP XGBoost Accuracy with epsilon 10.0: 0.9355, Training Time: 0.89 seconds
Epsilon: 0.5, Accuracy: 0.9355, Training Time: 0.85 seconds
Epsilon: 1.0, Accuracy: 0.9355, Training Time: 0.89 seconds
Epsilon: 1.5, Accuracy: 0.9355, Training Time: 2.16 seconds
Epsilon: 2.0, Accuracy: 0.9355, Training Time: 4.54 seconds
Epsilon: 3.0, Accuracy: 0.9355, Training Time: 1.53 seconds
Epsilon: 4.0, Accuracy: 0.9355, T