In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import torch

# Load data
data = pd.read_csv("InsuraceTrain.csv")
data.head()

Unnamed: 0,policy_id,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
0,ID00001,0.515874,0.05,0.644231,C1,4990,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
1,ID00002,0.672619,0.02,0.375,C2,27003,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
2,ID00003,0.84111,0.02,0.384615,C3,4076,1,A,M1,CNG,...,No,No,No,Yes,No,No,No,Yes,0,0
3,ID00004,0.900277,0.11,0.432692,C4,21622,1,C1,M2,Petrol,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,2,0
4,ID00005,0.596403,0.11,0.634615,C5,34738,2,A,M3,Petrol,...,No,Yes,Yes,Yes,No,Yes,Yes,Yes,2,0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58592 entries, 0 to 58591
Data columns (total 44 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   policy_id                         58592 non-null  object 
 1   policy_tenure                     58592 non-null  float64
 2   age_of_car                        58592 non-null  float64
 3   age_of_policyholder               58592 non-null  float64
 4   area_cluster                      58592 non-null  object 
 5   population_density                58592 non-null  int64  
 6   make                              58592 non-null  int64  
 7   segment                           58592 non-null  object 
 8   model                             58592 non-null  object 
 9   fuel_type                         58592 non-null  object 
 10  max_torque                        58592 non-null  object 
 11  max_power                         58592 non-null  object 
 12  engi

In [38]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler


data.drop('policy_id', axis=1, inplace=True)

# Separate target and features
X = data.drop('is_claim', axis=1)
y = data['is_claim']

# Identify categorical columns
categorical_columns = X.select_dtypes(include=['object']).columns.tolist()

# Re-define the one-hot encoder
encoder = OneHotEncoder(drop='first', sparse=False)

# One-hot encode categorical columns using the updated method
encoded_features = encoder.fit_transform(X[categorical_columns])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Replace categorical columns with encoded columns in original dataframe
X = pd.concat([X, encoded_df], axis=1)
X.drop(categorical_columns, axis=1, inplace=True)

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_val.shape



((46873, 100), (11719, 100))

In [39]:

X_train_tensor = torch.tensor(X_train.astype('float32'))
y_train_tensor = torch.tensor(y_train.values.astype('float32'))
X_val_tensor = torch.tensor(X_val.astype('float32'))
y_val_tensor = torch.tensor(y_val.values.astype('float32'))


In [42]:
import time
def knn_pytorch(X_train, y_train, X_val, k=3):
    y_preds = []
    # Start timing
    start_time = time.time()
    for val_point in X_val:
        distances = torch.norm(X_train - val_point, dim=1)
        _, indices = distances.topk(k, largest=False)
        k_nearest_labels = y_train[indices]
        prediction = torch.mode(k_nearest_labels).values.item()
        y_preds.append(prediction)
    # End timing
    end_time = time.time()
    classification_time = end_time - start_time
    return torch.tensor(y_preds), classification_time

# Apply PyTorch-based KNN and evaluate
k = 3
y_pred_tensor, classification_time = knn_pytorch(X_train_tensor, y_train_tensor, X_val_tensor, k=k)

# Convert predictions to numpy and calculate accuracy
y_pred = y_pred_tensor.numpy()
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Classification Time: {classification_time:.2f} seconds")

Accuracy: 92.25%
Classification Time: 75.16 seconds


In [18]:

def dp_knn(X_train, y_train, X_val, y_val, epsilon=1.0):
    # Compute standard deviation based on epsilon for Gaussian noise
    sigma = np.sqrt(2 * np.log(1.25 / 0.01)) / epsilon

    # Add Gaussian noise to the data
    noisy_X_train = X_train + np.random.normal(0, sigma, X_train.shape)

    # Train and evaluate KNN
    knn = KNeighborsClassifier(n_neighbors=3)

    # Start timing
    start_time = time.time()

    knn.fit(noisy_X_train, y_train)
    y_pred = knn.predict(X_val)

    # End timing
    end_time = time.time()
    training_time = end_time - start_time

    accuracy = accuracy_score(y_val, y_pred)
    return accuracy, training_time

# Hyperparameters
epsilons = [0.5, 1.0, 1.5, 2.0, 3.0, 4.0]
results = []

for eps in epsilons:
    accuracy, training_time = dp_knn(X_train, y_train, X_val, y_val, epsilon=eps)
    results.append((eps, accuracy, training_time))
    print(f"DP KNN Accuracy with epsilon {eps}: {accuracy:.4f}, Training Time: {training_time:.2f} seconds")

# Display results
for eps, accuracy, training_time in results:
    print(f"Epsilon: {eps}, Accuracy: {accuracy:.4f}, Training Time: {training_time:.2f} seconds")


DP KNN Accuracy with epsilon 0.5: 0.9355, Training Time: 8.28 seconds
DP KNN Accuracy with epsilon 1.0: 0.7922, Training Time: 5.76 seconds
DP KNN Accuracy with epsilon 1.5: 0.9269, Training Time: 8.09 seconds
DP KNN Accuracy with epsilon 2.0: 0.9252, Training Time: 5.69 seconds
DP KNN Accuracy with epsilon 3.0: 0.9335, Training Time: 7.74 seconds
DP KNN Accuracy with epsilon 4.0: 0.9182, Training Time: 5.75 seconds
Epsilon: 0.5, Accuracy: 0.9355, Training Time: 8.28 seconds
Epsilon: 1.0, Accuracy: 0.7922, Training Time: 5.76 seconds
Epsilon: 1.5, Accuracy: 0.9269, Training Time: 8.09 seconds
Epsilon: 2.0, Accuracy: 0.9252, Training Time: 5.69 seconds
Epsilon: 3.0, Accuracy: 0.9335, Training Time: 7.74 seconds
Epsilon: 4.0, Accuracy: 0.9182, Training Time: 5.75 seconds
