In [6]:
import pandas as pd
from diffprivlib.models import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load your dataset
df = pd.read_csv("C:/Users/yadap/Desktop/DPS/Project/dataset.csv")

# Define the features and target
X = df[["Id", "Income", 'Age', 'Experience', 'Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'State', 'Current_Job_Years', 'Current_House_Years']]
y = df['Risk_Flag']

# Encode categorical features
label_encoders = {}
for column in ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'State']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
    label_encoders[column] = le

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the differentially private Naive Bayes model
epsilon = 0.5
nb_model_dp = GaussianNB(epsilon=epsilon)
nb_model_dp.fit(X_train, y_train)

# Make predictions
y_pred_dp = nb_model_dp.predict(X_test)

# Evaluate the model
accuracy_dp = accuracy_score(y_test, y_pred_dp)
classification_rep_dp = classification_report(y_test, y_pred_dp)
confusion_mat_dp = confusion_matrix(y_test, y_pred_dp)

print(f'Accuracy with DP %: {round(100 * accuracy_dp, 2)}')
print('Classification Report with DP:')
print(classification_rep_dp)
print('Confusion Matrix with DP:')
print(confusion_mat_dp)


Accuracy with DP %: 83.07
Classification Report with DP:
              precision    recall  f1-score   support

           0       0.85      0.83      0.84      7243
           1       0.81      0.83      0.82      6199

    accuracy                           0.83     13442
   macro avg       0.83      0.83      0.83     13442
weighted avg       0.83      0.83      0.83     13442

Confusion Matrix with DP:
[[6012 1231]
 [1045 5154]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[column] = le.fit_transform(X[column])


In [17]:
import numpy as np

# Define epsilon values to test
epsilons = [0.5, 1, 2, 4]

# Initialize lists to store results
accuracy_list = []
precision_list = []
recall_list = []

for epsilon in epsilons:
    nb_model_dp = GaussianNB(epsilon=epsilon)
    nb_model_dp.fit(X_train, y_train)
    y_pred_dp = nb_model_dp.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred_dp)
    report = classification_report(y_test, y_pred_dp, output_dict=True)
    
    accuracy_list.append(accuracy)
    precision_list.append(report['weighted avg']['precision'])
    recall_list.append(report['weighted avg']['recall'])

# Create a DataFrame to display the results
results_df = pd.DataFrame({
    'Epsilon': epsilons,
    'Accuracy': accuracy_list,
    'Precision': precision_list,
    'Recall': recall_list
})

print(results_df)




   Epsilon  Accuracy  Precision    Recall
0      0.5  0.826588   0.826901  0.826588
1      1.0  0.840128   0.840322  0.840128
2      2.0  0.836185   0.836315  0.836185
3      4.0  0.841021   0.841156  0.841021
