In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import time
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('train.csv')

# Exploratory Data Analysis (EDA)
# Display basic info and summary statistics
print(data.info())
print(data.describe())

# Checking for missing values
print(data.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67463 entries, 0 to 67462
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            67463 non-null  int64  
 1   Loan Amount                   67463 non-null  int64  
 2   Funded Amount                 67463 non-null  int64  
 3   Funded Amount Investor        67463 non-null  float64
 4   Term                          67463 non-null  int64  
 5   Batch Enrolled                67463 non-null  object 
 6   Interest Rate                 67463 non-null  float64
 7   Grade                         67463 non-null  object 
 8   Sub Grade                     67463 non-null  object 
 9   Employment Duration           67463 non-null  object 
 10  Home Ownership                67463 non-null  float64
 11  Verification Status           67463 non-null  object 
 12  Payment Plan                  67463 non-null  object 
 13  L

In [2]:

# Encoding categorical variables
label_encoders = {}
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    data[col] = label_encoders[col].fit_transform(data[col])

# Separating features and target variable
X = data.drop('Loan Status', axis=1)
y = data['Loan Status']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [6]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Measuring training time
start_time = time.time()
knn.fit(X_train, y_train)
training_time = time.time() - start_time
print(f"Training time: {training_time} seconds")

# Making predictions and calculating accuracy
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Training time: 0.015697240829467773 seconds
Accuracy: 0.9047654339287038


In [10]:
def dp_knn(X_train, y_train, X_test, y_test, epsilon=1.0):
    # Compute standard deviation based on epsilon for Gaussian noise
    sigma = np.sqrt(2 * np.log(1.25 / 0.01)) / epsilon

    # Add Gaussian noise to the data
    noisy_X_train = X_train + np.random.normal(0, sigma, X_train.shape)

    # Train KNN
    knn = KNeighborsClassifier(n_neighbors=3)
    start_time = time.time()
    knn.fit(noisy_X_train, y_train)
    training_time = time.time() - start_time

    # Predict and evaluate
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy, training_time

#

In [12]:
# Epsilon values for differential privacy
epsilons = [0.5, 1.0, 1.5, 2.0, 3.0, 4.0]

results = []
for epsilon in epsilons:
    accuracy, training_time = dp_knn(X_train, y_train, X_test, y_test, epsilon)
    results.append((epsilon, accuracy, training_time))

# Print results
for epsilon, accuracy, training_time in results:
    print(f"Epsilon: {epsilon}, Accuracy: {accuracy:.2f}, Training Time: {training_time:.2f} seconds")


Epsilon: 0.5, Accuracy: 0.91, Training Time: 0.01 seconds
Epsilon: 1.0, Accuracy: 0.90, Training Time: 0.01 seconds
Epsilon: 1.5, Accuracy: 0.90, Training Time: 0.01 seconds
Epsilon: 2.0, Accuracy: 0.89, Training Time: 0.01 seconds
Epsilon: 3.0, Accuracy: 0.90, Training Time: 0.01 seconds
Epsilon: 4.0, Accuracy: 0.89, Training Time: 0.01 seconds
