In [44]:
# imported necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

In [45]:
# Dataset is loaded using pandas
data = pd.read_csv("UniversalBank.csv")

In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [47]:
# one hot encoding for the "Education" variable
data = pd.get_dummies(data, columns=["Education"], prefix="Education")

In [48]:
data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3
0,1,25,1,49,91107,4,1.6,0,0,1,0,0,0,1,0,0
1,2,45,19,34,90089,3,1.5,0,0,1,0,0,0,1,0,0
2,3,39,15,11,94720,1,1.0,0,0,0,0,0,0,1,0,0
3,4,35,9,100,94112,1,2.7,0,0,0,0,0,0,0,1,0
4,5,35,8,45,91330,4,1.0,0,0,0,0,0,1,0,1,0


In [49]:
# Seperating independent variable and removing 'ID' and 'ZIP Code' variables
X = data.drop(['ID', 'ZIP Code', 'Personal Loan'], axis=1)
# Seperating dependent variable
y = data['Personal Loan']

In [50]:
X.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3
0,25,1,49,4,1.6,0,1,0,0,0,1,0,0
1,45,19,34,3,1.5,0,1,0,0,0,1,0,0
2,39,15,11,1,1.0,0,0,0,0,0,1,0,0
3,35,9,100,1,2.7,0,0,0,0,0,0,1,0
4,35,8,45,4,1.0,0,0,0,0,1,0,1,0


In [51]:
# Split the data into training 75% and validation 25% sets
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.25, random_state=123)

In [52]:
# Creating the new customer's data
customer_inp = pd.DataFrame({
    'Age': [40],
    'Experience': [10],
    'Income': [84],
    'Family': [2],
    'CCAvg': [2],
    'Mortgage': [0],
    'Securities Account': [1],
    'CD Account': [1],
    'Online': [1],
    'CreditCard': [1],
    'Education_1': [0],
    'Education_2': [1],
    'Education_3': [0],
})

In [53]:
# Standardize the data for consistency
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation = scaler.transform(X_validation)
customer_inp = scaler.transform(customer_inp)

In [54]:
# Performing k-NN classification with k = 1
knn_classifier = KNeighborsClassifier(n_neighbors=1)
# Fitting the training data to k-NN
knn_classifier.fit(X_train, y_train)

In [55]:
# Predicting whether the customer will accept a loan offer
customer_classification = knn_classifier.predict(customer_inp)
print(customer_classification)

[0]


In [56]:
k_values = range(1, 21)
validation_scores = []

In [57]:
# Finding the optimal value of k using the validation data set with range of values
for k in k_values:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train, y_train)
    validation_scores.append(knn_classifier.score(X_validation, y_validation))

optimal_k = k_values[np.argmax(validation_scores)]

In [58]:
# The optimal k is typically chosen based on the best performance metric, such as accuracy or F1-score, on the validation data.
print(optimal_k)

1


In [59]:
# Print the confusion matrix for the validation data using the optimal k
knn_classifier = KNeighborsClassifier(n_neighbors=optimal_k)
knn_classifier.fit(X_train, y_train)
validation_predictions = knn_classifier.predict(X_validation)
confusion_matrix_validation = confusion_matrix(y_validation, validation_predictions)
print("Confusion Matrix (Validation Data):\n", confusion_matrix_validation)

Confusion Matrix (Validation Data):
 [[1108   17]
 [  28   97]]


In [60]:
# Classifying the new customer using the best k
new_customer_classification = knn_classifier.predict(customer_inp)
print("Classification of the new customer:", new_customer_classification)

Classification of the new customer: [0]


In [61]:
# Repartition the data into training, validation, and test sets with 50%, 30%, 20% respectively
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.5, random_state=123)
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.6, random_state=123)

In [62]:
# Standardize the data for the new partitions
train_data_std = scaler.fit_transform(X_train)
validation_data_std = scaler.transform(X_validation)
test_data_std = scaler.transform(X_test)

In [63]:
# Applying the k-NN method with the optimal k chosen above on the test set.
knn_classifier = KNeighborsClassifier(n_neighbors=optimal_k)
knn_classifier.fit(X_train, y_train)
test_predictions = knn_classifier.predict(X_test)

In [64]:
# Computing the confusion matrices for training, validation, and test sets
train_confusion = confusion_matrix(y_train, knn_classifier.predict(X_train))
validation_confusion = confusion_matrix(y_validation, knn_classifier.predict(X_validation))
test_confusion = confusion_matrix(y_test, test_predictions)

print("Confusion Matrix for Training Data:")
print(train_confusion)

print("Confusion Matrix for Validation Data:")
print(validation_confusion)

print("Confusion Matrix for Test Data:")
print(test_confusion)


Confusion Matrix for Training Data:
[[2258    0]
 [   0  242]]
Confusion Matrix for Validation Data:
[[861  40]
 [ 56  43]]
Confusion Matrix for Test Data:
[[1295   66]
 [  83   56]]


---> From the above metrices, with no false positives or false negatives,
 the model gave excellent performance on the training set. The model may not be generalizing effectively to new data because it has overfitted to the training set, which indicates that it has learnt the training
 set too well.

--->Compared to the training data, the model's performance on the
 validation data was marginally poorer. Even though there are a few false
 positives and false negatives, the accuracy is still very high overall.

--->Compared to the validation data, the model's performance was
 considerably poorer on the test data. The accuracy is lower overall and there are more false positives and false negatives.

--->The number of false positives and false negatives is the main
 difference between the confusion matrices for the training, validation,
 and test data. From the training data to the validation data to the test data, there are more false positives and false negatives. This shows that the model is not well generalizing to new data and is overfitting to the training set.

--->The main possible reason for the model's perfect performance on
 training data but its poor performance on validation and test data could be because it has memorized(overfitting) the training data and is unable to generalize to unknown data.