1.Importing necessary library

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

2. Load the dataset and cleaning the dataset to eliminate missing values and duplicate values

In [2]:
df = pd.read_csv("adult.csv")
print(df)

       age workclass  fnlwgt     education  education.num      marital.status  \
0       90         ?   77053       HS-grad              9             Widowed   
1       82   Private  132870       HS-grad              9             Widowed   
2       66         ?  186061  Some-college             10             Widowed   
3       54   Private  140359       7th-8th              4            Divorced   
4       41   Private  264663  Some-college             10           Separated   
...    ...       ...     ...           ...            ...                 ...   
32556   22   Private  310152  Some-college             10       Never-married   
32557   27   Private  257302    Assoc-acdm             12  Married-civ-spouse   
32558   40   Private  154374       HS-grad              9  Married-civ-spouse   
32559   58   Private  151910       HS-grad              9             Widowed   
32560   22   Private  201490       HS-grad              9       Never-married   

              occupation   

3.Preprocessing the data

In [3]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)


4.Encoding the categorical variable


In [4]:
df = pd.get_dummies(df, drop_first=True)


5.Encoding the target variables

In [5]:
df["income"] = df["income_>50K"].apply(lambda x: 1 if x else 0)

6.Separating the features(X) and target(y)

In [6]:
X = df.drop("income", axis=1)
y = df["income"]

7.Splitting the dataset into training and testing sets, here 80% of the data is assigned as the training and 20% as testing 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

8.Standardising the feature

In [8]:
scaler = StandardScaler()


9.Transform and fit the training data and testing data

In [9]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

10.Impelmenting the k-NN algorithm with the default parameters 
a)initialization

In [10]:
knn = KNeighborsClassifier()



b)training the model

In [11]:
knn.fit(X_train, y_train)

c)making predictions on the test set

In [12]:
y_pred = knn.predict(X_test)

d)Evaluation of the default model

In [13]:
print("The default performance of k-NN")
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("The Confusion Matrix is: ")
print(confusion_matrix(y_test, y_pred))
print("The Classification Report is: ")
print(classification_report(y_test, y_pred))

The default performance of k-NN
Accuracy:  0.9505224339274739
The Confusion Matrix is: 
[[4885  103]
 [ 219 1301]]
The Classification Report is: 
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4988
           1       0.93      0.86      0.89      1520

    accuracy                           0.95      6508
   macro avg       0.94      0.92      0.93      6508
weighted avg       0.95      0.95      0.95      6508



11.Hyperparameter tuning done using the gridsearchcv

a)Defining the parameter grid

In [14]:
params = {
    "n_neighbors": range(1, 21),
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan", "minkowski"]
}

b)Initialization of gridsearchcv

In [15]:
search_grid = GridSearchCV(knn, params, n_jobs=-1, cv=5, scoring="accuracy")


c)Fitting the gridsearchcv to the data

In [16]:
search_grid.fit(X_train, y_train)

d)Retrieving the best parameters and estimators

In [17]:
params_best = search_grid.best_params_
best_kNN = search_grid.best_estimator_

e)Making the prediction using the optimized model

In [18]:
optimized_model_y_pred = best_kNN.predict(X_test)

f)Evaluating the optimized model

In [19]:
print("The optimized model performance: ")
print("Accuracy: ", accuracy_score(y_test, optimized_model_y_pred))
print("The Confusion Matrix is: ")
print(confusion_matrix(y_test, optimized_model_y_pred))
print("The Classification Report is: ")
print(classification_report(y_test, optimized_model_y_pred))
print("The best parameters are: ", params_best)


The optimized model performance: 
Accuracy:  0.9652735095267363
The Confusion Matrix is: 
[[4945   43]
 [ 183 1337]]
The Classification Report is: 
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      4988
           1       0.97      0.88      0.92      1520

    accuracy                           0.97      6508
   macro avg       0.97      0.94      0.95      6508
weighted avg       0.97      0.97      0.96      6508

The best parameters are:  {'metric': 'manhattan', 'n_neighbors': 8, 'weights': 'distance'}
