# K Nearest Neighbors (KNN) 

## Classifying personal income into:

- less than or equal to 50,000

- greater than 50,000

### Importing necessary packages

In [1]:
import os
#to work with df
import pandas as pd
#to perform numerical operations
import numpy as np
#to visualize the data
import seaborn as sns
#to partition the data
from sklearn.model_selection import train_test_split
#importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier
#importing performance metrics - accuracy score and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

### Importing the data 

In [2]:
os.chdir('F:/DS tut/datasets')

In [3]:
data = pd.read_csv('income.csv', na_values=[" ?"])

### Data pre-processing 

In [4]:
data.isnull().sum()

age                 0
JobType          1809
EdType              0
maritalstatus       0
occupation       1816
relationship        0
race                0
gender              0
capitalgain         0
capitalloss         0
hoursperweek        0
nativecountry       0
SalStat             0
dtype: int64

In [5]:
missing = data[data.isnull().any(axis=1)]
#axis=1 => to consider at least one column value is missing in a row
print(missing)

       age JobType         EdType        maritalstatus occupation  \
8       17     NaN           11th        Never-married        NaN   
17      32     NaN   Some-college   Married-civ-spouse        NaN   
29      22     NaN   Some-college        Never-married        NaN   
42      52     NaN           12th        Never-married        NaN   
44      63     NaN        1st-4th   Married-civ-spouse        NaN   
...    ...     ...            ...                  ...        ...   
31892   59     NaN      Bachelors   Married-civ-spouse        NaN   
31934   20     NaN        HS-grad        Never-married        NaN   
31945   28     NaN   Some-college   Married-civ-spouse        NaN   
31967   80     NaN        HS-grad              Widowed        NaN   
31968   17     NaN           11th        Never-married        NaN   

          relationship    race   gender  capitalgain  capitalloss  \
8            Own-child   White   Female            0            0   
17             Husband   White   

1. Missing values in Jobtype    = 1809

2. Missing values in Occupation = 1816 

3. There are 1809 rows where two specific columns i.e. occupation & JobType have missing values
   
4. (1816-1809) = 7 => You still have occupation unfilled for these 7 rows. Because, jobtype is Never worked

In [6]:
#dropping na values
data2 = data.dropna(axis=0)

In [7]:
#reindexing salary status names to 0, 1
data2['SalStat'] = data2['SalStat'].map({' less than or equal to 50,000':0, ' greater than 50,000':1})
print(data2['SalStat'])

0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['SalStat'] = data2['SalStat'].map({' less than or equal to 50,000':0, ' greater than 50,000':1})


In [8]:
new_data = pd.get_dummies(data2, drop_first=True)

In [9]:
#storing column names
columns_list = list(new_data.columns)
print(columns_list)

['age', 'capitalgain', 'capitalloss', 'hoursperweek', 'SalStat', 'JobType_ Local-gov', 'JobType_ Private', 'JobType_ Self-emp-inc', 'JobType_ Self-emp-not-inc', 'JobType_ State-gov', 'JobType_ Without-pay', 'EdType_ 11th', 'EdType_ 12th', 'EdType_ 1st-4th', 'EdType_ 5th-6th', 'EdType_ 7th-8th', 'EdType_ 9th', 'EdType_ Assoc-acdm', 'EdType_ Assoc-voc', 'EdType_ Bachelors', 'EdType_ Doctorate', 'EdType_ HS-grad', 'EdType_ Masters', 'EdType_ Preschool', 'EdType_ Prof-school', 'EdType_ Some-college', 'maritalstatus_ Married-AF-spouse', 'maritalstatus_ Married-civ-spouse', 'maritalstatus_ Married-spouse-absent', 'maritalstatus_ Never-married', 'maritalstatus_ Separated', 'maritalstatus_ Widowed', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-s

In [10]:
#seperating the input names from data
features = list(set(columns_list)-set(['SalStat']))
print(features)

['JobType_ Local-gov', 'race_ Other', 'maritalstatus_ Never-married', 'occupation_ Armed-Forces', 'maritalstatus_ Married-spouse-absent', 'race_ White', 'nativecountry_ Japan', 'nativecountry_ Mexico', 'nativecountry_ Greece', 'nativecountry_ France', 'EdType_ 7th-8th', 'relationship_ Own-child', 'JobType_ State-gov', 'nativecountry_ Cuba', 'nativecountry_ South', 'nativecountry_ Iran', 'maritalstatus_ Married-civ-spouse', 'nativecountry_ Germany', 'nativecountry_ Taiwan', 'occupation_ Farming-fishing', 'nativecountry_ Puerto-Rico', 'nativecountry_ Canada', 'nativecountry_ Holand-Netherlands', 'nativecountry_ Nicaragua', 'EdType_ 11th', 'EdType_ HS-grad', 'occupation_ Transport-moving', 'nativecountry_ Columbia', 'maritalstatus_ Widowed', 'age', 'occupation_ Handlers-cleaners', 'occupation_ Tech-support', 'EdType_ 1st-4th', 'occupation_ Priv-house-serv', 'JobType_ Self-emp-inc', 'EdType_ Doctorate', 'nativecountry_ Hungary', 'occupation_ Prof-specialty', 'EdType_ Preschool', 'nativecou

In [11]:
#storing teh output values in y
y = new_data['SalStat'].values
print(y)

[0 0 1 ... 0 0 0]


In [12]:
#storing values form input features
x = new_data[features].values
print(x)

[[0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]]


In [13]:
#splitting the data into train and test
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=0)

# KNN 

In [14]:
#storing k nearest neighbors classifier
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)

In [15]:
#fitting the values for X and Y
KNN_classifier.fit(train_x, train_y)

KNeighborsClassifier()

In [16]:
#predicting the test values with model
prediction = KNN_classifier.predict(test_x)

In [17]:
#performance matrix check
confusionMatrix = confusion_matrix(test_y, prediction)
print(confusionMatrix)

[[6173  650]
 [ 843 1383]]


In [18]:
#calculating the accuracy
accuracy_score = accuracy_score(test_y, prediction)
print(accuracy_score)

0.8350093933031274


In [19]:
print('Misclassified samples: %d' % (test_y!=prediction).sum())

Misclassified samples: 1493


**Effect of K value on classifier**

In [21]:
Misclassified_sample = []
#calculating error for K values between 1 and 20
for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(train_x, train_y)
    pred_i = knn.predict(test_x)
    Misclassified_sample.append((test_y!=pred_i).sum())
print(Misclassified_sample)

[1766, 1516, 1515, 1436, 1493, 1438, 1451, 1432, 1458, 1436, 1441, 1447, 1451, 1423, 1413, 1390, 1424, 1396, 1434]
