In [127]:
'''
https://stackabuse.com/k-nearest-neighbors-algorithm-in-python-and-scikit-learn/
'''

'''
Classification using K-Nearest Neighbors with Scikit-Learn
'''
'''
The Scikit-Learn California Housing Dataset

We are going to use the California housing dataset to illustrate how the KNN algorithm works. The dataset was derived from the 1990 U.S. census. One row of the dataset represents the census of one block group.

In this section, we'll go over the details of the California Housing Dataset, so you can gain an intuitive understanding of the data we'll be working with. It's very important to get to know your data before you start working on it.

A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data. Besides block group, another term used is household, a household is a group of people residing within a home.

The dataset consists of nine attributes:

MedInc - median income in block group
HouseAge - median house age in a block group
AveRooms - the average number of rooms (provided per household)
AveBedrms - the average number of bedrooms (provided per household)
Population - block group population
AveOccup - the average number of household members
Latitude - block group latitude
Longitude - block group longitude
MedHouseVal - median house value for California districts (hundreds of thousands of dollars)

In this guide, we will use MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude to predict MedHouseVal. Something similar to our motivation narrative.

Let's now jump right into the implementation of the KNN algorithm for the regression.
'''
from sklearn.datasets import fetch_california_housing

# as_frame=True loads the data in a dataframe format, with other metadata besides it
california_housing = fetch_california_housing(as_frame=True)
# Select only the dataframe part and assign it to the df variable
df = california_housing.frame

import pandas as pd
df.to_csv('/Users/yzhao/Downloads/internet_example_3_knn_classification.csv', index=False)
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [128]:
'Preprocessing Data for Classification'
# Creating 4 categories and assigning them to a MedHouseValCat column
df["MedHouseValCat"] = pd.qcut(df["MedHouseVal"], 4, retbins=False, labels=[1, 2, 3, 4])
y = df['MedHouseValCat']
X = df.drop(['MedHouseVal', 'MedHouseValCat'], axis = 1)

'Splitting Data into Train and Test Sets'
from sklearn.model_selection import train_test_split

In [129]:
y

0        4
1        4
2        4
3        4
4        4
        ..
20635    1
20636    1
20637    1
20638    1
20639    1
Name: MedHouseValCat, Length: 20640, dtype: category
Categories (4, int64): [1 < 2 < 3 < 4]

In [130]:
X


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [131]:
SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)


'Feature Scaling for Classification'
from sklearn.preprocessing import StandardScaler

In [132]:
rows_2_to_5 = X_test.iloc[2:5]
print(rows_2_to_5)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
15663  3.4801      52.0  3.977155   1.185877      1310.0  1.360332     37.80   
20484  5.7376      17.0  6.163636   1.020202      1705.0  3.444444     34.28   
9814   3.7250      34.0  5.492991   1.028037      1063.0  2.483645     36.62   

       Longitude  
15663    -122.44  
20484    -118.72  
9814     -121.93  


In [133]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
rows_2_to_5 = scaler.transform(rows_2_to_5)
'Training and Predicting for Classification'
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
'Evaluating KNN for Classification'
acc =  classifier.score(X_test, y_test)
acc # 0.6191860465116279
from sklearn.metrics import classification_report, confusion_matrix
#importing Seaborn's to use the heatmap
import seaborn as sns

# Adding classes names for better interpretation
classes_names = ['class 1','class 2','class 3', 'class 4']
cm = pd.DataFrame(confusion_matrix(y_test, y_pred),
                  columns=classes_names, index = classes_names)

# Seaborn's heatmap to better visualize the confusion matrix
sns.heatmap(cm, annot=True, fmt='d');

print(classification_report(y_test, y_pred))

'Finding the Best K for KNN Classification'
from sklearn.metrics import f1_score

f1s = []

# Calculating f1 score for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    # using average='weighted' to calculate a weighted average for the 4 classes
    f1s.append(f1_score(y_test, pred_i, average='weighted'))

              precision    recall  f1-score   support

           1       0.75      0.78      0.76      1292
           2       0.49      0.56      0.53      1283
           3       0.51      0.51      0.51      1292
           4       0.76      0.62      0.69      1293

    accuracy                           0.62      5160
   macro avg       0.63      0.62      0.62      5160
weighted avg       0.63      0.62      0.62      5160



In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), f1s, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('F1 Score K Value')
plt.xlabel('K Value')
plt.ylabel('F1 Score')

In [None]:
'''
From the output, we can see that the f1-score is the highest when the value of the K is 15.
Let's retrain our classifier with 15 neighbors and see what it does to our classification report results:
'''
classifier15 = KNeighborsClassifier(n_neighbors=15)
classifier15.fit(X_train, y_train)
y_pred15 = classifier15.predict(X_test)
print(classification_report(y_test, y_pred15))