In [1]:
from sklearn.datasets import fetch_california_housing

'''
The Scikit-Learn California Housing Dataset

We are going to use the California housing dataset to illustrate how the KNN algorithm works. The dataset was derived from the 1990 U.S. census. One row of the dataset represents the census of one block group.

In this section, we'll go over the details of the California Housing Dataset, so you can gain an intuitive understanding of the data we'll be working with. It's very important to get to know your data before you start working on it.

A block group is the smallest geographical unit for which the U.S. Census Bureau publishes sample data. Besides block group, another term used is household, a household is a group of people residing within a home.

The dataset consists of nine attributes:

MedInc - median income in block group
HouseAge - median house age in a block group
AveRooms - the average number of rooms (provided per household)
AveBedrms - the average number of bedrooms (provided per household)
Population - block group population
AveOccup - the average number of household members
Latitude - block group latitude
Longitude - block group longitude
MedHouseVal - median house value for California districts (hundreds of thousands of dollars)

In this guide, we will use MedInc, HouseAge, AveRooms, AveBedrms, Population, AveOccup, Latitude, Longitude to predict MedHouseVal. Something similar to our motivation narrative.

Let's now jump right into the implementation of the KNN algorithm for the regression.
'''

# as_frame=True loads the data in a dataframe format, with other metadata besides it
california_housing = fetch_california_housing(as_frame=True)
# Select only the dataframe part and assign it to the df variable
df = california_housing.frame

import pandas as pd
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [2]:
'''
Regression with K-Nearest Neighbors with Scikit-Learn
'''
'Preprocessing Data for KNN Regression'
y = df['MedHouseVal']
X = df.drop(['MedHouseVal'], axis = 1)

In [3]:
'By looking at our variables descriptions, we can see that we have differences in measurements. To avoid guessing, let's use the describe() method to check:'
# .T transposes the results, transforming rows into columns
X.describe().T

SyntaxError: invalid syntax (3438588800.py, line 1)

In [None]:
'Splitting Data into Train and Test Sets'

from sklearn.model_selection import train_test_split

SEED = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)
len(X)       # 20640
len(X_train) # 15480
len(X_test)  # 5160

In [None]:
'Feature Scaling for KNN Regression'
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
# Fit only on X_train
scaler.fit(X_train)

# Scale both X_train and X_test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

col_names=['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
scaled_df = pd.DataFrame(X_train, columns=col_names)
scaled_df.describe().T

In [None]:
'Training and Predicting KNN Regression'

from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=5)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [None]:
'Evaluating the Algorithm for KNN Regression'
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f'mae: {mae}')
print(f'mse: {mse}')
print(f'rmse: {rmse}')
regressor.score(X_test, y_test)