In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [2]:
df = pd.read_csv("../week_02/data/Housing.csv")

In [3]:
numerical_features = [x for x in df.columns if df[x].dtype in ('int64','float64')]
numerical_features

['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking']

In [4]:
# First split data to isolate test data
features = df.columns.drop(["price"])
target = "price"
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], train_size=0.85, test_size=0.15, random_state=1234)

In [5]:
categorical_features = [x for x in df.columns if df[x].dtype =='object']

In [6]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop="if_binary")
ohe.fit(X_train[categorical_features])
encoded_array = ohe.transform(X_train[categorical_features])
encoded_df = pd.DataFrame(encoded_array, columns=ohe.get_feature_names_out(categorical_features))
X_train.drop(columns=categorical_features, inplace=True)
X_train = pd.concat([X_train.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

In [7]:
def apply_ohe(ohe: OneHotEncoder, X: pd.DataFrame, cat_features: list):
    encoded_array = ohe.transform(X[cat_features])
    encoded_df = pd.DataFrame(
        encoded_array, 
        columns=ohe.get_feature_names_out(cat_features)
    )
    X.drop(columns=cat_features, inplace=True)
    X = pd.concat(
        [
            X.reset_index(drop=True), 
            encoded_df.reset_index(drop=True)
        ], 
        axis=1
    )
    return X

In [8]:
X_test = apply_ohe(ohe, X_test.copy(), categorical_features)

In [9]:
knn_model = KNeighborsRegressor(n_neighbors=25)
knn_model.fit(X_train, y_train)

In [10]:
y_test_pred = knn_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_test_pred)

In [11]:
numerical_features.remove("price")
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(X_train[numerical_features])

X_train_mm_scaled = X_train.copy()
X_train_mm_scaled[numerical_features] = min_max_scaler.transform(X_train_mm_scaled[numerical_features])

X_test_mm_scaled = X_test.copy()
X_test_mm_scaled[numerical_features] = min_max_scaler.transform(X_test_mm_scaled[numerical_features])

knn_mm_model = KNeighborsRegressor(n_neighbors=25)
knn_mm_model.fit(X_train_mm_scaled, y_train)

y_test_mm_pred = knn_mm_model.predict(X_test_mm_scaled)
rmse_mm = root_mean_squared_error(y_test, y_test_mm_pred)

In [12]:
std_scaler = StandardScaler()
std_scaler.fit(X_train[numerical_features])

X_train_std_scaled = X_train.copy()
X_train_std_scaled[numerical_features] = std_scaler.transform(X_train_std_scaled[numerical_features])

X_test_std_scaled = X_test.copy()
X_test_std_scaled[numerical_features] = std_scaler.transform(X_test_std_scaled[numerical_features])

knn_std_model = KNeighborsRegressor(n_neighbors=25)
knn_std_model.fit(X_train_std_scaled, y_train)

y_test_std_pred = knn_std_model.predict(X_test_std_scaled)
rmse_std = root_mean_squared_error(y_test, y_test_std_pred)

In [13]:
rb_scaler = RobustScaler()
rb_scaler.fit(X_train[numerical_features])

X_train_rb_scaled = X_train.copy()
X_train_rb_scaled[numerical_features] = rb_scaler.transform(X_train_rb_scaled[numerical_features])

X_test_rb_scaled = X_test.copy()
X_test_rb_scaled[numerical_features] = rb_scaler.transform(X_test_rb_scaled[numerical_features])

knn_rb_model = KNeighborsRegressor(n_neighbors=25)
knn_rb_model.fit(X_train_rb_scaled, y_train)

y_test_rb_pred = knn_rb_model.predict(X_test_rb_scaled)
rmse_rb = root_mean_squared_error(y_test, y_test_rb_pred)

“How large is my typical prediction error, relative to the average house price?”

In [14]:
print(f"For None Scaled RMSE = {rmse},  Ratio: {rmse / y_test.mean()}")
print(f"For Min Max Scaled RMSE = {rmse_mm},  Ratio: {rmse_mm / y_test.mean()}")
print(f"For Standard Scaled RMSE = {rmse_std},  Ratio: {rmse_std / y_test.mean()}")
print(f"For Rebust Scaled RMSE = {rmse_rb},  Ratio: {rmse_rb / y_test.mean()}")

For None Scaled RMSE = 1434921.188371389,  Ratio: 0.29802270802568775
For Min Max Scaled RMSE = 1461268.4525123732,  Ratio: 0.30349484340984517
For Standard Scaled RMSE = 1166659.5823537894,  Ratio: 0.24230672102055836
For Rebust Scaled RMSE = 1241223.9619216016,  Ratio: 0.25779320104548303
