# Hello there! Let's do some KNN on real estate

The basics of this is to utilize King County real estate information freely available from the King Count Accessor's office and determine if we can figure out what a house is worth.

We do this by utilizing K (the number) neighbors based on distances.

In [None]:
# First step is to load the packages that are important

import pandas as pd
import numpy as np
import random
from scipy.spatial import KDTree
from sklearn.metrics import mean_absolute_error,  mean_squared_error
import sys
import matplotlib.pyplot as plt

In [None]:
#  Load up the king county geocoded data.

df = pd.read_csv('../../data/king_county_data_geocoded.csv')
df

In [None]:
df.columns

In [None]:
from sklearn.neighbors import KNeighborsRegressor

def regress(X, y, X_test, y_test, n_neighbors=2, metric='minkowski', p=2):

    reg = KNeighborsRegressor(n_neighbors=n_neighbors, metric=metric, p=p)

    y_column = 'AppraisedValue'

    reg.fit(X, y)

    return round(mean_absolute_error(y_test, reg.predict(X_test)), 2)

X = df[['lat', 'long', 'SqFtLot']]

# This normalizes using MinMaxScaling you can also use others
# https://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing
# The biggest issue is that the data comes in biased towards Seattle and biased
# towards a size of house > 900 sqft.

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Note the below line is exactly the same as
# X = (X - X.min()) / (X.max() - X.min())
X = scaler.fit_transform(X)

# Try out some other columns if you want
# X = df[df.columns[~df.columns.isin(['AppraisedValue'])]]

# This is in dollar amounts
y = df['AppraisedValue']

# Try out some different n_neighbors or one of the distance metrics
# p = 2 for minkowski is a euclidean distance. p=1 is a manhattan distance.
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html
# NOTE THAT n_neighbors=1 would be cheating in this case! so no cheating!
regress(X, y, X, y, n_neighbors=2, metric='minkowski', p=2)

In [None]:
from sklearn.model_selection import train_test_split

# What happens when you change the test_size to 0.5 or 0.8 or 0.01?
# train(X), predict(X)

# X -> X_train, X_test. train(X_train), predict(X_test)

# Training, build the model on.
# Testing, what we evaluate on.

# 80 - 90% training
# 10 - 20% Testing.

X = df[['lat', 'long', 'SqFtLot']]

df

X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

z = {'p': [], 'k': [], 'mae': []}

for p in range(1,10):
    for k in range(1,10):
        z['p'].append(p)
        z['k'].append(k)
        z['mae'].append(regress(X_train, y_train, X_test, y_test, n_neighbors=k, metric='minkowski', p=p))
        print(z['mae'][-1])
        
pd.DataFrame(z)

In [None]:
# Let's get more clever

from sklearn.model_selection import StratifiedKFold

# 2 split. 50/50
# 3 split. 33/33/33

error_data = []
for splits in range(2, 11):
    skf = StratifiedKFold(n_splits=splits)

    for train_index, test_index in skf.split(X, y):
        print('.', end='')
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        error_data.append({'k_folds': splits,\
                           'MAE': regress(X_train, y_train, X_test, y_test, n_neighbors=6, metric='minkowski', p=2)\
                          })
pd.DataFrame(error_data).groupby('k_folds').agg({'MAE' : [np.min, np.max]}).plot()