# W6_Nearest Neighbors & Kernel Regression

In [1]:
import numpy as np
import pandas as pd

In [2]:
# import data
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

data = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)

# remove features we don't need
for d in (data, train, validation, test):
    d.drop(['zipcode', 'date', 'id'], axis=1, inplace=True)

data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3.0,1.0,1180.0,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340.0,5650.0
1,538000.0,3.0,2.25,2570.0,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690.0,7639.0
2,180000.0,2.0,1.0,770.0,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720.0,8062.0
3,604000.0,4.0,3.0,1960.0,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360.0,5000.0
4,510000.0,3.0,2.0,1680.0,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800.0,7503.0


In [3]:
# get numpy array we need from dataframe
def get_numpy_data(dataframe, features, output):
    features_matrix = dataframe[features].values
    output_array = dataframe[output].values 
    return (features_matrix, output_array)

In [4]:
# normarlize the features
def feature_normalization(X):
    norms = np.linalg.norm(X, axis=0)
    X_normalized = X / norms
    return (X_normalized, norms)

In [5]:
features = list(data.columns)[1:]
output = 'price'

train_X, train_y = get_numpy_data(train, features, output)
val_X, val_y = get_numpy_data(validation, features, output)
test_X, test_y = get_numpy_data(test, features, output)

# normalize features
train_X, norms = feature_normalization(train_X)
val_X = val_X / norms
test_X = test_X / norms

In [6]:
# Euclidean distance
def compute_one_distance(point_a, point_b):
    distance = np.sqrt(np.sum((point_a - point_b) ** 2))
    return distance

# example
print(test_X[0]) # query point
print(train_X[9])
print(compute_one_distance(test_X[0], train_X[9]))

[ 0.01551285  0.01807473  0.01759212  0.00160518  0.017059    0.
  0.05102365  0.0116321   0.01564352  0.01362084  0.02481682  0.01350306
  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01163464  0.00602491  0.0083488   0.00050756  0.01279425  0.          0.
  0.01938684  0.01390535  0.0096309   0.          0.01302544  0.
  0.01346821 -0.01346251  0.01195898  0.00156612]
0.059723593714


In [7]:
# compute distance from the query house to each of the first 10 houses in the training set
# this is a inefficient way
query = test_X[0]
distance_list = []
for i in range(10):
    d = compute_one_distance(query, train_X[i])
    distance_list.append(d)
    
# index of minimum distance
np.argmin(distance_list)

8

In [8]:
# implement vectorized operation to compute distances
def compute_distances(train_matrix, query):
    distances = np.sqrt(np.sum((query - train_matrix)**2, axis=1))
    return distances

In [9]:
# 1-nearest neighbor regression
query = test_X[2]
index = np.argmin(compute_distances(train_X, query))
index

382

In [10]:
# prediciton
train_y[index]

249000.0

In [11]:
# k-nearest neighbor regression
def KNN(k, train_matrix, query):
    distances = np.sqrt(np.sum((query - train_matrix)**2, axis=1))
    neighbors = np.argsort(distances)[:k]
    return neighbors

In [12]:
# example
KNN(4, train_X, query)

array([ 382, 1149, 4087, 3142], dtype=int64)

In [13]:
def compute_one_prediction(k, train_matrix, train_output, query):
    neighbors = KNN(k, train_matrix, query)
    prediction = np.average(train_output[neighbors])
    return prediction

In [14]:
# example
compute_one_prediction(4, train_X, train_y, query)

413987.5

In [15]:
def compute_predictions(k, train_matrix, train_output, queries):
    predictions = []
    for i in range(queries.shape[0]):
        p = compute_one_prediction(k, train_matrix, train_output, queries[i])
        predictions.append(p)
    return np.array(predictions)

In [16]:
# example
queries = test_X[:10]
result = compute_predictions(10, train_X, train_y, queries)
print(result)
print(min(result))
print(np.argmin(result))

[ 881300.   431860.   460595.   430200.   766750.   667420.   350032.
  512800.7  484000.   457235. ]
350032.0
6


In [17]:
# choose best value of k using validation set
queries = val_X
min_RSS = float('inf')
best_k = None
for k in range(1, 16):
    predictions = compute_predictions(k, train_X, train_y, queries)
    RSS = sum((predictions - val_y) ** 2)
    if RSS < min_RSS:
        min_RSS = RSS
        best_k = k
print(min_RSS, best_k)

6.73616787355e+13 8


In [18]:
# compute RSS on test data
queries = test_X
predictions = compute_predictions(best_k, train_X, train_y, queries)
sum((predictions - test_y) ** 2)

133118823551516.81