In [1]:
import pandas as pd
import numpy as np

In [3]:
sales = pd.read_csv('kc_house_data_small.csv')
train = pd.read_csv('kc_house_data_small_train.csv')
validation = pd.read_csv('kc_house_data_validation.csv')
test = pd.read_csv('kc_house_data_small_test.csv')

In [4]:
def get_numpy_data(df, features, output):
    df['constant'] = 1 
    features = ['constant'] + features
    features_df = df[features]
    
    feature_matrix = features_df.to_numpy()
    output_array = df['price'].to_numpy()

    return feature_matrix, output_array


def normalize_features(features):
    """
    normalize columns of a given feature matrix.
    :param features: columns of a feature matrix
    :return normalized features: normalized columns of given feature matrix
    :return norms: norms of orignal features
    """
    
    norms = np.linalg.norm(features, axis=0)
    normalized_features = features / norms
    
    return normalized_features, norms

In [5]:
# get numpy arrays
feature_list = [
    'bedrooms',  
    'bathrooms',  
    'sqft_living',  
    'sqft_lot',  
    'floors',
    'waterfront',  
    'view',  
    'condition',  
    'grade',  
    'sqft_above',  
    'sqft_basement',
    'yr_built',  
    'yr_renovated',  
    'lat',  
    'long',  
    'sqft_living15',  
    'sqft_lot15'
]

features_train, output_train = get_numpy_data(train, feature_list, 'price')
features_test, output_test = get_numpy_data(test, feature_list, 'price')
features_valid, output_valid = get_numpy_data(validation, feature_list, 'price')

In [6]:
# normalize features
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_valid = features_valid / norms

#### Compute a single distance

To start, let's just explore computing the “distance” between two given houses. We will take our query house to be the first house of the test set and look at the distance between this house and the 10th house of the training set.

To see the features associated with the query house, print the first row (index 0) of the test feature matrix. You should get an 18-dimensional vector whose components are between 0 and 1. Similarly, print the 10th row (index 9) of the training feature matrix.

In [9]:
print("Query house: \n", features_test[0])
print("House in training set: \n", features_train[9])

Query house: 
 [ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
House in training set: 
 [ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [10]:
# Euclidean Distance
np.sqrt(np.sum((features_test[0] - features_train[9])**2))

0.05972359371398078

To visualize this nearest-neighbor search, let's first compute the distance from our query house (features_test[0]) to the first 10 houses of the training set (features_train[0:10]) and then search for the nearest neighbor within this small set of houses.  Through restricting ourselves to a small set of houses to begin with, we can visually scan the list of 10 distances to verify that our code for finding the nearest neighbor is working.

Write a loop to compute the Euclidean distance from the query house to each of the first 10 houses in the training set.

In [11]:
for i in range(10):
    print(f"Distance to house {i}", np.sqrt(np.sum((features_test[0] - features_train[i])**2)))

Distance to house 0 0.06027470916295592
Distance to house 1 0.08546881147643746
Distance to house 2 0.06149946435279315
Distance to house 3 0.05340273979294363
Distance to house 4 0.05844484060170442
Distance to house 5 0.059879215098128345
Distance to house 6 0.05463140496775461
Distance to house 7 0.055431083236146074
Distance to house 8 0.052383627840220305
Distance to house 9 0.05972359371398078


#### Perform 1-nearest neighbor regression
Write a single-line expression to define a variable ‘diff’ such that ‘diff[i]’ gives the element-wise difference between the features of the query house and the i-th training house.(Using vectorized numpy functions).

The next step in computing the Euclidean distances is to take these feature-by-feature differences in ‘diff’, square each, and take the sum over feature indices.  That is, compute the sum of squared feature differences for each training house (row in ‘diff’).

By default, ‘np.sum’ sums up everything in the matrix and returns a single number. To instead sum only over a row or column, we need to specifiy the ‘axis’ parameter described in the np.sum documentation. In particular, ‘axis=1’ computes the sum across each row.

In [82]:
def compute_distances(features_instances, features_query):
    diff = features_instances - features_query
    distances = np.sqrt(np.sum(diff**2, axis=1))
    return distances

In [80]:
compute_distances(features_train[4], features_test[2])
# np.sum((features_train[4] - features_test[2])**1, axis=1)
# features_train[4].reshape(1,-1).shape

(1, 18)

In [71]:
# Take the query house to be third house of the test set (features_test[2]).  
# What is the index of the house in the training set that is closest to this query house?
quiz_distance = compute_distances(features_train, features_test[2])
print("Closest neighbor: ", np.where(quiz_distance == min(quiz_distance)))
# What is the predicted value of the query house based on 1-nearest neighbor regression?
output_train[382]

Closest neighbor:  (array([382]),)


249000

#### Perform k-nearest neighbor regression
Using the functions above, implement a function that takes in:
- the value of k;
- the feature matrix for the instances; and
- the feature of the query
- and returns the indices of the k closest training houses. For instance, with 2-nearest neighbor, a return value of [5, 10] would indicate that the 6th and 11th training houses are closest to the query house.

In [122]:
def k_nearest_neighbors(k, feature_train, features_query):
    # initialize
    dist2kNN = np.sort(compute_distances(feature_train[:k], features_query))
    neighbors = np.argsort(dist2kNN)
    
    for i in range(k, len(feature_train)):
        new_distance = compute_distances(feature_train[i].reshape(1,-1), features_query)
        if new_distance < dist2kNN[k-1]:
            # find j such that δ > dist2kNN[j-1] but δ < dist2kNN[j]
            j = max(np.max(np.where(dist2kNN < new_distance), initial=0), np.min(np.where(dist2kNN > new_distance)))
            # remove furthest house and shift queue
            neighbors[j+1:k] = neighbors[j:k-1]
            dist2kNN[j+1:k] = dist2kNN[j:k-1]
            # set the new distance at j
            dist2kNN[j] = new_distance
            neighbors[j] = i
    
    return neighbors

In [123]:
# Take the query house to be third house of the test set (features_test[2]).  
# What are the indices of the 4 training houses closest to the query house?
k_nearest_neighbors(4, features_train, features_test[2])

array([ 382, 1149, 4087, 3142])

Write a function that predicts the value of a given query house. For simplicity, take the average of the prices of the k nearest neighbors in the training set. The function should have the following parameters:
- the value of k;
- the feature matrix for the instances;
- the output values (prices) of the instances; and
- the feature of the query, whose price we’re predicting.
- The function should return a predicted value of the query house.



In [126]:
def predict_output_of_query(k, features_train, output_train, features_query):
    neighbors = k_nearest_neighbors(k, features_train, features_query)
    prediction = 1/k * np.sum(output_train[neighbors])
    return prediction

In [127]:
# Again taking the query house to be third house of the test set (features_test[2]), 
# predict the value of the query house using k-nearest neighbors with k=4 
# and the simple averaging method described and implemented above.
predict_output_of_query(4, features_train, output_train, features_test[2])

413987.5

Finally, write a function to predict the value of each and every house in a query set. (The query set can be any subset of the dataset, be it the test set or validation set.) The idea is to have a loop where we take each house in the query set as the query house and make a prediction for that specific house. The new function should take the following parameters:
- the value of k;
- the feature matrix for the training set;
- the output values (prices) of the training houses; and
- the feature matrix for the query set.
- The function should return a set of predicted values, one for each house in the query set.

In [132]:
def predict_output(k, features_train, output_train, features_query):
    predictions = np.array([predict_output_of_query(k, features_train, output_train, i) for i in features_query])
    return predictions

In [133]:
# Make predictions for the first 10 houses in the test set, using k=10. 
# What is the index of the house in this query set that has the lowest predicted value?  
# What is the predicted value of this house?
predict_output(10, features_train, output_train, features_test[:10])

array([881300. , 431860. , 460595. , 430200. , 766750. , 667420. ,
       350032. , 512800.7, 484000. , 457235. ])

#### Choosing the best value of k using a validation set

There remains a question of choosing the value of k to use in making predictions. Here, we use a validation set to choose this value. Write a loop that does the following:

For k in [1, 2, … 15]:
- Make predictions for the VALIDATION data using the k-nearest neighbors from the TRAINING data.
- Compute the RSS on VALIDATION data
- Report which k produced the lowest RSS on validation data.


In [137]:
import math
smallest_rss = math.inf

for k in range(1, 16):
    print(k)
    predictions_valid = predict_output(k, features_train, output_train, features_valid)
    rss = sum((output_valid-predictions_valid)**2)
    if rss < smallest_rss:
        smallest_rss = rss
        best_k = k

print(best_k)

    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
8


In [138]:
# What is the RSS on the TEST data using the value of k found above?  
# To be clear, sum over all houses in the TEST set.
predictions_test = predict_output(8, features_train, output_train, features_test)
rss = sum((output_test-predictions_test)**2)

133154528199016.81


In [139]:
"{:e}".format(rss)

'1.331545e+14'