In [8]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [9]:
import pandas as pd
import numpy as np

In [58]:
df = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
training = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)
testing = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)

In [59]:
def get_numpy_data(df, features, output):
    return np.array(df[features].astype(float)), np.array(df[output])

In [60]:
def normalize_features(feature_matrix):
    norms = np.linalg.norm(feature_matrix, axis=0)
    return feature_matrix / norms, norms

In [61]:
features_train, output_train = get_numpy_data(training, training.ix[:, 3:].columns, 'price')
features_validation, output_validation = get_numpy_data(validation, validation.ix[:,3:].columns, 'price')
features_test, output_test = get_numpy_data(testing, testing.ix[:,3:].columns, 'price')

In [62]:
features_train, norms = normalize_features(features_train)
features_test = features_test / norms
features_validation = features_validation / norms

In [63]:
features_test[0]

array([ 0.01551285,  0.01807473,  0.01759212,  0.00160518,  0.017059  ,
        0.        ,  0.05102365,  0.0116321 ,  0.01564352,  0.01362084,
        0.02481682,  0.01350306,  0.        ,  0.01345762,  0.01345387,
       -0.01346922,  0.01375926,  0.0016225 ])

In [68]:
features_train[9]

array([ 0.01163464,  0.00602491,  0.0083488 ,  0.00050756,  0.01279425,
        0.        ,  0.        ,  0.01938684,  0.01390535,  0.0096309 ,
        0.        ,  0.01302544,  0.        ,  0.0134557 ,  0.01346821,
       -0.01346251,  0.01195898,  0.00156612])

In [75]:
np.sqrt(np.sum(np.square(features_test[0] - features_train[9])))

0.059723593744844518

In [77]:
for i in range(10):
    print('The Euclidean distance between the query house and the {} house is {}'.format(i, np.sqrt(np.sum(np.square(features_test[0] - features_train[i])))))

The Euclidean distance between the query house and the 0 house is 0.06027470958485634
The Euclidean distance between the query house and the 1 house is 0.0854688114765475
The Euclidean distance between the query house and the 2 house is 0.061499465821446185
The Euclidean distance between the query house and the 3 house is 0.05340273981055426
The Euclidean distance between the query house and the 4 house is 0.05844484103681425
The Euclidean distance between the query house and the 5 house is 0.0598792159123212
The Euclidean distance between the query house and the 6 house is 0.05463140503661286
The Euclidean distance between the query house and the 7 house is 0.055431085638732065
The Euclidean distance between the query house and the 8 house is 0.05238362784901739
The Euclidean distance between the query house and the 9 house is 0.05972359374484452


In [80]:
features_train[0:3] - features_test[0]

(3, 18)

In [81]:
diff = features_train[:] - features_test[0]

In [84]:
diff.shape

(5527, 18)

In [87]:
np.sum(diff**2, axis=1)[15]

0.0033070591154297987

In [88]:
np.sum(diff[15]**2)

0.0033070591154297992

In [94]:
distances = np.sqrt(np.sum(diff**2, axis=1))

In [95]:
distances[100]

0.023708238128855204

In [96]:
def compute_distances(features_instances, features_query):
    return np.sqrt(np.sum((features_instances[:] - features_query)**2, axis=1))

In [97]:
distances = compute_distances(features_train, features_test[2])

In [99]:
np.argmin(distances)

382

In [155]:
output_train[382]

249000.0

In [104]:
def k_nearest_neighbors(k, feature_train, features_query):
    distances = np.sqrt(np.sum((feature_train[:] - features_query)**2, axis=1))
    return np.argpartition(distances, k)

In [111]:
idx = k_nearest_neighbors(4, features_train, features_test[2])[:4]

In [112]:
idx

array([ 382, 1149, 4087, 3142], dtype=int64)

In [125]:
def predict_output_of_query(k, features_train, output_train, features_query):
    distances = np.sqrt(np.sum((features_train[:] - features_query)**2, axis=1))
    idx = np.argpartition(distances, k)[:k]
    prediction = np.sum(output_train[idx])/k
    return prediction

In [126]:
prediction = predict_output_of_query(4, features_train, output_train, features_test[2])

In [127]:
prediction

413987.5

In [128]:
output_test[2]

438000.0

In [129]:
def predict_output(k, features_train, output_train, features_query):
    predictions = []
    for query in features_query:
        distances = np.sqrt(np.sum((features_train[:] - query)**2, axis=1))
        idx = np.argpartition(distances, k)[:k]
        prediction = np.sum(output_train[idx])/k
        predictions.append(prediction)
    return predictions

In [131]:
predictions = predict_output(10, features_train, output_train, features_test[:10])

In [133]:
np.argmin(predictions)

6

In [134]:
predictions[np.argmin(predictions)]

350032.0

In [143]:
rss = []
for k in range(1, 16):
    predictions = predict_output(k, features_train, output_train, features_validation)
    rss.append(np.sum(np.square(predictions - output_validation)))

In [145]:
best_k = 1 + np.argmin(rss)

In [147]:
best_k

8

In [156]:
np.sum(np.square(predict_output(best_k, features_train, output_train, features_test) - output_test))

133097270973391.81