In [292]:
import pandas as pd
import numpy as np
import math
import sklearn

In [293]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}

In [294]:
train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)

In [295]:
validation = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

In [296]:
test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)

In [297]:
np.array(train.columns)

array(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype=object)

In [298]:
def get_numpy_data(data_frame, features, output):
    selected_data_frame = data_frame[features]
    output_array = data_frame[output].to_numpy()
    np_selected_data_frame = selected_data_frame.to_numpy()
    total_row = np_selected_data_frame.shape[0]
    np_ones = np.ones(total_row, dtype=int).reshape(total_row, 1)
    
    features_array = np.append(np_ones, np_selected_data_frame, axis=1)
    
    return (features_array, output_array)

In [299]:
def normalize_features(features):
    norms = np.linalg.norm(features, axis=0)
    return (features/norms, norms)

In [300]:
features = ['bedrooms',  
                'bathrooms',  
                'sqft_living',  
                'sqft_lot',  
                'floors',
                'waterfront',  
                'view',  
                'condition',  
                'grade',  
                'sqft_above',  
                'sqft_basement',
                'yr_built',  
                'yr_renovated',  
                'lat',  
                'long',  
                'sqft_living15',  
                'sqft_lot15']
output_column = ['price']

In [301]:
len(features)

17

In [302]:
train[features].head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,3.0,1.0,1180.0,5650,1.0,0,0,3,7,1180,0,1955,0,47.5112,-122.257,1340.0,5650.0
1,3.0,2.25,2570.0,7242,2.0,0,0,3,7,2170,400,1951,1991,47.721,-122.319,1690.0,7639.0
2,2.0,1.0,770.0,10000,1.0,0,0,3,6,770,0,1933,0,47.7379,-122.233,2720.0,8062.0
3,4.0,3.0,1960.0,5000,1.0,0,0,5,7,1050,910,1965,0,47.5208,-122.393,1360.0,5000.0
4,3.0,2.0,1680.0,8080,1.0,0,0,3,8,1680,0,1987,0,47.6168,-122.045,1800.0,7503.0


In [303]:
train.iloc[0]

id                    7129300520
date             20141013T000000
price                     221900
bedrooms                       3
bathrooms                      1
sqft_living                 1180
sqft_lot                    5650
floors                         1
waterfront                     0
view                           0
condition                      3
grade                          7
sqft_above                  1180
sqft_basement                  0
yr_built                    1955
yr_renovated                   0
zipcode                    98178
lat                      47.5112
long                    -122.257
sqft_living15               1340
sqft_lot15                  5650
Name: 0, dtype: object

In [304]:
features_train, train_output = get_numpy_data(train, features, output_column)



In [305]:
features_test, test_output = get_numpy_data(test, features, output_column)
features_valid, valid_output = get_numpy_data(validation, features, output_column)

In [306]:
features_train, norms = normalize_features(features_train)

features_test = features_test/norms

features_valid = features_valid/norms

### Compute a single distance

In [307]:
print (features_test[0])
print (features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [308]:
def euclidean_distance(house_1, house_2):
    return np.sqrt(np.sum((house_1 - house_2) ** 2))

In [309]:
print ("%.3f" % euclidean_distance(features_test[0], features_train[9]))

0.060


### 10. Quiz Question: Among the first 10 training houses, which house is the closest to the query house?

In [310]:
min_distance = None
nearest_house = None
id = 0
best_id = None

for house in features_train[0:10]:
    distance = euclidean_distance(features_test[0], house)
    print (id, distance)
    if (min_distance == None or distance < min_distance):
        min_distance = distance
        nearest_house = house
        best_id = id
    id += 1

print ("Best house with distance = %f, feature of this house: %s, id=%d" % 
       (min_distance, nearest_house, best_id))
    

0 0.06027470916295592
1 0.08546881147643746
2 0.06149946435279315
3 0.05340273979294363
4 0.05844484060170442
5 0.059879215098128345
6 0.05463140496775461
7 0.055431083236146074
8 0.052383627840220305
9 0.05972359371398078
Best house with distance = 0.052384, feature of this house: [ 0.01345102  0.01163464  0.01054359  0.01461041  0.00086391  0.017059
  0.          0.          0.0116321   0.01390535  0.01203862  0.01790904
  0.01306638  0.          0.01347288 -0.01346757  0.01131603  0.0014493 ], id=8


In [311]:
for i in range(3):
    print (features_train[i] - features_test[0])

[ 0.00000000e+00 -3.87821276e-03 -1.20498190e-02 -1.05552733e-02
  2.08673616e-04 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03 -5.50336860e-03 -2.48168183e-02
 -1.63756198e-04  0.00000000e+00 -1.70254220e-05  1.29876855e-05
 -5.14364795e-03  6.69281453e-04]
[ 0.00000000e+00 -3.87821276e-03 -4.51868214e-03 -2.26610387e-03
  7.19763456e-04  0.00000000e+00  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -3.47633726e-03  1.30705004e-03 -1.45830788e-02
 -1.91048898e-04  6.65082271e-02  4.23090220e-05  6.16364736e-06
 -2.89330197e-03  1.47606982e-03]
[ 0.00000000e+00 -7.75642553e-03 -1.20498190e-02 -1.30002801e-02
  1.60518166e-03 -8.52950206e-03  0.00000000e+00 -5.10236549e-02
  0.00000000e+00 -5.21450589e-03 -8.32384500e-03 -2.48168183e-02
 -3.13866046e-04  0.00000000e+00  4.70885840e-05  1.56292487e-05
  3.72914476e-03  1.64764925e-03]


In [312]:
def compute_distance(features_instances, features_query):
    diff = features_instances - features_query
    print (diff[-1].sum())
    distances = np.sqrt(np.sum(diff ** 2, axis=1))
    
    return distances

In [313]:
distances_100 = compute_distance(features_train, features_test[0])

-0.09343399874654643


distances[100] should contain 0.0237082324496

In [314]:
a = 0.0237082324496
print (abs(distances_100[abs(distances_100 - a).argmin()] - a) < 1e-6)


True


In [315]:
a = np.array([1,2])
b = np.array([[5,60], [7, 8], [9, 10]])

In [316]:
b - a

array([[ 4, 58],
       [ 6,  6],
       [ 8,  8]])

In [317]:
np.sum(b-a, axis=1)

array([62, 12, 16])

### 16. Quiz Question: Take the query house to be third house of the test set (features_test[2]).  What is the index of the house in the training set that is closest to this query house?

In [318]:
distances_3 = compute_distance(features_train, features_test[2])

0.006664529159452478


In [319]:
distances_3.argmin()

382

In [320]:
distances_3[distances_3.argmin()]

0.0028604955575117085

### 17. Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?



In [321]:
# Predicted
train_output[distances_3.argmin()]

array([249000.])

In [322]:
# Actual
test_output[distances_3.argmin()]

array([600000.])

## Perform k-nearest neighbor regression

In [323]:
np.argsort

<function numpy.argsort(a, axis=-1, kind='quicksort', order=None)>

In [324]:
def k_nearest_neighbors(k, features_train, features_query):
    diff = features_train - features_query
    distances = np.sqrt(np.sum(diff ** 2, axis=1))
    
#     print (len(features_train), len(distances))
#     print (distances)
    
    return np.argsort(distances)[0:k]

In [325]:
nearest_2_nb = k_nearest_neighbors(2, features_train, features_test[2])
nearest_2_nb

array([ 382, 1149])

In [326]:
train_output[nearest_2_nb]

array([[249000.],
       [477000.]])

## 19. Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?

In [327]:
nearest_4_nb = k_nearest_neighbors(4, features_train, features_test[2])
nearest_4_nb

array([ 382, 1149, 4087, 3142])

In [328]:
def predict_output_of_query(k, features_train, output_train, features_query):
    nearest_neighbors = k_nearest_neighbors(k, features_train, features_query)
    predicted = 1.0/k * np.sum(output_train[nearest_neighbors])
    
    return predicted

## 21. Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.



In [329]:

predict_output_of_query(4, features_train, train_output, features_test[2])

413987.5

In [330]:
# Actual
test_output[2]

array([438000.])

In [331]:
def predict_output(k, features_train, output_train, features_query):
    total_query = features_query.shape[0]
    predictions = np.zeros([total_query, 1])
    
    for i in range(total_query):
        predictions[i] = predict_output_of_query(k, features_train,
                                                 output_train, features_query[i]
                                                )
    
    return predictions

## 23. Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?



In [332]:
predicted_first_10_houses = predict_output(10, features_train, train_output, features_test[0:10])
predicted_first_10_houses

array([[881300. ],
       [431860. ],
       [460595. ],
       [430200. ],
       [766750. ],
       [667420. ],
       [350032. ],
       [512800.7],
       [484000. ],
       [457235. ]])

In [333]:
# Actual
test_output[0:10]

array([[650000.],
       [485000.],
       [438000.],
       [535000.],
       [785000.],
       [975000.],
       [287000.],
       [355000.],
       [305000.],
       [518500.]])

In [342]:
predicted_first_10_houses[predicted_first_10_houses.argmin()]

array([350032.])

In [335]:
test_output[predicted_first_10_houses.argmin()]

array([287000.])

## Choose the best value of k using a validation set

In [339]:
rss_list = np.zeros(15)

for k in range(1, 16):
    predictions = predict_output(k, features_train, train_output,
                                 features_valid
                                )
    print (predictions.shape)
    rss = np.sum((valid_output - predictions) ** 2)
    print (valid_output.shape)
    rss_list[k-1] = rss

print (rss_list)
    

(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
(1435, 1)
[1.05453830e+14 8.34450735e+13 7.26920960e+13 7.19467217e+13
 6.98465174e+13 6.88995444e+13 6.83419735e+13 6.73616787e+13
 6.83727280e+13 6.93350487e+13 6.95238552e+13 6.90499696e+13
 7.00112545e+13 7.09086989e+13 7.11069284e+13]


## Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.

In [340]:
best_k = rss_list.argmin() + 1
print (best_k)

8


In [341]:
test_predictions = predict_output(best_k, features_train, train_output,
                                  features_test
                                 )
rss_test = np.sum((test_predictions - test_output) ** 2)
rss_test

133118823551516.81