In [1]:
import pandas as pd
import numpy as np

In [2]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 
              'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 
              'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float,
              'floors':float, 'condition':int, 'lat':float, 'date':str, 
              'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int
             }

In [3]:
def get_data(data, features, target):
    out_df = pd.DataFrame(data[features])
    out_df['constant'] = 1
    
    return out_df[['constant'] + features].values, np.array(data[target])

In [4]:
def normalise(features):
    
    norms = np.linalg.norm(features, axis=0)
    normalised_features = features / norms
    
    return normalised_features, norms

In [5]:
df_sales = pd.read_csv('kc_house_data_small.csv', dtype=dtype_dict)
df_train = pd.read_csv('kc_house_data_small_train.csv', dtype=dtype_dict)
df_test = pd.read_csv('kc_house_data_small_test.csv', dtype=dtype_dict)
df_valid = pd.read_csv('kc_house_data_validation.csv', dtype=dtype_dict)

In [6]:
feature_list = ['bedrooms',
                'bathrooms',
                'sqft_living',
                'sqft_lot',
                'floors',
                'waterfront',
                'view',
                'condition',
                'grade',
                'sqft_above',
                'sqft_basement',
                'yr_built',
                'yr_renovated',
                'lat',
                'long',
                'sqft_living15',
                'sqft_lot15',
               ]

In [7]:
features_train, target_train = get_data(df_train, feature_list, 'price')
features_test, target_test = get_data(df_test, feature_list, 'price')
features_valid, target_valid = get_data(df_valid, feature_list, 'price')

features_train, norms = normalise(features_train)

features_test = features_test / norms
features_valid = features_valid / norms

In [8]:
print(features_test[0])
print(features_train[9])

[ 0.01345102  0.01551285  0.01807473  0.01759212  0.00160518  0.017059
  0.          0.05102365  0.0116321   0.01564352  0.01362084  0.02481682
  0.01350306  0.          0.01345387 -0.01346922  0.01375926  0.0016225 ]
[ 0.01345102  0.01163464  0.00602491  0.0083488   0.00050756  0.01279425
  0.          0.          0.01938684  0.01390535  0.0096309   0.
  0.01302544  0.          0.01346821 -0.01346251  0.01195898  0.00156612]


In [9]:
def euclidean_dist(x_j, x_q):
    return np.sqrt(np.sum((x_j-x_q)**2))

<hr>
**Quiz Question: What is the Euclidean distance between the query house and the 10th house of the training set?**

In [10]:
euclidean_dist(features_train[9], features_test[0])

0.05972359371398078

<hr>
**Quiz Question: Among the first 10 training houses, which house is the closest to the query house?**

In [11]:
distances = {}

for i, j in enumerate(features_train[0:10]):
    distances[i] = euclidean_dist(j, features_test[0])

print("""
The closest house is house {}
""".format(min(distances, key=distances.get)+1))


The closest house is house 9



<hr>

In [12]:
def euclidean_dist_vector(x_j, x_q):
    return np.sqrt(np.sum((x_j-x_q)**2, axis=1))

<hr>
**Quiz Question: Take the query house to be third house of the test set (features_test[2]). What is the index of the house in the training set that is closest to this query house?**

In [13]:
np.argmin(euclidean_dist_vector(features_train, features_test[2]))

382

**Quiz Question: What is the predicted value of the query house based on 1-nearest neighbor regression?**

In [14]:
print("""
The expected value of the query house is: {}""".format(target_train[382]))


The expected value of the query house is: 249000.0


<hr>

In [15]:
def k_nearest_neighbors(k, feature_train, query):
    return np.argsort(euclidean_dist_vector(features_train, query))[:k]

<hr>
**Quiz Question: Take the query house to be third house of the test set (features_test[2]). What are the indices of the 4 training houses closest to the query house?**

In [16]:
k_nearest_neighbors(4, features_train, features_test[2])

array([ 382, 1149, 4087, 3142])

<hr>

In [17]:
def predict_one(k, feature_train, target, query):
    neighbours = k_nearest_neighbors(k, feature_train, query)
    
    return np.mean(target[neighbours])

<hr>
**Quiz Question: Again taking the query house to be third house of the test set (features_test[2]), predict the value of the query house using k-nearest neighbors with k=4 and the simple averaging method described and implemented above.**

In [18]:
predict_one(4, features_test, target_train, features_test[2])

413987.5

<hr>

In [19]:
def predict(k, feature_train, target, features_query):
    predictions = np.zeros(len(features_query))
    
    for i in range(len(features_query)):
        predictions[i] = predict_one(k, feature_train, target, features_query[i])
        
    return predictions

<hr>
**Quiz Question: Make predictions for the first 10 houses in the test set, using k=10. What is the index of the house in this query set that has the lowest predicted value? What is the predicted value of this house?**

In [20]:
predictions_10 = predict(10, features_train, target_train, features_test[:10])

print("""
The minimum predict value occurs at index {}
with a predicted value of {}
""".format(np.argmin(predictions_10),
           target_train[np.argmin(predictions_10)]
          )
     )


The minimum predict value occurs at index 6
with a predicted value of 229500.0



<hr>

In [21]:
rss = lambda y, yhat: np.dot((y.T-yhat).T, (y.T-yhat))

In [22]:
calculated_rss = {}

for k in range(1,16):
    predictions_valid = predict(k, features_train, target_train, features_valid)
    
    calculated_rss[k] = rss(target_valid, predictions_valid)

print("""
The minimum RSS: {}
occurs with a k of: {}
""".format(min(calculated_rss.values()),
      min(calculated_rss, key=calculated_rss.get)
         )
     )    


The minimum RSS: 67361678735491.5
occurs with a k of: 8



<hr>
**Quiz Question: What is the RSS on the TEST data using the value of k found above? To be clear, sum over all houses in the TEST set.**

In [23]:
predictions = predict(8, features_train, target_train, features_test)

rss(target_test, predictions)

133118823551516.81