# K-Nearest Neighbors Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### The Model

In [2]:
def knn_regression(X_train, y_train, X_test, k=10):
    '''Returns the prediction for every example in the X_test by taking averages of the predictions of the k
    Nearest Neighbors in X_train.
    '''
    y_cap_test = np.zeros((X_test.shape[0], 1))
    
    for i, x_query in enumerate(X_test):
        
        distances = np.linalg.norm(X_train - x_query, axis=1).flatten()
        indices = np.argsort(distances)
        
        knn_labels = y_train[indices[:k], 0]
        y_cap_test[i, 0] = np.mean(knn_labels)
        
    return y_cap_test


def score(y, y_cap):
    
    error = y - y_cap
    cost = np.matmul(error.T, error)
    score = 1 - (cost / ((y - y.mean()) ** 2).sum())
    
    return score.flatten()[0]

### Housing Data

In [3]:
df = pd.read_csv('data/LinearRegression_kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long
0,7129300520,20141013T000000,221900,3,1.0,1180.0,5650,1.0,0,0,3,7,1180,0,1955.0,,98178.0,47.5112,-122.257
1,6414100192,20141209T000000,538000,3,2.25,2570.0,7242,2.0,0,0,3,7,2170,400,1951.0,1991.0,98125.0,47.721,-122.319
2,5631500400,20150225T000000,180000,2,1.0,770.0,10000,1.0,0,0,3,6,770,0,1933.0,,98028.0,47.7379,-122.233
3,2487200875,20141209T000000,604000,4,3.0,1960.0,5000,1.0,0,0,5,7,1050,910,1965.0,,98136.0,47.5208,-122.393
4,1954400510,20150218T000000,510000,3,2.0,1680.0,8080,1.0,0,0,3,8,1680,0,1987.0,,98074.0,47.6168,-122.045


#### Filling Missing Values

In [4]:
df['yr_renovated'] = df['yr_renovated'].fillna(df['yr_built'])
df['zipcode'] = df['zipcode'].fillna(method='ffill')
df['sqft_living'] = df['sqft_living'].fillna(df['sqft_living'].mean())

#### Selecting Features

In [5]:
columns_to_select = []
for column in df.columns:
    if column not in ['id', 'date', 'price']:
        columns_to_select.append(column)
columns_to_select

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long']

#### Droping Rows with null values in any column

In [6]:
df = df[columns_to_select + ['price']].dropna()     # Only 6 rows dropped
X = np.array(df[columns_to_select])
y = np.array(df['price']).reshape(-1,1)
print(X.shape, y.shape)

(21607, 16) (21607, 1)


#### Train Test Split and normalization

In [21]:
train_size = int(0.8*X.shape[0])
random_indices = np.random.permutation(X.shape[0])
X_train = X[random_indices[:train_size], :]
X_test = X[random_indices[train_size:], :]
y_train = y[random_indices[:train_size], :]
y_test = y[random_indices[train_size:], :]
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

means = np.mean(X_train, axis=0)
stds = np.std(X_train, axis=0)
X_train = (X_train - means) / stds

(17285, 16) (4322, 16) (17285, 1) (4322, 1)


#### Predicting Labels for Test Data with k = 3

In [30]:
y_cap_test = knn_regression(X_train, y_train, (X_test - means) / stds, k=3)

#### Evaluating Test Set Performance

In [32]:
score(y_test, y_cap_test)

0.7554671995798563

### How to choose k

We can say that the model complexity decreases with the increase in the value of k i.e with lower values of k the model may have high variance and for the higher values of k the model may have high bias. We can choose k by cross validation or by just validation set.

#### k-fold cross validation

In [36]:
def k_fold_cross_validation(X, y, k_list_for_knn, k=10):
    
    m = X.shape[0]                            # No. of examples
    n = X.shape[1]                            # No. of features
    
    validation_scores = []
    
    for k_for_knn in k_list_for_knn:
        
        validation_score = 0
        
        for i in range(k):
            
            start = (m * i) // k
            end = (m * (i + 1)) // k-1
            
            X_valid = X[start:end+1, :]
            y_valid = y[start:end+1, :]
            
            X_train = np.append(X[:start, :], X[end+1:, :], axis=0)
            y_train = np.append(y[:start, :], y[end+1:, :], axis=0)
            
            y_cap_valid = knn_regression(X_train, y_train, X_valid, k_for_knn)
            
            validation_score += score(y_valid, y_cap_valid)
            
        validation_score /= k
        validation_scores.append(validation_score)
    
    return validation_scores

#### Running k-fold cross validation for different values of k

In [37]:
k_list = np.arange(1, 20)
validation_scores = k_fold_cross_validation(X_train, y_train, k_list)

In [38]:
k_list[np.argmax(validation_scores)]

7

#### Predicting Test Set Labels with best value of k determined by k-fold cross validation

In [43]:
y_cap_test = knn_regression(X_train, y_train, (X_test - means) / stds, k=k_list[np.argmax(validation_scores)])

#### Evaluating Test Set Performance for best k

In [44]:
score(y_test, y_cap_test)

0.7750296793223103