# K-Nearest Neighbor with Sklearn and Custom Distance Metric

In [14]:
import numpy as np # imports a fast numerical programming library
import scipy as sp #imports stats functions, amongst other things
import matplotlib as mpl # this actually imports matplotlib
import matplotlib.cm as cm #allows us easy access to colormaps
import matplotlib.pyplot as plt #sets up plotting under plt
import pandas as pd #lets us handle data as dataframes
#sets up pandas table display
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns #sets up styles and gives us more plotting options

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
df = pd.read_csv('../data/processed/housing-san-jose-cleaned.csv')
df

Unnamed: 0,price,sq ft (living),sq ft (lot),bd,ba,year built,year renov,stories,rooms,address
0,998888,1333,164,2,2,2008,2009,1,4,38 N Almaden Blvd UNIT 911
1,550000,852,4809,2,1,1930,0,1,0,"255 S 24th St,"
2,1200000,2316,5519,4,3,1901,2016,2,13,"755 E Saint James St,"
3,419000,645,1772,1,1,1990,0,1,0,2463 Jubilee Ln
4,2699000,2723,244807,4,3,1949,1949,1,9,11321 Canon Vista Ave
5,889000,1553,5000,4,3,1980,1980,1,7,1851 Pine Hollow Cir
6,675000,1008,5998,3,1,1954,0,1,0,3432 San Pablo Ave


In [3]:
# define independent and dependent variables

x = df[['sq ft (living)','bd']]
y = df['price']

'''x = x.values.reshape(-1,1)
plt.scatter(x,y)
plt.xlabel('size (sq ft)')
plt.ylabel('price ($)')
plt.show()'''


"x = x.values.reshape(-1,1)\nplt.scatter(x,y)\nplt.xlabel('size (sq ft)')\nplt.ylabel('price ($)')\nplt.show()"

In [4]:
# Generate 7 random query points

a = np.linspace(df['sq ft (living)'].min(),df['sq ft (living)'].max(),7)
b = [1,2,1,3,4,3,2]

query_table = {'query sq ft (living)':a,
            'query bd':b}
x2 = pd.DataFrame(query_table)
x2 = x2[['query sq ft (living)','query bd']]

x2

Unnamed: 0,query sq ft (living),query bd
0,645.0,1
1,991.333333,2
2,1337.666667,1
3,1684.0,3
4,2030.333333,4
5,2376.666667,3
6,2723.0,2


In [5]:
# custom distance function

def mydist(x,y):

    return 0.5*abs(x[0]-y[0]) + 0.5*abs(x[1]-y[1])*1000

    #n = x.shape[0]
    #for i in range(n):
    #    print(0.5*abs(x.loc[i][0]-y.loc[i][0]) + 0.5*abs(x.loc[i][1]-y.loc[i][1])*1000)

#mydist(x,x2)

In [6]:
x.loc[0][0]

1333

In [7]:
x

Unnamed: 0,sq ft (living),bd
0,1333,2
1,852,2
2,2316,4
3,645,1
4,2723,4
5,1553,4
6,1008,3


In [23]:
# instantiate, fit, and predict using Sklearn KNN Regressor

from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=2,metric=mydist)
knn.fit(x,y)
preds=pd.DataFrame(knn.predict(x2))
preds

Unnamed: 0,0
0,484500.0
1,774444.0
2,708944.0
3,782000.0
4,1044500.0
5,1949500.0
6,774444.0


In [24]:
# generate results table
result_table = pd.concat([x,x2,pd.DataFrame(df['price'])],axis=1)
result_table['predicted price ($)'] = preds
result_table = result_table[['sq ft (living)','bd','price','query sq ft (living)','query bd','predicted price ($)']]
result_table

Unnamed: 0,sq ft (living),bd,price,query sq ft (living),query bd,predicted price ($)
0,1333,2,998888,645.0,1,484500.0
1,852,2,550000,991.33,2,774444.0
2,2316,4,1200000,1337.67,1,708944.0
3,645,1,419000,1684.0,3,782000.0
4,2723,4,2699000,2030.33,4,1044500.0
5,1553,4,889000,2376.67,3,1949500.0
6,1008,3,675000,2723.0,2,774444.0
