In [20]:
import numpy as np
import pandas as pd
from scipy.spatial import distance
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
dc_listings = pd.read_csv('./data/dc_airbnb.csv')
# Remove the commas (,) and dollar sign characters ($) from the price column:
dc_listings = dc_listings.assign(price=dc_listings.price.str.replace('[$|,]', '').astype('float'))

# Training set: 75 %
# Test set: 25 %
train_rows = int(dc_listings.shape[0] * 0.75)
train_df = dc_listings.iloc[:train_rows]
test_df = dc_listings.iloc[train_rows:].copy()

In [42]:
def predict_price(value, feature):
#     params:
#         feature: name of the column (accommodates, bathrooms, ...)
#         value: 
    df = train_df.copy()
    df = df.assign(
        distance=np.abs(value - df[feature])
    ).sort_values('distance')
    
    ## Complete the function.
    return df.iloc[:5, df.columns.get_loc('price')].mean()

### Evaluating model performance

In [35]:
test_df = test_df.assign(
    predicted_price = test_df.accommodates.apply(predict_price, args=('accommodates',))
)

# Mean absolute error
mae = np.abs(test_df.price - test_df.predicted_price).mean()

# Mean squared error
mse = ((test_df.price - test_df.predicted_price) ** 2).mean()

In [48]:
test_df.loc[:, 'predicted_price'] = test_df.bathrooms.apply(predict_price, args=('bathrooms', ))
test_df.loc[:, 'squared_error'] = (test_df.price - test_df.predicted_price) ** 2
mse = test_df.squared_error.mean()

In [50]:
# Root mean squared error
rmse = mse ** (1/2)

In [52]:
errors_one = pd.Series([5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10])
errors_two = pd.Series([5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 10, 5, 1000])

In [61]:
mae_one = errors_one.mean()
rmse_one = (errors_one ** 2).mean() ** 0.5
mae_two = errors_two.mean()
rmse_two = (errors_two ** 2).mean() ** 0.5

In [62]:
mae_one, rmse_one

(7.5, 7.905694150420948)

In [63]:
mae_two, rmse_two

(62.5, 235.82302686548658)

### Multivariate K-nearest Neignbors

In [76]:
columns_to_drop = ['host_response_rate', 'host_acceptance_rate', 'host_listings_count',
                   'room_type', 'city', 'state',
                   'latitude', 'longitude', 'zipcode',
                   'cleaning_fee', 'security_deposit']
dc_listings = dc_listings.drop(columns_to_drop, axis=1)

dc_listings = dc_listings.dropna(subset=['bedrooms', 'bathrooms', 'beds'])

normalized_listings = (dc_listings - dc_listings.mean()) / dc_listings.std()

normalized_listings['price'] = dc_listings['price']

In [99]:
distance.euclidean(normalized_listings.accommodates, normalized_listings.bathrooms)

55.536699517309025

In [109]:
first_fifth_distance = distance.euclidean(
    normalized_listings.iloc[0, [0,2]],
    normalized_listings.iloc[4, [0,2]]
)

In [15]:
dc_listings = pd.read_csv('./data/dc_airbnb.csv')
# Remove the commas (,) and dollar sign characters ($) from the price column:
dc_listings = dc_listings.assign(price=dc_listings.price.str.replace('[$|,]', '').astype('float'))
columns_to_drop = ['host_response_rate', 'host_acceptance_rate', 'host_listings_count',
                   'room_type', 'city', 'state',
                   'latitude', 'longitude', 'zipcode',
                   'cleaning_fee', 'security_deposit']
dc_listings = dc_listings.drop(columns_to_drop, axis=1)
dc_listings = dc_listings.dropna(subset=['bedrooms', 'bathrooms', 'beds'])
normalized_listings = (dc_listings - dc_listings.mean()) / dc_listings.std()
normalized_listings['price'] = dc_listings['price']
# Training set: 75 %
# Test set: 25 %
train_df = normalized_listings.iloc[0:2792]
test_df = normalized_listings.iloc[2792:]

2 variables prediction

In [26]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[['accommodates', 'bathrooms']], train_df['price'])
predictions = knn.predict(test_df[['accommodates', 'bathrooms']])
two_features_mse = mean_squared_error(test_df.price, predictions)
two_features_rmse = two_features_mse ** (0.5)

two_features_mse, two_features_rmse

(12880.785483503983, 113.493548202107)

4 variables prediction

In [30]:
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']], train_df['price'])
four_predictions = knn.predict(test_df[['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']])
four_mse = mean_squared_error(test_df['price'], four_predictions)
four_rmse = four_mse ** (0.5)
four_mse, four_rmse

(12542.466120591584, 111.99315211472344)

All variables prediction

In [None]:
all_features = train_df.columns.drop('price')
knn = KNeighborsRegressor(n_neighbors=5, algorithm='brute')
knn.fit(train_df[all_features], train_df['price'])
all_features_predictions = knn.predict(test_df[all_features])
all_features_mse = mean_squared_error(test_df['price'], all_features_predictions)
all_features_rmse = all_features_mse ** (0.05)
all_features_mse, all_features_rmse