In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sb.set()

In [2]:
data = pd.read_csv("cleaned_data.csv", index_col = 0)
variables = pd.DataFrame(data.drop(columns=['host_name','neighbourhood','room_type','neighbourhood_group','price']))
price=pd.DataFrame(data['price'])

 ## K-Nearest Neighbors

In [3]:
#split the data and scaling data
sc_X = StandardScaler()
sc_y = StandardScaler()
X = sc_X.fit_transform(variables)
y = sc_y.fit_transform(price)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# check the sample sizes
print("Train Set :", X_train.shape, y_train.shape)
print("Test Set  :", X_test.shape, y_test.shape)

Train Set : (6324, 55) (6324, 1)
Test Set  : (1581, 55) (1581, 1)


## Using KFold to compare

In [4]:
from sklearn.neighbors import KNeighborsRegressor
knn_model = KNeighborsRegressor(n_neighbors=3)

In [5]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import sklearn.metrics as metrics

cv = KFold(n_splits = 10, random_state = 1, shuffle = True)

scores = cross_val_score(knn_model, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores)*-1)

Mean of MSE =  1.1242013343944455


In [6]:
from sklearn.model_selection import GridSearchCV

parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(1, 50)})

## Best performing value of K:

In [7]:
gridsearch.best_params_

{'n_neighbors': 28}

In [8]:
scores1 = cross_val_score(gridsearch, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores1)*-1)

Mean of MSE =  0.9305231909448446


## Adding Weighted Average of Neighbors Based on Distance

In [9]:
parameters = {
"n_neighbors": range(1, 50),
"weights": ["uniform", "distance"],
}
w_gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
w_gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': range(1, 50),
                         'weights': ['uniform', 'distance']})

In [10]:
w_gridsearch.best_params_

{'n_neighbors': 28, 'weights': 'uniform'}

In [11]:
scores2 = cross_val_score(w_gridsearch, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores2)*-1)

Mean of MSE =  0.9305231909448446


## Further Improving on kNN in scikit-learn With Bagging

In [12]:
best_k = w_gridsearch.best_params_["n_neighbors"]
best_weights = w_gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)

In [13]:
from sklearn.ensemble import BaggingRegressor
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

In [14]:
scores3 = cross_val_score(bagging_model, X_train, y_train.ravel(), scoring='neg_mean_squared_error', cv=cv, n_jobs=1)

print("Mean of MSE = ", np.mean(scores3)*-1)

Mean of MSE =  0.9256605438136576


In [15]:
print("Comparison of MSE of the Four Models:")

print("Arbitrary k: ", np.mean(scores)*-1)

print("GridSearchCV for k: ", np.mean(scores1)*-1)

print("GridSearchCV for k and weights: ", np.mean(scores2)*-1)

print("Bagging and GridSearchCV: ", np.mean(scores3)*-1)

Comparison of MSE of the Four Models:
Arbitrary k:  1.1242013343944455
GridSearchCV for k:  0.9305231909448446
GridSearchCV for k and weights:  0.9305231909448446
Bagging and GridSearchCV:  0.9256605438136576


## Thus, based on the results, we use kNN with bagging to predict the price.

In [16]:
bagging_model.fit(X_train, y_train.ravel())
test_preds_grid = bagging_model.predict(X_test)
price_pred = sc_y.inverse_transform(test_preds_grid.reshape(-1,1)) 

In [17]:
y_test = sc_y.inverse_transform(y_test)
df1 = pd.DataFrame(y_test, columns = ['price'])

# rounding up to nearest whole no.
price_pred = np.rint(price_pred)

df = pd.DataFrame(price_pred, columns = ['pred_price'])

# combining total cases test and the predicted total cases into a dataframe
all_data = pd.concat([df1, df], axis = 1)
all_data.head()

Unnamed: 0,price,pred_price
0,168.0,204.0
1,81.0,199.0
2,97.0,87.0
3,142.0,172.0
4,58.0,51.0


In [18]:
print("MAE of model on test data: ", metrics.mean_absolute_error(y_test, price_pred))
print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, price_pred))
print("Root Mean Squared Error (RMSE)\t:", (mean_squared_error(y_test, price_pred,squared=False)))

MAE of model on test data:  85.33017077798861
Mean Squared Error (MSE) 	: 121315.39974699558
Root Mean Squared Error (RMSE)	: 348.30360283378576
