# Predicting House Prices using KNN Regression

 Loading libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
pd.set_option('display.max_rows', 200)

In [None]:
TT_SPLIT = 0.2     # ratio train/test size
RAND_STATE = 42   # specifies a sampling for repeatable results

In [None]:
# read data
data = pd.read_csv("DATA_Housing_Prices.csv")

 Preprocessing function

In [None]:
def preprocessing(df):
    """
    Function to transform an input dataframe. It will add another column named "Antiquity", drop all the Year related columns, drop any column not in
    ['Id','OverallQual','TotalBsmtSF','1stFlrSF','GrLivArea','FullBath','KitchenQual','TotRmsAbvGrd','GarageCars','GarageArea','Antiquity', 'SalePrice']
    replace values of variable KitchenQual to {'Fa': 1, 'TA': 1.61, 'Gd': 1.61, 'Ex': 3.14}, and
    drop any row with 'LotArea' < 25000
    :param df: input dataframe
    :return df2: processed dataframe
    """
    df2 = df.copy()

    # Computing Antiquity
    df2['Antiquity'] = df2['YrSold'] - df2['YearRemodAdd']
    year_cols = ['YearBuilt', 'YearRemodAdd', 'YrSold']
    df2.drop(columns = year_cols, axis = 1, inplace = True)

    # Filtering by LotArea
    df2 = df2[df2['LotArea'] < 25000 ]

    # Droping unwanted columns
    cols_to_keep = [
   'OverallQual',
   'TotalBsmtSF',
   'GrLivArea',
   'FullBath',
   'KitchenQual',
   'GarageCars',
   'Antiquity',
   'SalePrice']
    df2 = df2[cols_to_keep]

    # Replacing KitchenQual values by numerical ones.
    df2['KitchenQual'] = df2['KitchenQual'].replace({'Fa': 1, 'TA': 1.61, 'Gd': 1.61, 'Ex': 3.14})

    return df2

## Model evaluation

In [None]:
def model_performance(y_train, y_pred_train, y_test, y_pred_test):
    """
    :param y_train:
    :param y_pred_train:
    :param y_test:
    :param y_pred_test:
    :return:
    """
    exp_y_train      = np.exp(y_train)
    exp_y_test       = np.exp(y_test)
    exp_y_pred_train = np.exp(y_pred_train)
    exp_y_pred_test  = np.exp(y_pred_test)
    diff_train       = exp_y_train - exp_y_pred_train
    diff_test        = exp_y_test  - exp_y_pred_test

    ME_train = np.abs(np.mean(diff_train))
    ME_test  = np.abs(np.mean(diff_test))

    MAE_train = mean_absolute_error(exp_y_train, exp_y_pred_train)
    MAE_test  = mean_absolute_error(exp_y_test,  exp_y_pred_test)

    MSE_train = mean_squared_error(exp_y_train, exp_y_pred_train)
    MSE_test  = mean_squared_error(exp_y_test,  exp_y_pred_test)

    RMSE_train = np.sqrt(MSE_train)
    RMSE_test  = np.sqrt(MSE_test)

    MAPE_train = np.mean((np.abs(diff_train)/exp_y_train) * 100.)
    MAPE_test  = np.mean((np.abs(diff_test)/exp_y_test) * 100.)

    R2_train = r2_score(exp_y_train, exp_y_pred_train)
    R2_test  = r2_score(exp_y_test,  exp_y_pred_test)

    performance = pd.DataFrame({'Error_metric': ['Mean error','Mean absolute error','Mean squared error',
                                                 'Root mean squared error','Mean absolute percentual error','R2'],
                            'Train': [ME_train, MAE_train, MSE_train, RMSE_train, MAPE_train, R2_train],
                            'Test' : [ME_test, MAE_test , MSE_test, RMSE_test, MAPE_test, R2_test]})

    pd.options.display.float_format = '{:.2f}'.format

    df_train = pd.DataFrame({'Real': exp_y_train, 'Predicted':  exp_y_pred_train})
    df_test  = pd.DataFrame({'Real': exp_y_test,  'Predicted':  exp_y_pred_test})

    return performance, df_train, df_test

## Creating our transformer

Check more information about pipelines [here](https://towardsdatascience.com/pipeline-columntransformer-and-featureunion-explained-f5491f815f)

In [None]:
from sklearn.preprocessing import FunctionTransformer

# FunctionTransformer creates a transformer from a user defined function
procTransformer = FunctionTransformer(preprocessing)

data = procTransformer.transform(data)
data

# data = preprocess(data)
# pipeline = Pipeline([('drop_columns',drop_cols_transformer),('standard_scaler',StandardScaler()),('Linear_model',LinearRegressor())])
# pipeline.fit(X_train,y_trains)
# pipeline.transform(X_train)
# pipeline.transform(X_test)
# pipeline.predict(X_train)
# pipeline.predict(X_test)

In [None]:
##data2 = preprocessing(data)
##data2

### Defining X, y

In [None]:
# define X and y (features and target)
X = data.drop(columns=["SalePrice"], axis = 1)
y = np.log(data["SalePrice"])

### Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TT_SPLIT, random_state=RAND_STATE)

# training datasets to DataFrame again to manipulate them
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

### Modelling

In [None]:

import pickle

trans = PowerTransformer()
model = LinearRegression()

# define X and y (features and target)
#X = data.drop(columns = ['Id','SalePrice'])
#y = np.log(data['SalePrice']) # The model will try to predict the log(SalePrice) instead of the real price

# data splitting
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# fitting to the train data
trans.fit(X_train)

# transformation
X_train = trans.transform(X_train)
X_test  = trans.transform(X_test)

X_train = pd.DataFrame(X_train, columns = X.columns)
X_test  = pd.DataFrame(X_test,  columns = X.columns)

# model fitting
model.fit(X_train, y_train)

# model prediction
y_pred_train_lm = model.predict(X_train)
y_pred_test_lm  = model.predict(X_test)

### Evaluating the model performance

In [None]:
performance_lm, _, _= model_performance(y_train, y_pred_train_lm, y_test, y_pred_test_lm)
performance_lm

### KNN

In [None]:
# initialize model (set parameters)
neigh = KNeighborsRegressor(n_neighbors=2) # n_neighbors = K

In [None]:
neigh.fit(X_train, y_train) # Minkowski distance with p = 2 -> Euclidean distance

## Making predictions

In [None]:
# make predictions
y_pred_train_knn = neigh.predict(X_train)
y_pred_test_knn  = neigh.predict(X_test)

performance_knn, _, _ = model_performance(y_train, y_pred_train_knn, y_test, y_pred_test_knn)
performance_knn

In [None]:
def run_kmeans(k_min,k_max):
    """ perform kmean modeling over sequence of values for k (# of neighbors) and output error metrics for each k
    :param k_min: starting number of clusters
    :param k_max: ending number of clusters
    :return: error metrics
    """

    full = pd.DataFrame()

    models = {'k': [] }


    for k in range(k_min,k_max+1):

        neigh = KNeighborsRegressor(n_neighbors=k) # define model object
        neigh.fit(X_train, y_train)                # train the model

        models['k'] = [k, neigh]                   # populate the results for each k

        y_pred_train_knn = neigh.predict(X_train)  # prediction (train)
        y_pred_test_knn  = neigh.predict(X_test)   # prediction (test)

        performance_knn, _, _ = model_performance(y_train, y_pred_train_knn, y_test, y_pred_test_knn) # error metrics
        temp = pd.DataFrame({'k': [k]*6, 'Error_metric': performance_knn['Error_metric'],
                             'Train': performance_knn['Train'], 'Test': performance_knn['Test']})
        full = pd.concat([full,temp], axis=0)

    return full


In [None]:
k_st = 2 #  minimum number of neighbors to use
k_end = 21 # maximum number of neighbors to use
results=run_kmeans(k_st,k_end)

Checking for overfitting

In [None]:
R2results_df = results[results['Error_metric'] == 'R2'].reset_index(drop=True)

In [None]:
results2 = results.melt(id_vars=['k','Error_metric']) # format nicely
results2

Display error metrics as functions of k for test and train results

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

#metrics = ['Mean error',]'Mean absolute error',...]

fig, ax = plt.subplots(2,3, figsize=(20,10))
sns.lineplot(x = 'k', y = 'value', data = results2[results2['Error_metric'] == 'Mean error'], hue = 'variable', ax = ax[0,0])
ax[0,0].set_xticks(range(2,21))
ax[0,0].set_title("Mean error")
ax[0,0].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = results2[results2['Error_metric'] == 'Mean absolute error'], hue = 'variable', ax = ax[0,1])
ax[0,1].set_xticks(range(2,21))
ax[0,1].set_title("Mean absolute error")
ax[0,1].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = results2[results2['Error_metric'] == 'Mean squared error'], hue = 'variable', ax = ax[0,2])
ax[0,2].set_xticks(range(2,21))
ax[0,2].set_title("Mean squared error")
ax[0,2].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = results2[results2['Error_metric'] == 'Root mean squared error'], hue = 'variable', ax = ax[1,0])
ax[1,0].set_xticks(range(2,21))
ax[1,0].set_title("Root mean squared error")
ax[1,0].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = results2[results2['Error_metric'] == 'Mean absolute percentual error'], hue = 'variable', ax = ax[1,1])
ax[1,1].set_xticks(range(2,21))
ax[1,1].set_title("Mean absolute percentual error")
ax[1,1].legend(loc='lower right')
sns.lineplot(x = 'k', y = 'value', data = results2[results2['Error_metric'] == 'R2'], hue = 'variable', ax = ax[1,2])
ax[1,2].set_xticks(range(2,21))
ax[1,2].set_title("R2")
ax[1,2].legend(loc='lower right')

## Part 2 - Cross validation and learning parameter optimizing

### Cross Validation

In [None]:
# initialize model (only parameter is n_jobs=-1, i.e., use cpu greedily)
neigh = KNeighborsRegressor(n_jobs=-1) # model

# define grid search with K-Fold cross validation
neigh_search = GridSearchCV(estimator=neigh, #  model to use
                            param_grid={"n_neighbors":range(k_st,k_end), # parameters to try
                                        "weights":["uniform", "distance"]}, # in a given neighborhood, how we weight points relative to query point
                            scoring="r2", # strategy to evaluate the performance of the cross-validated model on the test set.
                            cv=10) # cv = num of folds

# all possible metrics here:
# from sklearn.metrics import SCORERS
# sorted(SCORERS.keys())

In [None]:
from sklearn.model_selection import  cross_val_score
print(pd.DataFrame(cross_val_score(neigh,X_train, y_train),columns=['r2 score'], index=[ "run " + str(l) for l in range(5)])) # print r2 scores

In [None]:
neigh_search.fit(X_train, y_train) # run the grid search for optimal learning parameters

In [None]:
neigh_search.cv_results_ # examine the scores of each our search cases

In [None]:
print(pd.DataFrame(neigh_search.cv_results_["mean_test_score"],columns=['mean test r2 score'], index=[ "run " + str(l) for l in range(len((neigh_search.cv_results_["mean_test_score"])))])) # print the results

In [None]:
neigh_search.cv_results_["params"] # all the parameters we used

In [None]:
cv_res = neigh_search.cv_results_
sorted(zip(cv_res["mean_test_score"], cv_res["params"]), reverse=True)

#### Randomized search

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_distributions = {"n_neighbors":range(k_st,k_end),
                       "weights":["distance", "uniform"]} # all the parameters we want to optimize and their range limits
# 20 K values * 2 possible wights = 40 combinations
# 100 combinations

In [None]:
neigh_randsearch = RandomizedSearchCV(estimator=neigh, # model to use
                                      param_distributions=param_distributions, # parameters to try
                                      n_iter=40,    # number of parameter settings that are sampled
                                      scoring="r2", # strategy to evaluate the performance of the cross-validated model on the test set.
                                      cv=10)

In [None]:
neigh_randsearch.fit(X_train, y_train) # apply model to data

In [None]:
cv_res = neigh_randsearch.cv_results_ # results : r2 scores for the different runs
sorted(zip(cv_res["mean_test_score"], cv_res["params"]), reverse=True)