# Machine learning with GIS based housing data

Experimenting with GIS based housing data.

### Import packages

In [1]:
import json
import math
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn import linear_model

from utils import make_train_test, get_metrics, cross_validation, soos_validation

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``COUNT_RADIUS``: Maximum distance for count based features
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
COUNT_RADIUS = 3500  # in meters
K_FOLDS = 5
DUMMIES = ["MUNICODE"]

Load structured data with added GIS features.

In [3]:
structured_gis = pd.read_csv(PATH+f"structured_gis_category_features_{COUNT_RADIUS}_radius.csv")
print(structured_gis.shape)
structured_gis.head(10)

(9554, 137)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,apartment_buildings_dist,apartment_buildings_count,faith-based_facilities_dist,faith-based_facilities_count,restaurants_dist,restaurants_count,community_nonprofit_orgs_dist,community_nonprofit_orgs_count,bus_stops_dist,bus_stops_count
0,230053,15025,878,44,87803,9900,04-21-2015,170000.0,142400,1.0,...,2688.731014,3,2345.385039,3,2062.585612,4,231.140135,28,3030.85734,10
1,59600,15044,938,3,93801,157687,02-06-2017,715000.0,624900,2.0,...,3266.415457,1,1192.377424,5,2993.958582,13,988.415027,41,5957.45266,0
2,59872,15106,812,7,81203,5760,12-13-2019,180000.0,137500,2.0,...,350.030769,17,358.605655,22,271.033288,65,199.170062,143,146.934459,187
3,233732,15215,801,17,80102,4000,08-17-2017,445000.0,388900,2.0,...,400.925709,23,72.114896,34,208.885548,87,86.770573,190,289.644717,365
4,245949,15108,817,11,81702,7500,02-10-2017,149500.0,130700,2.0,...,352.99811,7,490.160829,12,67.865382,32,173.833546,59,170.112583,75
5,245988,15236,873,44,87302,2800,06-15-2017,132000.0,115400,1.0,...,754.395334,28,297.717978,20,485.603501,81,71.868207,104,496.138025,204
6,257252,15228,926,26,92601,31263,07-13-2017,495000.0,329000,2.0,...,535.446337,59,455.585543,39,641.186114,104,39.431604,237,701.447824,155
7,246320,15132,409,23,40005,2500,05-25-2017,47000.0,40600,2.0,...,1601.435649,21,716.901283,30,82.472518,70,194.883164,126,800.367569,167
8,246781,15221,828,9,82801,13620,06-06-2018,170000.0,150400,2.0,...,564.82474,54,499.314807,50,241.708847,94,320.049787,265,117.035343,496
9,247147,15068,880,31,88008,4897,11-25-2015,60000.0,67700,1.0,...,5443.429505,0,2482.600144,5,1763.161382,14,154.624213,16,1595.881571,28


Create results df

In [4]:
results_df = pd.DataFrame()

## Only distance to nearest

Filter out all ``_count`` features.

In [5]:
structured_gis_dist = structured_gis[[col for col in structured_gis.columns if "_count" not in col]]
structured_gis_dist.shape

(9554, 101)

In [6]:
X_columns, data_sets, error_df = make_train_test(structured_gis_dist, DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

removed column MUNICODE_861, first occurence in test

(7165, 251): (5373, 252) + (1792, 252)
(7165,): (5373,) + (1792,)
(2389, 251)
(2389,)


### Linear regression

In [7]:
# model_01 = linear_model.LinearRegression()
model_01 = linear_model.Lasso()
# model_01 = linear_model.Ridge()
model_01.fit(X_train, y_train)

Lasso()

In [8]:
y_pred_01 = model_01.predict(X_test)
metrics = get_metrics(y_test, y_pred_01)

MAE:  32438
RMSE: 46144
MAPE: 19.62%
R^2:  0.874


Cross validation

In [9]:
results_df["Linear: S+D"], X_01_columns = cross_validation(model_01, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_302, first occurence in test
removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_123, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_822, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  32756
RMSE: 47635
MAPE: 20.21%
R^2:  0.878


### Catboost

In [10]:
model_02 = CatBoostRegressor()
model_02.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x2bc0a4ed048>

In [11]:
y_pred_02 = model_02.predict(X_test)
metrics = get_metrics(y_test, y_pred_02)

MAE:  28619
RMSE: 41418
MAPE: 16.5%
R^2:  0.899


Cross validation

In [12]:
results_df["Catboost: S+D"], X_02_columns = cross_validation(model_02, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_302, first occurence in test
removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_123, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_822, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  29182
RMSE: 43473
MAPE: 16.98%
R^2:  0.899


## Distance to nearest and count in radius

Make train/test set for ``_dist`` and ``_count`` features

In [13]:
X_columns, data_sets, error_df = make_train_test(structured_gis, DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

removed column MUNICODE_861, first occurence in test

(7165, 287): (5373, 288) + (1792, 288)
(7165,): (5373,) + (1792,)
(2389, 287)
(2389,)


### Linear regression

In [14]:
# model_03 = linear_model.LinearRegression()
# model_03 = linear_model.Lasso()
model_03 = linear_model.Ridge()
model_03.fit(X_train, y_train)

Ridge()

In [15]:
y_pred_03 = model_03.predict(X_test)
metrics = get_metrics(y_test, y_pred_03)

MAE:  32271
RMSE: 45933
MAPE: 19.6%
R^2:  0.875


Cross validation

In [16]:
results_df["Linear: S+D+C"], X_03_columns = cross_validation(model_03, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_302, first occurence in test
removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_123, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_822, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  32502
RMSE: 47146
MAPE: 20.04%
R^2:  0.881


### Catboost

In [17]:
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x2bc0a46ec48>

In [18]:
y_pred_04 = model_04.predict(X_test)
metrics = get_metrics(y_test, y_pred_04)

MAE:  28192
RMSE: 41126
MAPE: 16.06%
R^2:  0.9


In [19]:
error_df["catboost"] = [test - pred for test, pred in zip(y_test, y_pred_04)]

Cross validation

In [20]:
results_df["Catboost: S+D+C"], X_04_columns = cross_validation(model_04, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_302, first occurence in test
removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_123, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_822, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  28975
RMSE: 43292
MAPE: 16.86%
R^2:  0.899


## Results

In [21]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
# reorder columns
results_df = results_df[["Linear: S+D", "Linear: S+D+C", "Catboost: S+D", "Catboost: S+D+C"]]
results_df.to_csv(
    PATH + f"results/structured_gis_{COUNT_RADIUS}_results.csv", index=False)
print(f"Results for a count radius of {COUNT_RADIUS}m.")
results_df.head()

Results for a count radius of 3500m.


Unnamed: 0,Linear: S+D,Linear: S+D+C,Catboost: S+D,Catboost: S+D+C
MAE,32756.0,32502.0,29182.0,28975.0
RMSE,47635.0,47146.0,43473.0,43292.0
MAPE,20.21,20.04,16.98,16.86
R^2,0.878,0.881,0.899,0.899


## Feature selection

Create train/test split.

In [22]:
X_columns_text, data_sets, error_df = make_train_test(structured_gis, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

removed column MUNICODE_861, first occurence in test

(7165, 287): (5373, 288) + (1792, 288)
(7165,): (5373,) + (1792,)
(2389, 287)
(2389,)


Use [Recursive Feature Elimination](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html) to eliminate features and find out the optimal number of features.

In [23]:
# estimator = CatBoostRegressor(iterations=500, verbose=False)
# selector = RFECV(estimator, step=5, cv=5, verbose=1)
# selector = selector.fit(X_train, y_train)

In [24]:
# _ = joblib.dump(selector, 'features/rfecv_gis.gz')  # save selector
selector = joblib.load('features/rfecv_gis.gz')   # load selector
print(f"optimal number of features: {selector.n_features_}")

feature_mask = selector.support_

optimal number of features: 207


Create list of features to drop.

In [25]:
to_drop = np.array(X_train.columns)[~feature_mask]
to_drop = [col for col in to_drop if not DUMMIES[0] in col]  # remove dummy coded spatial membership

## Spatial out-of-sample test

In [26]:
estimator = CatBoostRegressor()
# error_df_soos, col_names, avg_fis, metrics = soos_validation(estimator,
#                                                              structured_gis,
#                                                              additional_drops=to_drop)
error_df_soos, col_names, avg_fis, metrics = soos_validation(estimator, structured_gis)
maes, rmses, mapes, r_squareds = metrics

Predicting district 1/13
Predicting district 2/13
Predicting district 3/13
Predicting district 4/13
Predicting district 5/13
Predicting district 6/13
Predicting district 7/13
Predicting district 8/13
Predicting district 9/13
Predicting district 10/13
Predicting district 11/13
Predicting district 12/13
Predicting district 13/13

Weighted metrics:
MAE:  41785
RMSE: 58712
MAPE: 24.08%
R^2:  0.662


In [27]:
error_df_soos.to_csv(PATH+"results/errors_soos_gis.csv")
error_df_soos

Unnamed: 0,id,lat,long,district,prediction,error
0,77497,40.362407,-80.046209,district_5,130251.188782,55723.811218
1,396546,40.450605,-80.217889,district_1,178578.547591,16421.452409
2,255751,40.508760,-80.082683,district_1,141684.252826,53315.747174
3,461687,40.606085,-79.930633,district_3,167037.700206,-7037.700206
4,141966,40.423962,-79.788868,district_8,355096.294911,-116096.294911
...,...,...,...,...,...,...
9549,282852,40.326328,-80.027753,district_5,117503.279462,2496.720538
9550,243406,40.360855,-80.016675,district_6,139277.370589,-44277.370589
9551,263392,40.607721,-79.930186,district_3,197501.037332,-10501.037332
9552,536570,40.373072,-79.811263,district_9,229219.590980,-154819.590980


In [28]:
districts = ["district_"+str(i) for i in range(1,14)]
metrics_df = pd.DataFrame(data={"district":districts, "mae":maes, "rmse":rmses, "mapes":mapes, "R^2":r_squareds})
metrics_df = metrics_df.set_index("district")
metrics_df.to_csv(PATH+"results/errors_soos_district_gis.csv")

# add more information about each district to characterize
df_agg = structured_gis.groupby(by="DISTRICT").mean()
df_agg  = df_agg[["SALEPRICE", "LOTAREA", "YEARBLT", "STORIES"]]
metrics_df_agg = pd.concat([metrics_df, df_agg], axis=1)

metrics_df_agg

Unnamed: 0,mae,rmse,mapes,R^2,SALEPRICE,LOTAREA,YEARBLT,STORIES
district_1,35582.802193,46640.240934,17.081168,0.732878,219888.788503,18311.494577,1959.376356,1.557158
district_2,58834.105142,85598.164165,16.732823,0.717353,347195.02963,28928.46455,1968.440212,1.703175
district_3,54193.279748,85840.940349,19.495726,0.728172,261041.727106,22902.147436,1959.855311,1.500916
district_4,28101.696112,39433.652083,17.209548,0.818003,191448.015873,16155.658009,1956.578644,1.48557
district_5,41636.032657,58226.095003,14.799241,0.772069,279481.883882,13725.758232,1956.907279,1.650347
district_6,23741.28951,30867.701533,17.788632,0.739094,159027.268519,10129.287037,1955.130658,1.43107
district_7,42371.404043,51764.411389,42.613141,0.503466,125234.332921,11437.024752,1950.638614,1.465099
district_8,36303.365586,47532.264605,27.35191,0.639433,158730.732,13534.752,1956.425333,1.457333
district_9,27906.774273,35315.193527,35.671461,0.353656,100983.590308,12213.361233,1951.126285,1.278267
district_10,58865.83976,83173.56385,50.227775,0.586973,190825.944681,7570.417021,1936.978723,1.878723


## Exploring solution concerning feature importance

In [29]:
cols_sliced = col_names[54:]
fis_sliced = avg_fis[54:]
fi_df = pd.DataFrame(data={"feature": cols_sliced, "importance": fis_sliced})
fi_df.sort_values(by=["importance"], ascending=False).head(15)

Unnamed: 0,feature,importance
1,senior_centers_dist,1.852932
24,nursing_homes_count,1.699187
38,banks_count,1.559306
3,health_centers_dist,1.511459
16,bars_count,1.186636
17,bike_share_stations_dist,1.127477
62,apartment_buildings_count,1.114907
51,parks_and_facilities_dist,1.104522
64,faith-based_facilities_count,1.045127
19,affordable_housing_dist,1.042578
