# Machine learning with GIS based housing data

Experimenting with GIS based housing data.

### Import packages

In [1]:
import json
import math
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

from utils import make_train_test, get_metrics, cross_validation, soos_validation

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``COUNT_RADIUS``: Maximum distance for count based features
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
COUNT_RADIUS = 3500  # in meters
K_FOLDS = 5
DUMMIES = []

Load structured data with added GIS features.

In [3]:
structured_gis = pd.read_csv(PATH+f"structured_gis_category_features_{COUNT_RADIUS}_radius.csv")
print(structured_gis.shape)
structured_gis.head(10)

(9556, 137)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,apartment_buildings_dist,apartment_buildings_count,faith-based_facilities_dist,faith-based_facilities_count,restaurants_dist,restaurants_count,community_nonprofit_orgs_dist,community_nonprofit_orgs_count,bus_stops_dist,bus_stops_count
0,427021,15037,908,16,90803,10276,04-21-2017,179900.0,157200,2.0,...,1295.712181,2,430.854222,3,766.161991,11,430.131485,34,4100.223256,0
1,428296,15106,850,7,85002,9375,12-27-2019,185000.0,180400,1.5,...,1855.096394,29,1418.878172,23,556.515602,79,132.952974,140,952.974317,193
2,428307,15237,927,27,92705,14827,01-17-2017,226400.0,167900,1.0,...,720.786679,21,921.939028,16,625.040058,73,322.297063,105,982.293948,122
3,230894,15236,877,4,87702,8206,08-17-2018,140500.0,122800,1.0,...,755.689725,30,924.038244,27,761.305188,99,23.971638,145,151.018551,174
4,231082,15241,950,42,95002,13050,07-17-2020,365000.0,255300,2.0,...,1274.896936,6,1823.857016,20,890.492128,54,31.019148,118,1132.092745,90
5,429294,15220,120,47,12001,10553,04-04-2017,203500.0,152900,1.0,...,836.578377,62,736.642125,47,349.591633,173,345.953494,293,335.095565,336
6,429419,15237,940,28,94002,11266,08-12-2019,226000.0,133700,1.0,...,286.150889,23,236.638991,21,335.390755,109,617.593485,125,160.819866,182
7,430193,15122,870,45,87002,12445,09-08-2020,80000.0,68500,2.0,...,1346.471821,15,531.013014,57,529.601447,84,342.584995,153,36.344021,371
8,430698,15143,884,27,88405,100188,04-04-2018,295000.0,250800,1.0,...,1869.882119,1,1839.573113,9,2127.553934,6,783.373282,50,5551.757438,0
9,431954,15136,919,24,91901,11587,01-04-2017,280000.0,244700,2.0,...,1002.117208,6,895.534897,8,850.532377,39,268.493993,60,373.789015,165


Create results df

In [4]:
results_df = pd.DataFrame()

## Only distance to nearest

Filter out all ``_count`` features.

In [5]:
structured_gis_dist = structured_gis[[col for col in structured_gis.columns if "_count" not in col]]
structured_gis_dist.shape

(9556, 101)

In [6]:
X_columns, data_sets, error_df = make_train_test(structured_gis_dist, DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7167, 89): (5375, 89) + (1792, 89)
(7167,): (5375,) + (1792,)
(2389, 89)
(2389,)


### Linear regression

In [7]:
# model_01 = linear_model.LinearRegression()
model_01 = linear_model.Lasso()
# model_01 = linear_model.Ridge()
model_01.fit(X_train, y_train)

Lasso()

In [8]:
y_pred_01 = model_01.predict(X_test)
metrics = get_metrics(y_test, y_pred_01)

MAE:  39367
RMSE: 55089
MAPE: 24.25%
R^2:  0.829


Cross validation

In [9]:
results_df["Linear: S+D"], X_01_columns = cross_validation(model_01, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  41501
RMSE: 58637
MAPE: 26.38%
R^2:  0.815


### Catboost

In [10]:
model_02 = CatBoostRegressor()
model_02.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x1a6b5afeb48>

In [11]:
y_pred_02 = model_02.predict(X_test)
metrics = get_metrics(y_test, y_pred_02)

MAE:  28725
RMSE: 41493
MAPE: 16.59%
R^2:  0.903


Cross validation

In [12]:
results_df["Catboost: S+D"], X_02_columns = cross_validation(model_02, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  29852
RMSE: 44221
MAPE: 17.72%
R^2:  0.895


## Distance to nearest and count in radius

Make train/test set for ``_dist`` and ``_count`` features

In [13]:
X_columns, data_sets, error_df = make_train_test(structured_gis, DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7167, 125): (5375, 125) + (1792, 125)
(7167,): (5375,) + (1792,)
(2389, 125)
(2389,)


### Linear regression

In [14]:
# model_03 = linear_model.LinearRegression()
# model_03 = linear_model.Lasso()
model_03 = linear_model.Ridge()
model_03.fit(X_train, y_train)

Ridge()

In [15]:
y_pred_03 = model_03.predict(X_test)
metrics = get_metrics(y_test, y_pred_03)

MAE:  35561
RMSE: 49586
MAPE: 22.33%
R^2:  0.862


Cross validation

In [16]:
results_df["Linear: S+D+C"], X_03_columns = cross_validation(model_03, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  37300
RMSE: 52906
MAPE: 23.78%
R^2:  0.85


### Catboost

In [17]:
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x1a6b5b075c8>

In [18]:
y_pred_04 = model_04.predict(X_test)
metrics = get_metrics(y_test, y_pred_04)

MAE:  28098
RMSE: 40853
MAPE: 15.98%
R^2:  0.906


In [19]:
error_df["catboost"] = [test - pred for test, pred in zip(y_test, y_pred_04)]

Cross validation

In [20]:
results_df["Catboost: S+D+C"], X_04_columns = cross_validation(model_04, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  29161
RMSE: 43369
MAPE: 17.15%
R^2:  0.899


## Results

In [21]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
# reorder columns
results_df = results_df[["Linear: S+D", "Linear: S+D+C", "Catboost: S+D", "Catboost: S+D+C"]]
results_df.to_csv(
    PATH + f"results/structured_gis_{COUNT_RADIUS}_results.csv", index=False)
print(f"Results for a count radius of {COUNT_RADIUS}m.")
results_df.head()

Results for a count radius of 3500m.


Unnamed: 0,Linear: S+D,Linear: S+D+C,Catboost: S+D,Catboost: S+D+C
MAE,41501.0,37300.0,29852.0,29161.0
RMSE,58637.0,52906.0,44221.0,43369.0
MAPE,26.38,23.78,17.72,17.15
R^2,0.815,0.85,0.895,0.899


## Spatial out-of-sample test

In [22]:
estimator = CatBoostRegressor()
error_df_soos, metrics = soos_validation(estimator, structured_gis)
maes, rmses, mapes, r_squareds = metrics

Predicting district 1/13
Predicting district 2/13
Predicting district 3/13
Predicting district 4/13
Predicting district 5/13
Predicting district 6/13
Predicting district 7/13
Predicting district 8/13
Predicting district 9/13
Predicting district 10/13
Predicting district 11/13
Predicting district 12/13
Predicting district 13/13

Weighted metrics:
MAE:  42310
RMSE: 59284
MAPE: 24.76%
R^2:  0.649


In [23]:
error_df_soos.to_csv(PATH+"results/errors_soos_gis.csv")
error_df_soos

Unnamed: 0,id,lat,long,district,prediction,error
0,508783,40.360665,-79.978464,district_6,141265.469276,14234.530724
1,186641,40.405091,-80.045501,district_12,351360.386893,-54360.386893
2,326572,40.420452,-79.748534,district_8,318994.203150,-8994.203150
3,456854,40.403449,-79.803452,district_8,237371.842271,-47371.842271
4,81010,40.439363,-79.791659,district_8,225006.959789,-5006.959789
...,...,...,...,...,...,...
9551,7078,40.512509,-80.161883,district_1,135310.414911,9689.585089
9552,191240,40.485395,-80.039425,district_13,102112.311613,62887.688387
9553,169806,40.451221,-80.215098,district_1,120994.223657,-24494.223657
9554,11579,40.408977,-80.067776,district_4,257449.393574,-52649.393574


In [24]:
districts = ["district_"+str(i) for i in range(1,14)]
metrics_df = pd.DataFrame(data={"district":districts, "mae":maes, "rmse":rmses, "mapes":mapes, "R^2":r_squareds})
metrics_df = metrics_df.set_index("district")
metrics_df.to_csv(PATH+"results/errors_soos_district_gis.csv")

# add more information about each district to characterize
df_agg = structured_gis.groupby(by="DISTRICT").mean()
df_agg  = df_agg[["SALEPRICE", "LOTAREA", "YEARBLT", "STORIES"]]
metrics_df_agg = pd.concat([metrics_df, df_agg], axis=1)

metrics_df_agg

Unnamed: 0,mae,rmse,mapes,R^2,SALEPRICE,LOTAREA,YEARBLT,STORIES
district_1,33552.269782,44683.610007,16.205092,0.754597,219849.905742,18302.388949,1959.393283,1.557638
district_2,58993.535207,85553.220069,16.565603,0.71765,347195.02963,28928.46455,1968.440212,1.703175
district_3,53144.920288,85450.492971,18.841512,0.73064,261041.727106,22902.147436,1959.855311,1.500916
district_4,28125.165321,39613.942202,17.693504,0.816335,191448.015873,16155.658009,1956.578644,1.48557
district_5,43197.040246,60013.120945,15.321263,0.757863,279481.883882,13725.758232,1956.907279,1.650347
district_6,24141.903399,31344.985142,17.81611,0.730963,159027.268519,10129.287037,1955.130658,1.43107
district_7,50024.965583,59657.862682,50.801379,0.339918,125184.599506,11433.442522,1950.647713,1.464524
district_8,35907.168827,46802.397351,27.112107,0.650421,158730.732,13534.752,1956.425333,1.457333
district_9,29166.177968,36298.86279,38.144304,0.317148,100983.590308,12213.361233,1951.126285,1.278267
district_10,59479.689474,83812.261677,52.277363,0.580605,190825.944681,7570.417021,1936.978723,1.878723


## Exploring solution

In [25]:
print(f"Intercept: {model_03.intercept_}")
feature_coef_df = pd.DataFrame(data={"feature": X_03_columns[53:], "coef": model_03.coef_[53:]})
feature_coef_df["coef"] = feature_coef_df["coef"].apply(lambda x: round(x, 2))

Intercept: -267481.0350929999


Most and least valuable POI based on count in vicinity

In [26]:
feature_coef_count = feature_coef_df[feature_coef_df["feature"].str.contains("count")]

neg_sorted = feature_coef_count.sort_values(by=["coef"], ascending=False).head(10)
pos_sorted = feature_coef_count.sort_values(by=["coef"]).head(10)

df1_styler = pos_sorted.style.set_table_attributes("style='display:inline'").set_caption('Most valuable POI')
df2_styler = neg_sorted.style.set_table_attributes("style='display:inline'").set_caption('Least valuable POI')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,feature,coef
43,libraries_count,-5294.86
37,post_offices_count,-4803.51
7,wic_vendors_count,-3792.97
27,public_buildings_count,-1951.52
57,polling_places_count,-1797.89
47,supermarkets_count,-1668.48
55,child_care_centers_count,-1369.26
61,schools_count,-1200.11
67,restaurants_count,-532.68
15,park_and_rides_count,-517.79

Unnamed: 0,feature,coef
1,universities_count,5472.33
29,farmers_markets_count,4384.13
5,health_centers_count,4196.58
3,senior_centers_count,3639.73
39,banks_count,2318.15
9,barbers_count,1697.32
17,bars_count,1603.72
49,laundromats_count,1226.41
25,nursing_homes_count,948.01
13,coffee_shops_count,821.69


Most and least valuable POI based on distance to home

In [27]:
feature_coef_dist = feature_coef_df[feature_coef_df["feature"].str.contains("dist")]

neg_sorted = feature_coef_dist.sort_values(by=["coef"], ascending=False).head(10)
pos_sorted = feature_coef_dist.sort_values(by=["coef"]).head(10)

df1_styler = pos_sorted.style.set_table_attributes("style='display:inline'").set_caption('Most valuable POI')
df2_styler = neg_sorted.style.set_table_attributes("style='display:inline'").set_caption('Least valuable POI')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,feature,coef
38,banks_dist,-7.25
16,bars_dist,-4.46
44,doctors_offices_dist,-4.46
60,schools_dist,-3.41
36,post_offices_dist,-3.38
62,apartment_buildings_dist,-3.12
42,libraries_dist,-2.72
46,supermarkets_dist,-2.48
18,bike_share_stations_dist,-2.03
8,barbers_dist,-1.96

Unnamed: 0,feature,coef
20,affordable_housing_dist,6.26
28,farmers_markets_dist,6.09
56,polling_places_dist,5.51
2,senior_centers_dist,4.7
22,pharmacies_dist,4.7
6,wic_vendors_dist,3.99
66,restaurants_dist,2.96
58,hair_salons_dist,2.85
52,parks_and_facilities_dist,2.82
0,universities_dist,2.73


Explore feature importance of best model

In [28]:
feature_importance_df = pd.DataFrame(data={"feature": X_04_columns[53:],
                                           "importance": model_04.get_feature_importance()[53:]})
feature_importance_df.sort_values(by=["importance"], ascending=False).head(10)

Unnamed: 0,feature,importance
25,nursing_homes_count,2.204712
2,senior_centers_dist,2.094696
39,banks_count,1.663273
4,health_centers_dist,1.420168
21,affordable_housing_count,1.242234
18,bike_share_stations_dist,1.11066
0,universities_dist,1.06369
17,bars_count,1.037626
69,community_nonprofit_orgs_count,1.03104
20,affordable_housing_dist,0.971309
