# Machine learning with text based housing data

Experimenting with text based housing data.

### Import packages

In [1]:
import json
import math
import random
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor, Pool
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn import linear_model

from utils import *

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import joblib
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``MAX_DIST``: Maximum distance for article weights
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
MAX_DIST = 6500
K_FOLDS = 5
WEIGHTING = True
MEAN = True
DUMMIES = ["MUNICODE"]  # e.g. ["MUNICODE"]

Read structured data with added text features.

In [3]:
if not WEIGHTING:
    structured_wiki_text = pd.read_csv(PATH + f"structured_wiki_text_features_{MAX_DIST}_NOWEIGHT.csv")
elif not MEAN:
    structured_wiki_text = pd.read_csv(PATH + f"structured_wiki_text_features_{MAX_DIST}_NOMEAN.csv")
else:
    structured_wiki_text = pd.read_csv(PATH + f"structured_wiki_text_features_{MAX_DIST}.csv")
    
print(structured_wiki_text.shape)
structured_wiki_text.head(10)

(9554, 3174)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,yielding student,york,york city,youghiogheny,youghiogheny river,young,youth,zip,zip code,article_count
0,362058,15212,126,47,12603,5329,10-27-2017,113000.0,69200,1.0,...,0.00084,0.38598,0.114879,0.008347,0.007765,0.203187,0.077129,0.107527,0.089066,775
1,544290,15017,946,36,94601,10800,09-27-2016,320000.0,269900,2.0,...,0.0,0.104479,0.036831,0.0,0.0,0.0,0.38737,0.269866,0.253284,31
2,314083,15090,935,3,93503,114476,03-25-2017,545000.0,450000,1.0,...,0.42139,0.0,0.0,0.0,0.0,0.0,0.031478,0.169031,0.17708,20
3,314280,15241,950,42,95003,43197,06-09-2016,315000.0,274000,2.0,...,0.241856,0.088242,0.04439,0.0,0.0,0.02836,0.042316,0.039828,0.041048,53
4,314812,15239,880,31,88006,12390,01-31-2017,174900.0,154100,2.0,...,0.739299,0.032164,0.0,0.024987,0.0,0.02241,0.039523,0.0,0.0,13
5,315579,15143,921,32,92102,10081,03-02-2015,300000.0,244600,2.0,...,0.286786,0.228377,7.7e-05,0.059439,0.061049,0.174641,0.127391,0.070458,0.073813,57
6,315587,15235,934,30,93401,10257,05-15-2017,172500.0,144700,2.0,...,0.380227,0.235346,0.109887,0.048135,0.0,0.140575,0.214427,0.057435,0.03056,49
7,362804,15102,876,5,87603,10920,07-11-2016,250000.0,217800,2.0,...,0.193749,0.057654,0.007831,0.0,0.0,0.036618,0.007465,0.037384,0.039164,68
8,315758,15108,939,24,93903,54189,09-28-2018,199900.0,174700,1.0,...,0.141333,0.086281,0.0,0.024239,0.0,0.042487,0.09675,0.025549,0.026766,56
9,315868,15133,837,35,83702,6569,05-12-2017,143000.0,117900,2.0,...,0.150948,0.153154,0.051771,1.592789,1.425353,0.070506,0.118956,0.160361,0.136056,54


Create the data sets and error dataframe

In [4]:
structured_wiki_categories = pd.read_csv(PATH+"structured_wiki_category_features.csv")
print(structured_wiki_categories.shape)
structured_wiki_categories.head(10)

(9554, 115)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,high school_dist,high school_count,public_dist,public_count,defunct_dist,defunct_count,golf_dist,golf_count,transportation_dist,transportation_count
0,536102,15219,103,47,10301,2875,01-31-2018,287000.0,159000,2.0,...,646.211908,4,993.947917,10,517.69985,14,3654.869381,0,910.904485,5
1,197251,15241,950,42,95002,17193,09-02-2017,299900.0,259500,2.0,...,1356.822566,1,1356.822566,1,9127.889947,0,6210.239877,0,5248.021671,0
2,25219,15146,879,18,87910,22264,03-22-2017,190000.0,214900,2.0,...,3340.901141,0,3340.901141,0,1133.283586,1,11845.52648,0,6003.343808,0
3,197755,15236,873,44,87302,7800,05-09-2017,225000.0,155000,2.0,...,1918.186873,2,1918.186873,1,1831.468503,2,7576.960841,0,3950.374941,0
4,198593,15015,809,27,80902,43734,06-28-2019,249000.0,208800,2.0,...,4211.043296,0,1455.785244,1,13266.442574,0,21196.01374,0,9357.397493,0
5,532924,15212,127,47,12703,5000,05-25-2016,113000.0,84300,2.0,...,270.510903,1,270.510903,4,270.510903,7,8006.652249,0,2587.035978,0
6,198384,15136,919,24,91903,9664,10-30-2015,189900.0,169200,1.0,...,599.370206,1,599.370206,1,3972.673023,0,9015.869977,0,4286.475014,0
7,198694,15237,927,27,92703,12162,11-06-2019,310000.0,257800,2.0,...,2039.2867,1,2039.2867,2,4648.043348,0,11673.441446,0,8344.857679,0
8,198744,15227,874,4,87402,6695,03-13-2018,165000.0,144200,1.5,...,800.919649,2,800.919649,2,2921.479813,0,3399.838705,0,4095.874034,0
9,199198,15139,845,33,84501,12050,08-24-2018,380000.0,320000,2.0,...,935.704859,1,935.704859,1,9131.859699,0,452.914362,15,2780.160495,0


Create results df

In [5]:
results_df = pd.DataFrame()

## Feature selection

### Text features

Create train/test split

In [6]:
X_columns_text, data_sets, error_df = make_train_test(structured_wiki_text, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7165, 3325): (5373, 3326) + (1792, 3326)
(7165,): (5373,) + (1792,)
(2389, 3325)
(2389,)


Use [Recursive Feature Elimination](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html) to eliminate features and find out the optimal number of features.

In [7]:
# estimator = CatBoostRegressor(iterations=500, verbose=False)
# selector_text = RFECV(estimator, step=20, cv=5, verbose=1)
# selector_text = selector_text.fit(X_train, y_train)

In [8]:
# _ = joblib.dump(selector_text, 'features/rfecv_text_catboost.gz')  # save selector
selector_text = joblib.load('features/rfecv_text_catboost.gz')   # load selector
print(f"optimal number of features: {selector_text.n_features_}")

feature_mask_text = selector_text.support_

optimal number of features: 1365


Create list of features to drop.

In [9]:
to_drop_text = np.array(X_train.columns)[~feature_mask_text]
to_drop_text = [col for col in to_drop_text if not DUMMIES[0] in col]  # remove dummy coded spatial membership

len(to_drop_text)

1926

### WikiGIS features

Create train/test split

In [None]:
X_columns_text, data_sets, error_df = make_train_test(structured_wiki_categories, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

Use [Recursive Feature Elimination](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html) to eliminate features and find out the optimal number of features.

In [None]:
# estimator = CatBoostRegressor(iterations=500, verbose=False)
# selector_wgis = RFECV(estimator, step=5, cv=5, verbose=1)
# selector_wgis = selector_wgis.fit(X_train, y_train)

In [None]:
# _ = joblib.dump(selector_wgis, 'rfecv_text_wgis.gz')  # save selector
selector_wgis = joblib.load('rfecv_text_wgis.gz')   # load selector
print(f"optimal number of features: {selector_wgis.n_features_}")

feature_mask_wgis = selector_wgis.support_

Create list of features to drop.

In [None]:
to_drop_wgis = np.array(X_train.columns)[~feature_mask_wgis]
to_drop_wgis = [col for col in to_drop_wgis if not DUMMIES[0] in col]  # remove dummy coded spatial membership

len(to_drop_wgis)

## Combining structured and text features

In [19]:
X_columns_text, data_sets, error_df = make_train_test(structured_wiki_text, dummies=DUMMIES)
# X_columns_text, data_sets, error_df = make_train_test(structured_wiki_text, dummies=[])
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7165, 3325): (5373, 3326) + (1792, 3326)
(7165,): (5373,) + (1792,)
(2389, 3325)
(2389,)


### Linear regression

In [None]:
# model_03 = linear_model.LinearRegression()
# model_03 = linear_model.Lasso()
model_03 = linear_model.Ridge()
model_03.fit(X_train, y_train)

In [None]:
y_pred_03 = model_03.predict(X_test)
metrics_03 = get_metrics(y_test, y_pred_03)

Cross validation

In [17]:
results_df["Linear: S+T"], X_03_columns = cross_validation(model_03, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_123, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  32572
RMSE: 47261
MAPE: 20.18%
R^2:  0.88


### Catboost

In [None]:
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, verbose=False)

In [None]:
y_pred_04 = model_04.predict(X_test)
metrics_04 = get_metrics(y_test, y_pred_04)

Cross Validation

In [22]:
results_df["Catboost: S+T"], X_04_columns = cross_validation(model_04, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_123, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  28578
RMSE: 43092
MAPE: 16.61%
R^2:  0.9


## Combining structured and category features

Load structured with added wikipedia category data

In [21]:
X_columns_cat, data_sets, error_df = make_train_test(structured_wiki_categories, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

removed column MUNICODE_123, first occurence in test
removed column MUNICODE_861, first occurence in test
removed column MUNICODE_947, first occurence in test

(7165, 263): (5373, 264) + (1792, 264)
(7165,): (5373,) + (1792,)
(2389, 263)
(2389,)


### Linear regression

In [22]:
# model_05 = linear_model.LinearRegression()
# model_05 = linear_model.Lasso()
model_05 = linear_model.Ridge()
model_05.fit(X_train, y_train)

Ridge()

In [23]:
y_pred_05 = model_05.predict(X_test)
metrics_05 = get_metrics(y_test, y_pred_05)

MAE:  32665
RMSE: 46667
MAPE: 19.46%
R^2:  0.881


Cross validation

In [24]:
results_df["Linear: S+C"], X_05_columns = cross_validation(model_05, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_123, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  32725
RMSE: 47084
MAPE: 20.21%
R^2:  0.881


### Catboost

In [25]:
model_06 = CatBoostRegressor()
model_06.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x18d2b5eafc8>

In [26]:
y_pred_06 = model_06.predict(X_test)
metrics_06 = get_metrics(y_test, y_pred_06)

MAE:  28601
RMSE: 42832
MAPE: 16.05%
R^2:  0.9


Cross validation

In [27]:
results_df["Catboost: S+C"], X_06_columns = cross_validation(model_06, X, y, K_FOLDS, additional_drops=to_drop_wgis)

  0%|          | 0/5 [00:00<?, ?it/s]

removed column MUNICODE_123, first occurence in test
removed column MUNICODE_947, first occurence in test
removed column MUNICODE_849, first occurence in test
removed column MUNICODE_862, first occurence in test
removed column MUNICODE_406, first occurence in test
removed column MUNICODE_303, first occurence in test

MAE:  28829
RMSE: 43225
MAPE: 16.87%
R^2:  0.9


## Results

In [29]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
# reorder columns
results_df = results_df[["Linear: S+T", "Catboost: S+T"]]
if WEIGHTING:
    results_df.to_csv(PATH + f"results/structured_wiki_{MAX_DIST}_results.csv", index=False)
else:
    results_df.to_csv(PATH + f"results/structured_wiki_{MAX_DIST}_results_NOWEIGHT.csv", index=False)
print(f"Results for a max distance of {MAX_DIST}m.")
results_df.head()

Results for a max distance of 6500m.


Unnamed: 0,Linear: S+T,Catboost: S+T
MAE,32572.0,28552.0
RMSE,47261.0,42933.0
MAPE,20.18,16.57
R^2,0.88,0.901


## Spatial out-of-sample validation

### Text features

#### CatBoost

Without REFCV

In [10]:
estimator = CatBoostRegressor()
error_df_soos, col_names_text, avg_fis_text, metrics = soos_validation(estimator,
                                                                       structured_wiki_text)
maes, rmses, mapes, r_squareds = metrics

Predicting district 1/13
Predicting district 2/13
Predicting district 3/13
Predicting district 4/13
Predicting district 5/13
Predicting district 6/13
Predicting district 7/13
Predicting district 8/13
Predicting district 9/13
Predicting district 10/13
Predicting district 11/13
Predicting district 12/13
Predicting district 13/13

Weighted metrics:
MAE:  40273
RMSE: 57549
MAPE: 23.2%
R^2:  0.679


With REFCV

In [32]:
estimator = CatBoostRegressor()
error_df_soos, col_names_text, avg_fis_text, metrics = soos_validation(estimator,
                                                                       structured_wiki_text,
                                                                       additional_drops=to_drop_text)
maes, rmses, mapes, r_squareds = metrics

Predicting district 1/13
Predicting district 2/13
Predicting district 3/13
Predicting district 4/13
Predicting district 5/13
Predicting district 6/13
Predicting district 7/13
Predicting district 8/13
Predicting district 9/13
Predicting district 10/13
Predicting district 11/13
Predicting district 12/13
Predicting district 13/13

Weighted metrics:
MAE:  41293
RMSE: 58884
MAPE: 23.54%
R^2:  0.673


In [None]:
if not WEIGHTING:
    error_df_soos.to_csv(PATH+"results/errors_soos_wiki_noweight.csv")
elif not MEAN:
    error_df_soos.to_csv(PATH+"results/errors_soos_wiki_nomean.csv")
else:
    error_df_soos.to_csv(PATH+"results/errors_soos_wiki.csv")
error_df_soos

Display metrics for each district along with aggregated information about houses in that district

In [None]:
districts = ["district_"+str(i) for i in range(1,14)]
metrics_df = pd.DataFrame(data={"district":districts, "mae":maes, "rmse":rmses, "mapes":mapes, "R^2":r_squareds})
metrics_df = metrics_df.set_index("district")
metrics_df.to_csv(PATH+"results/errors_soos_district_wiki.csv")

# add more information about each district to characterize
df_agg = structured_wiki_text.groupby(by="DISTRICT").mean()
df_agg  = df_agg[["SALEPRICE", "LOTAREA", "YEARBLT", "STORIES"]]
metrics_df_agg = pd.concat([metrics_df, df_agg], axis=1)

metrics_df_agg

#### Linear

In [None]:
estimator = linear_model.Lasso(alpha=0.2, random_state=42)
error_df_soos_linear, col_names, avg_coefs, metrics = soos_validation(estimator, structured_wiki_text, standardize=True)
maes, rmses, mapes, r_squareds = metrics

In [None]:
coefs = estimator.coef_
non_zero_coefs = coefs[coefs == 0]
print(f"train coefs: {len(coefs)}")
print(f"non-zero coefs: {len(non_zero_coefs)}")

In [None]:
combined = np.array([(col, round(coef/100000, 2)) for col, coef in zip(col_names, coefs) if coef > 0][20:])
combined_df = pd.DataFrame(data={"word": combined[:, 0], "coef": combined[:, 1]})
combined_df

### WikiGIS features

In [None]:
estimator = CatBoostRegressor()
error_df_soos, col_names_wikigis, avg_fis_wikigis, metrics = soos_validation(estimator,
                                                                             structured_wiki_categories,
                                                                             additional_drops=to_drop_wgis)
maes, rmses, mapes, r_squareds = metrics

In [None]:
if WEIGHTING:
    error_df_soos.to_csv(PATH+"results/errors_soos_wikigis.csv")
else:
    error_df_soos.to_csv(PATH+"results/errors_soos_wikigis_noweight.csv")
error_df_soos

In [None]:
districts = ["district_"+str(i) for i in range(1,14)]
metrics_df = pd.DataFrame(data={"district":districts, "mae":maes, "rmse":rmses, "mapes":mapes, "R^2":r_squareds})
metrics_df = metrics_df.set_index("district")
metrics_df.to_csv(PATH+"results/errors_soos_district_wikigis.csv")

# add more information about each district to characterize
df_agg = structured_wiki_text.groupby(by="DISTRICT").mean()
df_agg  = df_agg[["SALEPRICE", "LOTAREA", "YEARBLT", "STORIES"]]
metrics_df_agg = pd.concat([metrics_df, df_agg], axis=1)

metrics_df_agg

## Exploring solution concerning feature importance

### WikiGIS Category features

In [None]:
cols_wikigis_sliced = col_names_wikigis[54:]
fis_wikigis_sliced = avg_fis_wikigis[54:]
wikigis_fi_df = pd.DataFrame(data={"feature": cols_wikigis_sliced, "importance": fis_wikigis_sliced})
wikigis_fi_df.sort_values(by=["importance"], ascending=False).head(15)

### Text features

In [None]:
cols_text_sliced = col_names_text[54:-1]
fis_text_sliced = avg_fis_text[54:-1]
word_fi_df = pd.DataFrame(data={"word": cols_text_sliced, "importance": fis_text_sliced})
word_fi_df.sort_values(by=["importance"], ascending=False).head(15)

The next cell prints the feature importance for a specific word.

In [None]:
word_to_test = "hospital"

word_fi_lookup = {word:fi for (word, fi) in zip(cols_text_sliced, fis_text_sliced)}
fi = word_fi_lookup[word_to_test]
print(f"Coefficient for \"{word_to_test}\" is {round(float(fi), 2)}")

### Determine most impactful articles

Create article importance-score by calculating sum of all words multiplied with their importance for every article.

In [None]:
article_word_counts = pd.read_csv(PATH+"wikipedia/wikipedia_article_wordcounts.csv")
article_fi = article_word_counts.copy()
article_fi.insert(1, "article_importance", [0]*article_word_counts.shape[0])

# filter cols to account for RFECV
new_cols = ["article_title", "article_importance"] + list(cols_text_sliced)
article_fi = article_fi[new_cols]

words = article_fi.columns[2:]
for index, row in article_fi.iterrows():
    counts = row.iloc[2:]
    article_fi.loc[index, "article_importance"] = sum([word_fi_lookup[word]*count for count, word in zip(counts, words)])
    
article_fi.iloc[:, :2].sort_values(by=["article_importance"], ascending=False).head(20)

Add coordinates back

In [None]:
with open(PATH+"wikipedia/wikipedia_selected.ndjson") as fin:
    data_loaded = json.load(fin)

In [None]:
coords = np.array(data_loaded)[:, 1]  # filter coord column
article_fi.insert(2, "article_lat", [lat for lat, long in coords])
article_fi.insert(2, "article_long", [long for lat, long in coords])
article_fi = article_fi[["article_title", "article_importance", "article_long", "article_lat"]]  # remove word counts

Save articles with their value score and coordinates.

In [None]:
if WEIGHTING:
    article_fi.to_csv(PATH+"wikipedia_article_importances.csv", index=False)
else:
    article_fi.to_csv(PATH+"wikipedia_article_importances_noweight.csv", index=False)
article_fi.head()