# Machine learning with basic housing data

Experimenting with different algorithms on a non-geographical and basic geographical feature set.

### Import packages

In [1]:
import json
import math
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
K_FOLDS = 5

Load structured data and print columns.

In [3]:
structured = pd.read_csv(PATH+"structured_preprocessed.csv")
print(structured.shape)
print(structured.columns)
structured.head(10)

(9556, 65)
Index(['_id', 'PROPERTYZIP', 'MUNICODE', 'SCHOOLCODE', 'NEIGHCODE', 'LOTAREA',
       'SALEDATE', 'SALEPRICE', 'FAIRMARKETTOTAL', 'STORIES', 'YEARBLT',
       'GRADE', 'CONDITION', 'CDU', 'TOTALROOMS', 'BEDROOMS', 'FULLBATHS',
       'HALFBATHS', 'FIREPLACES', 'BSMTGARAGE', 'FINISHEDLIVINGAREA',
       'latitude', 'longitude', 'DISTRICT', 'SALEYEAR', 'SALEYEAR_STR_2015',
       'SALEYEAR_STR_2016', 'SALEYEAR_STR_2017', 'SALEYEAR_STR_2018',
       'SALEYEAR_STR_2019', 'SALEYEAR_STR_2020',
       'HEATINGCOOLINGDESC_CentralHeat',
       'HEATINGCOOLINGDESC_CentralHeatwithAC', 'OWNERDESC_REGULAR',
       'OWNERDESC_REGULAR-ETAL', 'OWNERDESC_REGULAR-ETUXORETVIR',
       'STYLEDESC_BI-LEVEL', 'STYLEDESC_BUNGALOW', 'STYLEDESC_CAPECOD',
       'STYLEDESC_COLONIAL', 'STYLEDESC_CONTEMPORARY', 'STYLEDESC_OLDSTYLE',
       'STYLEDESC_RANCH', 'STYLEDESC_SEMIDETACHED', 'STYLEDESC_SPLITLEVEL',
       'STYLEDESC_TUDOR', 'STYLEDESC_VICTORIAN', 'EXTFINISH_DESC_Brick',
       'EXTFINISH_DESC_

Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,ROOFDESC_SLATE,ROOFDESC_TILE,BASEMENTDESC_Crawl,BASEMENTDESC_Full,BASEMENTDESC_None,BASEMENTDESC_Part,ROOFDESC_ROLL.1,ROOFDESC_SHINGLE.1,ROOFDESC_SLATE.1,ROOFDESC_TILE.1
0,161705,15122,870,45,87005,10899,05-01-2018,145000.0,76700,1.0,...,0,0,0,1,0,0,0,1,0,0
1,530852,15146,879,18,87905,10691,05-13-2019,139997.0,106200,1.0,...,0,0,0,1,0,0,0,1,0,0
2,144978,15202,826,2,82601,11813,05-26-2017,170000.0,135300,1.0,...,0,0,0,1,0,0,0,1,0,0
3,436602,15202,803,29,80302,5324,06-06-2017,145000.0,117300,2.0,...,0,0,0,1,0,0,0,1,0,0
4,145066,15218,114,47,11403,3600,04-09-2016,325000.0,250000,2.0,...,0,0,0,1,0,0,0,1,0,0
5,145137,15228,926,26,92607,6406,04-30-2015,172900.0,137300,2.0,...,0,0,0,1,0,0,0,1,0,0
6,145246,15241,950,42,95001,38376,12-17-2015,817000.0,751600,2.0,...,0,0,0,1,0,0,0,1,0,0
7,529513,15132,409,23,40005,3844,01-09-2020,39000.0,45100,1.0,...,0,0,0,1,0,0,0,1,0,0
8,146103,15212,127,47,12703,5284,06-30-2016,65000.0,52800,1.5,...,0,0,0,1,0,0,0,1,0,0
9,146155,15212,127,47,12701,5544,11-10-2018,162000.0,111200,1.0,...,0,0,0,1,0,0,0,1,0,0


### Defining useful functions

In [4]:
def find_coord(x, df):
    """Returns id, latitude and longitude for property with given id"""
    
    _id, lat, long = x[0], x[1], x[2]
    row = df[df["_id"] == _id].iloc[0]
    return row["_id"], row["latitude"], row["longitude"]

In [5]:
def make_train_test(df, spatial=None, verbose=True):
    """Returns train/test sets along with column names and df for saving errors"""
    
    to_drop = ["PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE", "SALEPRICE",
               "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"]
    if spatial:
        to_drop.remove(spatial)  # remove from dropped columns to retain
    
    X = df.drop(to_drop, axis=1)
    
    # save col names for later
    X_columns = list(X.columns)
    X_columns.remove("_id")  # remove id from col list, since it will be filtered out later
    X = X.to_numpy()
    
    y = df["SALEPRICE"].to_numpy()
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    # save ids for later
    train_ids = [x[0] for x in X_train]
    test_ids = [x[0] for x in X_test]
    X_train = X_train[:, 1:]  # remove first column (id)
    X_test = X_test[:, 1:]    # remove first column (id)
    
    X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    
    if verbose:
        print(f"{X_train.shape}: {X_train_train.shape} + {X_train_val.shape}")
        print(f"{y_train.shape}: {y_train_train.shape} + {y_train_val.shape}")
        print(X_test.shape)
        print(y_test.shape)
    
    # create error df
    error_df = pd.DataFrame(data={"id": test_ids, "lat": [0]*len(test_ids), "long": [0]*len(test_ids)})
    error_df = error_df.apply(lambda x: find_coord(x, df), axis=1, result_type='broadcast')
    error_df.head(10)
    
    return X_columns, [X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val], error_df

In [7]:
def mean_absolute_percentage_error(y_true, y_pred):
    """Returns MAPE"""
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [8]:
def get_metrics(y_true, y_pred, print_out=True):
    """Returns MAE, RMSE, MAPE and R^2"""
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r_squared = r2_score(y_true, y_pred)
    
    if print_out:
        print(f"MAE:  {round(mae)}")
        print(f"RMSE: {round(rmse)}")
        print(f"MAPE: {round(mape, 2)}%")
        print(f"R^2:  {round(r_squared, 3)}")
    else:  
        return mae, rmse, mape, r_squared

In [6]:
def cross_validation(estimator, X, y, **kwargs):
    """Returns and prints cross validated MAE, RMSE, MAPE and R^2"""
    
    maes, rmses, mapes, r_squareds = [], [], [], []
    X_cv = X[:, 1:]  # remove "_id" column

    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    for train_index, test_index in tqdm(kf.split(X_cv), total=5):
        X_train, X_test = X_cv[train_index], X_cv[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        estimator.fit(X=X_train, y=y_train, **kwargs)


        y_pred_cv = estimator.predict(X_test)
        mae, rmse, mape, r_squared = get_metrics(y_test, y_pred_cv, print_out=False)
        maes.append(mae)
        rmses.append(rmse)
        mapes.append(mape)
        r_squareds.append(r_squared)
    
    mae_cv, rmse_cv = round(np.mean(maes)), round(np.mean(rmses))
    mape_cv, r_squared_cv = round(np.mean(mapes), 2), round(np.mean(r_squareds), 3)
    
    print(f"MAE:  {mae_cv}")
    print(f"RMSE: {rmse_cv}")
    print(f"MAPE: {mape_cv}%")
    print(f"R^2:  {r_squared_cv}")
    
    return mae_cv, rmse_cv, mape_cv, r_squared_cv

Make train/test split

In [9]:
X_columns, data_sets, error_df = make_train_test(structured)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 53): (5375, 53) + (1792, 53)
(7167,): (5375,) + (1792,)
(2389, 53)
(2389,)


In [10]:
results_df = pd.DataFrame()

## Only structured data

### Linear model

In [11]:
model_01 = linear_model.LinearRegression()
# model_01 = linear_model.Lasso()
# model_01 = linear_model.Ridge()
model_01.fit(X_train, y_train)

LinearRegression()

In [12]:
# y_pred_01 = [pow(2, pred) for pred in model_01.predict(X_test)] # reverse log transformation
y_pred_01 = model_01.predict(X_test)
get_metrics(y_test, y_pred_01)

MAE:  43695
RMSE: 60649
MAPE: 27.18%
R^2:  0.794


In [13]:
error_df["linear"] = [test - pred for test, pred in zip(y_test, y_pred_01)]

In [14]:
results_df["Linear: S"] = cross_validation(model_01, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  44125
RMSE: 62302
MAPE: 28.15%
R^2:  0.792


### Random forest

In [17]:
kwargs = dict(n_jobs=-1, n_estimators=100, max_features=None, random_state=42)
model_02 = RandomForestClassifier(max_depth=6, **kwargs)
model_02.fit(X_train, y_train)

RandomForestClassifier(max_depth=6, max_features=None, n_jobs=-1,
                       random_state=42)

In [18]:
y_pred_02 = model_02.predict(X_test)
get_metrics(y_test, y_pred_02)

MAE:  50247
RMSE: 71309
MAPE: 30.8%
R^2:  0.715


Perform hyperparameter optimization with 3-Fold cross validation.

In [29]:
model_02 = RandomForestClassifier(verbose=0, n_jobs=-1)

params_rf = {"max_depth": [3, 6, 9, 12, 15],
             "min_samples_leaf": [1, 2, 4],
             "min_samples_split": [2, 5, 10],
             "random_state": [42],
             "n_estimators": [300]}

randomsearch_rf = RandomizedSearchCV(estimator=model_02, param_distributions=params_rf, n_iter=40, random_state=42,
                                  scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=1, cv=3)

In [30]:
randomsearch_rf.fit(X[:, 1:], y)
best_params_rf = randomsearch_rf.best_params_

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  9.7min finished


Create best model

In [31]:
model_02 = RandomForestClassifier(verbose=0, n_jobs=-1, **best_params_rf)

In [32]:
error_df["random_forest"] = [test - pred for test, pred in zip(y_test, y_pred_02)]
results_df["RF: S"] = cross_validation(model_02, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  48545
RMSE: 72628
MAPE: 29.5%
R^2:  0.717


### Gradient boosting

In [44]:
kwargs = dict(n_jobs=-1, n_estimators=200,
              learning_rate=0.02, gamma=0.01,
              random_state=42, verbosity = 0)
model_03 = xgb.XGBClassifier(**kwargs)
model_03.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.01, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=-1, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=0)

In [45]:
y_pred_03 = model_03.predict(X_test)
get_metrics(y_test, y_pred_03)

MAE:  50467
RMSE: 72494
MAPE: 29.33%
R^2:  0.705


Perform hyperparameter optimization with 3-Fold cross validation.

In [35]:
model_03 = xgb.XGBClassifier(verbosity=0, n_jobs=-1)

params_xgb = {"learning_rate": [0.015, 0.02, 0.025, 0.03],
              "max_depth": [3, 5, 8],
              "gamma": [0, 0.1, 0.2, 0.3],
              "max_delta_step": [0, 3, 6, 9],
              "colsample_bytree": [0.6, 0.8, 1],
              "random_state": [42],
              "n_estimators": [100]}

randomsearch_xgb = RandomizedSearchCV(estimator=model_03, param_distributions=params, n_iter=40, random_state=42,
                                      scoring="neg_root_mean_squared_error", n_jobs=-1, verbose=1, cv=3)

In [36]:
randomsearch_xgb.fit(X[:, 1:], y)
best_params_xgb = randomsearch_xgb.best_params_

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 63.0min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed: 204.4min finished


AttributeError: 'RandomizedSearchCV' object has no attribute 'best_params_'

Create best model

In [38]:
model_03 = xgb.XGBClassifier(verbosity=0, n_jobs=-1, **best_params_xgb)

In [46]:
error_df["grad_boost"] = [test - pred for test, pred in zip(y_test, y_pred_03)]
results_df["XGB: S"] = cross_validation(model_03, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  50807
RMSE: 73893
MAPE: 30.36%
R^2:  0.707


### Catboost

In [40]:
kwargs = dict(verbose=False)
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, **kwargs)

<catboost.core.CatBoostRegressor at 0x26b5e28df88>

In [41]:
y_pred_04 = model_04.predict(X_test)
get_metrics(y_test, y_pred_04)

MAE:  40497
RMSE: 58437
MAPE: 24.16%
R^2:  0.809


In [42]:
error_df["catboost"] = [test - pred for test, pred in zip(y_test, y_pred_04)]
results_df["Catboost: S"] = cross_validation(model_04, X, y, **kwargs)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  40850
RMSE: 58818
MAPE: 25.48%
R^2:  0.814


Save error df

In [28]:
error_df.to_csv(PATH+"results/errors_basic.csv", index=None)

In [47]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
results_df.head()

Unnamed: 0,Linear: S,RF: S,XGB: S,Catboost: S
MAE,44125.0,48545.0,50807.0,40850.0
RMSE,62302.0,72628.0,73893.0,58818.0
MAPE,28.15,29.5,30.36,25.48
R^2,0.792,0.717,0.707,0.814


## Structured and basic spatial membership

Different types of aggregated membership are tested for their predictive performance.

In [39]:
for spatial in ["PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE"]:
    # make dataset with spatial column
    X_columns, data_sets, error_df = make_train_test(structured, spatial=spatial, verbose=False)
    X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets
    
    model_spatial = CatBoostRegressor()
    
    print(f"Spatial choice: {spatial}")
    kwargs = dict(verbose=False)
    metrics = cross_validation(model_spatial, X, y, **kwargs)
    print("")

Spatial choice: PROPERTYZIP


  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  33207
RMSE: 49561
MAPE: 19.92%
R^2:  0.868

Spatial choice: MUNICODE


  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  31517
RMSE: 46809
MAPE: 18.84%
R^2:  0.882

Spatial choice: SCHOOLCODE


  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  33350
RMSE: 50363
MAPE: 20.12%
R^2:  0.864

Spatial choice: NEIGHCODE


  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  31725
RMSE: 46977
MAPE: 19.18%
R^2:  0.881



## Explore solution

Coefficients for each feature

In [17]:
feature_importance_df = pd.DataFrame(data={"feature": X_columns,
                                           "coef": model_01.coef_})
print(f"Intercept: {model_01.intercept_}")
feature_importance_df

Intercept: 434882.92392270896


Unnamed: 0,feature,coef
0,LOTAREA,0.272668
1,STORIES,-8990.809285
2,YEARBLT,112.4636
3,GRADE,-33549.359629
4,CONDITION,-3046.535201
5,CDU,-28986.429427
6,TOTALROOMS,-1906.179316
7,BEDROOMS,3324.072563
8,FULLBATHS,23708.109631
9,HALFBATHS,13063.188534


Feature importance of best model

In [21]:
feature_importance_df = pd.DataFrame(data={"feature": X_columns,
                                           "importance": model_04.get_feature_importance()})
feature_importance_df["coefficient"] = model_01.coef_
feature_importance_df.sort_values(by=["importance"], ascending=False).head(10)

Unnamed: 0,feature,importance,coefficient
3,GRADE,30.133963,-33549.359629
12,FINISHEDLIVINGAREA,19.688553,66.768001
2,YEARBLT,7.148423,112.4636
0,LOTAREA,6.962367,0.272668
8,FULLBATHS,5.234396,23708.109631
5,CDU,4.560744,-28986.429427
6,TOTALROOMS,3.543801,-1906.179316
11,BSMTGARAGE,2.590334,-2864.409039
4,CONDITION,2.380424,-3046.535201
9,HALFBATHS,2.068662,13063.188534
