# Machine learning with text based housing data

Experimenting with text based housing data.

### Import packages

In [1]:
import json
import math
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``MAX_DIST``: Maximum distance for article weights
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
MAX_DIST = 5000
K_FOLDS = 5

Lead structured data with added text features.

In [3]:
structured_wiki_text = pd.read_csv(
    PATH + f"structured_wiki_doc2vec_features_{MAX_DIST}.csv")
print(structured_wiki_text.shape)
structured_wiki_text.head(10)

(9556, 365)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,vec_300
0,362058,15212,126,47,12603,5329,10-27-2017,113000.0,69200,1.0,...,-9.796203,-18.593355,-50.138167,31.661196,29.778861,-24.893443,-35.822241,-6.260757,13.613537,-42.302236
1,544290,15017,946,36,94601,10800,09-27-2016,320000.0,269900,2.0,...,0.512429,0.532064,-0.4132,0.162363,0.578378,0.487045,0.330141,-1.271629,-0.068568,-0.382046
2,314083,15090,935,3,93503,114476,03-25-2017,545000.0,450000,1.0,...,-0.433479,0.497991,-0.362472,0.362897,1.139053,-0.554338,-0.400123,-1.061791,0.29739,-1.05608
3,314280,15241,950,42,95003,43197,06-09-2016,315000.0,274000,2.0,...,-0.642973,0.570541,-1.14895,0.727212,0.941203,-0.000637,0.250378,-0.570598,0.512188,-0.897177
4,314812,15239,880,31,88006,12390,01-31-2017,174900.0,154100,2.0,...,0.089984,0.249701,-0.246685,-0.150849,0.08841,0.232431,-0.207343,-0.065307,-0.003261,0.030794
5,315579,15143,921,32,92102,10081,03-02-2015,300000.0,244600,2.0,...,-1.16136,0.826251,-1.453523,1.125376,2.642963,0.576404,0.395513,-1.176827,0.12564,-4.000058
6,315587,15235,934,30,93401,10257,05-15-2017,172500.0,144700,2.0,...,-0.799475,0.532983,-0.609679,-0.212419,0.005209,-0.734228,-0.109818,-0.51678,0.701884,-0.817381
7,362804,15102,876,5,87603,10920,07-11-2016,250000.0,217800,2.0,...,-2.07884,1.984523,-2.359517,2.727928,2.246496,0.485931,1.53642,-0.267323,0.816341,-0.087536
8,315758,15108,939,24,93903,54189,09-28-2018,199900.0,174700,1.0,...,-0.242182,-0.002154,-1.022322,0.732458,0.655006,-0.781192,0.347673,-1.210835,0.124744,-1.185205
9,315868,15133,837,35,83702,6569,05-12-2017,143000.0,117900,2.0,...,-1.449996,-0.999158,-0.516546,2.05797,0.63643,-1.716206,0.582574,-1.705143,1.444035,-4.217766


### Defining useful functions

In [4]:
def find_coord(x, df):
    """Returns id, latitude and longitude for property with given id"""
    
    _id, lat, long = x[0], x[1], x[2]
    row = df[df["_id"] == _id].iloc[0]
    return row["_id"], row["latitude"], row["longitude"]

In [5]:
def make_train_test(df):
    """Returns train/test sets along with column names and df for saving errors"""

    X = df.drop(["PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE", "SALEPRICE",
                 "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"], axis=1)

    # save col names for later
    X_columns = list(X.columns)
    # remove id from col list, since it will be filtered out later
    X_columns.remove("_id")
    X = X.to_numpy()

    y = df["SALEPRICE"].to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42)

    # save ids for later
    train_ids = [x[0] for x in X_train]
    test_ids = [x[0] for x in X_test]
    X_train = X_train[:, 1:]  # remove first column (id)
    X_test = X_test[:, 1:]    # remove first column (id)

    X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(
        X_train, y_train, test_size=0.25, random_state=42)

    print(f"{X_train.shape}: {X_train_train.shape} + {X_train_val.shape}")
    print(f"{y_train.shape}: {y_train_train.shape} + {y_train_val.shape}")
    print(X_test.shape)
    print(y_test.shape)

    # create error df
    error_df = pd.DataFrame(
        data={"id": test_ids, "lat": [0]*len(test_ids), "long": [0]*len(test_ids)})
    error_df = error_df.apply(lambda x: find_coord(
        x, df), axis=1, result_type='broadcast')
    error_df.head(10)

    return X_columns, [X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val], error_df

In [6]:
def mean_absolute_percentage_error(y_true, y_pred):
    """Returns MAPE"""
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [7]:
def get_metrics(y_true, y_pred, print_out=True):
    """Returns MAE, RMSE, MAPE and R^2"""
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r_squared = r2_score(y_true, y_pred)

    if print_out:
        print(f"MAE:  {round(mae)}")
        print(f"RMSE: {round(rmse)}")
        print(f"MAPE: {round(mape, 2)}%")
        print(f"R^2:  {round(r_squared, 3)}")

    return mae, rmse, mape, r_squared

In [8]:
def cross_validation(estimator, X, y):
    """Returns and prints cross validated MAE, RMSE, MAPE and R^2"""
    
    maes, rmses, mapes, r_squareds = [], [], [], []
    X_cv = X[:, 1:]  # remove "_id" column

    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    for train_index, test_index in tqdm(kf.split(X_cv), total=5):
        X_train, X_test = X_cv[train_index], X_cv[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        if "linear_model" in str(type(estimator)):
            estimator.fit(X=X_train, y=y_train)
        else:
            estimator.fit(X=X_train, y=y_train, verbose=False)

        y_pred_cv = estimator.predict(X_test)
        mae, rmse, mape, r_squared = get_metrics(y_test, y_pred_cv, print_out=False)
        maes.append(mae)
        rmses.append(rmse)
        mapes.append(mape)
        r_squareds.append(r_squared)
    
    mae_cv, rmse_cv = round(np.mean(maes)), round(np.mean(rmses))
    mape_cv, r_squared_cv = round(np.mean(mapes), 2), round(np.mean(r_squareds), 3)
    
    print(f"MAE:  {mae_cv}")
    print(f"RMSE: {rmse_cv}")
    print(f"MAPE: {mape_cv}%")
    print(f"R^2:  {r_squared_cv}")
    
    return mae_cv, rmse_cv, mape_cv, r_squared_cv

Create the data sets and error dataframe

In [9]:
X_columns_text, data_sets, error_df = make_train_test(structured_wiki_text)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 354): (5375, 354) + (1792, 354)
(7167,): (5375,) + (1792,)
(2389, 354)
(2389,)


Create results df

In [10]:
results_df = pd.DataFrame()

## Only text features

Remove all structured data

In [11]:
X_train_text = X_train[:, 53:]
X_test_text = X_test[:, 53:]

### Linear regression

In [12]:
# model_01 = linear_model.LinearRegression()
# model_01 = linear_model.Lasso()
model_01 = linear_model.Ridge()
model_01.fit(X_train_text, y_train)

Ridge()

In [13]:
y_pred_01 = model_01.predict(X_test_text)
metrics_01 = get_metrics(y_test, y_pred_01)

MAE:  60763
RMSE: 89365
MAPE: 35.71%
R^2:  0.58


Cross validation

In [14]:
results_df["Linear: T"] = cross_validation(model_01, X[:, 52:], y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  59385
RMSE: 87647
MAPE: 34.69%
R^2:  0.588


### Catboost

In [15]:
model_02 = CatBoostRegressor()
model_02.fit(X=X_train_text, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x25dd967f6c8>

In [16]:
y_pred_02 = model_02.predict(X_test_text)
metrics_02 = get_metrics(y_test, y_pred_02)

MAE:  50802
RMSE: 76128
MAPE: 29.89%
R^2:  0.695


Cross validation

In [17]:
results_df["Catboost: T"] = cross_validation(model_02, X[:, 52:], y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  49880
RMSE: 76410
MAPE: 29.08%
R^2:  0.687


## Combining structured and text features

### Linear regression

In [18]:
# model_03 = linear_model.LinearRegression()
# model_03 = linear_model.Lasso()
model_03 = linear_model.Ridge()
model_03.fit(X_train, y_train)

Ridge()

In [19]:
y_pred_03 = model_03.predict(X_test)
metrics_03 = get_metrics(y_test, y_pred_03)

MAE:  33863
RMSE: 47175
MAPE: 21.5%
R^2:  0.883


Cross validation

In [20]:
results_df["Linear: S+T"] = cross_validation(model_03, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  33426
RMSE: 47819
MAPE: 20.99%
R^2:  0.877


### Catboost

In [21]:
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x25dd9670a88>

In [22]:
y_pred_04 = model_04.predict(X_test)
metrics_04 = get_metrics(y_test, y_pred_04)

MAE:  28615
RMSE: 42562
MAPE: 16.64%
R^2:  0.905


Cross Validation

In [23]:
results_df["Catboost: S+T"] = cross_validation(model_04, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  28576
RMSE: 42455
MAPE: 16.65%
R^2:  0.903


## Combining structured and category features

Load structured with added wikipedia category data

In [24]:
structured_wiki_categories = pd.read_csv(PATH+"structured_wiki_category_features.csv")
print(structured_wiki_categories.shape)
structured_wiki_categories.head(10)

(9556, 98)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,music venue_dist,music venue_count,librar_dist,librar_count,demolished_dist,demolished_count,theatre_dist,theatre_count,airport_dist,airport_count
0,287372,15202,804,2,80401,5781,05-05-2015,182000.0,148000,2.0,...,6719.221197,0,2390.879906,1,2989.609671,0,8303.907529,0,11114.088534,0
1,287857,15071,929,43,92903,12069,11-08-2017,197000.0,172200,2.0,...,14621.281406,0,10611.214325,0,8984.864561,0,16716.673657,0,5259.131053,0
2,87806,15237,940,28,94002,10117,04-07-2017,212000.0,168500,2.0,...,4595.347535,0,3100.782315,0,5883.52401,0,6814.894097,0,11283.630799,0
3,288474,15145,953,9,95303,10416,04-09-2018,161000.0,183500,2.0,...,4243.496567,0,4018.065776,0,1898.174138,1,6727.948117,0,5499.635608,0
4,288515,15044,938,3,93801,14680,07-24-2020,382500.0,188700,2.0,...,11477.68009,0,10182.0912,0,12385.510059,0,17324.893754,0,9121.425016,0
5,288682,15101,927,27,92703,35611,04-15-2020,337500.0,225400,2.0,...,11519.716678,0,2537.598142,0,11436.30052,0,13941.953218,0,14110.775287,0
6,288780,15228,926,26,92604,8775,08-16-2016,164000.0,152200,2.0,...,8583.318135,0,4693.071613,0,3899.615424,0,9200.925389,0,11183.444237,0
7,297737,15243,941,8,94102,7017,06-17-2019,192400.0,144900,1.0,...,10201.563777,0,4672.413004,0,6032.474406,0,11020.887502,0,13237.895978,0
8,289157,15139,845,33,84503,4020,02-20-2020,225000.0,183000,2.0,...,3601.594796,0,8109.406909,0,1676.325304,1,8961.649598,0,8431.248808,0
9,88679,15122,870,45,87001,46566,04-29-2016,85000.0,60300,1.0,...,7991.484449,0,4556.338961,0,3868.89796,0,5370.828077,0,681.21753,2


In [25]:
X_columns_cat, data_sets, error_df = make_train_test(structured_wiki_categories)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 87): (5375, 87) + (1792, 87)
(7167,): (5375,) + (1792,)
(2389, 87)
(2389,)


### Linear regression

In [26]:
model_05 = linear_model.LinearRegression()
# model_05 = linear_model.Lasso()
# model_05 = linear_model.Ridge()
model_05.fit(X_train, y_train)

LinearRegression()

In [27]:
y_pred_05 = model_05.predict(X_test)
metrics_05 = get_metrics(y_test, y_pred_05)

MAE:  40261
RMSE: 56533
MAPE: 26.11%
R^2:  0.823


Cross validation

In [28]:
results_df["Linear: S+C"] = cross_validation(model_05, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  40523
RMSE: 57078
MAPE: 25.88%
R^2:  0.825


### Catboost

In [29]:
model_06 = CatBoostRegressor()
model_06.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x25dd973c448>

In [30]:
y_pred_06 = model_06.predict(X_test)
metrics_06 = get_metrics(y_test, y_pred_06)

MAE:  29344
RMSE: 44386
MAPE: 17.58%
R^2:  0.891


Cross validation

In [31]:
results_df["Catboost: S+C"] = cross_validation(model_06, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  29554
RMSE: 43963
MAPE: 17.45%
R^2:  0.896


## Combining structured, text and category features

Add category features

In [32]:
merge_cols = list(structured_wiki_text.columns[:64])
structured_wiki_combined = pd.merge(
    structured_wiki_text, structured_wiki_categories, on=merge_cols)
print(structured_wiki_combined.shape)

(9556, 399)


In [33]:
X_columns_text_cat, data_sets, error_df = make_train_test(structured_wiki_combined)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 388): (5375, 388) + (1792, 388)
(7167,): (5375,) + (1792,)
(2389, 388)
(2389,)


### Linear model

In [34]:
# model_07 = linear_model.LinearRegression()
# model_07 = linear_model.Lasso()
model_07 = linear_model.Ridge()
model_07.fit(X_train, y_train)

Ridge()

In [35]:
y_pred_07 = model_07.predict(X_test)
metrics_07 = get_metrics(y_test, y_pred_07)

MAE:  33647
RMSE: 47072
MAPE: 21.37%
R^2:  0.884


Cross validation

In [36]:
results_df["Linear: S+T+C"] = cross_validation(model_07, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  33200
RMSE: 47587
MAPE: 20.86%
R^2:  0.878


### Catboost

In [37]:
model_08 = CatBoostRegressor()
model_08.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x25dd5385a48>

In [38]:
y_pred_08 = model_08.predict(X_test)
metrics_08 = get_metrics(y_test, y_pred_08)

MAE:  28515
RMSE: 42563
MAPE: 16.55%
R^2:  0.905


Cross validation

In [39]:
results_df["Catboost: S+T+C"] = cross_validation(model_08, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  28508
RMSE: 42568
MAPE: 16.57%
R^2:  0.903


## Results

In [40]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
# reorder columns
results_df = results_df[["Linear: T", "Linear: S+T", "Linear: S+T+C", "Catboost: T", "Catboost: S+T", "Catboost: S+T+C"]]
results_df.to_csv(
    PATH + f"results/structured_doc2vec_sum_{MAX_DIST}_results.csv", index=False)
print(f"Results for a max distance of {MAX_DIST}m.")
results_df.head()

Results for a max distance of 5000m.


Unnamed: 0,Linear: T,Linear: S+T,Linear: S+T+C,Catboost: T,Catboost: S+T,Catboost: S+T+C
MAE,59385.0,33426.0,33200.0,49880.0,28576.0,28508.0
RMSE,87647.0,47819.0,47587.0,76410.0,42455.0,42568.0
MAPE,34.69,20.99,20.86,29.08,16.65,16.57
R^2,0.588,0.877,0.878,0.687,0.903,0.903


## Spatial out-of-sample test

Calculate median latitude and longitude

In [41]:
soos_df = structured_wiki_text.copy()

coords_median = soos_df.loc[:, "latitude":"longitude"].median()
lat_median = coords_median.loc["latitude"]
long_median = coords_median.loc["longitude"]
coords_median

latitude     40.441981
longitude   -79.987716
dtype: float64

In [42]:
quadrants = []

quadrant_1 = soos_df[(soos_df["latitude"] >= lat_median) & (soos_df["longitude"] >= long_median)]
quadrants.append(quadrant_1)
print(quadrant_1.shape)

quadrant_2 = soos_df[(soos_df["latitude"] >= lat_median) & (soos_df["longitude"] < long_median)]
quadrants.append(quadrant_2)
print(quadrant_2.shape)

quadrant_3 = soos_df[(soos_df["latitude"] < lat_median) & (soos_df["longitude"] < long_median)]
quadrants.append(quadrant_3)
print(quadrant_3.shape)

quadrant_4 = soos_df[(soos_df["latitude"] < lat_median) & (soos_df["longitude"] >= long_median)]
quadrants.append(quadrant_4)
print(quadrant_4.shape, end="\n\n")

row_sum = quadrant_1.shape[0] + quadrant_2.shape[0] + quadrant_3.shape[0] + quadrant_4.shape[0]
print(f"{row_sum, quadrant_1.shape[1]}")

(2487, 365)
(2291, 365)
(2487, 365)
(2291, 365)

(9556, 365)


In [43]:
quadrants_df = pd.concat(quadrants, ignore_index=True)

error_df_soos = pd.DataFrame(
    data={"id": quadrants_df["_id"],
          "lat": quadrants_df["latitude"],
          "long": quadrants_df["longitude"],
          "prediction": 0,
          "error": 0})
error_df_soos.head(10)

Unnamed: 0,id,lat,long,prediction,error
0,314812,40.466737,-79.708578,0,0
1,315587,40.445413,-79.804255,0,0
2,318468,40.452267,-79.801333,0,0
3,457748,40.450123,-79.799833,0,0
4,319473,40.443532,-79.740358,0,0
5,458374,40.520053,-79.784539,0,0
6,319778,40.608447,-79.777244,0,0
7,320222,40.453754,-79.811168,0,0
8,320681,40.471195,-79.784846,0,0
9,363493,40.629263,-79.725169,0,0


In [44]:
y_preds = []
errors = []
maes, rmses, mapes, r_squareds = [], [], [], []

for i, quadrant in enumerate(quadrants):
    train = pd.concat(quadrants[:i] + quadrants[i+1:])
    test = quadrants[i]
    
    train = train.drop(["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
                        "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"], axis=1)
    test = test.drop(["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
                      "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"], axis=1)
    
    X_train = train.drop(["SALEPRICE"], axis=1).to_numpy()
    y_train = train["SALEPRICE"].to_numpy()
    
    X_test = test.drop(["SALEPRICE"], axis=1).to_numpy()
    y_test = test["SALEPRICE"].to_numpy()
    
    model_cv = CatBoostRegressor()
    model_cv.fit(X=X_train, y=y_train, verbose=False)
    
    y_pred_cv = model_cv.predict(X_test)
    y_preds.extend(y_pred_cv)
    errors.extend([test - pred for test, pred in zip(y_test, y_pred_cv)])
    
    print(f"Quadrant: {i+1}")
    mae, rmse, mape, r_squared = get_metrics(y_test, y_pred_cv)
    maes.append(mae)
    rmses.append(rmse)
    mapes.append(mape)
    r_squareds.append(r_squared)
    
    print("")

error_df_soos["prediction"] = y_preds
error_df_soos["error"] = errors
    
print("Average:")
print(f"MAE:  {round(np.mean(maes))}")
print(f"RMSE: {round(np.mean(rmses))}")
print(f"MAPE: {round(np.mean(mapes), 2)}%")
print(f"R^2:  {round(np.mean(r_squareds), 3)}")

Quadrant: 1
MAE:  48974
RMSE: 71012
MAPE: 30.99%
R^2:  0.793

Quadrant: 2
MAE:  45901
RMSE: 67845
MAPE: 21.02%
R^2:  0.79

Quadrant: 3
MAE:  41296
RMSE: 61417
MAPE: 18.29%
R^2:  0.71

Quadrant: 4
MAE:  42476
RMSE: 54048
MAPE: 44.22%
R^2:  0.676

Average:
MAE:  44662
RMSE: 63581
MAPE: 28.63%
R^2:  0.742


In [45]:
error_df_soos.head(10)

Unnamed: 0,id,lat,long,prediction,error
0,314812,40.466737,-79.708578,237465.283091,-62565.283091
1,315587,40.445413,-79.804255,245351.642733,-72851.642733
2,318468,40.452267,-79.801333,180099.378311,-65199.378311
3,457748,40.450123,-79.799833,184030.322599,-37030.322599
4,319473,40.443532,-79.740358,137014.535803,9435.464197
5,458374,40.520053,-79.784539,334372.447972,-66372.447972
6,319778,40.608447,-79.777244,195122.751433,4177.248567
7,320222,40.453754,-79.811168,230534.273167,-74034.273167
8,320681,40.471195,-79.784846,152470.27005,-92470.27005
9,363493,40.629263,-79.725169,149096.303179,-10096.303179


In [46]:
error_df_soos.to_csv(PATH+"results/errors_soos_doc2vec.csv", index=False)

## Exploring solution

In [47]:
category_coef_df = pd.DataFrame(data={"feature": X_columns_cat[53:], "coef": model_05.coef_[53:]})
category_coef_df_dist = category_coef_df[category_coef_df["feature"].str.contains("dist")]
category_coef_df_dist.sort_values(by=["coef"], ascending=True).head(10)

Unnamed: 0,feature,coef
6,tourist attraction_dist,-4.506974
8,skyscraper_dist,-4.500854
4,river_dist,-2.932569
14,museum_dist,-2.735737
26,librar_dist,-1.109421
16,railway station_dist,-1.08884
2,bridge_dist,-0.95037
12,universit_dist,-0.735239
24,music venue_dist,-0.328256
20,sports venue_dist,0.077149


### Text features

Feature importance for best model

In [48]:
sorted(list(zip(X_columns_text, model_04.get_feature_importance())), key=lambda x: x[1], reverse=True)[:15]

[('GRADE', 28.494930688724914),
 ('FINISHEDLIVINGAREA', 16.284128649131475),
 ('FULLBATHS', 3.568756457874683),
 ('LOTAREA', 2.6257652693151528),
 ('YEARBLT', 2.0358528666192734),
 ('vec_183', 1.810209532219415),
 ('CDU', 1.719098492460599),
 ('TOTALROOMS', 1.2651503927077286),
 ('vec_141', 1.0896876426633313),
 ('vec_37', 1.0371471542253918),
 ('HALFBATHS', 0.880254164649816),
 ('vec_134', 0.7768423485011445),
 ('vec_151', 0.7680545885417207),
 ('BSMTGARAGE', 0.7385176599725156),
 ('vec_292', 0.7211524534887265)]

In [49]:
to_drop = ["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
           "SALEPRICE", "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"]
word_df = structured_wiki_text.drop(to_drop, axis=1)

In [50]:
print(f"Intercept: {model_03.intercept_}")
word_coef_df = pd.DataFrame(data={"feature": word_df.columns, "coef": model_03.coef_})
word_coef_lookup = {word:coef for (word, coef) in zip(word_df.columns, model_03.coef_)}
word_coef_df.head(20)

Intercept: -493723.4393115096


Unnamed: 0,feature,coef
0,LOTAREA,0.20056
1,STORIES,-16274.71419
2,YEARBLT,473.163189
3,GRADE,-21692.092363
4,CONDITION,-125.167543
5,CDU,-17846.752033
6,TOTALROOMS,-377.556366
7,BEDROOMS,-150.518045
8,FULLBATHS,19308.637129
9,HALFBATHS,8376.356395
