# Machine learning with text based housing data

Experimenting with text based housing data.

### Import packages

In [1]:
import json
import math
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``MAX_DIST``: Maximum distance for article weights
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
MAX_DIST = 5500
K_FOLDS = 5

Lead structured data with added text features.

In [3]:
structured_wiki_text = pd.read_csv(
    PATH + f"structured_wiki_text_features_{MAX_DIST}.csv")
print(structured_wiki_text.shape)
structured_wiki_text.head(10)

(9556, 3173)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,yielding student,york,york city,youghiogheny,youghiogheny river,young,youth,zip,zip code,article_count
0,362058,15212,126,47,12603,5329,10-27-2017,113000.0,69200,1.0,...,0.0,0.344647,0.1013,0.007816,0.008028,0.186637,0.065533,0.091133,0.076878,681
1,544290,15017,946,36,94601,10800,09-27-2016,320000.0,269900,2.0,...,0.0,0.057171,0.025815,0.0,0.0,0.0,0.576713,0.351555,0.351777,20
2,314083,15090,935,3,93503,114476,03-25-2017,545000.0,450000,1.0,...,0.471146,0.0,0.0,0.0,0.0,0.0,0.0,0.236106,0.247349,12
3,314280,15241,950,42,95003,43197,06-09-2016,315000.0,274000,2.0,...,0.269688,0.094085,0.048684,0.0,0.0,0.026453,0.046409,0.042496,0.04452,40
4,314812,15239,880,31,88006,12390,01-31-2017,174900.0,154100,2.0,...,0.750332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
5,315579,15143,921,32,92102,10081,03-02-2015,300000.0,244600,2.0,...,0.370776,0.290444,0.0,0.086572,0.088918,0.191602,0.167543,0.089567,0.093832,37
6,315587,15235,934,30,93401,10257,05-15-2017,172500.0,144700,2.0,...,0.428896,0.271729,0.110373,0.064085,0.0,0.163348,0.272471,0.045588,0.016574,31
7,362804,15102,876,5,87603,10920,07-11-2016,250000.0,217800,2.0,...,0.215557,0.057949,0.0,0.0,0.0,0.042779,0.0,0.030316,0.03176,52
8,315758,15108,939,24,93903,54189,09-28-2018,199900.0,174700,1.0,...,0.103958,0.065639,0.0,0.020599,0.0,0.042027,0.06237,0.0,0.0,39
9,315868,15133,837,35,83702,6569,05-12-2017,143000.0,117900,2.0,...,0.15253,0.126928,0.053714,1.635255,1.458919,0.068184,0.113242,0.160394,0.139978,46


### Defining useful functions

In [4]:
def find_coord(x, df):
    """Returns id, latitude and longitude for property with given id"""
    
    _id, lat, long = x[0], x[1], x[2]
    row = df[df["_id"] == _id].iloc[0]
    return row["_id"], row["latitude"], row["longitude"]

In [5]:
def make_train_test(df):
    """Returns train/test sets along with column names and df for saving errors"""

    X = df.drop(["PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE", "SALEPRICE",
                 "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"], axis=1)

    # save col names for later
    X_columns = list(X.columns)
    # remove id from col list, since it will be filtered out later
    X_columns.remove("_id")
    X = X.to_numpy()

    y = df["SALEPRICE"].to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42)

    # save ids for later
    train_ids = [x[0] for x in X_train]
    test_ids = [x[0] for x in X_test]
    X_train = X_train[:, 1:]  # remove first column (id)
    X_test = X_test[:, 1:]    # remove first column (id)

    X_train_train, X_train_val, y_train_train, y_train_val = train_test_split(
        X_train, y_train, test_size=0.25, random_state=42)

    print(f"{X_train.shape}: {X_train_train.shape} + {X_train_val.shape}")
    print(f"{y_train.shape}: {y_train_train.shape} + {y_train_val.shape}")
    print(X_test.shape)
    print(y_test.shape)

    # create error df
    error_df = pd.DataFrame(
        data={"id": test_ids, "lat": [0]*len(test_ids), "long": [0]*len(test_ids)})
    error_df = error_df.apply(lambda x: find_coord(
        x, df), axis=1, result_type='broadcast')
    error_df.head(10)

    return X_columns, [X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val], error_df

In [6]:
def mean_absolute_percentage_error(y_true, y_pred):
    """Returns MAPE"""
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [7]:
def get_metrics(y_true, y_pred, print_out=True):
    """Returns MAE, RMSE, MAPE and R^2"""
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    r_squared = r2_score(y_true, y_pred)

    if print_out:
        print(f"MAE:  {round(mae)}")
        print(f"RMSE: {round(rmse)}")
        print(f"MAPE: {round(mape, 2)}%")
        print(f"R^2:  {round(r_squared, 3)}")

    return mae, rmse, mape, r_squared

In [8]:
def cross_validation(estimator, X, y):
    """Returns and prints cross validated MAE, RMSE, MAPE and R^2"""
    
    maes, rmses, mapes, r_squareds = [], [], [], []
    X_cv = X[:, 1:]  # remove "_id" column

    kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
    for train_index, test_index in tqdm(kf.split(X_cv), total=5):
        X_train, X_test = X_cv[train_index], X_cv[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        if "linear_model" in str(type(estimator)):
            estimator.fit(X=X_train, y=y_train)
        else:
            estimator.fit(X=X_train, y=y_train, verbose=False)

        y_pred_cv = estimator.predict(X_test)
        mae, rmse, mape, r_squared = get_metrics(y_test, y_pred_cv, print_out=False)
        maes.append(mae)
        rmses.append(rmse)
        mapes.append(mape)
        r_squareds.append(r_squared)
    
    mae_cv, rmse_cv = round(np.mean(maes)), round(np.mean(rmses))
    mape_cv, r_squared_cv = round(np.mean(mapes), 2), round(np.mean(r_squareds), 3)
    
    print(f"MAE:  {mae_cv}")
    print(f"RMSE: {rmse_cv}")
    print(f"MAPE: {mape_cv}%")
    print(f"R^2:  {r_squared_cv}")
    
    return mae_cv, rmse_cv, mape_cv, r_squared_cv

Create the data sets and error dataframe

In [9]:
X_columns_text, data_sets, error_df = make_train_test(structured_wiki_text)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 3162): (5375, 3162) + (1792, 3162)
(7167,): (5375,) + (1792,)
(2389, 3162)
(2389,)


Create results df

In [10]:
results_df = pd.DataFrame()

## Only text features

Remove all structured data

In [11]:
X_train_text = X_train[:, 53:]
X_test_text = X_test[:, 53:]

### Linear regression

In [12]:
# model_01 = linear_model.LinearRegression()
# model_01 = linear_model.Lasso()
model_01 = linear_model.Ridge()
model_01.fit(X_train_text, y_train)

Ridge()

In [13]:
y_pred_01 = model_01.predict(X_test_text)
metrics_01 = get_metrics(y_test, y_pred_01)

MAE:  59605
RMSE: 88973
MAPE: 34.52%
R^2:  0.544


Cross validation

In [14]:
results_df["Linear: T"] = cross_validation(model_01, X[:, 52:], y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  60740
RMSE: 90602
MAPE: 35.84%
R^2:  0.559


### Catboost

In [15]:
model_02 = CatBoostRegressor()
model_02.fit(X=X_train_text, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x1988bbf39c8>

In [16]:
y_pred_02 = model_02.predict(X_test_text)
metrics_02 = get_metrics(y_test, y_pred_02)

MAE:  49990
RMSE: 75079
MAPE: 28.93%
R^2:  0.675


Cross validation

In [17]:
results_df["Catboost: T"] = cross_validation(model_02, X[:, 52:], y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  49961
RMSE: 76154
MAPE: 29.25%
R^2:  0.688


## Combining structured and text features

### Linear regression

In [18]:
# model_03 = linear_model.LinearRegression()
# model_03 = linear_model.Lasso()
model_03 = linear_model.Ridge()
model_03.fit(X_train, y_train)

Ridge()

In [19]:
y_pred_03 = model_03.predict(X_test)
metrics_03 = get_metrics(y_test, y_pred_03)

MAE:  33268
RMSE: 46995
MAPE: 21.18%
R^2:  0.873


Cross validation

In [20]:
results_df["Linear: S+T"] = cross_validation(model_03, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  34245
RMSE: 49185
MAPE: 21.57%
R^2:  0.87


### Catboost

In [21]:
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x1988b900a88>

In [22]:
y_pred_04 = model_04.predict(X_test)
metrics_04 = get_metrics(y_test, y_pred_04)

MAE:  27565
RMSE: 40615
MAPE: 16.12%
R^2:  0.905


Cross Validation

In [23]:
results_df["Catboost: S+T"] = cross_validation(model_04, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  28513
RMSE: 42363
MAPE: 16.66%
R^2:  0.904


## Combining structured and category features

Load structured with added wikipedia category data

In [24]:
structured_wiki_categories = pd.read_csv(PATH+"structured_wiki_category_features.csv")
print(structured_wiki_categories.shape)
structured_wiki_categories.head(10)

(9556, 98)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,music venue_dist,music venue_count,librar_dist,librar_count,demolished_dist,demolished_count,theatre_dist,theatre_count,airport_dist,airport_count
0,230053,15025,878,44,87803,9900,04-21-2015,170000.0,142400,1.0,...,15990.452118,0,2820.945079,0,10696.759563,0,14548.670451,0,8261.217226,0
1,59600,15044,938,3,93801,157687,02-06-2017,715000.0,624900,2.0,...,15252.566458,0,8875.882352,0,16605.101289,0,20061.837733,0,8735.994643,0
2,59872,15106,812,7,81203,5760,12-13-2019,180000.0,137500,2.0,...,7405.027862,0,442.271862,1,4954.189003,0,8489.804061,0,13263.563322,0
3,233732,15215,801,17,80102,4000,08-17-2017,445000.0,388900,2.0,...,3762.277275,0,4014.164339,0,4432.421403,0,3758.563485,0,4484.601475,0
4,245949,15108,817,11,81702,7500,02-10-2017,149500.0,130700,2.0,...,12021.222158,0,3689.992805,0,3573.119729,0,13989.916643,0,5386.712501,0
5,245988,15236,873,44,87302,2800,06-15-2017,132000.0,115400,1.0,...,10193.679408,0,4946.52743,0,5818.055614,0,9464.651731,0,3442.224739,0
6,257252,15228,926,26,92601,31263,07-13-2017,495000.0,329000,2.0,...,8890.074005,0,5877.934506,0,2970.4991,0,9185.477223,0,10112.540384,0
7,246320,15132,409,23,40005,2500,05-25-2017,47000.0,40600,2.0,...,10729.808104,0,2005.180201,1,2712.802271,0,9810.115704,0,5850.338099,0
8,246781,15221,828,9,82801,13620,06-06-2018,170000.0,150400,2.0,...,1260.69152,1,2297.413876,2,2068.531631,1,3859.906307,0,4399.683079,0
9,247147,15068,880,31,88008,4897,11-25-2015,60000.0,67700,1.0,...,11407.277774,0,12971.710218,0,8285.703599,0,17277.653271,0,9003.306122,0


In [25]:
X_columns_cat, data_sets, error_df = make_train_test(structured_wiki_categories)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 87): (5375, 87) + (1792, 87)
(7167,): (5375,) + (1792,)
(2389, 87)
(2389,)


### Linear regression

In [26]:
model_05 = linear_model.LinearRegression()
# model_05 = linear_model.Lasso()
# model_05 = linear_model.Ridge()
model_05.fit(X_train, y_train)

LinearRegression()

In [27]:
y_pred_05 = model_05.predict(X_test)
metrics_05 = get_metrics(y_test, y_pred_05)

MAE:  40346
RMSE: 55821
MAPE: 25.4%
R^2:  0.831


Cross validation

In [28]:
results_df["Linear: S+C"] = cross_validation(model_05, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  40509
RMSE: 57135
MAPE: 25.83%
R^2:  0.825


### Catboost

In [29]:
model_06 = CatBoostRegressor()
model_06.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x1988a0e9448>

In [30]:
y_pred_06 = model_06.predict(X_test)
metrics_06 = get_metrics(y_test, y_pred_06)

MAE:  29622
RMSE: 43273
MAPE: 17.02%
R^2:  0.898


Cross validation

In [31]:
results_df["Catboost: S+C"] = cross_validation(model_06, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  29248
RMSE: 43411
MAPE: 17.38%
R^2:  0.899


## Combining structured, text and category features

Add category features

In [32]:
merge_cols = list(structured_wiki_text.columns[:64])
structured_wiki_combined = pd.merge(
    structured_wiki_text, structured_wiki_categories, on=merge_cols)
print(structured_wiki_combined.shape)

(9556, 3207)


In [33]:
X_columns_text_cat, data_sets, error_df = make_train_test(structured_wiki_combined)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets

(7167, 3196): (5375, 3196) + (1792, 3196)
(7167,): (5375,) + (1792,)
(2389, 3196)
(2389,)


### Linear model

In [34]:
# model_07 = linear_model.LinearRegression()
# model_07 = linear_model.Lasso()
model_07 = linear_model.Ridge()
model_07.fit(X_train, y_train)

Ridge()

In [35]:
y_pred_07 = model_07.predict(X_test)
metrics_07 = get_metrics(y_test, y_pred_07)

MAE:  32676
RMSE: 46416
MAPE: 20.88%
R^2:  0.876


Cross validation

In [36]:
results_df["Linear: S+T+C"] = cross_validation(model_07, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  33768
RMSE: 48630
MAPE: 21.33%
R^2:  0.873


### Catboost

In [37]:
model_08 = CatBoostRegressor()
model_08.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x1988b923288>

In [38]:
y_pred_08 = model_08.predict(X_test)
metrics_08 = get_metrics(y_test, y_pred_08)

MAE:  27931
RMSE: 40749
MAPE: 16.21%
R^2:  0.904


Cross validation

In [39]:
results_df["Catboost: S+T+C"] = cross_validation(model_08, X, y)

  0%|          | 0/5 [00:00<?, ?it/s]

MAE:  28558
RMSE: 42482
MAPE: 16.67%
R^2:  0.903


## Results

In [40]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
# reorder columns
results_df = results_df[["Linear: T", "Linear: S+T", "Linear: S+T+C", "Catboost: T", "Catboost: S+T", "Catboost: S+T+C"]]
results_df.to_csv(
    PATH + f"results/structured_wiki_{MAX_DIST}_results.csv", index=False)
print(f"Results for a max distance of {MAX_DIST}m.")
results_df.head()

Results for a max distance of 5500m.


Unnamed: 0,Linear: T,Linear: S+T,Linear: S+T+C,Catboost: T,Catboost: S+T,Catboost: S+T+C
MAE,60740.0,34245.0,33768.0,49961.0,28513.0,28558.0
RMSE,90602.0,49185.0,48630.0,76154.0,42363.0,42482.0
MAPE,35.84,21.57,21.33,29.25,16.66,16.67
R^2,0.559,0.87,0.873,0.688,0.904,0.903


## Spatial out-of-sample test

Calculate median latitude and longitude

In [41]:
soos_df = structured_wiki_text.copy()

coords_median = soos_df.loc[:, "latitude":"longitude"].median()
lat_median = coords_median.loc["latitude"]
long_median = coords_median.loc["longitude"]
coords_median

latitude     40.441981
longitude   -79.987716
dtype: float64

In [42]:
quadrants = []

quadrant_1 = soos_df[(soos_df["latitude"] >= lat_median) & (soos_df["longitude"] >= long_median)]
quadrants.append(quadrant_1)
print(quadrant_1.shape)

quadrant_2 = soos_df[(soos_df["latitude"] >= lat_median) & (soos_df["longitude"] < long_median)]
quadrants.append(quadrant_2)
print(quadrant_2.shape)

quadrant_3 = soos_df[(soos_df["latitude"] < lat_median) & (soos_df["longitude"] < long_median)]
quadrants.append(quadrant_3)
print(quadrant_3.shape)

quadrant_4 = soos_df[(soos_df["latitude"] < lat_median) & (soos_df["longitude"] >= long_median)]
quadrants.append(quadrant_4)
print(quadrant_4.shape, end="\n\n")

row_sum = quadrant_1.shape[0] + quadrant_2.shape[0] + quadrant_3.shape[0] + quadrant_4.shape[0]
print(f"{row_sum, quadrant_1.shape[1]}")

(2487, 3173)
(2291, 3173)
(2487, 3173)
(2291, 3173)

(9556, 3173)


In [43]:
quadrants_df = pd.concat(quadrants, ignore_index=True)

error_df_soos = pd.DataFrame(
    data={"id": quadrants_df["_id"],
          "lat": quadrants_df["latitude"],
          "long": quadrants_df["longitude"],
          "prediction": 0,
          "error": 0})
error_df_soos.head(10)

Unnamed: 0,id,lat,long,prediction,error
0,314812,40.466737,-79.708578,0,0
1,315587,40.445413,-79.804255,0,0
2,318468,40.452267,-79.801333,0,0
3,457748,40.450123,-79.799833,0,0
4,319473,40.443532,-79.740358,0,0
5,458374,40.520053,-79.784539,0,0
6,319778,40.608447,-79.777244,0,0
7,320222,40.453754,-79.811168,0,0
8,320681,40.471195,-79.784846,0,0
9,363493,40.629263,-79.725169,0,0


In [44]:
y_preds = []
errors = []
maes, rmses, mapes, r_squareds = [], [], [], []

for i, quadrant in enumerate(quadrants):
    train = pd.concat(quadrants[:i] + quadrants[i+1:])
    test = quadrants[i]
    
    train = train.drop(["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
                        "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"], axis=1)
    test = test.drop(["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
                      "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"], axis=1)
    
    X_train = train.drop(["SALEPRICE"], axis=1).to_numpy()
    y_train = train["SALEPRICE"].to_numpy()
    
    X_test = test.drop(["SALEPRICE"], axis=1).to_numpy()
    y_test = test["SALEPRICE"].to_numpy()
    
    model_cv = CatBoostRegressor()
    model_cv.fit(X=X_train, y=y_train, verbose=False)
    
    y_pred_cv = model_cv.predict(X_test)
    y_preds.extend(y_pred_cv)
    errors.extend([test - pred for test, pred in zip(y_test, y_pred_cv)])
    
    print(f"Quadrant: {i+1}")
    mae, rmse, mape, r_squared = get_metrics(y_test, y_pred_cv)
    maes.append(mae)
    rmses.append(rmse)
    mapes.append(mape)
    r_squareds.append(r_squared)
    
    print("")

error_df_soos["prediction"] = y_preds
error_df_soos["error"] = errors
    
print("Average:")
print(f"MAE:  {round(np.mean(maes))}")
print(f"RMSE: {round(np.mean(rmses))}")
print(f"MAPE: {round(np.mean(mapes), 2)}%")
print(f"R^2:  {round(np.mean(r_squareds), 3)}")

Quadrant: 1
MAE:  46954
RMSE: 71537
MAPE: 28.22%
R^2:  0.79

Quadrant: 2
MAE:  44706
RMSE: 69199
MAPE: 19.67%
R^2:  0.782

Quadrant: 3
MAE:  44225
RMSE: 63642
MAPE: 19.38%
R^2:  0.688

Quadrant: 4
MAE:  37423
RMSE: 48436
MAPE: 39.12%
R^2:  0.74

Average:
MAE:  43327
RMSE: 63203
MAPE: 26.6%
R^2:  0.75


In [45]:
error_df_soos.head(10)

Unnamed: 0,id,lat,long,prediction,error
0,314812,40.466737,-79.708578,239094.263713,-64194.263713
1,315587,40.445413,-79.804255,205395.39899,-32895.39899
2,318468,40.452267,-79.801333,161310.500492,-46410.500492
3,457748,40.450123,-79.799833,152277.340099,-5277.340099
4,319473,40.443532,-79.740358,127385.929395,19064.070605
5,458374,40.520053,-79.784539,329461.380277,-61461.380277
6,319778,40.608447,-79.777244,174577.749271,24722.250729
7,320222,40.453754,-79.811168,215465.139657,-58965.139657
8,320681,40.471195,-79.784846,119807.779119,-59807.779119
9,363493,40.629263,-79.725169,162364.184781,-23364.184781


In [46]:
error_df_soos.to_csv(PATH+"results/errors_soos_wiki.csv", index=False)

## Exploring solution

### Category features

In [47]:
category_coef_df = pd.DataFrame(data={"feature": X_columns_cat[53:], "coef": model_05.coef_[53:]})
category_coef_df_dist = category_coef_df[category_coef_df["feature"].str.contains("dist")]
category_coef_df_dist.sort_values(by=["coef"], ascending=True).head(10)

Unnamed: 0,feature,coef
8,skyscraper_dist,-4.782086
6,tourist attraction_dist,-4.249049
4,river_dist,-2.755481
14,museum_dist,-2.501638
16,railway station_dist,-1.43601
2,bridge_dist,-0.918393
26,librar_dist,-0.737401
24,music venue_dist,-0.724933
12,universit_dist,-0.722572
20,sports venue_dist,0.279702


### Text features

Feature importance for best model

In [48]:
sorted(list(zip(X_columns_text[53:], model_04.get_feature_importance()[53:])), key=lambda x: x[1], reverse=True)[:15]

[('actor', 0.9074641569875287),
 ('shadyside', 0.7386735849269496),
 ('arena', 0.53078450869351),
 ('importance', 0.5148471064017115),
 ('presbyterian church', 0.508134544603739),
 ('fine', 0.46800831313390423),
 ('farms', 0.4545474597414473),
 ('study released', 0.45004445472381005),
 ('places nris', 0.42662576219796483),
 ('theater', 0.3780158741502531),
 ('auditorium', 0.37285824769017084),
 ('term', 0.3625626456632759),
 ('subdivision type', 0.3608031230516139),
 ('richard', 0.3315322094681271),
 ('released', 0.32653817852574957)]

In [49]:
to_drop = ["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
           "SALEPRICE", "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR"]
word_df = structured_wiki_text.drop(to_drop, axis=1)

In [50]:
print(f"Intercept: {model_03.intercept_}")
word_coef_df = pd.DataFrame(data={"feature": word_df.columns[53:], "coef": model_03.coef_[53:]})
word_coef_lookup = {word:coef for (word, coef) in zip(word_df.columns[53:], model_03.coef_[53:])}
word_coef_df.head(53)

Intercept: -389543.2008813888


Unnamed: 0,feature,coef
0,abandoned,24083.976991
1,ability,-2892.11379
2,able,6163.985314
3,academic,-12292.11074
4,academic achievement,-554.278494
5,academic performance,-1297.113297
6,academics,1084.694866
7,academy,13593.089473
8,accept,1976.021603
9,accept credits,441.099921


The next cell prints the coefficient for a specific word.

In [51]:
word_to_test = "hospital"

coef = word_coef_lookup[word_to_test]
print(f"Coefficient for \"{word_to_test}\" is {round(float(coef), 2)}")

Coefficient for "hospital" is -17415.15


Find out most and least valuable words

In [58]:
best_words = word_coef_df.sort_values(by=["coef"], ascending=False).head(10)
worst_words = word_coef_df.sort_values(by=["coef"], ascending=True).head(10)

df1_styler = best_words.style.set_table_attributes("style='display:inline'").set_caption('Most valuable words')
df2_styler = worst_words.style.set_table_attributes("style='display:inline'").set_caption('Least valuable words')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,feature,coef
2183,public library,54346.230075
245,beaver county,52467.372199
89,allegheny river,49186.659141
157,arena,46716.867155
1278,hot,40884.806543
244,beaver,39181.433845
457,championship,38509.366119
1995,perry,37159.361288
2412,room,36657.254171
3064,woods,36543.611423

Unnamed: 0,feature,coef
486,city,-64829.121676
2947,upmc,-51108.612326
2692,stop,-49196.311741
795,division,-48105.160627
1902,original,-44033.307246
2447,schenley,-42149.473338
2982,village,-41429.307662
680,current,-39701.980966
1890,operation,-38352.610833
650,course,-38132.547414


### Add article with highest word count (highest impact) for top negative/positive words

In [53]:
article_word_counts = pd.read_csv(PATH+"wikipedia/wikipedia_article_wordcounts.csv")
article_word_counts.head()

Unnamed: 0,article_title,abandoned,ability,able,academic,academic achievement,academic performance,academics,academy,accept,...,yielding,yielding student,york,york city,youghiogheny,youghiogheny river,young,youth,zip,zip code
0,Washington County Courthouse (Pennsylvania),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Wild Things Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Thackeray Hall,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Immaculate Heart of Mary Church (Pittsburgh),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,St. Stanislaus Kostka Church (Pittsburgh),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Show example article with all words which appear 40 times or more.

In [54]:
upmc = article_word_counts.loc[article_word_counts["article_title"]=="University of Pittsburgh Medical Center", :]

to_drop = []
for i, col in enumerate(upmc.columns[1:]):
    if upmc.iloc[0, i+1] < 40:
        to_drop.append(col)
        
upmc.drop(to_drop, axis=1)

Unnamed: 0,article_title,care,center,health,hospital,located,medical,pennsylvania,pittsburgh,presbyterian,university,university pittsburgh,upmc
415,University of Pittsburgh Medical Center,53,45,75,133,44,42,60,89,40,71,40,302


Most valuable and least valuable words with article in which word appears the most

In [59]:
for x in [best_words, worst_words]:
    x["article"] = ""
    for index, row in x.iterrows():
        highest_impact = article_word_counts.sort_values(by=[row["feature"]], ascending=False).iloc[0, 0]
        x.loc[index, "article"] = highest_impact

In [60]:
df1_styler = best_words.style.set_table_attributes("style='display:inline'").set_caption('Most valuable articles')
df2_styler = worst_words.style.set_table_attributes("style='display:inline'").set_caption('Least valuable articles')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,feature,coef,article
2183,public library,54346.230075,Allegheny County Library Association
245,beaver county,52467.372199,Western Beaver County School District
89,allegheny river,49186.659141,List of crossings of the Allegheny River
157,arena,46716.867155,Civic Arena (Pittsburgh)
1278,hot,40884.806543,Essie's Original Hot Dog Shop
244,beaver,39181.433845,Western Beaver County School District
457,championship,38509.366119,1978 PGA Championship
1995,perry,37159.361288,Perry Traditional Academy
2412,room,36657.254171,Nationality Rooms
3064,woods,36543.611423,2007 U.S. Open (golf)

Unnamed: 0,feature,coef,article
486,city,-64829.121676,List of Pennsylvania state historical markers in Allegheny County
2947,upmc,-51108.612326,University of Pittsburgh Medical Center
2692,stop,-49196.311741,Pittsburgh International Airport
795,division,-48105.160627,University of Pittsburgh Medical Center
1902,original,-44033.307246,Nationality Rooms
2447,schenley,-42149.473338,William Pitt Union
2982,village,-41429.307662,Chatham Village (Pittsburgh)
680,current,-39701.980966,WPXI
1890,operation,-38352.610833,Kennywood
650,course,-38132.547414,Oakmont Country Club


### Determine best and worst articles

Create article value-score by calculating sum of all words multiplied with their coefficient for every article.

In [61]:
article_values = article_word_counts.copy()
article_values.insert(1, "article_value", [0]*article_word_counts.shape[0])
words = article_values.columns[2:]
for index, row in article_values.iterrows():
    counts = row.iloc[2:]
    article_values.loc[index, "article_value"] = sum([word_coef_lookup[word]*count for count, word in zip(counts, words)])

In [62]:
pos_sorted = article_values.loc[:, "article_title":"article_value"].sort_values(by=["article_value"], ascending=False).head(10)
neg_sorted = article_values.loc[:, "article_title":"article_value"].sort_values(by=["article_value"], ascending=True).head(10)

df1_styler = pos_sorted.style.set_table_attributes("style='display:inline'").set_caption('Most valuable articles')
df2_styler = neg_sorted.style.set_table_attributes("style='display:inline'").set_caption('Least valuable articles')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,article_title,article_value
630,Western Beaver County School District,13928361.463118
2104,Civic Arena (Pittsburgh),12043360.916518
832,List of Pittsburgh History and Landmarks Foundation Historic Landmarks,11482605.281687
178,Nationality Rooms,11132162.812902
2107,Cathedral of Learning,7822076.609121
1527,Carnegie Mellon University,7298247.5797
922,Duquesne Gardens,7014412.808453
2355,PPG Paints Arena,6280269.499917
1099,Squirrel Hill (Pittsburgh),5215240.234522
384,Aliquippa School District,5089312.630853

Unnamed: 0,article_title,article_value
415,University of Pittsburgh Medical Center,-21623258.514051
2376,"Allegheny, Pennsylvania",-6180070.232624
2194,UPMC Presbyterian,-5764340.460425
2090,Heinz Field,-3482842.41747
98,UPMC Children's Hospital of Pittsburgh,-2912339.346542
1520,List of crossings of the Allegheny River,-2823239.381605
2094,Schenley Park,-2762071.83695
620,UPMC Hillman Cancer Center,-2708837.378033
2103,Washington & Jefferson College,-2692827.259114
955,University of Pittsburgh School of Medicine,-2611387.614659


Add coordinates back

In [63]:
with open(PATH+"wikipedia/wikipedia_selected.ndjson") as fin:
    data_loaded = json.load(fin)

In [64]:
coords = np.array(data_loaded)[:, 1]  # filter coord column
article_values.insert(2, "article_lat", [lat for lat, long in coords])
article_values.insert(2, "article_long", [long for lat, long in coords])
article_values = article_values[["article_title", "article_value", "article_long", "article_lat"]]  # remove word counts

Save articles with their value score and coordinates.

In [65]:
article_values.to_csv(PATH+"wikipedia_article_values.csv", index=False)
article_values.head()

Unnamed: 0,article_title,article_value,article_long,article_lat
0,Washington County Courthouse (Pennsylvania),-256486.860752,-80.245803,40.17045
1,Wild Things Park,-739795.442445,-80.283611,40.154167
2,Thackeray Hall,-253649.572529,-79.957264,40.444317
3,Immaculate Heart of Mary Church (Pittsburgh),696692.637054,-79.967778,40.456389
4,St. Stanislaus Kostka Church (Pittsburgh),382697.206604,-79.983611,40.452322
