# Machine learning with text based housing data

Experimenting with text based housing data.

### Import packages

In [1]:
import json
import math
import random
import warnings
warnings.filterwarnings(action="ignore")

from catboost import CatBoostRegressor
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

from utils import make_train_test, get_metrics, cross_validation, soos_validation

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display_html

Definde constants.

- ``PATH``: Path to the base data folder
- ``MAX_DIST``: Maximum distance for article weights
- ``K_FOLDS``: Number of folds to perform for cross validation

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
MAX_DIST = 5500
K_FOLDS = 5
WEIGHTING = True
DUMMIES = []  # e.g. ["MUNICODE"]

Lead structured data with added text features.

In [3]:
if WEIGHTING:
    structured_wiki_text = pd.read_csv(PATH + f"structured_wiki_text_features_{MAX_DIST}.csv")
else:
    structured_wiki_text = pd.read_csv(PATH + f"structured_wiki_text_features_{MAX_DIST}_NOWEIGHT.csv")
    
print(structured_wiki_text.shape)
structured_wiki_text.head(10)

(9556, 3174)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,yielding student,york,york city,youghiogheny,youghiogheny river,young,youth,zip,zip code,article_count
0,161705,15122,870,45,87005,10899,05-01-2018,145000.0,76700,1.0,...,0.0,0.130886,0.10104,0.282106,0.249367,0.010241,0.00194,0.027334,0.028636,49
1,530852,15146,879,18,87905,10691,05-13-2019,139997.0,106200,1.0,...,0.015599,0.259048,0.240958,0.28796,0.0,0.271106,0.381369,0.0,0.0,14
2,144978,15202,826,2,82601,11813,05-26-2017,170000.0,135300,1.0,...,0.257677,0.079152,0.0,0.044886,0.0,0.008793,0.066521,0.0,0.0,47
3,436602,15202,803,29,80302,5324,06-06-2017,145000.0,117300,2.0,...,0.160069,0.168517,0.106982,0.030178,0.0,0.03758,0.04692,0.122617,0.101795,83
4,145066,15218,114,47,11403,3600,04-09-2016,325000.0,250000,2.0,...,0.030046,0.225885,0.11365,0.008153,0.0,0.103157,0.089612,0.074424,0.031744,337
5,145137,15228,926,26,92607,6406,04-30-2015,172900.0,137300,2.0,...,0.169756,0.05,0.0,0.0,0.0,0.054306,0.053788,0.041071,0.015175,104
6,145246,15241,950,42,95001,38376,12-17-2015,817000.0,751600,2.0,...,0.032662,0.072342,0.0,0.0,0.0,0.023127,0.007039,0.029061,0.018636,67
7,529513,15132,409,23,40005,3844,01-09-2020,39000.0,45100,1.0,...,0.249204,0.229567,0.073169,1.565165,1.355768,0.074707,0.165631,0.123405,0.124603,46
8,146103,15212,127,47,12703,5284,06-30-2016,65000.0,52800,1.5,...,0.015753,0.22776,0.083524,0.007829,0.007075,0.096008,0.047138,0.140486,0.113373,427
9,146155,15212,127,47,12701,5544,11-10-2018,162000.0,111200,1.0,...,0.037225,0.16682,0.109717,0.009048,0.002554,0.056058,0.043232,0.159225,0.119882,190


Create the data sets and error dataframe

In [4]:
X_columns_text, data_sets, error_df = make_train_test(structured_wiki_text, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7167, 3162): (5375, 3162) + (1792, 3162)
(7167,): (5375,) + (1792,)
(2389, 3162)
(2389,)


Create results df

In [5]:
results_df = pd.DataFrame()

## Combining structured and text features

### Linear regression

In [6]:
# model_03 = linear_model.LinearRegression()
# model_03 = linear_model.Lasso()
model_03 = linear_model.Ridge()
model_03.fit(X_train, y_train)

Ridge()

In [7]:
y_pred_03 = model_03.predict(X_test)
metrics_03 = get_metrics(y_test, y_pred_03)

MAE:  33896
RMSE: 48337
MAPE: 20.47%
R^2:  0.87


Cross validation

In [8]:
results_df["Linear: S+T"], X_03_columns = cross_validation(model_03, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  34237
RMSE: 49266
MAPE: 21.51%
R^2:  0.87


### Catboost

In [9]:
model_04 = CatBoostRegressor()
model_04.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x2bb2504aa48>

In [10]:
y_pred_04 = model_04.predict(X_test)
metrics_04 = get_metrics(y_test, y_pred_04)

MAE:  28421
RMSE: 42015
MAPE: 15.79%
R^2:  0.901


Cross Validation

In [11]:
results_df["Catboost: S+T"], X_04_columns = cross_validation(model_04, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  28648
RMSE: 42797
MAPE: 16.73%
R^2:  0.902


## Combining structured and category features

Load structured with added wikipedia category data

In [12]:
structured_wiki_categories = pd.read_csv(PATH+"structured_wiki_category_features.csv")
print(structured_wiki_categories.shape)
structured_wiki_categories.head(10)

(9556, 99)


Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,music venue_dist,music venue_count,librar_dist,librar_count,demolished_dist,demolished_count,theatre_dist,theatre_count,airport_dist,airport_count
0,536102,15219,103,47,10301,2875,01-31-2018,287000.0,159000,2.0,...,517.69985,12,897.917939,4,517.69985,19,362.128604,12,7366.798716,0
1,197251,15241,950,42,95002,17193,09-02-2017,299900.0,259500,2.0,...,12399.579075,0,7345.059951,0,6699.747625,0,12980.769713,0,13247.804765,0
2,25219,15146,879,18,87910,22264,03-22-2017,190000.0,214900,2.0,...,14458.681434,0,4272.542747,0,12028.97002,0,17076.275805,0,6003.343808,0
3,197755,15236,873,44,87302,7800,05-09-2017,225000.0,155000,2.0,...,11009.82667,0,4240.695718,0,6425.354574,0,10122.630117,0,3950.374941,0
4,198593,15015,809,27,80902,43734,06-28-2019,249000.0,208800,2.0,...,19408.880978,0,8103.33775,0,13266.442574,0,18629.773509,0,9357.397493,0
5,532924,15212,127,47,12703,5000,05-25-2016,113000.0,84300,2.0,...,1022.312436,2,2376.258142,1,2442.269063,2,2082.763441,2,11124.871386,0
6,198384,15136,919,24,91903,9664,10-30-2015,189900.0,169200,1.0,...,6847.322917,0,5775.964244,0,5238.956835,0,8985.544284,0,8408.081284,0
7,198694,15237,927,27,92703,12162,11-06-2019,310000.0,257800,2.0,...,9237.89419,0,2248.669106,1,9457.535248,0,11343.201706,0,15420.145174,0
8,198744,15227,874,4,87402,6695,03-13-2018,165000.0,144200,1.5,...,7000.025029,0,6209.42845,0,3218.508916,0,7000.025029,0,4095.874034,0
9,199198,15139,845,33,84501,12050,08-24-2018,380000.0,320000,2.0,...,4285.299694,0,9711.011884,0,1230.228244,1,10480.485342,0,8608.785351,0


In [13]:
X_columns_cat, data_sets, error_df = make_train_test(structured_wiki_categories, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7167, 87): (5375, 87) + (1792, 87)
(7167,): (5375,) + (1792,)
(2389, 87)
(2389,)


### Linear regression

In [14]:
# model_05 = linear_model.LinearRegression()
# model_05 = linear_model.Lasso()
model_05 = linear_model.Ridge()
model_05.fit(X_train, y_train)

Ridge()

In [15]:
y_pred_05 = model_05.predict(X_test)
metrics_05 = get_metrics(y_test, y_pred_05)

MAE:  40805
RMSE: 57503
MAPE: 25.64%
R^2:  0.815


Cross validation

In [16]:
results_df["Linear: S+C"], X_05_columns = cross_validation(model_05, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  40628
RMSE: 57265
MAPE: 25.95%
R^2:  0.824


### Catboost

In [17]:
model_06 = CatBoostRegressor()
model_06.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x2bb250ea648>

In [18]:
y_pred_06 = model_06.predict(X_test)
metrics_06 = get_metrics(y_test, y_pred_06)

MAE:  29972
RMSE: 44476
MAPE: 17.44%
R^2:  0.889


Cross validation

In [19]:
results_df["Catboost: S+C"], X_06_columns = cross_validation(model_06, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  29445
RMSE: 43855
MAPE: 17.36%
R^2:  0.897


## Combining structured, text and category features

Add category features

In [20]:
merge_cols = list(structured_wiki_text.columns[:65])
structured_wiki_combined = pd.merge(structured_wiki_text, structured_wiki_categories, on=merge_cols)
print(structured_wiki_combined.shape)

(9556, 3208)


In [21]:
X_columns_text_cat, data_sets, error_df = make_train_test(structured_wiki_combined, dummies=DUMMIES)
X, y, X_train, X_test, y_train, y_test, X_train_train, X_train_val, y_train_train, y_train_val = data_sets


(7167, 3196): (5375, 3196) + (1792, 3196)
(7167,): (5375,) + (1792,)
(2389, 3196)
(2389,)


### Linear model

In [22]:
# model_07 = linear_model.LinearRegression()
# model_07 = linear_model.Lasso()
model_07 = linear_model.Ridge()
model_07.fit(X_train, y_train)

Ridge()

In [23]:
y_pred_07 = model_07.predict(X_test)
metrics_07 = get_metrics(y_test, y_pred_07)

MAE:  33009
RMSE: 47421
MAPE: 19.87%
R^2:  0.874


Cross validation

In [24]:
results_df["Linear: S+T+C"], X_07_columns = cross_validation(model_07, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  33788
RMSE: 48720
MAPE: 21.3%
R^2:  0.873


### Catboost

In [25]:
model_08 = CatBoostRegressor()
model_08.fit(X=X_train, y=y_train, verbose=False)

<catboost.core.CatBoostRegressor at 0x2bb236ea848>

In [26]:
y_pred_08 = model_08.predict(X_test)
metrics_08 = get_metrics(y_test, y_pred_08)

MAE:  28267
RMSE: 41724
MAPE: 15.71%
R^2:  0.903


Cross validation

In [27]:
results_df["Catboost: S+T+C"], X_08_columns = cross_validation(model_08, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  28583
RMSE: 42540
MAPE: 16.67%
R^2:  0.903


## Results

In [28]:
results_df.index = ["MAE", "RMSE", "MAPE", "R^2"]
# reorder columns
results_df = results_df[["Linear: S+T", "Linear: S+T+C", "Catboost: S+T", "Catboost: S+T+C"]]
if WEIGHTING:
    results_df.to_csv(PATH + f"results/structured_wiki_{MAX_DIST}_results.csv", index=False)
else:
    results_df.to_csv(PATH + f"results/structured_wiki_{MAX_DIST}_results_NOWEIGHT.csv", index=False)
print(f"Results for a max distance of {MAX_DIST}m.")
results_df.head()

Results for a max distance of 5500m.


Unnamed: 0,Linear: S+T,Linear: S+T+C,Catboost: S+T,Catboost: S+T+C
MAE,34237.0,33788.0,28648.0,28583.0
RMSE,49266.0,48720.0,42797.0,42540.0
MAPE,21.51,21.3,16.73,16.67
R^2,0.87,0.873,0.902,0.903


## Spatial out-of-sample test

In [29]:
estimator = CatBoostRegressor()
error_df_soos, metrics = soos_validation(estimator, structured_wiki_text)
maes, rmses, mapes, r_squareds = metrics

Predicting district 1/13
Predicting district 2/13
Predicting district 3/13
Predicting district 4/13
Predicting district 5/13
Predicting district 6/13
Predicting district 7/13
Predicting district 8/13
Predicting district 9/13
Predicting district 10/13
Predicting district 11/13
Predicting district 12/13
Predicting district 13/13

Weighted metrics:
MAE:  40685
RMSE: 57676
MAPE: 23.82%
R^2:  0.661


In [30]:
if WEIGHTING:
    error_df_soos.to_csv(PATH+"results/errors_soos_wiki.csv")
else:
    error_df_soos.to_csv(PATH+"results/errors_soos_wiki_noweight.csv")
error_df_soos

Unnamed: 0,id,lat,long,district,prediction,error
0,212373,40.362531,-79.959576,district_6,124405.393297,11694.606703
1,290789,40.564608,-80.207006,district_2,477020.629089,-62020.629089
2,491314,40.577795,-80.051588,district_2,137302.322005,-48302.322005
3,45608,40.373040,-79.954025,district_6,403179.505748,135820.494252
4,489922,40.425462,-79.837627,district_7,127990.927872,259.072128
...,...,...,...,...,...,...
9551,45486,40.523675,-80.037983,district_1,216802.686283,-24802.686283
9552,96528,40.400842,-80.016259,district_12,132591.933818,-59091.933818
9553,150478,40.402549,-80.012689,district_12,220589.410882,207410.589118
9554,505188,40.350998,-80.053068,district_5,146005.921389,-31005.921389


Display metrics for each district along with aggregated information about houses in that district

In [31]:
districts = ["district_"+str(i) for i in range(1,14)]
metrics_df = pd.DataFrame(data={"district":districts, "mae":maes, "rmse":rmses, "mapes":mapes, "R^2":r_squareds})
metrics_df = metrics_df.set_index("district")
metrics_df.to_csv(PATH+"results/errors_soos_district_wiki.csv")

# add more information about each district to characterize
df_agg = structured_wiki_text.groupby(by="DISTRICT").mean()
df_agg  = df_agg[["SALEPRICE", "LOTAREA", "YEARBLT", "STORIES"]]
metrics_df_agg = pd.concat([metrics_df, df_agg], axis=1)

metrics_df_agg

Unnamed: 0,mae,rmse,mapes,R^2,SALEPRICE,LOTAREA,YEARBLT,STORIES
district_1,29106.015078,41286.326232,13.67358,0.790494,219849.905742,18302.388949,1959.393283,1.557638
district_2,54908.129702,82556.482466,15.344813,0.737084,347195.02963,28928.46455,1968.440212,1.703175
district_3,48064.424865,76781.220582,18.520139,0.782522,261041.727106,22902.147436,1959.855311,1.500916
district_4,33650.596736,44361.989831,22.551744,0.76967,191448.015873,16155.658009,1956.578644,1.48557
district_5,52270.336379,73574.665427,17.025866,0.636064,279481.883882,13725.758232,1956.907279,1.650347
district_6,25255.13366,32618.579062,18.072573,0.708656,159027.268519,10129.287037,1955.130658,1.43107
district_7,41445.171566,48836.77623,43.915732,0.557659,125184.599506,11433.442522,1950.647713,1.464524
district_8,26702.526186,37469.897268,20.621169,0.775935,158730.732,13534.752,1956.425333,1.457333
district_9,32859.263907,41264.608939,43.91862,0.117538,100983.590308,12213.361233,1951.126285,1.278267
district_10,58684.20333,85764.019878,47.996093,0.560845,190825.944681,7570.417021,1936.978723,1.878723


## Exploring solution

### Category features

In [32]:
category_coef_df = pd.DataFrame(data={"feature": X_05_columns[54:], "coef": model_05.coef_[54:]})
category_coef_df_dist = category_coef_df[category_coef_df["feature"].str.contains("dist")]
category_coef_df_dist.sort_values(by=["coef"], ascending=True).head(10)

Unnamed: 0,feature,coef
7,skyscraper_dist,-4.577016
5,tourist attraction_dist,-4.2986
3,river_dist,-2.613502
13,museum_dist,-2.358905
15,railway station_dist,-1.399124
1,bridge_dist,-1.144772
25,librar_dist,-0.836893
11,universit_dist,-0.759298
23,music venue_dist,-0.422663
19,sports venue_dist,0.225843


### Text features

Feature importance for best model

In [33]:
sorted(list(zip(X_04_columns[54:], model_04.get_feature_importance()[54:])), key=lambda x: x[1], reverse=True)[:15]

[('actor', 1.7477560310104763),
 ('emergency', 0.8421273567260995),
 ('monongahela river', 0.6485677795126841),
 ('news world', 0.6479561798041462),
 ('chapel', 0.5650843203484709),
 ('farms', 0.5173987247761488),
 ('new year', 0.4933607346793734),
 ('category steel', 0.4411459735018301),
 ('writer', 0.4167552382809443),
 ('leadership', 0.3882450902557),
 ('silver', 0.3823530011225753),
 ('note', 0.37327422130116855),
 ('year cohort', 0.36199871808411593),
 ('marshall', 0.33126660399581526),
 ('produce', 0.3304650277503912)]

In [34]:
to_drop = ["_id", "PROPERTYZIP", "MUNICODE", "SCHOOLCODE", "NEIGHCODE", "SALEDATE",
           "SALEPRICE", "FAIRMARKETTOTAL", "latitude", "longitude", "SALEYEAR", "DISTRICT"]
word_df = structured_wiki_text.drop(to_drop, axis=1)

In [35]:
print(f"Intercept: {model_03.intercept_}")
word_coef_df = pd.DataFrame(data={"feature": X_03_columns[53:], "coef": model_03.coef_[53:]})
word_coef_lookup = {word:coef for (word, coef) in zip(X_03_columns[53:], model_03.coef_[53:])}
word_coef_df.head(53)

Intercept: -305817.46820107056


Unnamed: 0,feature,coef
0,abandoned,18811.552756
1,ability,-2155.963706
2,able,5287.15748
3,academic,-12326.754105
4,academic achievement,1344.579139
5,academic performance,-3712.691886
6,academics,-718.161176
7,academy,11885.293589
8,accept,1177.323985
9,accept credits,284.413718


The next cell prints the coefficient for a specific word.

In [36]:
word_to_test = "hospital"

coef = word_coef_lookup[word_to_test]
print(f"Coefficient for \"{word_to_test}\" is {round(float(coef), 2)}")

Coefficient for "hospital" is -12461.12


Find out most and least valuable words

In [37]:
best_words = word_coef_df.sort_values(by=["coef"], ascending=False).head(10)
worst_words = word_coef_df.sort_values(by=["coef"], ascending=True).head(10)

df1_styler = best_words.style.set_table_attributes("style='display:inline'").set_caption('Most valuable words')
df2_styler = worst_words.style.set_table_attributes("style='display:inline'").set_caption('Least valuable words')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,feature,coef
245,beaver county,57364.360316
89,allegheny river,56513.683694
2553,show,45435.986526
244,beaver,41598.524499
457,championship,40184.904495
2412,room,37962.481172
210,avenue,37899.272778
1224,heights,37656.711097
1837,november,37627.733295
1278,hot,37451.768468

Unnamed: 0,feature,coef
486,city,-58594.790648
2947,upmc,-50493.21095
650,course,-48668.794594
680,current,-45603.26801
795,division,-43425.286607
580,concrete,-41037.953737
1861,officers,-40393.399551
1902,original,-40321.214081
1045,flows,-39828.675367
2982,village,-38132.760617


### Add article with highest word count (highest impact) for top negative/positive words

In [38]:
article_word_counts = pd.read_csv(PATH+"wikipedia/wikipedia_article_wordcounts.csv")
article_word_counts.head()

Unnamed: 0,article_title,abandoned,ability,able,academic,academic achievement,academic performance,academics,academy,accept,...,yielding,yielding student,york,york city,youghiogheny,youghiogheny river,young,youth,zip,zip code
0,Washington County Courthouse (Pennsylvania),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Wild Things Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Thackeray Hall,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Immaculate Heart of Mary Church (Pittsburgh),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,St. Stanislaus Kostka Church (Pittsburgh),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Show example article with all words which appear 40 times or more.

In [39]:
upmc = article_word_counts.loc[article_word_counts["article_title"]=="University of Pittsburgh Medical Center", :]

to_drop = []
for i, col in enumerate(upmc.columns[1:]):
    if upmc.iloc[0, i+1] < 40:
        to_drop.append(col)
        
upmc.drop(to_drop, axis=1)

Unnamed: 0,article_title,care,center,health,hospital,located,medical,pennsylvania,pittsburgh,presbyterian,university,university pittsburgh,upmc
415,University of Pittsburgh Medical Center,53,45,75,133,44,42,60,89,40,71,40,302


Most valuable and least valuable words with article in which word appears the most

In [40]:
for x in [best_words, worst_words]:
    x["article"] = ""
    for index, row in x.iterrows():
        highest_impact = article_word_counts.sort_values(by=[row["feature"]], ascending=False).iloc[0, 0]
        x.loc[index, "article"] = highest_impact

In [41]:
df1_styler = best_words.style.set_table_attributes("style='display:inline'").set_caption('Most valuable articles')
df2_styler = worst_words.style.set_table_attributes("style='display:inline'").set_caption('Least valuable articles')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,feature,coef,article
245,beaver county,57364.360316,Western Beaver County School District
89,allegheny river,56513.683694,List of crossings of the Allegheny River
2553,show,45435.986526,KDKA-TV
244,beaver,41598.524499,Western Beaver County School District
457,championship,40184.904495,1978 PGA Championship
2412,room,37962.481172,Nationality Rooms
210,avenue,37899.272778,List of Pittsburgh History and Landmarks Foundation Historic Landmarks
1224,heights,37656.711097,"Hickory Heights, Pennsylvania"
1837,november,37627.733295,KDKA (AM)
1278,hot,37451.768468,Essie's Original Hot Dog Shop

Unnamed: 0,feature,coef,article
486,city,-58594.790648,List of Pennsylvania state historical markers in Allegheny County
2947,upmc,-50493.21095,University of Pittsburgh Medical Center
650,course,-48668.794594,Oakmont Country Club
680,current,-45603.26801,WPXI
795,division,-43425.286607,University of Pittsburgh Medical Center
580,concrete,-41037.953737,Tribune Review Publishing Company Building
1861,officers,-40393.399551,2009 shooting of Pittsburgh police officers
1902,original,-40321.214081,Nationality Rooms
1045,flows,-39828.675367,Wheeling Creek (West Virginia)
2982,village,-38132.760617,Chatham Village (Pittsburgh)


### Determine best and worst articles

Create article value-score by calculating sum of all words multiplied with their coefficient for every article.

In [42]:
article_values = article_word_counts.copy()
article_values.insert(1, "article_value", [0]*article_word_counts.shape[0])
words = article_values.columns[2:]
for index, row in article_values.iterrows():
    counts = row.iloc[2:]
    article_values.loc[index, "article_value"] = sum([word_coef_lookup[word]*count for count, word in zip(counts, words)])

In [43]:
pos_sorted = article_values.loc[:, "article_title":"article_value"].sort_values(by=["article_value"], ascending=False).head(10)
neg_sorted = article_values.loc[:, "article_title":"article_value"].sort_values(by=["article_value"], ascending=True).head(10)

df1_styler = pos_sorted.style.set_table_attributes("style='display:inline'").set_caption('Most valuable articles')
df2_styler = neg_sorted.style.set_table_attributes("style='display:inline'").set_caption('Least valuable articles')

display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)

Unnamed: 0,article_title,article_value
630,Western Beaver County School District,14854666.699611
832,List of Pittsburgh History and Landmarks Foundation Historic Landmarks,11395009.524894
178,Nationality Rooms,9917410.091076
2104,Civic Arena (Pittsburgh),9355391.149236
2107,Cathedral of Learning,8134609.986609
384,Aliquippa School District,6477810.578063
1527,Carnegie Mellon University,6179414.671569
922,Duquesne Gardens,6061197.660101
1099,Squirrel Hill (Pittsburgh),5156216.597548
292,Beaver Area School District,5082244.809001

Unnamed: 0,article_title,article_value
415,University of Pittsburgh Medical Center,-21282160.891007
2194,UPMC Presbyterian,-5732025.028277
2376,"Allegheny, Pennsylvania",-5356281.201716
620,UPMC Hillman Cancer Center,-2683562.790981
2094,Schenley Park,-2665438.784811
2090,Heinz Field,-2448921.542804
1444,Sewickley Bridge,-2357613.645583
1836,"Washington, Pennsylvania",-2287872.051337
1729,"Butler, Pennsylvania",-2284078.600488
1980,Greenfield Bridge,-2242937.991736


Add coordinates back

In [44]:
with open(PATH+"wikipedia/wikipedia_selected.ndjson") as fin:
    data_loaded = json.load(fin)

In [45]:
coords = np.array(data_loaded)[:, 1]  # filter coord column
article_values.insert(2, "article_lat", [lat for lat, long in coords])
article_values.insert(2, "article_long", [long for lat, long in coords])
article_values = article_values[["article_title", "article_value", "article_long", "article_lat"]]  # remove word counts

Save articles with their value score and coordinates.

In [46]:
if WEIGHTING:
    article_values.to_csv(PATH+"wikipedia_article_values.csv", index=False)
else:
    article_values.to_csv(PATH+"wikipedia_article_values_noweight.csv", index=False)
article_values.head()

Unnamed: 0,article_title,article_value,article_long,article_lat
0,Washington County Courthouse (Pennsylvania),-144785.722639,-80.245803,40.17045
1,Wild Things Park,-777166.465832,-80.283611,40.154167
2,Thackeray Hall,-271053.91208,-79.957264,40.444317
3,Immaculate Heart of Mary Church (Pittsburgh),967259.759938,-79.967778,40.456389
4,St. Stanislaus Kostka Church (Pittsburgh),570096.079523,-79.983611,40.452322
