In [1]:
import os
import gc
import json
import re
from collections import Counter
import warnings
warnings.filterwarnings(action="ignore")

from tqdm.notebook import tqdm

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA

import gensim
assert gensim.models.doc2vec.FAST_VERSION > -1
from gensim.utils import simple_preprocess
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.models.callbacks import CallbackAny2Vec

import pandas as pd
tqdm.pandas()
import matplotlib.pyplot as plt
import numpy as np
import pickle
from geopy import distance

import multiprocessor_wiki

Definde constants.

- ``PATH``: Path to the base data folder
- ``CPU_CORES``: How many cores to use to process data, default=all
- ``MAX_DIST``: Maximum radius around each house for Wikipedia articles to be considered for text features

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
CPU_CORES = os.cpu_count()
MAX_DIST = 5500

Load data.

In [3]:
with open(PATH+"wikipedia/wikipedia_selected_usa.ndjson") as fin:
    data_loaded = json.load(fin)

In [4]:
df = pd.DataFrame(data_loaded).rename(columns={0: "name", 1: "coords", 2: "templates", 3: "text",
                                               4: "wikilinks", 5: "ext_links", 6: "text_length"})
print(df.shape)
df.head(5)

(319666, 7)


Unnamed: 0,name,coords,templates,text,wikilinks,ext_links,text_length
0,"Barrington Passage, Nova Scotia","[43.5275, -65.609167]","{'name': 'Barrington Passage', 'settlement_typ...",Barrington Passage is a community in the Canad...,"[Country, Province, County, Shelburne County, ...",[http://www.gov.ns.ca/finance/communitycounts/...,1959
1,University of Wisconsin–Milwaukee College of E...,"[43.0758, -87.8859]",{'image': '< !-- Do NOT place a non-free image...,The College of Engineering and Applied Science...,"[Public university, Brett Peters, Milwaukee, W...",[http://www4.uwm.edu/ceas/explore_ceas/about_c...,3362
2,University of Wisconsin–Milwaukee School of Ed...,"[43.075722222222225, -87.87899999999999]",{'image': '< !-- Do NOT place a non-free image...,The University of Wisconsin–Milwaukee School o...,"[Public university, Milwaukee, Wisconsin, Unit...","[http://www4.uwm.edu/soe, http://grad-schools....",2123
3,Standin' on the Corner Park,"[35.02348333333333, -110.6980638888889]","{'name': 'Standin' on the Corner Park', 'photo...",Standin' on the Corner Park is a public park i...,"[Jackson Browne, Glenn Frey, Take It Easy, Win...","[http://www.ronadamson.com, http://standinonth...",3652
4,Whitacre College of Engineering,"[33.587172, -101.876017]",{'name': 'Edward E. Whitacre Jr. < br > Colleg...,thumb|right|228px|Engineering Key \n The Edwar...,"[Albert Sacco, Lubbock, Texas, Texas, Image:TT...",[http://www.irim.ttu.edu/FactBook/Enrollment/E...,15762


## Train Dov2Vec model

In [5]:
# custom_stopwords = stopwords.words("english")
# custom_stopwords.extend(["ref", "ref ref", "ref name", "name", "also", "per", "one", "two",
#                          "three", "four", "five", "six", "seven", "eight", "nine", "ten",
#                          "eleven", "twelve"])
# token = RegexpTokenizer(r'[a-zA-Z][a-zA-Z]+')  # only keep words with minimum length of 2

# def preprocess(text):
#     string = "[[Category:Washington, Pennsylvania]]"
#     re.sub(r"([cC]ategory:)([a-zA-z]+)", r"\1 \2", string)
#     text = re.sub(r"([cC]ategory:)([a-zA-z]+)", r"\1 \2", text)
#     return token.tokenize(text)

In [6]:
simple_preprocess("this is a test, i want to te'st the simple  preprocessing")

['this',
 'is',
 'test',
 'want',
 'to',
 'te',
 'st',
 'the',
 'simple',
 'preprocessing']

In [7]:
# data = [TaggedDocument(simple_preprocess(row[4]), [row[1]]) for row in tqdm(df.itertuples(), total=df.shape[0])]
# model = Doc2Vec(dm=0, dbow_words=1, vector_size=300, window=8, min_count=15, epochs=10, workers=CPU_CORES)
# model.build_vocab(data)

In [8]:
class MonitorCallback(CallbackAny2Vec):
    def __init__(self, epochs):
        self._current_epoch = 1
        self._epochs = epochs

    def on_epoch_end(self, model):
        print(f"Epoch {self._current_epoch}/{self._epochs} finished!")
        self._current_epoch += 1

In [9]:
# monitor = MonitorCallback(epochs=model.epochs)
# model.train(data, total_examples=model.corpus_count, epochs=model.epochs, report_delay=1, callbacks=[monitor])
# print(f"Finished training in {model.total_train_time // 60} minutes")

In [10]:
# model.save("models/doc2vec_usa")
model = Doc2Vec.load("models/doc2vec_usa")

Test model with example

In [39]:
# target_word = ["donald", "trump"]
target_word = "church"

In [40]:
print(f'target_word: {repr(target_word)} \nmodel: {model} \nsimilar words:')
for i, (word, sim) in enumerate(model.wv.most_similar(target_word, topn=10), 1):
    print(f'    {i}. {sim:.2f} {repr(word)}')

target_word: 'church' 
model: Doc2Vec(dbow+w,d300,n5,w8,mc15,s0.001,t12) 
similar words:
    1. 0.75 'christ'
    2. 0.74 'episcopal'
    3. 0.73 'presbyterian'
    4. 0.72 'methodist'
    5. 0.70 'congregation'
    6. 0.70 'baptist'
    7. 0.70 'congregational'
    8. 0.69 'anglican'
    9. 0.65 'reformed'
    10. 0.65 'brethren'


Load Wikipedia articles from Allegheny County.

In [13]:
with open(PATH+"wikipedia/wikipedia_selected.ndjson") as fin:
    data_loaded = json.load(fin)
df_allegh = pd.DataFrame(data_loaded).rename(columns={0: "name", 1: "coords", 2: "templates", 3: "text",
                                               4: "wikilinks", 5: "ext_links", 6: "text_length"})
print(df_allegh.shape)

(2407, 7)


Save vectors for each document.

In [14]:
org_cols = df_allegh.columns
df_allegh = df_allegh.apply(lambda x: list(x)+list(model.docvecs[x["name"]]), axis=1, result_type="expand")
df_allegh.columns = list(org_cols) + ["vec_"+str(i) for i in range(1, model.vector_size+1)]

In [15]:
df_allegh.head(5)

Unnamed: 0,name,coords,templates,text,wikilinks,ext_links,text_length,vec_1,vec_2,vec_3,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,vec_300
0,Washington County Courthouse (Pennsylvania),"[40.170449999999995, -80.24580277777778]","{'name': 'Washington County Courthouse', 'desi...",The Washington County Courthouse is located in...,"[Washington, Pennsylvania, Frederick J. Osterl...",[http://www.washcolandmarks.com/landmark_regis...,1257,0.161066,0.052886,0.203199,...,-0.069492,0.105272,0.491999,-0.096069,0.054004,-0.144695,0.161227,-0.11278,-0.205241,-0.22418
1,Wild Things Park,"[40.15416666666667, -80.28361111111111]","{'stadium_name': 'Wild Things Park', 'nickname...","Wild Things Park is a 3,200-seat multi-purpose...","[File:Falconi Field, now known as CONSOL Energ...",[http://www.consolenergy.com/Powering/Partners...,4461,0.442984,0.196251,0.183512,...,0.02707,0.179715,-0.076711,0.755452,-0.297641,0.21191,-0.295192,-0.125475,0.180635,0.388374
2,Thackeray Hall,"[40.444316666666666, -79.95726388888889]","{'name': 'Thackeray Hall', 'nrhp_type': 'cp', ...",Thackeray Hall is an academic building of the ...,"[University of Pittsburgh, Schenley Farms Hist...",[https://web.archive.org/web/20090628170337/ht...,2357,-0.267731,-0.179268,0.140842,...,0.216866,-0.217405,-0.017986,-0.142208,0.230511,-0.157332,-0.322223,0.292737,-0.265031,-0.256753
3,Immaculate Heart of Mary Church (Pittsburgh),"[40.45638888888889, -79.96777777777778]","{'name': 'Immaculate Heart of Mary Church', 'n...","Immaculate Heart of Mary Church in Pittsburgh,...","[William P. Ginther, Polish language, Roman Ca...",[http://www.phlf.org/wp-content/uploads/2007/0...,2978,-0.44717,-0.121726,0.323693,...,-0.045491,-0.110203,-0.283853,0.073682,0.109183,0.021509,-0.531077,0.104394,0.081341,0.071251
4,St. Stanislaus Kostka Church (Pittsburgh),"[40.45232222222222, -79.98361111111112]",{'name': 'St. Stanislaus Kostka Roman Catholic...,"St. Stanislaus Kostka Church in Pittsburgh, Pe...","[Pittsburgh, Pennsylvania, Frederick C. Sauer,...",[http://www.phlf.org/wp-content/uploads/2007/0...,5254,-0.262681,-0.112132,0.269356,...,-0.34859,-0.536347,0.014992,-0.10873,0.216571,-0.072938,-0.022283,-0.006919,-0.327681,0.134902


## Create feature set

Load structured data.

In [16]:
df_structured = pd.read_csv(PATH + "structured_preprocessed.csv")  # read structured data
df_structured.shape

(9556, 64)

Create text based features by filtering all articles in MAX_DIST radius, weighing doc2vec vectors by distance and calculating the mean for every dimension.

In [17]:
df.reset_index(drop=True, inplace=True)  # make sure index is from 0 to n-1
df_structured_d2v = multiprocessor_wiki.process_doc2vec_features(df_structured, df_allegh, MAX_DIST, CPU_CORES)

  0%|          | 0/100 [00:00<?, ?it/s]

In [18]:
df_structured_d2v.to_csv(PATH + f"structured_wiki_doc2vec_features_{MAX_DIST}.csv", index=False)
df_structured_d2v.head(10)

Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,vec_291,vec_292,vec_293,vec_294,vec_295,vec_296,vec_297,vec_298,vec_299,vec_300
0,161705,15122,870,45,87005,10899,05-01-2018,145000.0,76700,1.0,...,-0.483311,-0.050547,-0.177805,2.347651,-1.14481,1.111483,-1.024357,-1.978401,-0.451744,-1.93748
1,530852,15146,879,18,87905,10691,05-13-2019,139997.0,106200,1.0,...,0.30184,1.596993,-2.395855,-1.785833,-0.184289,-0.122471,-1.649159,0.155087,1.968305,-0.268778
2,144978,15202,826,2,82601,11813,05-26-2017,170000.0,135300,1.0,...,-0.511462,-0.129936,-1.47964,0.385174,2.670413,0.12204,0.353143,-1.40687,-1.019049,-1.444391
3,436602,15202,803,29,80302,5324,06-06-2017,145000.0,117300,2.0,...,-0.690288,-0.870494,-2.281473,0.152488,1.446694,-0.072112,-0.652558,-1.224214,1.813835,-2.564284
4,145066,15218,114,47,11403,3600,04-09-2016,325000.0,250000,2.0,...,-0.240091,-0.576042,-0.966035,1.219164,1.073587,-0.108186,-0.715042,0.260503,0.980314,-1.047018
5,145137,15228,926,26,92607,6406,04-30-2015,172900.0,137300,2.0,...,-1.42896,1.248975,-0.44948,0.52414,-0.208572,0.005876,0.154637,0.175364,1.595531,-0.535725
6,145246,15241,950,42,95001,38376,12-17-2015,817000.0,751600,2.0,...,-0.705347,1.169243,-1.915193,-0.442949,0.845908,-0.747879,0.011046,-0.199865,0.472289,-1.127833
7,529513,15132,409,23,40005,3844,01-09-2020,39000.0,45100,1.0,...,-0.728345,0.048441,-1.024781,0.36833,-0.722843,-1.074531,-2.381515,-0.289282,2.584451,-2.335266
8,146103,15212,127,47,12703,5284,06-30-2016,65000.0,52800,1.5,...,-1.04878,-0.511019,-0.976804,1.873559,0.809905,-1.080543,-1.262615,-1.61362,1.208577,-1.387948
9,146155,15212,127,47,12701,5544,11-10-2018,162000.0,111200,1.0,...,-1.243793,-1.868891,-2.400657,0.4937,2.417276,-0.141308,-1.128099,-0.037223,1.665028,-0.965511
