# Processing GIS database

This notebook processes the downloaded and decompressed GIS database and should therefore be run __after__ `download_wikipedia.ipynb`. Distance to nearest POI for each category and spatially weighted text based features will be created.

In [1]:
import os
import gc
import json
import re
from collections import Counter
import warnings
warnings.filterwarnings(action="ignore")

from tqdm.notebook import tqdm
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from geopy import distance

import multiprocessor_gis

Define constants.

- ``PATH``: Path to the base data folder
- ``COORD_RANGE_LAT``: latitude coordinate range of Allegheny County
- ``COORD_RANGE_LONG``: longitude coordinate range of Allegheny County
- ``CPU_CORES``: how many cpu cores to use, default = all
- ``MAX_DIST_DUP``: maximum distance for duplicate detection
- ``COUNT_RADIUS``: maximum distance for _count features

In [2]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_real_estate/"
COORD_RANGE_LAT = (40.000000, 40.870000)
COORD_RANGE_LONG = (-80.550000, -79.500000)
CPU_CORES = os.cpu_count()
MAX_DIST_DUP = 8  # in meters
COUNT_RADIUS = 3500  # in meters

Read GIS and structured data.

In [3]:
gis_categories = pd.read_csv(PATH+"gis_preprocessed.csv")
structured_df = pd.read_csv(PATH+"structured_preprocessed.csv")

Create distance features by calculating distance to nearest POI from every house and counting how many POI there are in a ``COUNT_RADIUS`` radius.

In [4]:
structured_gis_features = multiprocessor_gis.process_dist_features(gis_categories, structured_df, COUNT_RADIUS, CPU_CORES)
structured_gis_features.head(10)

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




Unnamed: 0,_id,PROPERTYZIP,MUNICODE,SCHOOLCODE,NEIGHCODE,LOTAREA,SALEDATE,SALEPRICE,FAIRMARKETTOTAL,STORIES,...,apartment_buildings_dist,apartment_buildings_count,faith-based_facilities_dist,faith-based_facilities_count,restaurants_dist,restaurants_count,community_nonprofit_orgs_dist,community_nonprofit_orgs_count,bus_stops_dist,bus_stops_count
0,161705,15122,870,45,87005,10899,05-01-2018,145000,76700,1.0,...,698.578,6,1433.26,16,366.337,63,164.152,61,126.418,150
1,530852,15146,879,18,87905,10691,05-13-2019,139997,106200,1.0,...,643.38,20,560.748,16,196.673,88,182.641,91,294.568,96
2,144978,15202,826,2,82601,11813,05-26-2017,170000,135300,1.0,...,238.974,7,345.638,8,172.83,18,64.5752,65,37.4453,134
3,436602,15202,803,29,80302,5324,06-06-2017,145000,117300,2.0,...,168.986,48,512.919,32,412.677,90,452.318,126,240.476,324
4,145066,15218,114,47,11403,3600,04-09-2016,325000,250000,2.0,...,505.765,126,471.672,89,193.478,189,120.957,431,129.273,810
5,145137,15228,926,26,92607,6406,04-30-2015,172900,137300,2.0,...,256.35,65,543.08,38,162.341,160,181.196,270,635.006,242
6,145246,15241,950,42,95001,38376,12-17-2015,817000,751600,2.0,...,837.21,25,365.915,31,881.556,71,68.2141,177,1151.48,210
7,529513,15132,409,23,40005,3844,01-09-2020,39000,45100,1.0,...,1584.62,21,834.017,31,59.9739,70,296.289,127,917.228,166
8,146103,15212,127,47,12703,5284,06-30-2016,65000,52800,1.5,...,619.8,42,573.588,56,209.715,137,238.605,237,127.744,674
9,146155,15212,127,47,12701,5544,11-10-2018,162000,111200,1.0,...,565.37,45,414.481,39,586.429,89,249.813,152,339.433,520


Save structured + GIS features.

In [5]:
structured_gis_features.to_csv(PATH+f"structured_gis_category_features_{COUNT_RADIUS}_radius.csv", index=False)