## HLCM 2 - with saved network vars
Arezoo Besharati, UrbanSim, July 2018 


In [1]:
import os; os.chdir('../../')
import numpy as np, pandas as pd 
import matplotlib.pyplot as plt
import warnings;
warnings.simplefilter('ignore')
%load_ext autoreload
%autoreload 2

In [2]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca
import seaborn as sns
%matplotlib notebook

### Load data

In [3]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [4]:
#mm.list_steps()

In [5]:
#mm.initialize()

In [6]:
orca.list_tables()
#orca.list_broadcasts()
#orca.list_steps()

['parcels',
 'buildings',
 'craigslist',
 'rentals',
 'nodessmall',
 'nodeswalk',
 'units',
 'households',
 'persons',
 'jobs']

### Load accessibility vars

In [7]:
%%capture
orca.run(["initialize_network_walk"])

In [8]:
%%capture
orca.run(["initialize_network_small"])

### Explore the data

In [9]:
bld = orca.get_table('buildings').to_frame()
bld.columns.tolist()

['parcel_id',
 'development_type_id',
 'improvement_value',
 'residential_units',
 'residential_sqft',
 'sqft_per_unit',
 'non_residential_sqft',
 'building_sqft',
 'nonres_rent_per_sqft',
 'res_price_per_sqft',
 'stories',
 'year_built',
 'redfin_sale_price',
 'redfin_sale_year',
 'redfin_home_type',
 'costar_property_type',
 'costar_rent',
 'building_type_id',
 'res_sqft_per_unit',
 'node_id_walk',
 'node_id_small']

In [21]:
bld.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
parcel_id,1824229.0,1012861.0,593806.5,26.0,495850.0,1002691.0,1532117.0,2054501.0
development_type_id,1824229.0,1.835162,3.44108,-1.0,1.0,1.0,1.0,24.0
improvement_value,1824229.0,318687.8,3571671.0,0.0,82675.03,167553.5,285342.3,3355514000.0
residential_units,1824229.0,1.534,6.137263,0.0,1.0,1.0,1.0,1912.0
residential_sqft,1824229.0,2657.113,30124.08,0.0,1230.0,1754.0,2400.0,14580000.0
sqft_per_unit,1824229.0,1669.294,1063.476,0.0,1176.0,1564.0,2029.424,30000.0
non_residential_sqft,1824229.0,1280.864,23922.26,0.0,0.0,0.0,0.0,16573100.0
building_sqft,1824229.0,3784.462,34945.68,0.0,1320.0,1858.0,2524.0,14580000.0
nonres_rent_per_sqft,1824229.0,0.9218572,4.053916,0.0,0.0,0.0,0.0,128.9625
res_price_per_sqft,1824229.0,292.7191,264.7712,0.0,151.8671,254.4594,369.9121,50161.09


In [10]:
# building_type dummies
bld.single_family = (bld.building_type_id == 1).astype(int)
bld.multi_family = (bld.building_type_id == 3).astype(int)
bld.mixed_use = (bld.building_type_id > 3).astype(int)
# add the columns
orca.add_column('buildings', 'single_family', bld.single_family)
orca.add_column('buildings', 'multi_family', bld.multi_family)
orca.add_column('buildings', 'mixed_use', bld.mixed_use)

<orca.orca._SeriesWrapper at 0x1c3367e780>

In [None]:
bld = orca.get_table('buildings').to_frame()
bld.head()

In [11]:
hh = orca.get_table('households').to_frame()
hh.columns.tolist()

['serialno',
 'persons',
 'building_type',
 'cars',
 'income',
 'race_of_head',
 'hispanic_head',
 'age_of_head',
 'workers',
 'state',
 'county',
 'tract',
 'block_group',
 'children',
 'tenure',
 'recent_mover',
 'block_group_id',
 'single_family',
 'unit_id',
 'building_id',
 'node_id_walk',
 'node_id_small']

In [35]:
# scale income and create race dummies
hh.income_k = hh.income/1000
hh.white = (hh.race_of_head == 1).astype(int)
hh.black = (hh.race_of_head == 2).astype(int)
hh.asian = (hh.race_of_head == 6).astype(int)
hh.hisp = (hh.hispanic_head == 'yes').astype(int)
hh.single = (hh.persons == 1).astype(int)
hh.elderly = (hh.age_of_head > 65).astype(int)
# add the columns
orca.add_column('households', 'income_k', hh.income_k)
orca.add_column('households', 'white', hh.white)
orca.add_column('households', 'black', hh.black)
orca.add_column('households', 'asian', hh.asian)
orca.add_column('households', 'hispanic', hh.hisp)
orca.add_column('households', 'elderly', hh.elderly)
orca.add_column('households', 'single', hh.single)


<orca.orca._SeriesWrapper at 0x1c3294a668>

In [None]:
#bld.building_type_id.value_counts()

In [17]:
nodeswalk = orca.get_table('nodeswalk').to_frame()
nodessmall = orca.get_table('nodessmall').to_frame()
print(nodeswalk.columns.tolist())
print(nodessmall.columns.tolist())

['units_500_walk', 'sqft_unit_500_walk', 'singles_500_walk', 'elderly_hh_500_walk', 'children_500_walk', 'units_sf_500_walk', 'units_mf_500_walk', 'pop_500_walk', 'hh_500_walk', 'poor_500_walk', 'rich_500_walk', 'renters_500_walk', 'avg_income_500_walk', 'jobs_500_walk', 'avg_rent_500_walk', 'pop_white_500_walk', 'pop_black_500_walk', 'pop_asian_500_walk', 'pop_hisp_500_walk', 'units_1500_walk', 'sqft_unit_1500_walk', 'singles_1500_walk', 'elderly_hh_1500_walk', 'children_1500_walk', 'units_sf_1500_walk', 'units_mf_1500_walk', 'pop_1500_walk', 'hh_1500_walk', 'poor_1500_walk', 'rich_1500_walk', 'renters_1500_walk', 'avg_income_1500_walk', 'jobs_1500_walk', 'avg_rent_1500_walk', 'pop_white_1500_walk', 'pop_black_1500_walk', 'pop_asian_1500_walk', 'pop_hisp_1500_walk', 'pop_2500_walk', 'pop_white_2500_walk', 'pop_black_2500_walk', 'pop_asian_2500_walk', 'pop_hisp_2500_walk']
['units_10000', 'units_sf_10000', 'units_mf_10000', 'pop_10000', 'hh_10000', 'poor_10000', 'renters_10000', 'med_i

In [20]:
nodessmall.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
units_10000,30826.0,111656.076786,87205.925674,0.0,43964.5,96821.5,163121.75,406854.0
units_sf_10000,30826.0,60759.14066,37438.924477,0.0,31221.5,59398.0,90068.5,147995.0
units_mf_10000,30826.0,45984.058035,46023.648561,0.0,10558.0,32530.5,69672.0,205873.0
pop_10000,30826.0,259194.721534,187603.704474,0.0,107475.75,233979.0,387446.0,848002.0
hh_10000,30826.0,98975.977389,77820.151052,0.0,39254.5,85946.5,142956.75,368466.0
poor_10000,30826.0,24028.122072,21383.817215,0.0,8298.5,18126.5,33361.5,99261.0
renters_10000,30826.0,46257.82106,46188.036596,0.0,13271.0,32305.0,64805.0,216731.0
med_income_10000,30826.0,86880.749594,23418.171479,-1.0,75000.0,86100.0,101100.0,457450.0
jobs_10000,30826.0,118229.853954,107967.213133,0.0,28940.25,92305.0,176939.0,428602.0
med_rent_10000,30826.0,2516.381042,713.935921,-1.0,2225.0,2570.0,2854.75,7300.0


In [19]:
nodeswalk.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
units_500_walk,415716.0,356.008992,1097.037701,0.0,9.0,188.0,440.0,184995.0
sqft_unit_500_walk,415716.0,1245.544486,979.879938,0.0,458.24952,1315.6943,1761.837925,30000.0
singles_500_walk,415716.0,89.88732,323.945632,0.0,1.0,28.0,86.0,45986.0
elderly_hh_500_walk,415716.0,60.195042,240.503341,0.0,1.0,29.0,73.0,41032.0
children_500_walk,415716.0,161.779883,448.370273,0.0,4.0,97.0,233.0,81448.0
units_sf_500_walk,415716.0,163.916265,704.597755,0.0,2.0,98.0,247.0,136971.0
units_mf_500_walk,415716.0,165.668045,469.333758,0.0,0.0,2.0,159.0,48024.0
pop_500_walk,415716.0,802.340538,2292.176423,0.0,20.75,458.0,1082.0,397670.0
hh_500_walk,415716.0,315.202367,964.220014,0.0,7.0,168.0,392.0,162582.0
poor_500_walk,415716.0,81.44261,291.422578,0.0,1.0,27.0,83.0,42787.0


In [22]:
nodeswalk_upper = nodeswalk.quantile(.99)
nodeswalk_upper

units_500_walk            2833.000000
sqft_unit_500_walk        3807.906545
singles_500_walk          1077.000000
elderly_hh_500_walk        492.000000
children_500_walk          861.000000
units_sf_500_walk          826.000000
units_mf_500_walk         2040.850000
pop_500_walk              5262.000000
hh_500_walk               2498.850000
poor_500_walk              795.000000
rich_500_walk              686.850000
renters_500_walk          1795.000000
avg_income_500_walk     240387.438000
jobs_500_walk             6957.000000
avg_rent_500_walk         5966.666500
pop_white_500_walk        2947.000000
pop_black_500_walk         669.000000
pop_asian_500_walk        1928.000000
pop_hisp_500_walk         1259.850000
units_1500_walk          26849.850000
sqft_unit_1500_walk       3280.493005
singles_1500_walk        10326.550000
elderly_hh_1500_walk      4599.850000
children_1500_walk        6648.000000
units_sf_1500_walk        6917.000000
units_mf_1500_walk       17540.550000
pop_1500_wal

In [25]:
#DataFrame.clip_upper(threshold, axis=None, inplace=False)
nodeswalk_clipped = nodeswalk.clip_upper(nodeswalk_upper, axis=1) 
nodeswalk_clipped.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
units_500_walk,415716.0,331.453042,471.370454,0.0,9.0,188.0,440.0,2833.0
sqft_unit_500_walk,415716.0,1229.401188,880.509079,0.0,458.24952,1315.6943,1761.837925,3807.906545
singles_500_walk,415716.0,80.842835,158.518625,0.0,1.0,28.0,86.0,1077.0
elderly_hh_500_walk,415716.0,55.069781,80.730876,0.0,1.0,29.0,73.0,492.0
children_500_walk,415716.0,154.637466,182.76547,0.0,4.0,97.0,233.0,861.0
units_sf_500_walk,415716.0,154.361879,177.327368,0.0,2.0,98.0,247.0,826.0
units_mf_500_walk,415716.0,154.462641,331.596474,0.0,0.0,2.0,159.0,2040.85
pop_500_walk,415716.0,758.23397,968.1837,0.0,20.75,458.0,1082.0,5262.0
hh_500_walk,415716.0,293.967984,416.374896,0.0,7.0,168.0,392.0,2498.85
poor_500_walk,415716.0,73.68345,128.999045,0.0,1.0,27.0,83.0,795.0


In [31]:
orca.add_table('nodeswalk', nodeswalk_clipped)

<orca.orca.DataFrameWrapper at 0x1c3292fe48>

In [None]:
#bld.stories.value_counts().plot(kind='bar')
#bld.stories.value_counts()

## Get the predicted price from REPM

In [None]:
#orca.run(['hedonic_rent_sqft'])

## Model Estimation

In [36]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 50

#Filters on choosers
m.chooser_filters = ['household_id <500000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units > 0',
                 '0 < avg_income_500_walk < 500000',
                 'sqft_per_unit > 0',
                 'year_built > 1800']

m.model_expression = 'np.log(residential_units) + \
np.log1p(res_price_per_sqft) + \
np.log1p(sqft_per_unit) + \
np.log1p(pop_1500_walk) + \
np.log1p(jobs_1500_walk) + \
np.log1p(jobs_25000) + \
np.log(income):np.log(avg_income_500_walk) + \
single:np.log1p(singles_1500_walk) + \
elderly:np.log1p(elderly_hh_1500_walk) + \
children:np.log1p(children_1500_walk) + \
white:np.log1p(pop_white_1500_walk) + \
black:np.log1p(pop_black_1500_walk) + \
asian:np.log1p(pop_asian_1500_walk) + \
hispanic:np.log1p(pop_hisp_1500_walk) \
- 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:         18,199
Model:         Multinomial Logit   Df Residuals:             18,185
Method:       Maximum Likelihood   Df Model:                     14
Date:                 2018-07-18   Pseudo R-squ.:             0.505
Time:                      22:52   Pseudo R-bar-squ.:         0.505
AIC:                  70,524.930   Log-Likelihood:      -35,248.465
BIC:                  70,634.257   LL-Null:             -71,194.907
                                                coef   std err          z     P>|z|   Conf. Int.
------------------------------------------------------------------------------------------------
np.log(residential_units)                     1.1037     0.009    128.006     0.000             
np.log1p(res_price_per_sqft)                 -0.5350     0.007    -72.748     0.000             
np.log1p(sqft_per_unit)                      -0.0786     0.013     -

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 500

#Filters on choosers
m.chooser_filters = ['household_id <100000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units!=0',
                 '0 < avg_income_500_walk < 300000',
                 'sqft_per_unit > 0',
                 'year_built > 1700',
                 'stories > 0']

m.model_expression = 'np.log1p(res_price_per_sqft)+\
np.log1p(jobs_1500_walk) + np.log(avg_income_500_walk) +np.log1p(jobs_25000)+\
np.log1p(pop_black_1500_walk)+ I(building_type_id==1) + year_built<1950 + year_built>2000+stories + np.log1p(sqft_per_unit) - 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 500

#Filters on choosers
m.chooser_filters = ['household_id <100000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units!=0',
                 '0 < avg_income_500_walk < 200000',
                 'sqft_per_unit > 0',
                 'year_built > 1700',
                 'stories > 0']

m.model_expression = 'np.log1p(res_price_per_sqft)+\
np.log1p(jobs_1500_walk) + np.log(avg_income_500_walk) +np.log1p(jobs_25000)+\
np.log1p(pop_black_1500_walk)+ I(building_type_id==1) + year_built<1950 + year_built>2000+stories  - 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
## Check for multicolinearity

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])


In [None]:
Selected_features = ['res_price_per_sqft', 'pop_500_walk', 'jobs_1500_walk',
                     'avg_income_500_walk','pop_black_1500_walk', 'sqft_per_unit','jobs_25000']
X = df[Selected_features]

plt.subplots(figsize=(8, 8))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()