## HLCM 2 - with saved network vars
Arezoo Besharati, UrbanSim, July 2018 


In [None]:
import os; os.chdir('../../')
import numpy as np, pandas as pd 
import matplotlib.pyplot as plt
import warnings;
warnings.simplefilter('ignore')
%load_ext autoreload
%autoreload 2

In [2]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca
import seaborn as sns
%matplotlib notebook

### Load data

In [3]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [None]:
#mm.list_steps()

In [None]:
#mm.initialize()

In [None]:
orca.list_tables()
#orca.list_broadcasts()
#orca.list_steps()

### Load accessibility vars

In [4]:
%%capture
orca.run(["initialize_network_walk"])

In [5]:
%%capture
orca.run(["initialize_network_small"])

### Explore the data

In [6]:
bld = orca.get_table('buildings').to_frame()
bld.columns.tolist()

['parcel_id',
 'development_type_id',
 'improvement_value',
 'residential_units',
 'residential_sqft',
 'sqft_per_unit',
 'non_residential_sqft',
 'building_sqft',
 'nonres_rent_per_sqft',
 'res_price_per_sqft',
 'stories',
 'year_built',
 'redfin_sale_price',
 'redfin_sale_year',
 'redfin_home_type',
 'costar_property_type',
 'costar_rent',
 'building_type_id',
 'res_sqft_per_unit',
 'node_id_walk',
 'node_id_small']

In [7]:
# building_type dummies
bld.single_family = bld.building_type_id == 1
bld.multi_family = bld.building_type_id == 3
bld.mixed_use = bld.building_type_id > 3 
# add the columns
orca.add_column('buildings', 'single_family', bld.single_family)
orca.add_column('buildings', 'multi_family', bld.multi_family)
orca.add_column('buildings', 'mixed_use', bld.mixed_use)

<orca.orca._SeriesWrapper at 0x1c1cc63ac8>

In [None]:
bld = orca.get_table('buildings').to_frame()
bld.head()

In [8]:
hh = orca.get_table('households').to_frame()
hh.columns.tolist()

['serialno',
 'persons',
 'building_type',
 'cars',
 'income',
 'race_of_head',
 'hispanic_head',
 'age_of_head',
 'workers',
 'state',
 'county',
 'tract',
 'block_group',
 'children',
 'tenure',
 'recent_mover',
 'block_group_id',
 'single_family',
 'unit_id',
 'building_id',
 'node_id_walk',
 'node_id_small']

In [18]:
# scale income and create race dummies
hh.income_k = hh.income/1000
hh.white = hh.race_of_head == 1
hh.black = hh.race_of_head == 2
hh.asian = hh.race_of_head == 6
hh.hisp = hh.hispanic_head == 'yes'
# add the columns
orca.add_column('households', 'income_k', hh.income_k)
orca.add_column('households', 'white', hh.white)
orca.add_column('households', 'black', hh.black)
orca.add_column('households', 'asian', hh.asian)
orca.add_column('households', 'hispanic', hh.hisp)

<orca.orca._SeriesWrapper at 0x1c21281ac8>

In [None]:
#bld.building_type_id.value_counts()

In [27]:
nodeswalk = orca.get_table('nodeswalk').to_frame()
nodessmall = orca.get_table('nodessmall').to_frame()
print(nodeswalk.columns.tolist())
print(nodessmall.columns.tolist())

['units_500_walk', 'sqft_unit_500_walk', 'singles_500_walk', 'elderly_hh_500_walk', 'children_500_walk', 'units_sf_500_walk', 'units_mf_500_walk', 'pop_500_walk', 'hh_500_walk', 'poor_500_walk', 'rich_500_walk', 'renters_500_walk', 'avg_income_500_walk', 'jobs_500_walk', 'avg_rent_500_walk', 'pop_white_500_walk', 'pop_black_500_walk', 'pop_asian_500_walk', 'pop_hisp_500_walk', 'units_1500_walk', 'sqft_unit_1500_walk', 'singles_1500_walk', 'elderly_hh_1500_walk', 'children_1500_walk', 'units_sf_1500_walk', 'units_mf_1500_walk', 'pop_1500_walk', 'hh_1500_walk', 'poor_1500_walk', 'rich_1500_walk', 'renters_1500_walk', 'avg_income_1500_walk', 'jobs_1500_walk', 'avg_rent_1500_walk', 'pop_white_1500_walk', 'pop_black_1500_walk', 'pop_asian_1500_walk', 'pop_hisp_1500_walk', 'pop_2500_walk', 'pop_white_2500_walk', 'pop_black_2500_walk', 'pop_asian_2500_walk', 'pop_hisp_2500_walk']
['units_10000', 'units_sf_10000', 'units_mf_10000', 'pop_10000', 'hh_10000', 'poor_10000', 'renters_10000', 'med_i

In [28]:
nodeswalk.describe()

Unnamed: 0,units_500_walk,sqft_unit_500_walk,singles_500_walk,elderly_hh_500_walk,children_500_walk,units_sf_500_walk,units_mf_500_walk,pop_500_walk,hh_500_walk,poor_500_walk,...,avg_rent_1500_walk,pop_white_1500_walk,pop_black_1500_walk,pop_asian_1500_walk,pop_hisp_1500_walk,pop_2500_walk,pop_white_2500_walk,pop_black_2500_walk,pop_asian_2500_walk,pop_hisp_2500_walk
count,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,...,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0,415716.0
mean,356.008992,1245.544486,89.88732,60.195042,161.779883,163.916265,165.668045,802.340538,315.202367,81.44261,...,2544.887916,4185.756497,440.785231,2271.882773,1535.478952,22549.051672,11993.463444,1231.324772,6494.531692,4404.455121
std,1097.037701,979.879938,323.945632,240.503341,448.370273,704.597755,469.333758,2292.176423,964.220014,291.422578,...,1191.996469,6079.309178,980.706698,3078.791686,2156.729176,24397.165755,13934.759697,2429.74282,7756.703815,5093.802029
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.0,458.24952,1.0,1.0,4.0,2.0,0.0,20.75,7.0,1.0,...,2200.0,895.0,17.0,296.0,150.0,7446.75,3808.0,113.0,1408.0,768.0
50%,188.0,1315.6943,28.0,29.0,97.0,98.0,2.0,458.0,168.0,27.0,...,2617.0,3023.0,118.0,1302.0,840.0,17378.0,9132.0,430.0,4214.0,2818.0
75%,440.0,1761.837925,86.0,73.0,233.0,247.0,159.0,1082.0,392.0,83.0,...,3200.0,5636.0,390.0,3095.0,2076.0,30321.0,16124.0,1089.0,8783.25,6145.0
max,184995.0,30000.0,45986.0,41032.0,81448.0,136971.0,48024.0,397670.0,162582.0,42787.0,...,9800.0,326129.0,18039.0,34932.0,67438.0,406786.0,334616.0,21836.0,58877.0,67835.0


In [None]:
#bld.stories.value_counts().plot(kind='bar')
#bld.stories.value_counts()

## Get the predicted price from REPM

In [None]:
#orca.run(['hedonic_rent_sqft'])

## Model Estimation

In [26]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 50

#Filters on choosers
m.chooser_filters = ['white == 1 & household_id <50000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units > 0',
                 '0 < avg_income_500_walk < 500000',
                 'sqft_per_unit > 0',
                 'year_built > 1800']

m.model_expression = 'np.log(residential_units) + \
res_price_per_sqft + \
np.log(income):res_price_per_sqft + \
np.log1p(sqft_per_unit) + \
np.log1p(jobs_1500_walk) + \
np.log1p(jobs_25000) + \
np.log(income):np.log(avg_income_1500_walk) + \
np.log1p(pop_1500_walk) + \
np.log1p(pop_white_1500_walk) + \
np.log1p(pop_black_1500_walk) + \
np.log1p(pop_asian_1500_walk) + \
np.log1p(pop_hisp_1500_walk) \
- 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          1,217
Model:         Multinomial Logit   Df Residuals:              1,205
Method:       Maximum Likelihood   Df Model:                     12
Date:                 2018-07-18   Pseudo R-squ.:             0.758
Time:                      17:45   Pseudo R-bar-squ.:         0.755
AIC:                   2,329.923   Log-Likelihood:       -1,152.961
BIC:                   2,391.173   LL-Null:              -4,760.932
                                                  coef   std err         z     P>|z|   Conf. Int.
-------------------------------------------------------------------------------------------------
np.log(residential_units)                       1.0924     0.045    24.169     0.000             
res_price_per_sqft                             -0.0102     0.002    -5.040     0.000             
np.log(income):res_price_per_sqft               0.0008     0.000

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 500

#Filters on choosers
m.chooser_filters = ['household_id <100000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units!=0',
                 '0 < avg_income_500_walk < 300000',
                 'sqft_per_unit > 0',
                 'year_built > 1700',
                 'stories > 0']

m.model_expression = 'np.log1p(res_price_per_sqft)+\
np.log1p(jobs_1500_walk) + np.log(avg_income_500_walk) +np.log1p(jobs_25000)+\
np.log1p(pop_black_1500_walk)+ I(building_type_id==1) + year_built<1950 + year_built>2000+stories + np.log1p(sqft_per_unit) - 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
%%time
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m.choice_column = 'building_id'
m.alt_sample_size = 500

#Filters on choosers
m.chooser_filters = ['household_id <100000 & recent_mover == 1 & 0 <income < 600000']

#Filters on alternatives
m.alt_filters = ['residential_units!=0',
                 '0 < avg_income_500_walk < 200000',
                 'sqft_per_unit > 0',
                 'year_built > 1700',
                 'stories > 0']

m.model_expression = 'np.log1p(res_price_per_sqft)+\
np.log1p(jobs_1500_walk) + np.log(avg_income_500_walk) +np.log1p(jobs_25000)+\
np.log1p(pop_black_1500_walk)+ I(building_type_id==1) + year_built<1950 + year_built>2000+stories  - 1'

m.name = 'hlcm'
m.tags = ['arezoo', 'test']
m.fit()

In [None]:
## Check for multicolinearity

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])


In [None]:
Selected_features = ['res_price_per_sqft', 'pop_500_walk', 'jobs_1500_walk',
                     'avg_income_500_walk','pop_black_1500_walk', 'sqft_per_unit','jobs_25000']
X = df[Selected_features]

plt.subplots(figsize=(8, 8))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()