# Household Location Choice Model (HLCM) for Single and Multi Family Housing  


Arezoo Besharati, Paul Waddell, UrbanSim, July 2018 

This notebook demonstrates the use of the LargeMultinomialLogit model template to construct, estimate, and evaluate a Household Location Choice Model for the San Francisco Bay Area.

In the process of developing the model, we also demonstrate some data checking and transformations to improve the model.

The model structure and specification are informed and limited by the available data, which is based on the data used by the Metropolitan Transportation Commission for their operational model.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preliminaries" data-toc-modified-id="Preliminaries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preliminaries</a></span><ul class="toc-item"><li><span><a href="#Load-data" data-toc-modified-id="Load-data-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Load data</a></span></li><li><span><a href="#Explore-the-data" data-toc-modified-id="Explore-the-data-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Explore the data</a></span></li><li><span><a href="#Create-a-Chooser-Filter-and-Tag-Their-Buildings" data-toc-modified-id="Create-a-Chooser-Filter-and-Tag-Their-Buildings-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Create a Chooser Filter and Tag Their Buildings</a></span></li><li><span><a href="#Add-Flag-to-Buildings-Table-Identifying-Chosen-Buildings" data-toc-modified-id="Add-Flag-to-Buildings-Table-Identifying-Chosen-Buildings-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Add Flag to Buildings Table Identifying Chosen Buildings</a></span></li><li><span><a href="#Data-Cleaning" data-toc-modified-id="Data-Cleaning-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Data Cleaning</a></span></li><li><span><a href="#Perform-desired-variable-creations-and-transformations¶" data-toc-modified-id="Perform-desired-variable-creations-and-transformations¶-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Perform desired variable creations and transformations¶</a></span></li><li><span><a href="#Check-Distributions" data-toc-modified-id="Check-Distributions-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>Check Distributions</a></span></li><li><span><a href="#Check-for-Multicollinearity" data-toc-modified-id="Check-for-Multicollinearity-1.8"><span class="toc-item-num">1.8&nbsp;&nbsp;</span>Check for Multicollinearity</a></span></li></ul></li><li><span><a href="#Large-Choice-Set-Single-Family" data-toc-modified-id="Large-Choice-Set-Single-Family-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Large Choice Set Single-Family</a></span></li><li><span><a href="#Large-Choice-Set-Multi-Family" data-toc-modified-id="Large-Choice-Set-Multi-Family-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Large Choice Set Multi-Family</a></span></li><li><span><a href="#Constrained-Choice-Set-Single_Family" data-toc-modified-id="Constrained-Choice-Set-Single_Family-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Constrained Choice Set Single_Family</a></span></li><li><span><a href="#Constrained-Choice-Set-Multi_Family" data-toc-modified-id="Constrained-Choice-Set-Multi_Family-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Constrained Choice Set Multi_Family</a></span></li><li><span><a href="#Model-Prediction" data-toc-modified-id="Model-Prediction-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Model Prediction</a></span></li></ul></div>

## Preliminaries

In [1]:
import os; os.chdir('../../')
import numpy as np, pandas as pd 
import matplotlib.pyplot as plt
import warnings;
warnings.simplefilter('ignore')
%load_ext autoreload
%autoreload 2

In [2]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca
import seaborn as sns
%matplotlib notebook

In [3]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
mm.initialize()

Loading model step 'hedonic_price_sqft_single_family'
Loading model step 'hedonic_price_sqft_multi_family'
Loading model step 'hedonic_rent_sqft'


### Load data

In [5]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [None]:
orca.list_tables()

In [None]:
orca.list_broadcasts()

In [None]:
mm.list_steps()

### Explore the data

In [None]:
nodeswalk = orca.get_table('nodeswalk').to_frame()
nodessmall = orca.get_table('nodessmall').to_frame()

In [None]:
nodeswalk.describe().transpose()

In [None]:
nodessmall.describe().transpose()

In [None]:
parcel = orca.get_table('parcels').to_frame()
parcel.columns

### Create a Chooser Filter and Tag Their Buildings

In [10]:
hh = orca.get_table('households').to_frame()
hh.columns.tolist()

['serialno',
 'persons',
 'building_type',
 'cars',
 'income',
 'race_of_head',
 'hispanic_head',
 'age_of_head',
 'workers',
 'state',
 'county',
 'tract',
 'block_group',
 'children',
 'tenure',
 'recent_mover',
 'block_group_id',
 'single_family',
 'unit_id',
 'building_id']

In [11]:
np.random.seed(12345)
hh['hh_random'] = np.random.uniform(0,1,len(hh))

In [None]:
hh.head()

In [None]:
hh.describe()

In [None]:
hh.dtypes

In [12]:
hh_sf = hh[(hh['building_type'] == 2)  & (hh['recent_mover'] == 1) \
         & (hh['income'] > 0) & (hh['income'] < 1000000)]
len(hh_sf)

23839

In [None]:
len(hh_sf.building_id.unique())

In [13]:
sf_tmp = pd.DataFrame(hh_sf.building_id.unique(), columns=['building_id'])
sf_tmp['sf_choice_set'] = 1
sf_tmp = sf_tmp.set_index('building_id')
#sf_tmp.head()

In [14]:
hh_mf = hh[(hh['building_type'] > 2) &  (hh['hh_random'] < .5) & (hh['recent_mover'] == 1) \
         & (hh['income'] > 0) & (hh['income'] < 500000)]
len(hh_mf)

38342

In [15]:
len(hh_mf.building_id.unique())

22757

In [16]:
mf_tmp = pd.DataFrame(hh_mf.building_id.unique(), columns=['building_id'])
mf_tmp['mf_choice_set'] = 1
mf_tmp = mf_tmp.set_index('building_id')
#mf_tmp.head()

### Add Flag to Buildings Table Identifying Chosen Buildings

In [6]:
bld = orca.get_table('buildings').to_frame()
bld.columns

Index(['parcel_id', 'development_type_id', 'improvement_value',
       'residential_units', 'residential_sqft', 'sqft_per_unit',
       'non_residential_sqft', 'building_sqft', 'nonres_rent_per_sqft',
       'res_price_per_sqft', 'stories', 'year_built', 'redfin_sale_price',
       'redfin_sale_year', 'redfin_home_type', 'costar_property_type',
       'costar_rent', 'building_type_id', 'res_sqft_per_unit'],
      dtype='object')

In [7]:
bld.building_type_id.value_counts()

1.00     1522263
3.00      180723
16.00      28239
10.00      27083
0.00       18671
4.00       15802
7.00       10957
12.00       8940
8.00        8312
6.00        1921
5.00        1045
9.00         273
Name: building_type_id, dtype: int64

In [8]:
(bld.residential_units==1).sum()/len(bld)

0.7774243255643891

In [None]:
len(bld)

In [17]:
bld2 = bld.merge(sf_tmp, how='left', left_index=True, right_index=True)

In [18]:
bld2.sf_choice_set= bld2.sf_choice_set.fillna(0)
bld2.sf_choice_set.value_counts()

0.00    1802831
1.00      21398
Name: sf_choice_set, dtype: int64

In [19]:
bld3 = bld2.merge(mf_tmp, how='left', left_index=True, right_index=True)
bld3.mf_choice_set= bld3.mf_choice_set.fillna(0)
bld3.mf_choice_set.value_counts()

0.00    1801473
1.00      22756
Name: mf_choice_set, dtype: int64

In [20]:
bld = bld3

In [None]:
#orca.add_table('buildings', bld3)

In [None]:
parcel.describe().transpose()

In [None]:
bld.describe().transpose()

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])

In [None]:
df.describe().transpose()

In [None]:
#hh_f.dtypes

In [None]:
hh_f_data = hh_f.merge(df, left_on='building_id', right_index=True)

In [None]:
hh_f_data.describe().transpose()

In [None]:
hh_f.head()

In [None]:
#bld.groupby('building_type_id')['residential_units'].sum()

In [None]:
#bld.describe().transpose()

In [None]:
#bld.quantile(.99)

In [None]:
#bld[['residential_units','residential_sqft', 'non_residential_sqft']].sum()

In [None]:
#bld.building_type_id.value_counts()

In [None]:
# hh_sf=hh[hh['building_type']==2]
# hh_mf=hh[hh['building_type']!=2]

### Data Cleaning

In [21]:
nodeswalk = orca.get_table('nodeswalk').to_frame()
nodessmall = orca.get_table('nodessmall').to_frame()
#print(nodeswalk.columns.tolist())
#print(nodessmall.columns.tolist())

In [22]:
nodeswalk['pop_jobs_ratio_1500_walk'] = nodeswalk['pop_1500_walk'] / (nodeswalk['jobs_500_walk']+1)
nodessmall['pop_jobs_ratio_10000'] = nodessmall['pop_10000'] / (nodessmall['jobs_10000'] + 1)
nodessmall['pop_jobs_ratio_25000'] = nodessmall['pop_25000'] / (nodessmall['jobs_25000'] + 1)
nodeswalk['avg_hhs_500_walk'] = nodeswalk['pop_500_walk'] / (nodeswalk['hh_500_walk']+1)
nodeswalk['avg_hhs_1500_walk'] = nodeswalk['pop_1500_walk'] / (nodeswalk['hh_1500_walk']+1)


In [23]:
orca.add_table('nodeswalk', nodeswalk)
orca.add_table('nodessmall', nodessmall)

<orca.orca.DataFrameWrapper at 0x1c4f9a0b00>

In [None]:
#nodessmall.describe().transpose()

In [None]:
# nodessmall_upper = nodessmall.quantile(.99)
#nodessmall_upper

In [None]:
# nodessmall_clipped = nodessmall.clip_upper(nodessmall_upper, axis=1) 
# nodessmall_clipped.describe().transpose()
# orca.add_table('nodessmall', nodessmall_clipped);

In [None]:
# nodeswalk.describe().transpose()
# nodeswalk_upper = nodeswalk.quantile(.99)
#nodeswalk_upper

In [None]:
# nodeswalk_clipped = nodeswalk.clip_upper(nodeswalk_upper, axis=1) 
# nodeswalk_clipped.describe().transpose()
# orca.add_table('nodeswalk', nodeswalk_clipped);

### Perform desired variable creations and transformations¶

In [24]:
# scale income and create race dummies
hh.income_k = hh.income/1000
hh.white = (hh.race_of_head == 1).astype(int)
hh.black = (hh.race_of_head == 2).astype(int)
hh.asian = (hh.race_of_head == 6).astype(int)
hh.hisp = (hh.hispanic_head == 'yes').astype(int)
hh.single = (hh.persons == 1).astype(int)
hh.elderly = (hh.age_of_head > 65).astype(int)
hh.rich = (hh.income > 150000).astype(int)
hh.poor = (hh.income < 40000).astype(int)
hh.has_children = (hh.children > 0).astype(int)
  
# building_type dummies
bld.single_family = (bld.building_type_id == 1).astype(int)
bld.multi_family = (bld.building_type_id == 3).astype(int)
bld.mixed_use = (bld.building_type_id > 3).astype(int)
bld.two_four_stories = ((bld.stories > 1) & (bld.stories < 5)).astype(int)
bld.five_six_stories = ((bld.stories > 5) & (bld.stories < 7)).astype(int)
bld.sevenplus_stories = (bld.stories > 6).astype(int)
bld.yrblt_2000 = (bld.year_built > 2000).astype(int)
bld.two_four_new = (bld.yrblt_2000 * bld.two_four_stories).astype(int)
bld.five_six_new = (bld.yrblt_2000 * bld.five_six_stories).astype(int)
bld.sevenplus_new = (bld.yrblt_2000 * bld.sevenplus_stories).astype(int)
bld.three_plus_stories = (bld.stories > 2).astype(int)

# add the columns

orca.add_column('households', 'income_k', hh.income_k)
orca.add_column('households', 'white', hh.white)
orca.add_column('households', 'black', hh.black)
orca.add_column('households', 'asian', hh.asian)
orca.add_column('households', 'hispanic', hh.hisp)
orca.add_column('households', 'elderly', hh.elderly)
orca.add_column('households', 'rich', hh.rich)
orca.add_column('households', 'poor', hh.poor)
orca.add_column('households', 'has_children', hh.has_children)
orca.add_column('households', 'single', hh.single)

orca.add_column('households', 'hh_random', hh.hh_random)

orca.add_column('buildings', 'single_family', bld.single_family)
orca.add_column('buildings', 'multi_family', bld.multi_family)
orca.add_column('buildings', 'mixed_use', bld.mixed_use)
orca.add_column('buildings', 'two_four_stories', bld.two_four_stories)
orca.add_column('buildings', 'five_six_stories', bld.five_six_stories)
orca.add_column('buildings', 'yrblt_2000', bld.yrblt_2000)
orca.add_column('buildings', 'sf_choice_set', bld.sf_choice_set)
orca.add_column('buildings', 'mf_choice_set', bld.mf_choice_set)
orca.add_column('buildings', 'two_four_new', bld.two_four_new)
orca.add_column('buildings', 'five_six_new', bld.five_six_new)
orca.add_column('buildings', 'sevenplus_new', bld.sevenplus_new)
orca.add_column('buildings', 'three_plus_stories', bld.three_plus_stories)

<orca.orca._SeriesWrapper at 0x1c272774e0>

In [25]:
testhh = orca.get_table('households').to_frame().head()
testhh

Unnamed: 0_level_0,serialno,persons,building_type,cars,income,race_of_head,hispanic_head,age_of_head,workers,state,...,white,black,asian,hispanic,elderly,rich,poor,has_children,single,hh_random
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2010000487191,1,6.0,1.0,85000.0,1,no,47,1.0,6,...,1,0,0,0,0,0,0,0,1,0.93
1,2013000554587,1,9.0,1.0,27000.0,6,no,52,1.0,6,...,0,0,1,0,0,0,1,0,1,0.32
2,2011001140920,2,2.0,1.0,6000.0,6,no,60,1.0,6,...,0,0,1,0,0,0,1,0,0,0.18
3,2012001376432,1,3.0,1.0,28000.0,8,yes,51,1.0,6,...,0,0,0,1,0,0,1,0,1,0.2
4,2013000186929,1,6.0,1.0,10000.0,1,no,64,0.0,6,...,1,0,0,0,0,0,1,0,1,0.57


In [26]:
testbld = orca.get_table('buildings').to_frame().head()
testbld

Unnamed: 0_level_0,parcel_id,development_type_id,improvement_value,residential_units,residential_sqft,sqft_per_unit,non_residential_sqft,building_sqft,nonres_rent_per_sqft,res_price_per_sqft,...,mixed_use,two_four_stories,five_six_stories,yrblt_2000,sf_choice_set,mf_choice_set,two_four_new,five_six_new,sevenplus_new,three_plus_stories
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,742974,1.0,0.0,1.0,2029.0,2029.42,0.0,2029.42,0.0,302.77,...,0,0,0,0,0.0,0.0,0,0,0,0
2,744961,1.0,0.0,1.0,2029.0,2029.42,0.0,2029.42,0.0,254.43,...,0,0,0,0,0.0,0.0,0,0,0,0
3,1442641,1.0,53262.87,1.0,1568.0,1568.0,0.0,1568.0,0.0,183.47,...,0,0,0,0,0.0,0.0,0,0,0,0
7,328712,1.0,0.0,1.0,1393.0,1393.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0.0,0.0,0,0,0,0
9,742661,1.0,116580.0,1.0,1018.0,1018.0,0.0,1018.0,0.0,474.35,...,0,0,0,0,0.0,0.0,0,0,0,0


### Check Distributions

In [None]:
# hh_income_upper = hh_singlefamily.income.quantile(0.99)
# len(hh[hh['income']>hh_income_upper])

In [None]:
# %matplotlib notebook
# hh_sf.income.plot(kind='hist',bins=100)
# print(hh_sf.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
# %matplotlib notebook
# hh_mf.income.plot(kind='hist',bins=100)
# print(hh_mf.income.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
# %matplotlib notebook
# ax = sns.boxplot(x = hh_sf.income,palette="Set2", width=0.5)

In [None]:
# %matplotlib notebook
# ax = sns.boxplot(x = hh_mf.income,palette="Set2", width=0.5)

In [None]:
df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])
#df.columns.tolist()
df_sf= df[df['residential_units']==1]
df_mf= df[df['residential_units']>1]

In [None]:
# %matplotlib notebook
# df_sf.avg_income_1500_walk.plot(kind='hist',bins=100)
# plt.show()
# print(df_sf.avg_income_1500_walk.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
# %matplotlib notebook
# df_mf.avg_income_1500_walk.plot(kind='hist',bins=100)
# plt.show()
# print(df_mf.avg_income_1500_walk.quantile([0.10,0.5,0.9,0.99,1]))

In [None]:
#%matplotlib notebook
#(df_mf.prop_black_500_walk).plot(kind='hist',bins=100)
#(df_mf.prop_asian_500_walk).plot(kind='hist',bins=100)
#(df_mf.prop_white_500_walk).plot(kind='hist',bins=100)
#(df_mf.prop_hisp_500_walk).plot(kind='hist',bins=100)


### Check for Multicollinearity

In [None]:
# %matplotlib inline
# plt.subplots(figsize=(20, 20))


# # Create correlation matrix
# corr_matrix = df_sf.corr().abs()

# # Select upper triangle of correlation matrix
# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# high_corr = [column for column in upper.columns if any(upper[column] > 0.98)]
# X= df_sf[high_corr]
# sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")

In [None]:
# %matplotlib inline
# plt.subplots(figsize=(20, 20))

# corr_matrix = df_mf.corr()

# upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
# high_corr = [column for column in upper.columns if any(upper[column] > 0.95)]
# X= df_mf[high_corr]
# plt.subplots(figsize=(20, 20))
# sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")

In [None]:
%matplotlib inline

selected_features = ['residential_units','res_price_per_sqft', 'pop_1500_walk', 'jobs_1500_walk',
                     'prop_singles_500_walk','prop_elderly_500_walk','prop_children_500_walk',
                     'avg_income_1500_walk', 'sqft_per_unit',
                     'jobs_25000','prop_white_500_walk','prop_black_500_walk','prop_asian_500_walk',
                     'prop_hisp_500_walk','prop_rich_500_walk','prop_poor_500_walk']

X = df_mf[selected_features]

plt.subplots(figsize=(12, 12))
sns.heatmap(X.corr(), annot=True, cmap="RdYlGn")
plt.show()

# Model Estimation

In [31]:
mrent = mm.get_step('hedonic_rent_sqft')
mrent.tables = ['buildings', 'parcels', 'nodessmall', 'nodeswalk']
mrent.out_filters = ['residential_units > 0']
mrent.out_column = 'rent_sqft'
mrent.tables

['buildings', 'parcels', 'nodessmall', 'nodeswalk']

In [29]:
bld = orca.get_table('buildings').to_frame()
bld['rent_sqft'] = 0
orca.add_table('buildings', bld)

<orca.orca.DataFrameWrapper at 0x1c2050c400>

In [32]:
mrent.run()

## Large Choice Set Single-Family 

In [None]:
%%time
m1 = LargeMultinomialLogitStep()
m1.choosers = ['households']
m1.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m1.choice_column = 'building_id'
m1.alt_sample_size = 50

#Filters on choosers
m1.chooser_filters = ['building_type == 2 & recent_mover == 1 & 0 <income < 1000000']

#Filters on alternatives
m1.alt_filters = ['residential_units == 1',
                 '0 < avg_income_500_walk < 500000',
                 'sqft_per_unit > 0']


m1.model_expression = ' \
np.log1p(rent_sqft) + \
np.log(income):np.log1p(rent_sqft) + \
persons:np.log(res_sqft_per_unit) + \
np.log1p(acres) + \
pop_jobs_ratio_25000 + \
persons:avg_hhs_500_walk + \
rich:prop_rich_500_walk + \
poor:prop_poor_500_walk + \
single:prop_singles_500_walk + \
elderly:prop_elderly_500_walk + \
white:prop_white_500_walk + \
black:prop_black_500_walk + \
asian:prop_asian_500_walk + \
hispanic:prop_hisp_500_walk\
- 1'

m1.name = 'hlcm'
m1.tags = ['single_family', 'test']
m1.fit()

In [None]:
len(m1._get_df(tables=m.choosers, filters=m.chooser_filters))

## Large Choice Set Multi-Family 

In [None]:
%%time
m2 = LargeMultinomialLogitStep()
m2.choosers = ['households']
m2.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m2.choice_column = 'building_id'
m2.alt_sample_size = 50

#Filters on choosers
m2.chooser_filters = ['building_type > 2 &\
                      recent_mover == 1 &\
                      hh_random < .5 & \
                      persons < 8 & \
                      workers < 4 & \
                      0 <income < 500000']

#Filters on alternatives
m2.alt_filters = ['residential_units > 1',
                 '0 < avg_income_500_walk < 500000',
                 '0 < rent_sqft < 1000',
                 'pop_1500_walk > 0',
                 'res_price_per_sqft < 1500',
                 'res_sqft_per_unit < 6000',
                 'residential_units < 1000',
                 'sqft_per_unit > 0']


m2.model_expression = ' np.log(residential_units) + \
yrblt_2000:np.log(residential_units) + \
year_built + \
np.log1p(rent_sqft) + \
np.log(income):np.log1p(rent_sqft) + \
np.log1p(income):np.log1p(res_sqft_per_unit) + \
np.log1p(units_500_walk) + \
np.log1p(jobs_25000) + \
rich:prop_rich_500_walk + \
poor:prop_poor_500_walk + \
single:prop_singles_500_walk + \
elderly:prop_elderly_500_walk + \
white:prop_white_500_walk + \
black:prop_black_500_walk + \
asian:prop_asian_500_walk + \
hispanic:prop_hisp_500_walk\
- 1'

m2.name = 'hlcm'
m2.tags = ['multi_family','test']
m2.fit()

In [None]:
len(m2._get_df(tables=m2.choosers, filters=m2.chooser_filters))

## Constrained Choice Set Single_Family

In [33]:
%%time
m3 = LargeMultinomialLogitStep()
m3.choosers = ['households']
m3.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m3.choice_column = 'building_id'
m3.alt_sample_size = 50

#Filters on choosers
m3.chooser_filters = ['building_type == 2 & recent_mover == 1 & 0 <income < 1000000']

m3.alt_filters = ['sf_choice_set == 1 & sqft_per_unit > 0']

# np.log(residential_units) +
# np.log(income):np.log(avg_income_500_walk) + \
# np.log1p(income):np.log1p(rich_1500_walk) + \
# np.log1p(income):np.log1p(poor_1500_walk) + \
# np.log1p(persons):np.log1p(sqft_per_unit) + \

#np.log1p(res_price_per_sqft) + \
#np.log1p(income):np.log1p(res_price_per_sqft) + \


m3.model_expression = ' \
np.log1p(rent_sqft) + \
np.log(income):np.log1p(rent_sqft) + \
np.log1p(income):np.log(res_sqft_per_unit) + \
np.log1p(income):np.log(acres) + \
pop_jobs_ratio_25000 + \
np.log(jobs_25000+1) + \
persons:avg_hhs_500_walk + \
rich:prop_rich_500_walk + \
poor:prop_poor_500_walk + \
single:prop_singles_500_walk + \
elderly:prop_elderly_500_walk + \
white:prop_white_500_walk + \
black:prop_black_500_walk + \
asian:prop_asian_500_walk + \
hispanic:prop_hisp_500_walk\
- 1'

m3.name = 'hlcm_constrained_sf'
m3.tags = ['single_family', 'constrained']
m3.fit()

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:         22,458
Model:         Multinomial Logit   Df Residuals:             22,443
Method:       Maximum Likelihood   Df Model:                     15
Date:                 2018-07-24   Pseudo R-squ.:             0.108
Time:                      14:29   Pseudo R-bar-squ.:         0.108
AIC:                 156,704.699   Log-Likelihood:      -78,337.350
BIC:                 156,824.990   LL-Null:             -87,856.213
                                                coef   std err         z     P>|z|   Conf. Int.
-----------------------------------------------------------------------------------------------
np.log1p(rent_sqft)                          -4.2286     0.176   -24.071     0.000             
np.log(income):np.log1p(rent_sqft)            0.3546     0.014    25.486     0.000             
np.log1p(income):np.log(res_sqft_per_unit)   -0.0290     0.001   -25.673

In [34]:
# register the model
m3.register()

Saving 'hlcm_constrained_sf.yaml': /Users/paul/src/urbansim_parcel_bayarea/configs
Loading model step 'hlcm_constrained_sf'


In [None]:
len(m3._get_df(tables=m3.choosers, filters=m3.chooser_filters))

## Constrained Choice Set Multi_Family

In [None]:
%%time
m4 = LargeMultinomialLogitStep()
m4.choosers = ['households']
m4.alternatives = ['buildings','parcels','nodeswalk','nodessmall']
m4.choice_column = 'building_id'
m4.alt_sample_size = 50

#Filters on choosers
m4.chooser_filters = ['building_type > 2 & \
                      hh_random < .5 & \
                      recent_mover == 1 & \
                      persons < 8 & \
                      workers < 4 & \
                      0 <income < 500000']

#Filters on alternatives
m4.alt_filters = ['residential_units > 1',
                 '0 < avg_income_500_walk < 500000',
                 '0 < rent_sqft < 1000',
                 'res_sqft_per_unit < 6000',
                  'sqft_per_unit > 0',
                  'residential_units < 1000',
                 'mf_choice_set == 1']

m4.model_expression = ' np.log(residential_units) + \
yrblt_2000:np.log(residential_units) + \
year_built + \
np.log1p(rent_sqft) + \
np.log(income):np.log1p(rent_sqft) + \
np.log1p(income):np.log1p(res_sqft_per_unit) + \
np.log1p(units_500_walk) + \
np.log1p(jobs_25000) + \
rich:prop_rich_500_walk + \
poor:prop_poor_500_walk + \
single:prop_singles_500_walk + \
elderly:prop_elderly_500_walk + \
white:prop_white_500_walk + \
black:prop_black_500_walk + \
asian:prop_asian_500_walk + \
hispanic:prop_hisp_500_walk\
- 1'

m4.name = 'hlcm_constrained_mf'
m4.tags = ['multi_family','constrained', 'hlcm']
m4.fit()

In [None]:
# register the model
m4.register()

In [None]:
tmp = m4.mergedchoicetable.to_frame()

In [None]:
tmp.head()

In [None]:
tmp.shape

In [None]:
tmp.shape[0]/50

In [None]:
len(m4._get_df(tables=m4.choosers, filters=m4.chooser_filters))

In [None]:
chosen = tmp[tmp['chosen']==1]

In [None]:
tmp_d = chosen.describe().transpose()

In [None]:
# number of choosers/agents/households/observations
len(m_mf._get_df(tables=m_mf.choosers, filters=m_mf.chooser_filters))

In [None]:
m_mf.fitted_parameters
#or
#mm.get_step('hlcm_multi_family').fitted_parameters

## Model Prediction

In [None]:
m3.out_chooser_filters = ['building_type > 2 &\
                          hh_random < .2 &\
                          recent_mover == 1 &\
                          0 <income < 500000']

m.out_alt_filters = ['residential_units == 1',
                         '0 < avg_income_500_walk < 500000',
                         'sqft_per_unit > 0']

In [None]:
%%time
m_mf.run()

In [None]:
print(m_mf.probabilities.shape)
m_mf.probabilities.head()

In [None]:
### number of observations/choosers
print(len(m_mf.probabilities.observation_id.unique()))
### or 
#len(m_mf.choices)

### number of unique alternatives
print(len(m_mf.probabilities.building_id.unique()))

### number of alternatives
print(len(m_mf.probabilities.building_id))

In [None]:
# summed probability 

predict_df=m_mf.probabilities.groupby('building_id')['probability'].sum().to_frame()
predict_df.head()

In [None]:
plt.hist(predict_df['probability'],bins= 100);


In [None]:
# Check that choices are plausible
choices = pd.DataFrame(m_mf.choices)
df = pd.merge(m_mf.probabilities, choices, left_on='observation_id', right_index=True)
df['chosen'] = 0
df.loc[df.building_id == df.choice, 'chosen'] = 1
print(df.head())

In [None]:
print(np.corrcoef(df.probability, df.chosen))

In [None]:
### join predicted df and df 
#hh_f = hh[(hh['building_type'] > 2) & (hh['hh_random'] < .2) & (hh['recent_mover'] == 1)\
#        & (hh['income'] > 0) & (hh['income'] < 500000)]
             
#df = orca.merge_tables(target = 'buildings', tables = ['buildings','parcels','nodeswalk','nodessmall'])
  
#hh_f_data = hh_f.merge(df, left_on='building_id', right_index=True)
#hh_f_data.columns.tolist()

#predict= pd.merge(predict_df,hh_f_data, left_index=True,right_on='building_id',how='left', sort=False)
#predict[['probability','building_id']].head()

#predict_2= pd.merge(predict_df,df, left_index=True,right_index=True,how='left', sort=False)
#predict_2.head()