In [1]:
import pandas as pd
import numpy as np

from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
from urbansim_templates.utils import get_data
from choicemodels import MultinomialLogit
from choicemodels.tools import MergedChoiceTable

import orca
import os; os.chdir('../')
import warnings;warnings.simplefilter('ignore')

In [2]:
from urbansim.utils import misc

### Load data

In [3]:
data_mode = 'csv'
local_data_dir = '/home/data/spring_2019/base/'
csv_fnames = {
    'parcels': 'parcels.csv',
    'buildings': 'buildings.csv',
    'jobs': 'jobs.csv',
    'establishments': 'establishments.csv',
    'households': 'households.csv',
    'persons': 'persons.csv',
    'rentals': 'craigslist.csv',
    'units': 'units.csv',
    'skims': 'mtc_skims.csv',
    'drive_nodes': 'drive_nodes.csv',
    'drive_edges': 'drive_edges.csv',
    'drive_access_vars': 'drive_net_vars.csv',
    'walk_nodes': 'walk_nodes.csv',
    'walk_edges': 'walk_edges.csv',
    'walk_access_vars': 'walk_net_vars.csv',
}
orca.add_injectable('data_mode', data_mode)
orca.add_injectable('csv_fnames', csv_fnames)
orca.add_injectable('store', None)
orca.add_injectable('s3_input_data_url', None)
orca.add_injectable('local_data_dir', local_data_dir)

In [4]:
from scripts import datasources, models, variables

Registering model step 'auto_ownership'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'ELCM_finance'
Registering model step 'WLCM'


In [5]:
orca.run(['initialize_network_small','initialize_network_walk'])

Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.00 s
Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 0.00 s
Total time to execute iteration 1 with iteration value None: 0.00 s


In [6]:
# if network vars have been previously computed and saved:
walk_net_vars = pd.read_csv(
    local_data_dir + csv_fnames['walk_access_vars'],
    index_col='osmid')
drive_net_vars = pd.read_csv(
    local_data_dir + csv_fnames['drive_access_vars'],
    index_col='osmid')
orca.add_table('nodeswalk', walk_net_vars)
orca.add_table('nodessmall', drive_net_vars)

<orca.orca.DataFrameWrapper at 0x7f6cb6fa0080>

### Get observations

In [7]:
elcm_retail = mm.get_step('ELCM_finance')

In [8]:
elcm_retail.choice_column

'building_id'

In [9]:
elcm_retail.choosers

'establishments'

In [10]:
elcm_retail.chooser_filters

['sector_id == 52']

In [11]:
elcm_retail.model_expression

'np.log1p(avg_income_500_walk) + nonres_rent_per_sqft*np.log1p(jobs_10000) + np.log1p(jobs_500_walk)*np.log1p(jobs_10000_retail) + np.log1p(jobs_2500_walk_finance) + np.log1p(units_mf_1500_walk) + np.log1p(units_sf_10000) + np.log1p(land_value)*nonres_rent_per_sqft + nonres_rent_per_sqft/np.log1p(sales_vol) + np.log1p(nonres_sqft_2500)/np.log1p(sqft_2500) + np.log1p(pop_10000)'

In [109]:
obs = get_data(
    tables=elcm_retail.choosers, filters=elcm_retail.chooser_filters,
    model_expression=elcm_retail.model_expression, extra_columns=elcm_retail.choice_column
)

In [110]:
obs.head()

Unnamed: 0_level_0,building_id,sector_id,sales_vol
establishment_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
23,1200840,52,18642.0
54,1200551,52,1434.0
55,1200840,52,1434.0
105,78135,52,0.0
106,1291617,52,0.0


### Get alternatives

In [14]:
elcm_retail.alternatives

['buildings', 'parcels', 'nodessmall', 'nodeswalk']

In [41]:
elcm_retail.alt_filters + ['node_id_small == node_id_small']

['0 < jobs_25000 < 1000000', 'node_id_small == node_id_small']

In [16]:
elcm_retail.model_expression

'np.log1p(avg_income_500_walk) + nonres_rent_per_sqft*np.log1p(jobs_10000) + np.log1p(jobs_500_walk)*np.log1p(jobs_10000_retail) + np.log1p(jobs_2500_walk_finance) + np.log1p(units_mf_1500_walk) + np.log1p(units_sf_10000) + np.log1p(land_value)*nonres_rent_per_sqft + nonres_rent_per_sqft/np.log1p(sales_vol) + np.log1p(nonres_sqft_2500)/np.log1p(sqft_2500) + np.log1p(pop_10000)'

In [42]:
alts = get_data(
    tables = elcm_retail.alternatives, filters = elcm_retail.alt_filters + ['node_id_small == node_id_small'], 
    model_expression = elcm_retail.model_expression, extra_columns='node_id_small')

In [43]:
alts.reset_index(inplace=True)

In [44]:
alts.head()

Unnamed: 0,building_id,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk
0,1,110407631,146582.0,5706.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0
1,2,407087343,146582.0,429.0,0.0,106636.341155,92884.0,21043.0,15628.0,423541600.0,49805.0,2911.0,111884117.0,7319.0,48024.0
2,3,65545753,1.0,23662.13,0.0,151178.224806,669839.0,231023.0,109907.0,49491670.0,459476.0,32814.0,8552002.0,124.0,2490.0
3,7,1695636890,693.0,0.0,0.0,164983.076923,628975.0,132509.0,34929.0,17982100.0,143267.0,19094.0,8212156.0,61.0,435.0
4,9,110407631,146582.0,413329.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0


In [45]:
alts.shape

(1824150, 15)

### Load node-based location quotients for retail

In [21]:
lq = pd.read_csv('data/nodes_w_establishment_lq.csv', usecols=['osmid', 'retail_lq'])

In [22]:
lq.head()

Unnamed: 0,osmid,retail_lq
0,281266,0.312298
1,302878,0.4288
2,302883,0.298295
3,302888,0.842069
4,25457926,2.898093


In [23]:
lq.shape

(29012, 2)

### Merge location quotients to alts

In [24]:
len(alts['node_id_small'].unique())

22265

In [25]:
len(alts[~alts['node_id_small'].isin(lq['osmid'])])

896

In [104]:
alts2 = pd.merge(alts, lq[['osmid', 'retail_lq']], left_on='node_id_small', right_on='osmid')
alts2.set_index('building_id', inplace=True)

In [105]:
alts2.head()

Unnamed: 0_level_0,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk,osmid,retail_lq
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,110407631,146582.0,5706.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728
9,110407631,146582.0,413329.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728
12,110407631,146582.0,204073.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728
13,110407631,146582.0,32530.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728
14,110407631,146582.0,21096.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728


In [106]:
alts2.shape

(1823254, 16)

In [114]:
obs = obs[obs['building_id'].isin(alts2.index.values)]

In [123]:
obs.shape

(15579, 3)

### Generate strata

In [117]:
strata = pd.qcut(alts2['retail_lq'], 5, retbins=True, labels=False)

In [118]:
alts2['stratum'] = strata[0]

In [119]:
alts2.head()

Unnamed: 0_level_0,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk,osmid,retail_lq,stratum
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,110407631,146582.0,5706.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728,3
9,110407631,146582.0,413329.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728,3
12,110407631,146582.0,204073.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728,3
13,110407631,146582.0,32530.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728,3
14,110407631,146582.0,21096.0,0.0,106636.341155,108367.0,40439.0,24928.0,423541600.0,83693.0,5624.0,111884117.0,7319.0,48024.0,110407631,1.062728,3


In [120]:
alts2['stratum'].value_counts()

2    364841
0    364732
3    364700
1    364598
4    364383
Name: stratum, dtype: int64

### Make MergedChoiceTable

In [124]:
mct = MergedChoiceTable(obs, alts2, chosen_alternatives='building_id',
                        sample_size=100, sampling_regime='stratified', strata='stratum')

In [125]:
mct.to_frame().shape

(1573479, 20)

In [126]:
mct_df = mct.to_frame()

In [127]:
mct_df[mct_df.isnull().any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,sector_id,sales_vol,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk,osmid,retail_lq,stratum,chosen
establishment_id,building_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Fit the model

In [128]:
elcm_retail.fit(mct)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:         15,579
Model:         Multinomial Logit   Df Residuals:             15,562
Method:       Maximum Likelihood   Df Model:                     17
Date:                 2019-08-24   Pseudo R-squ.:             0.498
Time:                      12:28   Pseudo R-bar-squ.:         0.498
AIC:                  72,189.923   Log-Likelihood:      -36,077.962
BIC:                  72,320.036   LL-Null:             -71,898.963
                                                         coef   std err         z     P>|z|   Conf. Int.
--------------------------------------------------------------------------------------------------------
Intercept                                             -0.0000     0.389    -0.000     1.000             
np.log1p(avg_income_500_walk)                         -0.0214     0.004    -5.531     0.000             
nonres_rent_per_sqft                

### TESTING

In [69]:
# TEST MCT METHODS
chosen_alternatives = obs[elcm_retail.choice_column].copy()
obs = obs.drop(chosen_alternatives.name, axis='columns')
chosen_alternatives.name = '_' + alts2.index.name

In [70]:
n_obs = obs.shape[0]

oid_name = obs.index.name
aid_name = alts2.index.name

samp_size=100

obs_ids = list(obs.index.values) * samp_size

alt_ids = []

strata_vals = alts2['stratum'].unique()
num_strata = float(len(strata_vals))
samp_size_per_strata = int(np.ceil(samp_size / num_strata))
new_samp_size = int(num_strata * samp_size_per_strata)

if new_samp_size != samp_size:
    samp_size = new_samp_size
    obs_ids = list(obs.index.values) * samp_size

for stratum in strata_vals:
    stratum_alts = alts2.loc[alts2['stratum'] == stratum]
    sampled_alts = np.random.choice(stratum_alts.index.values, 
                       replace = True,
                       size = n_obs * samp_size_per_strata).tolist()
    alt_ids += sampled_alts

In [71]:
obs_ids = np.append(obs_ids, obs.index.values)
alt_ids = np.append(alt_ids, chosen_alternatives)
chosen = np.append(np.repeat(0, samp_size * n_obs), np.repeat(1, n_obs))

In [72]:
df = pd.DataFrame({oid_name: obs_ids, aid_name: alt_ids})

In [73]:
df = df.join(obs, how='left', on=oid_name)

In [74]:
df = df.join(alts2, how='left', on=aid_name)

In [75]:
df['chosen'] = chosen

In [76]:
df.sort_values([oid_name, 'chosen'], ascending=False, inplace=True)

In [77]:
df.set_index([oid_name, aid_name], inplace=True)

In [78]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,sector_id,sales_vol,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk,osmid,retail_lq,stratum,chosen
establishment_id,building_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
240445,522059,52,0.0,56031926.0,78468.0,111476.0,14.297285,90938.15031,2946.0,1517.0,3530.0,231751300.0,9663.0,301.0,69130735.0,3260.0,28070.0,56031926.0,1.299392,4.0,1
240445,288437,52,0.0,318285761.0,146582.0,164000.0,0.0,106636.341155,115807.0,52006.0,26586.0,423541600.0,95029.0,7330.0,111884117.0,7319.0,48024.0,318285761.0,1.093184,3.0,0
240445,57167,52,0.0,640504457.0,314.0,287131.0,0.0,184341.484185,327456.0,113888.0,66600.0,20569030.0,232270.0,16305.0,3878323.0,182.0,417.0,640504457.0,1.067301,3.0,0
240445,393391,52,0.0,65403114.0,2868.0,117769.0,32.477049,153802.035519,492406.0,128615.0,65087.0,28572960.0,242102.0,19861.0,13938118.0,961.0,2522.0,65403114.0,1.049234,3.0,0
240445,858360,52,0.0,65589455.0,44.0,67277.9576,0.0,151515.95082,463272.0,42093.0,71031.0,23059860.0,322452.0,8081.0,3162378.0,113.0,302.0,65589455.0,1.160338,3.0,0


In [79]:
df[df.isnull().any(axis=1)]

Unnamed: 0_level_0,Unnamed: 1_level_0,sector_id,sales_vol,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk,osmid,retail_lq,stratum,chosen
establishment_id,building_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
239140,1745430,52,0.0,,,,,,,,,,,,,,,,,,1
233826,273834,52,0.0,,,,,,,,,,,,,,,,,,1
216905,523319,52,0.0,,,,,,,,,,,,,,,,,,1
132848,158442,52,2295.0,,,,,,,,,,,,,,,,,,1
132623,158442,52,372.0,,,,,,,,,,,,,,,,,,1
84019,1312512,52,2748.0,,,,,,,,,,,,,,,,,,1
82932,1312512,52,2232.0,,,,,,,,,,,,,,,,,,1
43498,1597782,52,0.0,,,,,,,,,,,,,,,,,,1
38961,862613,52,1644.0,,,,,,,,,,,,,,,,,,1


In [85]:
alts2[alts2.index.values == 1745430]

Unnamed: 0_level_0,node_id_small,jobs_500_walk,land_value,nonres_rent_per_sqft,avg_income_500_walk,jobs_25000,jobs_10000,units_sf_10000,sqft_2500,pop_10000,jobs_10000_retail,nonres_sqft_2500,jobs_2500_walk_finance,units_mf_1500_walk,osmid,retail_lq,stratum
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
