In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.neighbors import BallTree
import geopandas as gpd
from shapely.geometry import Point, LineString
from pyproj import Proj, transform
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
from urbansim.models.util import apply_filter_query
import orca
import os; os.chdir('../')
import warnings;warnings.simplefilter('ignore')
from activitysynth.scripts import datasources, models, variables
from choicemodels import MultinomialLogit
from choicemodels.tools import MergedChoiceTable

Registering model step 'auto_ownership'
Registering model step 'WLCM_gen_tt_simple'
Registering model step 'WLCM_gen_tt'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM_gen_tt_TWEAK'
Registering model step 'WLCM'


### Set required runtime args

In [3]:
input_file_format = 'csv'
input_data_dir = '/home/data/spring_2019/base/'

# estimate from base-year data

formattable_fname_dict = {
    'parcels': 'parcels.{0}',
    'buildings': 'buildings.{0}',
    'jobs': 'jobs.{0}',
    'establishments': 'establishments.{0}',
    'households': 'households.{0}',
    'persons': 'persons.{0}',
    'rentals': 'craigslist.{0}',
    'units': 'units.{0}',
    'mtc_skims': 'mtc_skims.{0}',
    'beam_skims_raw': '15.skims.csv.gz',
    'beam_skims_imputed': 'NOTGONNAFINDTHIS.{0}',  # force re-imputation
    # the following nodes and edges .csv's will be phased out and
    # replaced by travel model skims entirely
    'drive_nodes': 'drive_nodes.{0}',
    'drive_edges': 'drive_edges.{0}',
    'drive_access_vars': 'drive_net_vars.{0}',
    'walk_nodes': 'walk_nodes.{0}',
    'walk_edges': 'walk_edges.{0}',
    'walk_access_vars': 'walk_net_vars.{0}',
    'zones': 'zones.{0}',
    'zone_access_vars': 'zones_w_access_vars.{0}',
}

def format_fname_dict(formattable_fname_dict, format='csv'):
    formatted_dict = {
        k: v.format('csv')
        for k, v in formattable_fname_dict.items()}
    return formatted_dict

input_fnames = format_fname_dict(
            formattable_fname_dict, input_file_format)

### Set required Orca injectables

In [4]:
orca.add_injectable('input_file_format', input_file_format)
orca.add_injectable('input_data_dir', input_data_dir)
orca.add_injectable('input_fnames', input_fnames)
orca.add_injectable('store', None)

### Initialize Orca tables

In [5]:
orca.run(['initialize_network_walk', 'initialize_network_small', 'initialize_imputed_skims'])

Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 0.00 s
Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.00 s
Running step 'initialize_imputed_skims'
No imputed skims found. Creating them now.
Time to execute step 'initialize_imputed_skims': 82.61 s
Total time to execute iteration 1 with iteration value None: 82.61 s


In [11]:
buildings = orca.get_table('buildings').to_frame()
parcels = orca.get_table('parcels').to_frame()
jobs = orca.get_table('jobs').to_frame()

# using beam skims
beam_skims = orca.get_table('beam_skims_imputed').to_frame()
interaction_terms = beam_skims.rename_axis(['zone_id_home','zone_id_work'])

### Get accessibility vars

If already computed:

In [8]:
# walk_net_vars = pd.read_csv(input_data_dir + input_fnames['walk_access_vars'], index_col='osmid')
# drive_net_vars = pd.read_csv(input_data_dir + input_fnames['drive_access_vars'], index_col='osmid')
# zones = pd.read_csv(input_data_dir + input_fnames['zone_access_vars'], index_col='zone_id')

If computing for the first time

In [6]:
orca.run(['network_aggregations_small', 'network_aggregations_walk', 'skims_aggregations'])

Running step 'network_aggregations_small'
Computing accessibility variables
Computing units_10000
Computing units_sf_10000
Computing units_mf_10000
Computing pop_10000
Removed 189769 rows because they contain missing values
Computing hh_10000
Removed 189769 rows because they contain missing values
Computing poor_10000
Removed 53114 rows because they contain missing values
Computing renters_10000
Removed 102597 rows because they contain missing values
Computing avg_income_10000
Removed 189769 rows because they contain missing values
Computing jobs_10000
Computing avg_rent_10000
Computing med_rent_10000
Computing pop_white_10000
Removed 107372 rows because they contain missing values
Computing pop_black_10000
Removed 10541 rows because they contain missing values
Computing pop_asian_10000
Removed 51048 rows because they contain missing values
Computing pop_hisp_10000
Removed 31685 rows because they contain missing values
Computing units_25000
Computing units_sf_25000
Computing units_mf_2

Removed 10541 rows because they contain missing values
Computing pop_asian_2500_walk
Removed 51048 rows because they contain missing values
Computing pop_hisp_2500_walk
Removed 31685 rows because they contain missing values
Computing jobs_500_walk_retail
Computing jobs_1500_walk_retail
Computing jobs_2500_walk_retail
Computing jobs_500_walk_fire
Computing jobs_1500_walk_fire
Computing jobs_2500_walk_fire
Computing jobs_500_walk_tech
Computing jobs_1500_walk_tech
Computing jobs_2500_walk_tech
Computing jobs_500_walk_serv
Computing jobs_1500_walk_serv
Computing jobs_2500_walk_serv
       units_500_walk  sqft_unit_500_walk  singles_500_walk  \
count   415716.000000       415716.000000     415716.000000   
mean       356.025760         1245.303477         94.368742   
std       1097.266478          979.057308        344.532606   
min          0.000000            0.000000          0.000000   
25%          8.000000          455.911149          1.000000   
50%        188.000000         1315.6

In [7]:
# store the results so you don't have to do it again
orca.get_table('nodeswalk').to_frame().to_csv(
            os.path.join(input_data_dir, input_fnames['walk_access_vars']))
orca.get_table('nodessmall').to_frame().to_csv(
    os.path.join(input_data_dir, input_fnames['drive_access_vars']))
orca.get_table('zones').to_frame().to_csv(
    os.path.join(input_data_dir, input_fnames['zone_access_vars']))

### Load CHTS Data

Raw CHTS Households Data

In [8]:
chts_households = pd.read_csv('/home/data/fall_2018/CHTS_csv_format/data/Deliv_HH.csv')

Processed CHTS Persons Data (code available [here](https://github.com/ual/ual_model_workspace/blob/master/fall-2018-models/notebooks-max/WLCM_pre-processing.ipynb))

In [9]:
chts_persons = pd.read_csv('/home/data/fall_2018/chts_persons_w_jobs_and_res_bldgs.csv')

In [12]:
chts_persons.loc[:, 'worker'] = chts_persons[chts_persons['EMPLY'] == 1]
chts_persons.loc[:, 'work_at_home'] = chts_persons[chts_persons['WLOC'] == 2]
chts_workers = chts_persons[
    (~pd.isnull(chts_persons['job_id'])) & (chts_persons['job_id'].isin(jobs.index.values))]

### Define required model parameters

In [13]:
chooser_filters = ['age < 115', 'income < 98', 'edu < 98']
alt_sample_size = 10

### Generate the merged choice table

In [14]:
zones = orca.get_table('zones').to_frame()

This step must be done manually for now by calling the `choicemodels.MergedChoiceTable()` method directly instead of using a template because `urbansim_templates` does not yet have functionality for interaction terms such as home-to-work distances

In [15]:
obs = chts_workers.merge(
    chts_households[['SAMPN', 'INCOM']], on='SAMPN').merge(
    buildings, left_on='building_id', right_index=True).merge(
    parcels, left_on='parcel_id', right_index=True).rename(
    columns={
        'zone_id': 'zone_id_home', 'AGE': 'age', 'EDUCA': 'edu', 'INCOM': 'income'})
obs.index.name = 'obs_id'

In [16]:
obs['no_higher_ed'] = (obs['edu'] < 5).astype(int)
obs['age_under_45'] = (obs['age'] < 45).astype(int)
obs['hh_inc_under_25k'] = (obs['income'] < 3).astype(int)
obs['hh_inc_25_to_75k'] = ((obs['income'] > 2) & (obs['income'] < 6)).astype(int)
obs['hh_inc_75_to_200k'] = ((obs['income'] > 5) & (obs['income'] < 9)).astype(int)

In [17]:
obs = apply_filter_query(obs, filters=chooser_filters)

In [18]:
obs = obs[[
    'job_id', 'zone_id_home', 'age_under_45', 'no_higher_ed', 'age',
    'hh_inc_under_25k', 'hh_inc_25_to_75k', 'hh_inc_75_to_200k', 'income']]

In [52]:
alts = jobs.merge(buildings, left_on='building_id', right_index=True).merge(
    parcels, left_on='parcel_id', right_index=True).merge(
    zones, left_on='zone_id', right_index=True)

In [53]:
# industry of alternatives
alts['sector_retail'] = alts['sector_id'].isin([44, 45]).astype(int)
alts['sector_healthcare'] = alts['sector_id'].isin([62]).astype(int)
alts['sector_tech'] = alts['sector_id'].isin([51, 54]).astype(int)
alts['sector_food_and_hosp'] = alts['sector_id'].isin([72]).astype(int)
alts['sector_mfg'] = alts['sector_id'].isin([31, 32, 33]).astype(int)
alts['sector_edu_serv'] = alts['sector_id'].isin([61]).astype(int)
alts['sector_oth_serv'] = alts['sector_id'].isin([81]).astype(int)
alts['sector_constr'] = alts['sector_id'].isin([23]).astype(int)
alts['sector_gov'] = alts['sector_id'].isin([92]).astype(int)
alts['sector_fire'] = alts['sector_id'].isin([52, 53]).astype(int)
alts['sector_whlsale'] = alts['sector_id'].isin([42]).astype(int)
alts['sector_admin'] = alts['sector_id'].isin([56]).astype(int)
alts['sector_transport'] = alts['sector_id'].isin([48]).astype(int)
alts['sector_arts'] = alts['sector_id'].isin([71]).astype(int)
alts['sector_util'] = alts['sector_id'].isin([22]).astype(int)

# # occupation of alternatives
# alts['occup_mgmt'] = alts['occupation_id'].isin([11]).astype(int)
# alts['occup_sales'] = alts['occupation_id'].isin([41]).astype(int)
# alts['occup_biz'] = alts['occupation_id'].isin([13]).astype(int)
# alts['occup_admin'] = alts['occupation_id'].isin([43]).astype(int)
# alts['occup_edu'] = alts['occupation_id'].isin([25]).astype(int)
# alts['occup_food'] = alts['occupation_id'].isin([35]).astype(int)
# alts['occup_health'] = alts['occupation_id'].isin([29, 31]).astype(int)
# alts['occup_tech'] = alts['occupation_id'].isin([15]).astype(int)
# alts['occup_eng'] = alts['occupation_id'].isin([17]).astype(int)
# alts['occup_transp'] = alts['occupation_id'].isin([53]).astype(int)
# alts['occup_constr'] = alts['occupation_id'].isin([47]).astype(int)

In [54]:
alts = alts[[
    'total_jobs_gen_tt_WALK_TRANSIT_15', 'total_jobs_gen_tt_WALK_TRANSIT_45',
    'total_jobs_gen_tt_CAR_45', 'total_jobs_gen_tt_CAR_15', 'avg_income_gen_tt_CAR_30',
    'sum_persons_gen_tt_CAR_1', 'sum_persons_gen_tt_CAR_1', 'sum_income_gen_tt_CAR_15', 
    'sum_income_gen_tt_CAR_45', 'sum_residential_units_gen_tt_CAR_1', 'sum_residential_units_gen_tt_CAR_1',
    'sum_residential_units_gen_tt_CAR_15', 'sum_residential_units_gen_tt_CAR_15',
    'sum_persons_gen_tt_CAR_15', 'sum_persons_gen_tt_CAR_15',
    'zone_id_work', 'sector_retail', 'sector_healthcare', 'sector_tech', 'sector_food_and_hosp',
    'sector_mfg', 'sector_edu_serv', 'sector_oth_serv', 'sector_constr', 'sector_gov', 'sector_fire',
    'sector_whlsale', 'sector_admin', 'sector_transport', 'sector_arts', 'sector_util',
#     'occup_mgmt', 'occup_sales', 'occup_biz', 'occup_admin', 'occup_edu', 'occup_food', 'occup_health',
#     'occup_tech', 'occup_eng', 'occup_transp', 'occup_constr'
]]

In [55]:
%%time
mct = MergedChoiceTable(obs, alts, chosen_alternatives='job_id',
                        sample_size=alt_sample_size, interaction_terms=interaction_terms)

CPU times: user 880 ms, sys: 57.1 ms, total: 937 ms
Wall time: 936 ms


### Estimate the WLCM

In [26]:
mm.initialize()

Registering model step 'auto_ownership'
Registering model step 'WLCM_gen_tt_simple'
Registering model step 'WLCM_gen_tt'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM_gen_tt_TWEAK'
Registering model step 'WLCM'


In [27]:
m = LargeMultinomialLogitStep(
    chooser_filters=chooser_filters, constrained_choices=True,
    alt_sample_size=alt_sample_size
)

In [78]:
# m.model_expression = (
#     'dist_da/tt_da + '
#     'np.log1p(jobs_1500_walk_retail) + '
#     'np.log1p(cost_da_toll):(hh_inc_under_25k + hh_inc_25_to_75k + hh_inc_75_to_200k) + '
#     'no_higher_ed:(sector_retail + sector_fire + sector_healthcare + sector_tech + sector_mfg + '
#     'sector_food_and_hosp + sector_edu_serv + sector_gov + sector_whlsale) + '
#     'sector_retail + sector_tech + sector_mfg + sector_food_and_hosp + sector_edu_serv + sector_oth_serv + '
#     'sector_constr + sector_gov + sector_whlsale + sector_admin + sector_util - 1'
# )

m.model_expression = (
#     'np.log1p(gen_tt_CAR):np.log1p(gen_tt_WALK_TRANSIT) + '
#     'np.log1p(gen_tt_CAR) + '
#     'dist * (np.log1p(gen_tt_CAR)) + np.log1p(gen_tt_WALK_TRANSIT) + '
    'dist + np.log1p(gen_tt_CAR) + '
#     'total_jobs_gen_tt_CAR_45:sum_residential_units_gen_tt_CAR_15 +'
    'np.log1p(gen_cost_CAR):(hh_inc_under_25k + hh_inc_25_to_75k + hh_inc_75_to_200k) + '
    'no_higher_ed:(sector_retail + sector_fire + sector_healthcare + sector_tech + sector_mfg + '
    'sector_food_and_hosp + sector_edu_serv + sector_gov) + '
    'sector_retail + sector_tech + sector_mfg + sector_food_and_hosp + sector_edu_serv + sector_oth_serv + '
    'sector_constr + sector_gov + sector_whlsale + sector_admin + sector_util - 1'
)

In [79]:
m.fit(mct)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          7,928
Model:         Multinomial Logit   Df Residuals:              7,904
Method:       Maximum Likelihood   Df Model:                     24
Date:                 2019-09-10   Pseudo R-squ.:             0.503
Time:                      15:16   Pseudo R-bar-squ.:         0.502
AIC:                  18,179.324   Log-Likelihood:       -9,065.662
BIC:                  18,346.800   LL-Null:             -18,254.895
                                              coef   std err         z     P>|z|   Conf. Int.
---------------------------------------------------------------------------------------------
dist                                       -0.0001     0.000   -60.579     0.000             
np.log1p(gen_tt_CAR)                       -0.0764     0.018    -4.233     0.000             
np.log1p(gen_cost_CAR):hh_inc_under_25k    -0.6519     0.046   -14.044     0.000

### Add attributes to the model object that are needed for the simulation step

In [80]:
m.name = 'WLCM_gen_tt_simple'
m.tags = ['max']
m.alternatives = ['jobs', 'buildings', 'parcels', 'zones']
m.choosers = ['persons', 'households']
m.out_chooser_filters = ['worker == 1', 'work_at_home == 0', 'zone_id_home == zone_id_home']
m.out_column = 'job_id'

In [81]:
mm.register(m)

Saving 'WLCM_gen_tt_simple.yaml': /home/max/projects/activitysynth/activitysynth/configs
Registering model step 'WLCM_gen_tt_simple'
