In [1]:
from collections import OrderedDict
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
from urbansim_templates.models import SmallMultinomialLogitStep
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')

import pandas as pd
# import pandana as pdna
import time
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import reduce

import scipy.stats as st
from scipy.stats import skewnorm

# import matplotlib
# matplotlib.style.use('ggplot')

%matplotlib inline

pd.options.display.max_columns = 80

## Load Data

In [2]:
trips = pd.read_csv('/home/emma/ual_model_workspace/fall-2018-models/notebooks-emma/HWtrips_031418.csv')

trips.head()

Unnamed: 0.1,Unnamed: 0,HHPER,HHPERTRIP,origin,origin_dwell,origin_ST,origin_ET,trip_ET,TT,MODE
0,0,10351981,1035198000.0,home,14.0,17.333333,7.333333,7.7,0.366667,drive_alone
1,1,10351981,1035198000.0,work,9.383333,7.7,17.083333,17.333333,0.25,drive_alone
2,2,10351982,1035198000.0,home,10.416667,19.583333,6.0,6.25,0.25,drive_alone
3,3,10351982,1035198000.0,work,10.25,6.25,16.5,19.583333,3.083333,drive_alone
4,4,10352742,1035274000.0,home,13.583333,19.166667,8.75,9.166667,0.416667,drive_alone


## Prepare TOD and Dwell columns

In [3]:
#select people who make both home-work and work-home trips:
tripsII = trips.groupby('HHPER').filter(lambda x: len(x) == 2)

In [4]:
#make sure all home-work trip rows are listed first
tripsIII = tripsII.sort_values(['HHPER','origin']).reset_index()

In [5]:
#move work-home trip info up into home-work trip rows

tripsIII['work_dwell'] = tripsIII.groupby('HHPER', group_keys=False).origin_dwell.shift(-1)
tripsIII['work_ST'] = tripsIII.groupby('HHPER', group_keys=False).origin_ST.shift(-1)
tripsIII['WH_trip_ST'] = tripsIII.groupby('HHPER', group_keys=False).origin_ET.shift(-1)
tripsIII['WH_trip_ET'] = tripsIII.groupby('HHPER', group_keys=False).trip_ET.shift(-1)
tripsIII['WH_TT'] = tripsIII.groupby('HHPER', group_keys=False).TT.shift(-1)
tripsIII['WH_mode'] = tripsIII.groupby('HHPER', group_keys=False).MODE.shift(-1)

In [6]:
tripsIII = tripsIII.groupby('HHPER').first().reset_index()

In [7]:
tripsIII.rename(columns = {'origin_dwell':'home_dwell','origin_ST':'home_ST','origin_ET':'HW_trip_ST',
                           'trip_ET':'HW_trip_ET','TT':'HW_TT','MODE':'HW_mode'},inplace = True)

tripsIII.head()

Unnamed: 0.1,HHPER,index,Unnamed: 0,HHPERTRIP,origin,home_dwell,home_ST,HW_trip_ST,HW_trip_ET,HW_TT,HW_mode,work_dwell,work_ST,WH_trip_ST,WH_trip_ET,WH_TT,WH_mode
0,10351981,0,0,1035198000.0,home,14.0,17.333333,7.333333,7.7,0.366667,drive_alone,9.383333,7.7,17.083333,17.333333,0.25,drive_alone
1,10351982,2,2,1035198000.0,home,10.416667,19.583333,6.0,6.25,0.25,drive_alone,10.25,6.25,16.5,19.583333,3.083333,drive_alone
2,10352742,4,4,1035274000.0,home,13.583333,19.166667,8.75,9.166667,0.416667,drive_alone,7.583333,9.166667,16.75,19.166667,2.416667,drive_alone
3,10353643,6,6,1035364000.0,home,11.333333,19.583333,6.916667,7.416667,0.5,drive_alone,8.633333,7.416667,16.05,19.583333,3.533333,drive_alone
4,10372952,8,8,1037295000.0,home,17.166667,21.833333,15.0,15.416667,0.416667,drive_alone,6.0,15.416667,21.416667,21.833333,0.416667,drive_alone


In [8]:
tripsIII['TOD'] = (
    ((tripsIII.HW_trip_ET.between(3,6,inclusive = False)) | (tripsIII.HW_trip_ET==3))*1 +
    ((tripsIII.HW_trip_ET.between(6,9,inclusive = False)) | (tripsIII.HW_trip_ET==6))*2 +
    ((tripsIII.HW_trip_ET.between(9,15.5,inclusive = False)) | (tripsIII.HW_trip_ET==9))*3 +
    ((tripsIII.HW_trip_ET.between(15.5,18.5,inclusive = False)) | (tripsIII.HW_trip_ET==15.5))*4 +
    ((tripsIII.HW_trip_ET>=18.5))*5 +
    ((tripsIII.HW_trip_ET.between(0,3,inclusive = False)) | (tripsIII.HW_trip_ET==0))*5
)

tripsIII['TOD'] = tripsIII['TOD'] - 1

In [9]:
tripsIII['TOD'] = pd.to_numeric(tripsIII['TOD'])

In [10]:
tripsIII['dwell_work'] = (
    ((tripsIII.work_dwell.between(0,4.5,inclusive = False)) | (tripsIII.work_dwell==0))*1 +
    ((tripsIII.work_dwell.between(4.5,7.75,inclusive = False)) | (tripsIII.work_dwell==4.5))*2 +
    ((tripsIII.work_dwell.between(7.75,9.0,inclusive = False)) | (tripsIII.work_dwell==7.75))*3 +
    ((tripsIII.work_dwell.between(9.0,10.5,inclusive = False)) | (tripsIII.work_dwell==9.0))*4 +
    ((tripsIII.work_dwell>=10.5))*5)

## Add the demographic variables

In [11]:
person = pd.read_csv('/home/data/CHTS_csv_format/data/Deliv_PER.csv')

person.head()

Unnamed: 0,SAMPN,PERNO,RELAT,GEND,AGE,AGEB,HISP,RACE1,RACE2,RACE3,RACE4,O_RACE,NTVTY,CNTRY,LIC,USER,TRANS,TPTYP1,TPTYP2,TPTYP3,TPTYP4,TPTYP5,TPTYP6,TPTYP7,O_TPTYP,CLIP1,CLIP2,CLIP3,COMP,MET,PASSTL,FLEX,EMPLY,WKSTAT,O_WKSTAT,JOBS,WLOC,WNAME,WCITY,WSTAT,...,HVLOG,PTRIPS,TOLLF,TOLLR1,TOLLR2,TOLLR3,TOLLR4,TOLLR5,TOLLR6,TOLLR7,TOLLR8,TOLLR9,TOLLR10,TOLLB1,TOLLB2,TOLLB3,TOLLB4,TOLLB5,TOLLB6,TOLLB7,TOLLB8,TOLLB9,TOLLB10,HOVL,NOGOWHY,NOGOWHY_O,InComplete,Moto_trip,WCTFIP,WTRACT,SCTFIP,STRACT,WPrimaryCity,WSTFIP,W2PrimaryCity,W2STFIP,SPrimaryCity,SSTFIP,PERWGT,EXPPERWGT
0,1031985,1,1,1,74,,2,1.0,,,,,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,2.0,1.0,,,,,,,...,1.0,2.0,3.0,,,,,,,,,,,,,,,,,,,,,2.0,,,,1.0,,,,,,,,,,,0.052086,17.647568
1,1031985,2,2,2,73,,2,1.0,,,,,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,2.0,1.0,,,,,,,...,1.0,2.0,3.0,,,,,,,,,,,,,,,,,,,,,2.0,,,,1.0,,,,,,,,,,,0.052086,17.647568
2,1032036,1,1,1,46,,2,1.0,,,,,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,1.0,,,1.0,1.0,HIDDEN,SAN DIEGO,CA,...,,5.0,3.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,73.0,17032.0,,,SAN DIEGO,6.0,,,,,1.223974,414.701494
3,1032036,2,2,2,47,,2,1.0,97.0,,,MULTI-RACIAL,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,2.0,3.0,,,,,,,...,,18.0,3.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,,,,,,,,,0.863473,292.558373
4,1032036,3,3,1,15,,2,1.0,97.0,,,MULTI-RACIAL,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,4.0,3.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,73.0,17030.0,,,,,SAN DIEGO,6.0,0.941412,318.9651


In [12]:
person = person[['SAMPN','PERNO','GEND','AGE','HISP','RACE1','RACE2','RACE3','RACE4',
                 'HOURS','EDUCA','INDUS']]

In [13]:
person['HHPER'] = person['SAMPN'].map(str) + person['PERNO'].map(str)

In [14]:
hh_df = pd.read_csv('/home/data/CHTS_csv_format/data/Deliv_HH.csv')[
  ['SAMPN','HHVEH',
   'OWN','INCOM','HHSIZ']]

hh_df.head()

Unnamed: 0,SAMPN,HHVEH,OWN,INCOM,HHSIZ
0,1031985,2,1,3,2
1,1032036,1,1,7,5
2,1032053,2,2,2,6
3,1032425,2,1,7,2
4,1032558,0,2,1,1


In [15]:
demo = person.merge(hh_df,on = 'SAMPN',how = 'left')

demo.head()

Unnamed: 0,SAMPN,PERNO,GEND,AGE,HISP,RACE1,RACE2,RACE3,RACE4,HOURS,EDUCA,INDUS,HHPER,HHVEH,OWN,INCOM,HHSIZ
0,1031985,1,1,74,2,1.0,,,,,6,,10319851,2,1,3,2
1,1031985,2,2,73,2,1.0,,,,,6,,10319852,2,1,3,2
2,1032036,1,1,46,2,1.0,,,,40.0,6,54.0,10320361,1,1,7,5
3,1032036,2,2,47,2,1.0,97.0,,,,6,,10320362,1,1,7,5
4,1032036,3,1,15,2,1.0,97.0,,,,1,,10320363,1,1,7,5


## Get rid of null values

In [16]:
na_dict = {
    'GEND':[9],
    'AGE':[998,999],
    'HOURS':[998,999],
    'EDUCA':[8,9],
    'HHVEH':[98,99],
    'OWN':[7,8,9],
    'INCOM':[98,99],
    'HHSIZ':[98,99],
    'INDUS':[98,99],
}

for col in na_dict:
    for vals in na_dict[col]:
        demo[col] = demo[col].replace(vals,np.nan)

In [17]:
demo = demo.dropna(subset = ['GEND', 'AGE', 'HOURS', 'EDUCA','HHVEH','OWN','INCOM','HHSIZ','INDUS'])

In [18]:
tripsIII['HHPER'] = tripsIII['HHPER'].map(str)

trips1 = pd.merge(tripsIII, demo, on='HHPER')

print (len(tripsIII.index))
print (len(trips1.index))

21285
17943


## Prepare data for use in MNL estimation (make dummy columns)

In [19]:
trips1['minority'] = np.where((trips1['HISP'].isin([1.0]) |
                                trips1['RACE1'].isin([2.0]) | trips1['RACE2'].isin([2.0]) | trips1['RACE3'].isin([2.0]) | trips1['RACE4'].isin([2.0]) |
                              trips1['RACE1'].isin([3.0]) | trips1['RACE2'].isin([3.0]) | trips1['RACE3'].isin([3.0]) | trips1['RACE4'].isin([3.0]) |
                              trips1['RACE1'].isin([4.0]) | trips1['RACE2'].isin([4.0]) | trips1['RACE3'].isin([4.0]) | trips1['RACE4'].isin([4.0]) |
                              trips1['RACE1'].isin([5.0]) | trips1['RACE2'].isin([5.0]) | trips1['RACE3'].isin([5.0]) | trips1['RACE4'].isin([5.0]) |
                              trips1['RACE1'].isin([97.0]) | trips1['RACE2'].isin([97.0]) | trips1['RACE3'].isin([97.0]) | trips1['RACE4'].isin([97.0])),1,0)

trips1['HISP'] = np.where(trips1['HISP'].isin([1.0]),1,0)
trips1['black'] = np.where((trips1['RACE1'].isin([2.0]) | trips1['RACE2'].isin([2.0]) | trips1['RACE3'].isin([2.0]) | trips1['RACE4'].isin([2.0])),1,0)
trips1['native'] = np.where((trips1['RACE1'].isin([3.0]) | trips1['RACE2'].isin([3.0]) | trips1['RACE3'].isin([3.0]) | trips1['RACE4'].isin([3.0])),1,0)
trips1['asian'] = np.where((trips1['RACE1'].isin([4.0]) | trips1['RACE2'].isin([4.0]) | trips1['RACE3'].isin([4.0]) | trips1['RACE4'].isin([4.0])),1,0)
trips1['PI'] = np.where((trips1['RACE1'].isin([5.0]) | trips1['RACE2'].isin([5.0]) | trips1['RACE3'].isin([5.0]) | trips1['RACE4'].isin([5.0])),1,0)


In [20]:
trips1['hh_inc_less75k'] = np.where(trips1['INCOM'].isin([1.0,2.0,3.0,4.0,5.0]),1,0)
trips1['hh_inc_75kless100k'] = np.where(trips1['INCOM'].isin([6.0]),1,0)
trips1['hh_inc_150kplus'] = np.where(trips1['INCOM'].isin([8.0,9.0,10.0]),1,0)
trips1['hh_inc_150kless250k'] = np.where(trips1['INCOM'].isin([8.0,9.0]),1,0)
trips1['hh_inc_250kplus'] = np.where(trips1['INCOM'].isin([10.0]),1,0)

In [21]:
trips1['lessGED'] = np.where(trips1['EDUCA'].isin([1.0]),1,0)
trips1['GED'] = np.where(trips1['EDUCA'].isin([2.0]),1,0)
trips1['someBach'] = np.where(trips1['EDUCA'].isin([3.0]),1,0)
trips1['Assoc'] = np.where(trips1['EDUCA'].isin([4.0]),1,0)
trips1['Bach'] = np.where(trips1['EDUCA'].isin([5.0]),1,0)

trips1['lessGED_GED'] = np.where(trips1['EDUCA'].isin([1.0,2.0]),1,0)

trips1['no_higher_ed'] = (trips1['EDUCA'] < 5).astype(int)

In [22]:
trips1['age_16less25'] = np.where(((trips1.AGE.between(16,25,inclusive = False)) | (trips1.AGE==16)),1,0)
trips1['age_25less40'] = np.where(((trips1.AGE.between(25,40,inclusive = False)) | (trips1.AGE==25)),1,0)
trips1['age_40less50'] = np.where(((trips1.AGE.between(40,50,inclusive = False)) | (trips1.AGE==40)),1,0)
trips1['age_50less60'] = np.where(((trips1.AGE.between(50,60,inclusive = False)) | (trips1.AGE==50)),1,0)

In [23]:
trips1['female'] = trips1['GEND'] - 1

trips1['tenure_2'] = trips1['OWN'] - 1

trips1['noveh'] = np.where(trips1.HHVEH.isin([0.0]),1,0)

trips1['hh_size_1per'] = np.where(trips1.HHSIZ.isin([1.0]),1,0)

In [24]:
trips1['sector_constr'] = np.where(trips1['INDUS'].isin([23]),1,0)
trips1['sector_mfg'] = np.where(trips1['INDUS'].isin([31]),1,0)
trips1['sector_retail'] = np.where(trips1['INDUS'].isin([44,45]),1,0)
trips1['sector_transport'] = np.where(trips1['INDUS'].isin([48]),1,0)
trips1['info'] = np.where(trips1['INDUS'].isin([51]),1,0)
trips1['finance'] = np.where(trips1['INDUS'].isin([52]),1,0)
trips1['scitech'] = np.where(trips1['INDUS'].isin([54]),1,0)
trips1['sector_edu_serv'] = np.where(trips1['INDUS'].isin([61]),1,0)
trips1['sector_healthcare'] = np.where(trips1['INDUS'].isin([62]),1,0)
trips1['sector_oth_serv'] = np.where(trips1['INDUS'].isin([81]),1,0)
trips1['sector_gov'] = np.where(trips1['INDUS'].isin([92]),1,0)

In [25]:
trips1['TOD_3to6'] = np.where(trips1['TOD'].isin([0]),1,0)
# trips1['TOD_6to9'] = np.where(trips1['TOD'].isin([1]),1,0)
trips1['TOD_9to1530'] = np.where(trips1['TOD'].isin([2]),1,0)
trips1['TOD_1530to1830'] = np.where(trips1['TOD'].isin([3]),1,0)
trips1['TOD_1830up'] = np.where(trips1['TOD'].isin([4]),1,0)

## Estimate the model for dwell time at work

In [26]:
@orca.table(cache=True)
def tripsA():
    return trips1

In [27]:
m = SmallMultinomialLogitStep()
m.name = 'dwell_work'
m.tables = ['tripsA']
m.choice_column = 'dwell_work'
m.model_expression = OrderedDict([
    ('intercept', [1,2,3,5]), 
    
    ('TOD_3to6',[2,5]),
#     ('TOD_6to9'),
    ('TOD_9to1530',[1,2,4,5]),
    ('TOD_1530to1830',[1,2,4]),
    ('TOD_1830up',[1,4]),
    
    ('sector_mfg',[1,2]),
    ('sector_retail',[1,5]),
    ('sector_transport',[4,5]),
    ('info',[1]),
    ('finance',[1,4,5]),
    ('scitech',[1,2]),
    ('sector_edu_serv',[2,4,5]),
    ('sector_healthcare',[1,2,4,5]),
    ('sector_gov',[1,2]),
    
    ('age_16less25',[1,2]),
    ('age_25less40',[1]),
    ('age_40less50',[1]),
    ('age_50less60',[1]),
    
    ('female',[[1,2],5]),
        
    ('minority',[1,2]),
    
    ('hh_inc_less75k',[1,4,5]), 
    ('hh_inc_75kless100k',[5]),
#     ('100kless150k')
    ('hh_inc_150kplus',[1,2,4]),
    
    ('lessGED_GED',[1]),
    ('Assoc',[1,4]),
       
    ('HOURS',[1,2,4,5]),
    
    ('noveh',[4]),
   
    ('hh_size_1per',[4]),
    
    ('tenure_2',[4]),
    
])

In [33]:
m.fit()

Log-likelihood at zero: -28,878.1445
Initial Log-likelihood: -28,878.1445
Estimation Time for Point Estimation: 9.57 seconds.
Final log-likelihood: -24,197.0957
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                     _chosen   No. Observations:               17,943
Model:             Multinomial Logit Model   Df Residuals:                   17,881
Method:                                MLE   Df Model:                           62
Date:                     Thu, 28 Mar 2019   Pseudo R-squ.:                   0.162
Time:                             20:30:47   Pseudo R-bar-squ.:               0.160
AIC:                            48,518.191   Log-Likelihood:            -24,197.096
BIC:                            49,001.479   LL-Null:                   -28,878.144
                           coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------

In [39]:
m.name = 'dwell_work'

In [35]:
mm.initialize()

Registering model step 'auto_ownership'
Registering model step 'dwell_work'
Registering model step 'TOD_choice'
Registering model step 'work_TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM'


In [36]:
m.tags = ['dwell_work','emma']
mm.register(m)

Saving 'dwell_work.yaml': /home/emma/activitysynth/activitysynth/configs
Model saved to configs/dwell_work-model-object.pkl
Registering model step 'dwell_work'


## Estimate the model for Home-to-Work Trip End Times

In [37]:
###model with p-values less than .01 (except hours4)

m = SmallMultinomialLogitStep()
m.name = 'work_TOD_choice'
m.tables = ['tripsA']
m.choice_column = 'TOD'
m.model_expression = OrderedDict([
    ('intercept', [0,1,3,4]), 
    
    ('sector_constr',[2,3]),
    ('sector_mfg',[0,2,3]),
    ('sector_retail',[2]),
    ('sector_transport',[0]),
    ('info',[0,2,3]),
    ('finance',[0,2,3]),
    ('scitech',[0,3]),
    ('sector_edu_serv',[0,2,3]),
    ('sector_healthcare',[0,2,3,4]),
    ('sector_oth_serv',[0,3]),
    ('sector_gov',[2,3]),
    
    ('age_16less25',[2,3,4]),
    ('age_25less40',[0]),
    
    ('female',[0,3,4]),
    
    ('minority',[0,4]),
    ('asian',[2]),
    
    ('hh_inc_less75k',[4]), 
#     ('75kless150k')
    ('hh_inc_150kless250k',[0]),
    ('hh_inc_250kplus',[0,2]),

    ('lessGED',[0,2,3]),
    ('GED',[0,2,3]),
    ('someBach',[0,2]),
    ('Assoc',[0,2]),
    ('no_higher_ed',[4]),
    ('Bach',[0,2,4]),
#     ('Grad')
       
    ('HOURS',[0,2,3,4]),

    ('noveh',[2]),

    ('hh_size_1per',[2]),
    
    ('tenure_2',[2]),
    
])

In [38]:
m.fit()

Log-likelihood at zero: -28,878.1445
Initial Log-likelihood: -28,878.1445
Estimation Time for Point Estimation: 15.67 seconds.
Final log-likelihood: -16,303.6766
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                     _chosen   No. Observations:               17,943
Model:             Multinomial Logit Model   Df Residuals:                   17,878
Method:                                MLE   Df Model:                           65
Date:                     Thu, 28 Mar 2019   Pseudo R-squ.:                   0.435
Time:                             20:34:48   Pseudo R-bar-squ.:               0.433
AIC:                            32,737.353   Log-Likelihood:            -16,303.677
BIC:                            33,244.025   LL-Null:                   -28,878.144
                            coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

In [40]:
m.name = 'work_TOD_choice'

In [41]:
m.tags = ['work_TOD_choice','emma']
mm.register(m)

Saving 'work_TOD_choice.yaml': /home/emma/activitysynth/activitysynth/configs
Model saved to configs/work_TOD_choice-model-object.pkl
Registering model step 'work_TOD_choice'


## Prepare synthetic population data for TOD simulation

In [42]:
obs = pd.read_csv('/home/data/fall_2018/persons_w_jobs.csv')

In [43]:
jobs = pd.read_csv('/home/data/fall_2018/jobs_v2.csv')

hh = pd.read_csv('/home/data/fall_2018/households_v2.csv')

buildings = pd.read_csv('/home/data/fall_2018/buildings_v2.csv')
parcels = pd.read_csv('/home/data/fall_2018/parcel_attr.csv')

merge = buildings.merge(parcels,how = 'left',left_on='parcel_id', right_on='primary_id')

jobs = jobs.merge(merge,on = 'building_id',how = 'left').rename(columns={'zone_id': 'zone_id_work'})

hh = hh.merge(merge,on = 'building_id',how = 'left').rename(columns={'zone_id': 'zone_id_home'})

In [44]:
obs = obs.merge(jobs,on = 'job_id',how = 'left')

obs = obs.merge(hh,on = 'household_id',how = 'left')

obs.head()

Unnamed: 0,person_id,member_id,age,primary_commute_mode,relate,edu,sex,hours,hispanic,earning,race_id,student,work_at_home,worker,household_id,node_id_small,node_id_walk,job_id,building_id_x,sector_id,occupation_id,parcel_id_x,development_type_id_x_x,improvement_value_x,residential_units_x,residential_sqft_x,sqft_per_unit_x,non_residential_sqft_x,building_sqft_x,nonres_rent_per_sqft_x,res_price_per_sqft_x,stories_x,year_built_x,redfin_sale_price_x,redfin_sale_year_x,redfin_home_type_x,costar_property_type_x,costar_rent_x,building_type_id_x,primary_id_x,...,recent_mover,block_group_id,single_family,unit_id,building_id_y,parcel_id_y,development_type_id_x_y,improvement_value_y,residential_units_y,residential_sqft_y,sqft_per_unit_y,non_residential_sqft_y,building_sqft_y,nonres_rent_per_sqft_y,res_price_per_sqft_y,stories_y,year_built_y,redfin_sale_price_y,redfin_sale_year_y,redfin_home_type_y,costar_property_type_y,costar_rent_y,building_type_id_y,primary_id_y,development_type_id_y_y,land_value_y,acres_y,county_id_y,zone_id_home,proportion_undevelopable_y,tax_exempt_status_y,apn_y,parcel_id_local_y,geom_id_y,imputation_flag_y,x_y,y_y,shape_area_y,block_id_y,node_id_y
0,0,1,47,1.0,0,22.0,1,40.0,1,85000.0,1,0,0,1,0,65468920.0,5125306000.0,413751.0,1834898.0,45.0,15.0,1569163.0,19.0,70552440.0,0.0,0.0,0.0,1212289.0,1212289.0,10.905878,0.0,3.0,1971.0,,,,Retail (Super Regional Mall),Negotiable,0.0,1569163.0,...,0,60855009011,False,1711366,409174.0,1546718.0,1.0,284448.87,6.0,15096.0,2516.0,0.0,15096.0,0.0,10.343517,2.0,1965.0,985000.0,2005.0,Single Family Residential,,,1.0,1546718.0,1.0,426851.13,0.142286,85.0,557.0,0.0,0.0,46724101,,15308930000000.0,"_, du_zonetarget",-121.882277,37.339126,575.809742,60855010000000.0,65430040.0
1,1,1,47,1.0,0,22.0,1,40.0,1,85000.0,1,0,0,1,670,286130700.0,5265128000.0,31870.0,177900.0,61.0,25.0,1462294.0,18.0,0.0,0.0,0.0,0.0,426880.0,426880.0,15.349687,0.0,1.0,1969.0,,,,Sports & Entertainment,-,0.0,1462294.0,...,0,60855003001,False,1595570,1292478.0,1392465.0,2.0,103204.535,5.0,5900.0,1180.0,2582.0,8482.0,0.0,193.929673,2.0,1888.0,,,,Office,-,3.0,1392465.0,2.0,516270.465,0.382178,85.0,539.0,0.0,0.0,25929029,,7680105000000.0,"_, cs_nrsqft, du_zonetarget",-121.90115,37.334387,1546.620624,60855000000000.0,286130700.0
2,2,1,47,1.0,0,22.0,1,40.0,1,85000.0,1,0,0,1,740,2279076000.0,286097000.0,1258351.0,1297992.0,61.0,25.0,1312083.0,18.0,42647700.0,0.0,0.0,0.0,147519.0,19200.0,19.564141,0.0,1.0,1931.0,,,,,,0.0,1312083.0,...,0,60855003001,False,1595613,1507170.0,1391986.0,1.0,0.0,1.0,1844.0,1844.367559,0.0,1844.367559,0.0,131.270024,1.0,1918.0,,,,,,1.0,1391986.0,1.0,0.0,4.038478,85.0,539.0,0.0,0.0,25912121,,2103531000000.0,"_, res_zone_yrblt, cnty_sq_du, stories1",-121.908472,37.343071,16343.153974,60855000000000.0,2279076000.0
3,3,1,47,1.0,0,22.0,1,40.0,1,85000.0,1,0,0,1,975,3969451000.0,65466730.0,343693.0,1579647.0,92.0,23.0,1546649.0,7.0,0.0,0.0,0.0,2046.0,9910.0,9910.0,23.095409,0.0,1.0,1930.0,185000.0,1995.0,Multi-Family (5+ Unit),,,10.0,1546649.0,...,0,60855008002,False,1596643,1293789.0,1392924.0,1.0,18997.8525,3.0,2985.0,995.0,0.0,2985.0,0.0,279.202661,1.0,2007.0,,,,,,1.0,1392924.0,1.0,10207.1475,0.072154,85.0,558.0,0.0,0.0,25947047,,5030014000000.0,"_, du_zonetarget",-121.899219,37.326159,291.998316,60855010000000.0,65466730.0
4,4,1,47,1.0,0,22.0,1,40.0,1,85000.0,1,0,0,1,977,65615510.0,65548530.0,447704.0,1382392.0,44.0,43.0,1718419.0,14.0,757614.4,0.0,0.0,0.0,23400.0,23400.0,9.961782,0.0,1.0,1979.0,,,,Industrial,-,7.0,1718419.0,...,0,60855008002,False,1595984,484870.0,1392836.0,2.0,82196.6886,28.0,33572.0,1199.0,0.0,33572.0,0.0,257.134271,1.0,2015.0,165000.0,1997.0,Multi-Family (2-4 Unit),,,3.0,1392836.0,2.0,126477.3114,0.094123,85.0,558.0,0.0,0.0,25945049,,11587670000000.0,"_, du_zonetarget",-121.898518,37.329283,380.903761,60855010000000.0,65473790.0


In [45]:
chooser_filters = ['worker == 1', 'work_at_home == 0']
query = ' and '.join(chooser_filters)

In [46]:
obs.index.name = 'obs_id'
obs = obs.query(query)
obs = obs[[
    'age', 'edu', 'sex','hours','hispanic','race_id',
    'income','persons', 'tenure',
    'sector_id']]

In [47]:
obs = obs.dropna(subset = [
    'age', 'edu', 'sex','hours','hispanic','race_id',
    'income','persons', 'tenure',
    'sector_id'])

In [48]:
obs['hh_inc_less75k'] = (obs['income'] < 75000).astype(int)
obs['hh_inc_75kless100k'] = ((obs['income'] >= 75000) & (obs['income']  < 100000)).astype(int)
obs['hh_inc_150kplus'] = ((obs['income'] > 150000) | (obs['income'] == 150000)).astype(int)
obs['hh_inc_150kless250k'] = ((obs['income'] >= 150000) & (obs['income']  < 250000)).astype(int)
obs['hh_inc_250kplus'] = ((obs['income'] > 250000) | (obs['income'] == 250000)).astype(int)

obs['no_higher_ed'] = (obs['edu'] < 21).astype(int)
obs['lessGED_GED'] = ((obs['edu'] < 16) | (obs['edu'] == 16) | (obs['edu'] == 17)).astype(int)

obs['lessGED'] = (obs['edu'] < 16).astype(int)
obs['GED'] = np.where(obs['edu'].isin([16,17]),1,0)
obs['someBach'] = np.where(obs['edu'].isin([18,19]),1,0)
obs['Assoc'] = np.where(obs['edu'].isin([20]),1,0)
obs['Bach'] = np.where(obs['edu'].isin([21]),1,0)

obs['female'] = obs['sex'] - 1

obs.rename(columns = {'hours':'HOURS'},inplace = True)

obs['white'] = np.where(obs['race_id'].isin([1.0]),1,0)
obs['asian'] = np.where(obs['race_id'].isin([4.0]),1,0)
obs['minority'] = np.where(obs['white'] == 1, 0, 1)

obs['age_16less25'] = np.where(((obs.age.between(16,25,inclusive = False)) | (obs.age==16)),1,0)
obs['age_25less40'] = np.where(((obs.age.between(25,40,inclusive = False)) | (obs.age==25)),1,0)
obs['age_40less50'] = np.where(((obs.age.between(40,50,inclusive = False)) | (obs.age==40)),1,0)
obs['age_50less60'] = np.where(((obs.age.between(50,60,inclusive = False)) | (obs.age==50)),1,0)

obs['hh_size_1per'] = np.where(obs.persons.isin([1.0]),1,0)

obs['tenure_2'] = (obs['tenure'] == 2).astype(int)

In [49]:
# industry of alternatives
obs['sector_retail'] = obs['sector_id'].isin([44, 45]).astype(int)
obs['sector_healthcare'] = obs['sector_id'].isin([62]).astype(int)
obs['info'] = obs['sector_id'].isin([51]).astype(int)
obs['scitech'] = obs['sector_id'].isin([54]).astype(int)
obs['sector_mfg'] = obs['sector_id'].isin([31, 32, 33]).astype(int)
obs['sector_edu_serv'] = obs['sector_id'].isin([61]).astype(int)
obs['sector_oth_serv'] = obs['sector_id'].isin([81]).astype(int)
obs['sector_constr'] = obs['sector_id'].isin([23]).astype(int)
obs['sector_gov'] = obs['sector_id'].isin([92]).astype(int)
obs['finance'] = obs['sector_id'].isin([52]).astype(int)
obs['sector_transport'] = obs['sector_id'].isin([48]).astype(int)

In [50]:
#placeholder for number of vehicles
obs['noveh'] = np.random.choice(range(0,1), size = len(obs.index), replace=True)

## Simulate TOD for Home-to-Work Trip End Times

In [51]:
@orca.table(cache=True)
def tripsA():
    return obs

In [52]:
m = mm.get_step('work_TOD_choice')

In [53]:
m.run()

In [54]:
obs.TOD.value_counts()

1    1783725
2     922614
0     210783
3     107970
4      35904
Name: TOD, dtype: int64

## Simulate dwell time at work

In [55]:
m = mm.get_step('dwell_work')

In [56]:
obs['TOD_3to6'] = np.where(obs['TOD'].isin([0]),1,0)
obs['TOD_6to9'] = np.where(obs['TOD'].isin([1]),1,0)
obs['TOD_9to1530'] = np.where(obs['TOD'].isin([2]),1,0)
obs['TOD_1530to1830'] = np.where(obs['TOD'].isin([3]),1,0)
obs['TOD_1830up'] = np.where(obs['TOD'].isin([4]),1,0)

In [57]:
m.run()

In [58]:
obs.dwell_work.value_counts()

4    911456
3    905473
2    597955
5    339888
1    306224
Name: dwell_work, dtype: int64

## Compare frequencies of H-W trip end times and work dwell times in actual and synthetic populations

In [59]:
trips1.TOD.value_counts(normalize=True)

1    0.609764
2    0.274926
0    0.072173
3    0.031879
4    0.011258
Name: TOD, dtype: float64

In [60]:
obs.TOD.value_counts(normalize=True)

1    0.582727
2    0.301410
0    0.068861
3    0.035273
4    0.011730
Name: TOD, dtype: float64

In [61]:
trips1.dwell_work.value_counts(normalize=True)

3    0.297943
4    0.290643
2    0.195675
5    0.112969
1    0.102770
Name: dwell_work, dtype: float64

In [62]:
obs.dwell_work.value_counts(normalize=True)

4    0.297765
3    0.295810
2    0.195347
5    0.111038
1    0.100041
Name: dwell_work, dtype: float64

In [75]:
tod_dwell = pd.crosstab(index=trips1["TOD"], 
                           columns=trips1["dwell_work"],normalize=True)

tod_dwell.rename(columns={1:'0-4.5h', 2:'4.5-7.75h',3:'7.75-9h',4:'9-10.5h',5:'10.5+h'}, inplace=True)

tod_dwell.rename(index={0:'3-6am',1:'6-9am',2:'9am-3:30pm',3:'3:30-6:30pm',4:'6:30pm-3am'}, inplace=True)

tod_dwell

dwell_work,0-4.5h,4.5-7.75h,7.75-9h,9-10.5h,10.5+h
TOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3-6am,0.002118,0.004904,0.018336,0.023798,0.023017
6-9am,0.028981,0.086886,0.196511,0.222872,0.074514
9am-3:30pm,0.057515,0.089227,0.075573,0.040907,0.011704
3:30-6:30pm,0.011537,0.013041,0.003957,0.001505,0.001839
6:30pm-3am,0.002619,0.001616,0.003567,0.00156,0.001895


In [74]:
syn_tod_dwell = pd.crosstab(index=obs["TOD"], 
                           columns=obs["dwell_work"], 
#                             rownames=['3-6am','6-9am','9am-3:30pm','3:30-6:30pm','6:30pm-3am'],
#                             colnames=['0-4.5h', '4.5-7.75h','7.75-9h','9-10.5h','10.5+h'],
                            normalize=True)

syn_tod_dwell.rename(columns={1:'0-4.5h', 2:'4.5-7.75h',3:'7.75-9h',4:'9-10.5h',5:'10.5+h'}, inplace=True)

syn_tod_dwell.rename(index={0:'3-6am',1:'6-9am',2:'9am-3:30pm',3:'3:30-6:30pm',4:'6:30pm-3am'}, inplace=True)

syn_tod_dwell

dwell_work,0-4.5h,4.5-7.75h,7.75-9h,9-10.5h,10.5+h
TOD,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3-6am,0.002309,0.004949,0.018818,0.022057,0.020727
6-9am,0.023975,0.076909,0.182807,0.224346,0.074691
9am-3:30pm,0.058283,0.097557,0.085253,0.047959,0.012358
3:30-6:30pm,0.012866,0.014255,0.00475,0.001804,0.001598
6:30pm-3am,0.002607,0.001677,0.004182,0.001598,0.001665


# Validate models

In [66]:
# Validation process
from scripts import validate

In [67]:
m = mm.get_step('work_TOD_choice')

In [68]:
validate.tp_rates(m)

AttributeError: 'SmallMultinomialLogitStep' object has no attribute 'probabilities'

In [None]:
predicted_choices = validate.get_predicted_choices(m)
pd.crosstab(m.choices.rename('observed'), predicted_choices, margins=True) # unnormalized

In [None]:
validate.model_crosstab(m)

In [None]:
import seaborn as sns; sns.heatmap(validate.model_crosstab(m))