In [1]:
from collections import OrderedDict
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
from urbansim_templates.models import SmallMultinomialLogitStep
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')

import pandas as pd
# import pandana as pdna
import time
import numpy as np
import matplotlib.pyplot as plt
import os
from functools import reduce

import scipy.stats as st
from scipy.stats import skewnorm

# import matplotlib
# matplotlib.style.use('ggplot')

%matplotlib inline

pd.options.display.max_columns = 80

## Load Data

In [4]:
trips = pd.read_csv('/home/emma/ual_model_workspace/spring-2019-models/notebooks-emma/HStrips_031219.csv')

In [6]:
trips.head()

Unnamed: 0.1,Unnamed: 0,HHPER,HHPERTRIP,origin,origin_dwell,origin_ST,origin_ET,trip_ET,TT,MODE
0,0,10320533,1032053000.0,home,13.75,17.75,7.5,7.55,0.05,shared
1,1,10320533,1032053000.0,school,8.616667,7.55,16.166667,17.75,1.583333,shared
2,2,10320534,1032053000.0,home,13.75,17.75,7.5,7.55,0.05,shared
3,3,10320534,1032053000.0,school,8.616667,7.55,16.166667,17.75,1.583333,shared
4,4,10320535,1032054000.0,home,14.833333,16.666667,7.5,7.55,0.05,shared


## Prepare TOD and Dwell columns

In [9]:
#select people who make both home-school and school-home trips:
tripsII = trips.groupby('HHPER').filter(lambda x: len(x) == 2)

In [11]:
#make sure all home-school trip rows are listed first
tripsIII = tripsII.sort_values(['HHPER','origin']).reset_index()

In [12]:
#move school-home trip info up into home-school trip rows

tripsIII['school_dwell'] = tripsIII.groupby('HHPER', group_keys=False).origin_dwell.shift(-1)
tripsIII['school_ST'] = tripsIII.groupby('HHPER', group_keys=False).origin_ST.shift(-1)
tripsIII['SH_trip_ST'] = tripsIII.groupby('HHPER', group_keys=False).origin_ET.shift(-1)
tripsIII['SH_trip_ET'] = tripsIII.groupby('HHPER', group_keys=False).trip_ET.shift(-1)
tripsIII['SH_TT'] = tripsIII.groupby('HHPER', group_keys=False).TT.shift(-1)
tripsIII['SH_mode'] = tripsIII.groupby('HHPER', group_keys=False).MODE.shift(-1)

In [13]:
tripsIII = tripsIII.groupby('HHPER').first().reset_index()

In [14]:
tripsIII.rename(columns = {'origin_dwell':'home_dwell','origin_ST':'home_ST','origin_ET':'HS_trip_ST',
                           'trip_ET':'HS_trip_ET','TT':'HS_TT','MODE':'HS_mode','TOD':'HS_TOD'},inplace = True)

In [20]:
tripsIII['HS_TOD1'] = (
    ((tripsIII.HS_trip_ET.between(3,7.75,inclusive = False)) | (tripsIII.HS_trip_ET==3))*1 +
    ((tripsIII.HS_trip_ET.between(7.75,8.5,inclusive = True)))*2 +
    ((tripsIII.HS_trip_ET.between(8.5,9.5,inclusive = False)) | (tripsIII.HS_trip_ET==9.5))*3 +
    ((tripsIII.HS_trip_ET.between(9.5,15.0,inclusive = False)) | (tripsIII.HS_trip_ET==15.0))*4 +
    ((tripsIII.HS_trip_ET>15.0))*5 +
    ((tripsIII.HS_trip_ET.between(0,3,inclusive = False)) | (tripsIII.HS_trip_ET==0))*5
)

tripsIII['HS_TOD1'] = tripsIII['HS_TOD1'] - 1

In [17]:
tripsIII['Sdwell'] = (
    ((tripsIII.school_dwell.between(0,3.5,inclusive = False)) | (tripsIII.school_dwell==0))*1 +
    ((tripsIII.school_dwell.between(3.5,6,inclusive = False)) | (tripsIII.school_dwell==3.5))*2 +
    ((tripsIII.school_dwell.between(6,8,inclusive = True)))*3 +
    ((tripsIII.school_dwell.between(8,10,inclusive = False)) | (tripsIII.school_dwell==10))*4 +
    ((tripsIII.school_dwell>10))*5
)

In [22]:
tripsIII['Sdwell'] = pd.to_numeric(tripsIII['Sdwell'])
tripsIII['HS_TOD1'] = pd.to_numeric(tripsIII['HS_TOD1'])

## Add the demographic variables

In [23]:
person = pd.read_csv('/home/data/CHTS_csv_format/data/Deliv_PER.csv')

person.head()

Unnamed: 0,SAMPN,PERNO,RELAT,GEND,AGE,AGEB,HISP,RACE1,RACE2,RACE3,RACE4,O_RACE,NTVTY,CNTRY,LIC,USER,TRANS,TPTYP1,TPTYP2,TPTYP3,TPTYP4,TPTYP5,TPTYP6,TPTYP7,O_TPTYP,CLIP1,CLIP2,CLIP3,COMP,MET,PASSTL,FLEX,EMPLY,WKSTAT,O_WKSTAT,JOBS,WLOC,WNAME,WCITY,WSTAT,...,HVLOG,PTRIPS,TOLLF,TOLLR1,TOLLR2,TOLLR3,TOLLR4,TOLLR5,TOLLR6,TOLLR7,TOLLR8,TOLLR9,TOLLR10,TOLLB1,TOLLB2,TOLLB3,TOLLB4,TOLLB5,TOLLB6,TOLLB7,TOLLB8,TOLLB9,TOLLB10,HOVL,NOGOWHY,NOGOWHY_O,InComplete,Moto_trip,WCTFIP,WTRACT,SCTFIP,STRACT,WPrimaryCity,WSTFIP,W2PrimaryCity,W2STFIP,SPrimaryCity,SSTFIP,PERWGT,EXPPERWGT
0,1031985,1,1,1,74,,2,1.0,,,,,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,2.0,1.0,,,,,,,...,1.0,2.0,3.0,,,,,,,,,,,,,,,,,,,,,2.0,,,,1.0,,,,,,,,,,,0.052086,17.647568
1,1031985,2,2,2,73,,2,1.0,,,,,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,2.0,1.0,,,,,,,...,1.0,2.0,3.0,,,,,,,,,,,,,,,,,,,,,2.0,,,,1.0,,,,,,,,,,,0.052086,17.647568
2,1032036,1,1,1,46,,2,1.0,,,,,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,1.0,,,1.0,1.0,HIDDEN,SAN DIEGO,CA,...,,5.0,3.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,73.0,17032.0,,,SAN DIEGO,6.0,,,,,1.223974,414.701494
3,1032036,2,2,2,47,,2,1.0,97.0,,,MULTI-RACIAL,1,,1.0,1.0,2.0,,,,,,,,,,,,,,2.0,2.0,2.0,3.0,,,,,,,...,,18.0,3.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,,,,,,,,,0.863473,292.558373
4,1032036,3,3,1,15,,2,1.0,97.0,,,MULTI-RACIAL,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,4.0,3.0,,,,,,,,,,,,,,,,,,,,,1.0,,,,1.0,,,73.0,17030.0,,,,,SAN DIEGO,6.0,0.941412,318.9651


In [26]:
person = person[['SAMPN','PERNO','GEND','AGE','HISP','RACE1','RACE2','RACE3','RACE4',
                 'HOURS','EDUCA']]

In [27]:
person['HHPER'] = person['SAMPN'].map(str) + person['PERNO'].map(str)

Unnamed: 0,SAMPN,PERNO,GEND,AGE,HISP,RACE1,RACE2,RACE3,RACE4,O_RACE,NTVTY,LIC,JOBS,HOURS,EDUCA,WSCHED,DISAB,INDUS,OCCUP,HHPER,female,white,black,native,asian,PI,immigrant,nolic
0,1031985,1,1,74,0,1.0,,,,,1,1.0,,,6,,0,,,10319851,0,1,0,0,0,0,0,0.0
1,1031985,2,2,73,0,1.0,,,,,1,1.0,,,6,,0,,,10319852,1,1,0,0,0,0,0,0.0
2,1032036,1,1,46,0,1.0,,,,,1,1.0,1.0,40.0,6,2.0,0,54.0,15.0,10320361,0,1,0,0,0,0,0,0.0
3,1032036,2,2,47,0,1.0,97.0,,,MULTI-RACIAL,1,1.0,,,6,,0,,,10320362,1,1,0,0,0,0,0,0.0
4,1032036,3,1,15,0,1.0,97.0,,,MULTI-RACIAL,1,,,,1,,0,,,10320363,0,1,0,0,0,0,0,


In [28]:
hh_df = pd.read_csv('/home/data/CHTS_csv_format/data/Deliv_HH.csv')[
  ['SAMPN','HHVEH','OWN','INCOM','HHSIZ']]

hh_df.head()

Unnamed: 0,SAMPN,HHVEH,HHBIC,OWN,INCOM,HHSIZ,rent
0,1031985,2,2,1,3,2,0
1,1032036,1,4,1,7,5,0
2,1032053,2,2,2,2,6,1
3,1032425,2,3,1,7,2,0
4,1032558,0,0,2,1,1,1


In [29]:
demo = person.merge(hh_df,on = 'SAMPN',how = 'left')

demo.head()

Unnamed: 0,SAMPN,PERNO,GEND,AGE,HISP,RACE1,RACE2,RACE3,RACE4,O_RACE,NTVTY,LIC,JOBS,HOURS,EDUCA,WSCHED,DISAB,INDUS,OCCUP,HHPER,female,white,black,native,asian,PI,immigrant,nolic,HHVEH,HHBIC,OWN,INCOM,HHSIZ,rent
0,1031985,1,1,74,0,1.0,,,,,1,1.0,,,6,,0,,,10319851,0,1,0,0,0,0,0,0.0,2,2,1,3,2,0
1,1031985,2,2,73,0,1.0,,,,,1,1.0,,,6,,0,,,10319852,1,1,0,0,0,0,0,0.0,2,2,1,3,2,0
2,1032036,1,1,46,0,1.0,,,,,1,1.0,1.0,40.0,6,2.0,0,54.0,15.0,10320361,0,1,0,0,0,0,0,0.0,1,4,1,7,5,0
3,1032036,2,2,47,0,1.0,97.0,,,MULTI-RACIAL,1,1.0,,,6,,0,,,10320362,1,1,0,0,0,0,0,0.0,1,4,1,7,5,0
4,1032036,3,1,15,0,1.0,97.0,,,MULTI-RACIAL,1,,,,1,,0,,,10320363,0,1,0,0,0,0,0,,1,4,1,7,5,0


## Get rid of null values

In [32]:
na_dict = {
    'GEND':[9],
    'AGE':[998,999],
    'HOURS':[998,999],
    'EDUCA':[8,9],
    'HHVEH':[98,99],
    'OWN':[7,8,9],
    'INCOM':[98,99],
    'HHSIZ':[98,99]
}

for col in na_dict:
    for vals in na_dict[col]:
        demo[col] = demo[col].replace(vals,np.nan)

In [None]:
demo = demo.dropna(subset = ['GEND', 'AGE', 'HOURS', 'EDUCA','HHVEH','OWN','INCOM','HHSIZ'])

In [36]:
tripsIII['HHPER'] = tripsIII['HHPER'].map(str)

trips1 = pd.merge(tripsIII, demo, on='HHPER')

print (len(tripsIII.index))
print (len(trips1.index))

9857
9857
8979


## Prepare data for use in MNL estimation (make dummy columns)

In [None]:
trips1['minority'] = np.where((trips1['HISP'].isin([1.0]) |
                                trips1['RACE1'].isin([2.0]) | trips1['RACE2'].isin([2.0]) | trips1['RACE3'].isin([2.0]) | trips1['RACE4'].isin([2.0]) |
                              trips1['RACE1'].isin([3.0]) | trips1['RACE2'].isin([3.0]) | trips1['RACE3'].isin([3.0]) | trips1['RACE4'].isin([3.0]) |
                              trips1['RACE1'].isin([4.0]) | trips1['RACE2'].isin([4.0]) | trips1['RACE3'].isin([4.0]) | trips1['RACE4'].isin([4.0]) |
                              trips1['RACE1'].isin([5.0]) | trips1['RACE2'].isin([5.0]) | trips1['RACE3'].isin([5.0]) | trips1['RACE4'].isin([5.0]) |
                              trips1['RACE1'].isin([97.0]) | trips1['RACE2'].isin([97.0]) | trips1['RACE3'].isin([97.0]) | trips1['RACE4'].isin([97.0])),1,0)

trips1['black'] = np.where((trips1['RACE1'].isin([2.0]) | trips1['RACE2'].isin([2.0]) | trips1['RACE3'].isin([2.0]) | trips1['RACE4'].isin([2.0])),1,0)
trips1['native'] = np.where((trips1['RACE1'].isin([3.0]) | trips1['RACE2'].isin([3.0]) | trips1['RACE3'].isin([3.0]) | trips1['RACE4'].isin([3.0])),1,0)
trips1['asian'] = np.where((trips1['RACE1'].isin([4.0]) | trips1['RACE2'].isin([4.0]) | trips1['RACE3'].isin([4.0]) | trips1['RACE4'].isin([4.0])),1,0)

In [40]:
trips1['hh_inc_less35k'] = np.where(trips1['INCOM'].isin([1.0,2.0,3.0]),1,0)
trips1['hh_inc_less50k'] = np.where(trips1['INCOM'].isin([1.0,2.0,3.0,4.0]),1,0)
trips1['hh_inc_150kplus'] = np.where(trips1['INCOM'].isin([8.0,9.0,10.0]),1,0)
trips1['hh_inc_150kless250k'] = np.where(trips1['INCOM'].isin([8.0,9.0]),1,0)
trips1['hh_inc_250kplus'] = np.where(trips1['INCOM'].isin([10.0]),1,0)

In [41]:
trips1['lessGED'] = np.where(trips1['EDUCA'].isin([1.0]),1,0)
trips1['GEDsomeBach'] = np.where(trips1['EDUCA'].isin([2.0,3.0]),1,0)

In [44]:
trips1['age_less5'] = np.where(((trips1.AGE.between(0,5,inclusive = False)) | (trips1.AGE==0)),1,0)
trips1['age_12less16'] = np.where(((trips1.AGE.between(12,16,inclusive = False)) | (trips1.AGE==12)),1,0)
trips1['age_16less19'] = np.where(((trips1.AGE.between(16,19,inclusive = False)) | (trips1.AGE==16)),1,0)
trips1['age_19less27'] = np.where(((trips1.AGE.between(19,27,inclusive = False)) | (trips1.AGE==19)),1,0)
trips1['age_27plus'] = np.where(((trips1.AGE.between(27,100,inclusive = False)) | (trips1.AGE==27)),1,0)

In [45]:
trips1['female'] = trips1['GEND'] - 1

trips1['tenure_2'] = trips1['OWN'] - 1

trips1['noveh'] = np.where(trips1.HHVEH.isin([0.0]),1,0)

trips1['hh_size_4plusper'] = np.where(trips1.HHSIZ.between(4,8,inclusive = True),1,0)

In [106]:
trips1['TOD_3to745'] = np.where(trips1['HS_TOD1'].isin([0]),1,0)
trips1['TOD_830to930'] = np.where(trips1['HS_TOD1'].isin([2]),1,0)
trips1['TOD_930to1500'] = np.where(trips1['HS_TOD1'].isin([3]),1,0)
trips1['TOD_1500up'] = np.where(trips1['HS_TOD1'].isin([4]),1,0)

## Estimate the model for Home-to-School Trip End Times

In [162]:
@orca.table(cache=True)
def tripsA():
    return trips1

In [163]:
m = SmallMultinomialLogitStep()
m.name = 'STOD_choice'
m.tables = ['tripsA']
m.choice_column = 'HS_TOD1'
m.model_expression = OrderedDict([
    ('intercept', [1,2,3,4]),
    
    ('less5',[0,2,3,4]),
#     ('5less12'),
    ('12less16',[0]),
    ('16less19',[0,3,4]),
    ('19less27',[0,2,3,4]),
    ('27plus',[2,3,4]),
    
    ('female',[0]),
    
    ('black',[3]),
    ('native',[2,3]),
    ('asian',[0,2,3]),
 
    ('less35k',[2]),
    ('150kless250k',[0]),
    ('250kplus',[0]),

    ('lessGED',[0,2,3,4]),
        
    ('noveh',[3]),

    ('4plusper',[3])
    
])

In [164]:
m.fit()

Log-likelihood at zero: -14,451.1430
Initial Log-likelihood: -14,451.1430
Estimation Time for Point Estimation: 0.81 seconds.
Final log-likelihood: -9,385.2482
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                     _chosen   No. Observations:                8,979
Model:             Multinomial Logit Model   Df Residuals:                    8,944
Method:                                MLE   Df Model:                           35
Date:                     Mon, 25 Mar 2019   Pseudo R-squ.:                   0.351
Time:                             16:32:43   Pseudo R-bar-squ.:               0.348
AIC:                            18,840.496   Log-Likelihood:             -9,385.248
BIC:                            19,089.089   LL-Null:                   -14,451.143
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
in

In [165]:
m.name = 'school_TOD'

In [166]:
mm.initialize()

Registering model step 'auto_ownership'
Registering model step 'dwell_work'
Registering model step 'TOD_choice'
Registering model step 'work_TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'school_dwell'
Registering model step 'WLCM'


In [167]:
m.tags = ['school_TOD','emma','test']
mm.register(m)

Saving 'school_TOD.yaml': /home/emma/ual_model_workspace/spring-2019-models/configs
Model saved to configs/school_TOD-model-object.pkl
Registering model step 'school_TOD'


## Estimate the model for dwell time at school

In [109]:
@orca.table(cache=True)
def tripsB():
    return trips1

In [157]:
m = SmallMultinomialLogitStep()
m.name = 'Sdwell_choice'
m.tables = ['tripsB']
m.choice_column = 'Sdwell'
m.model_expression = OrderedDict([
    ('intercept', [1,3,4,5]),
 
    ('TOD_3to745',[1,2,4,5]),
    ('TOD_830to930',[1,2]),
    ('TOD_930to1500',[1,2]),
    ('TOD_1500up',[1,2]),

    ('less5',[1,2,4,5]),
#     ('5less12',[1,3,4,5]),
    ('12less16',[1,2]),
    ('16less19',[1,4]),
    ('19less27',[1,2]),
    ('27plus',[1,2]),
    
    ('female',[4]),
    
    ('minority',[1]),

    ('less50k',[2,4]),
    ('150kplus',[2,4,5]),
    
    ('lessGED',[4,5]),
    ('GEDsomeBach',[[1,2]]),

    ('4plusper',[4])
    
])

In [158]:
m.fit()

Log-likelihood at zero: -14,451.1430
Initial Log-likelihood: -14,451.1430
Estimation Time for Point Estimation: 0.82 seconds.
Final log-likelihood: -9,352.9999
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                     _chosen   No. Observations:                8,979
Model:             Multinomial Logit Model   Df Residuals:                    8,942
Method:                                MLE   Df Model:                           37
Date:                     Mon, 25 Mar 2019   Pseudo R-squ.:                   0.353
Time:                             16:23:45   Pseudo R-bar-squ.:               0.350
AIC:                            18,780.000   Log-Likelihood:             -9,353.000
BIC:                            19,042.798   LL-Null:                   -14,451.143
                         coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------

In [159]:
m.name = 'school_dwell'

In [160]:
mm.initialize()

Registering model step 'auto_ownership'
Registering model step 'dwell_work'
Registering model step 'TOD_choice'
Registering model step 'work_TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM'


In [161]:
m.tags = ['school_dwell','emma','test']
mm.register(m)

Saving 'school_dwell.yaml': /home/emma/ual_model_workspace/spring-2019-models/configs
Model saved to configs/school_dwell-model-object.pkl
Registering model step 'school_dwell'


# Validate models

In [None]:
# Validation process
from scripts import validate

In [None]:
validate.tp_rates(m)

In [None]:
predicted_choices = validate.get_predicted_choices(m)
pd.crosstab(m.choices.rename('observed'), predicted_choices, margins=True) # unnormalized

In [None]:
validate.model_crosstab(m)

In [None]:
import seaborn as sns; sns.heatmap(validate.model_crosstab(m))