In [1]:
import pandas as pd
import numpy as np
import pickle
import json
import pylogit as pl
from collections import OrderedDict
from mocho_functions import *
import warnings
import copy
warnings.filterwarnings('ignore')

## Logit Functions

In [2]:
def long_form_data(mode_table, alt_attrs, generic_attrs):
    """
    generate long form data for logit model from mode table
    """
    basic_columns = ['group', 'alt', 'choice']
    keys = basic_columns + list(alt_attrs.keys()) + generic_attrs
    long_data_obj = {key: [] for key in keys}
    for rid, row in mode_table.iterrows():
        long_data_obj['group'] += [rid for i in range(4)]
        long_data_obj['alt'] += [0,1,2,3]
        mode_choice = [0, 0, 0, 0]
        mode_choice[int(row['mode'])] = 1
        long_data_obj['choice'] += mode_choice
        for alt_attr in alt_attrs:
            long_data_obj[alt_attr] += [row.get(row_attr, 0) for row_attr in alt_attrs[alt_attr]]
        for g_attr in generic_attrs:
            long_data_obj[g_attr] += [row[g_attr] for i in range(4)]
    long_data_df = pd.DataFrame.from_dict(long_data_obj)
    return long_data_df

def logit_spec(long_data_df, alt_attr_vars, generic_attrs=[], constant=True):
    """
    generate specification & varnames for pylogit
    """
    specifications = OrderedDict()
    names = OrderedDict()
    for var in alt_attr_vars:
        specifications[var] = [[0, 1, 2, 3]]
        names[var] = [var]
    for var in generic_attrs:
        specifications[var] = [1, 2, 3]
        names[var] = [var + ' for cycling', var + ' for walking', var+' for pt']
    if constant:
        specifications['intercept'] = [1, 2, 3]
        names['intercept'] = ['ASC for cycling', 'ASC for walking', 'ASC for pt']
    model = pl.create_choice_model(data = long_data_df,
                        alt_id_col="alt",
                        obs_id_col="group",
                        choice_col="choice",
                        specification=specifications,
                        model_type = "MNL",
                        names = names
    )
    numCoef = sum([len(specifications[s]) for s in specifications])
    return model, numCoef

def logit_est_disp(model, numCoef):
    """
    estimate a logit model and display results, using just_point=True in case of memory error
    """
    try:
        model.fit_mle(np.zeros(numCoef))
        print(model.get_statsmodels_summary())
        return {'just_point': False, 'model': model}
    except:
        mode_result = model.fit_mle(np.zeros(numCoef), just_point=True)
        ncs = int(long_data_df.shape[0]/4)
        ll0 = np.log(1/4) * ncs
        ll = -mode_result['fun']
        mcr = 1 - ll / ll0
        print('\n\nLogit model summary\n---------------------------')
        print('number of cases: ', ncs)
        print('Initial Log-likelihood: ', ll0)
        print('Final Log-likelihood: ', ll)
        print('McFadden R2: {:4.4}\n'.format(mcr))
        beta = mode_result['x']
        print('\nLogit model parameters:\n---------------------------')
        for varname, para in zip(model.ind_var_names, beta):
            print('{}: {:4.6f}'.format(varname, para))
        params = {varname: param for varname, param in zip(model.ind_var_names, beta)}
        return {'just_point': True, 'model': model, 'params': params, 'var_names': model.ind_var_names}
            
def utility_to_prob(v):
    v = v - v.mean()
    expV = np.exp(v)
    p = expV / expV.sum()
    return p
    
def unique_ele_and_keep_order(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]
    
def pylogit_pred(data_in, modelDict, customIDColumnName, even=True):
    """
    predict probabilities for logit model
    """
    data = data_in.copy()
    # fectch variable names and parameters 
    if modelDict['just_point']:
        params, varnames = modelDict['params'].values(), modelDict['params'].keys()
    else:
        params, varnames = list(modelDict['model'].coefs.values), list(modelDict['model'].coefs.index)
    # calc utilities
    data['utility'] = 0
    for varname, param in zip(varnames, params):
        data['utility'] += data[varname] * param
    # calc probabilities given utilities
    # if every choice situation has the same number of alternatives, use matrix, otherwise use list comprehension
    if even:
        numChoices = len(set(data[customIDColumnName]))
        v = np.array(data['utility']).copy().reshape(numChoices, -1)
        v = v - v.mean(axis=1, keepdims=True)  
        expV = np.exp(v)
        p = expV / expV.sum(axis=1, keepdims=True)
        return p.flatten()
    else:
        uniqueCustomIDs = unique_ele_and_keep_order(data[customIDColumnName])
        vArrayList = [np.array(data.loc[data[customIDColumnName]==id, 'utility']) for id in uniqueCustomIDs]
        pArrayList = [utility_to_prob(v) for v in vArrayList]
        return [pElement for pArray in pArrayList for pElement in pArray ]
    
def quasi_nested_logit_pred(sigma_nest, modelDict_nest, data, n_sample = 30, dummy='driving_like'):
    """
    predict probabilities using a standard logit with (nest dummy + random parameter)
    """
    modelDict_nest = copy.deepcopy(modelDict_asc_with_uber)
    normal01_samples = np.random.randn(n_sample)
    pred = np.zeros(data.shape[0])
    for normal01 in normal01_samples:
        modelDict_nest['params'][dummy] = normal01 * sigma_nest
        this_pred = pylogit_pred(data, modelDict_nest, customIDColumnName='group', even=True)
        pred += np.asarray(this_pred)
    pred /= n_sample
    pred = pred.reshape(-1,5)
    ap = pred.sum(axis=0) / pred.sum()   #aggregate prob
    print('Probabilities:\n-------------------------------')
    print('driving: {:4.4}\ncycling: {:4.4}\nwalk: {:4.4}\npt: {:4.4}\nuber: {:4.4}'.format(ap[0], ap[1], ap[2], ap[3], ap[4]))
    pred_uber_nest, ap_uber_nest = pred, ap

    # IIA: randomly select 3 samples and detect proportional changes on probs
    rows = np.random.choice(range(pred.shape[0]), size=3, replace=False)
    for row in rows:
        print('\nCheck for IIA (groupID = {}):\n------------------------------------------'.format(groupID[row]))
        print('             Base       +Uber       Change')
        print('%-8s %10.4f %10.4f %10.2f%%' % ('driving', pred_base[row,0], pred[row,0], 100*(pred[row,0]-pred_base[row,0])/pred_base[row,0]))
        print('%-8s %10.4f %10.4f %10.2f%%' % ('cycling', pred_base[row,1], pred[row,1], 100*(pred[row,1]-pred_base[row,1])/pred_base[row,1]))
        print('%-8s %10.4f %10.4f %10.2f%%' % ('walk', pred_base[row,2], pred[row,2], 100*(pred[row,2]-pred_base[row,2])/pred_base[row,2]))
        print('%-8s %10.4f %10.4f %10.2f%%' % ('pt', pred_base[row,3], pred[row,3], 100*(pred[row,3]-pred_base[row,3])/pred_base[row,3]))
        print('%-8s %10.4f %10.4f %10.2f%%' % ('uber', 0, pred[row,4], 100*(pred[row,4]-0)/0))

## General Data Preparing 

In [3]:
nhts_per=pd.read_csv(NHTS_PATH) # for mode choice on main mode
nhts_tour=pd.read_csv(NHTS_TOUR_PATH) # for mode speeds
nhts_trip=pd.read_csv(NHTS_TRIP_PATH) # for motifs

# add unique ids and merge some variables across the 3 tables
nhts_trip['uniquePersonId']=nhts_trip.apply(lambda row: str(row['HOUSEID'])+'_'+str(row['PERSONID']), axis=1)
nhts_per['uniquePersonId']=nhts_per.apply(lambda row: str(row['HOUSEID'])+'_'+str(row['PERSONID']), axis=1)
nhts_tour['uniquePersonId']=nhts_tour.apply(lambda row: str(row['HOUSEID'])+'_'+str(row['PERSONID']), axis=1)

# Some lookups
nhts_tour=nhts_tour.merge(nhts_per[['HOUSEID', 'HH_CBSA']], on='HOUSEID', how='left')
nhts_trip=nhts_trip.merge(nhts_per[['uniquePersonId', 'R_RACE']], on='uniquePersonId', how='left')

tables={'trips': nhts_trip, 'persons': nhts_per, 'tours': nhts_tour}
for t in ['trips', 'persons']:
# remove some records
    tables[t]=tables[t].loc[((tables[t]['CDIVMSAR'].isin(region_cdivsmars))&
                             (tables[t]['URBAN']==1))]
    tables[t]=tables[t].loc[tables[t]['R_AGE_IMP']>15]
    tables[t]['income']=tables[t].apply(lambda row: income_cat_nhts(row), axis=1)
    tables[t]['age']=tables[t].apply(lambda row: age_cat_nhts(row), axis=1)
    tables[t]['children']=tables[t].apply(lambda row: children_cat_nhts(row), axis=1)
    tables[t]['workers']=tables[t].apply(lambda row: workers_cat_nhts(row), axis=1)
    tables[t]['tenure']=tables[t].apply(lambda row: tenure_cat_nhts(row), axis=1)
    tables[t]['sex']=tables[t].apply(lambda row: sex_cat_nhts(row), axis=1)
    tables[t]['bach_degree']=tables[t].apply(lambda row: bach_degree_cat_nhts(row), axis=1)
    tables[t]['cars']=tables[t].apply(lambda row: cars_cat_nhts(row), axis=1)
    tables[t]['race']=tables[t].apply(lambda row: race_cat_nhts(row), axis=1)
    tables[t]=tables[t].rename(columns= {'HTPPOPDN': 'pop_per_sqmile_home'})

speeds={area:{} for area in set(tables['persons']['HH_CBSA'])}
tables['tours']['main_mode']=tables['tours'].apply(lambda row: mode_cat(row['MODE_D']), axis=1)

for area in speeds:
    this_cbsa=tables['tours'][tables['tours']['HH_CBSA']==area]
    for m in [0,1,2, 3]:
        all_speeds=this_cbsa.loc[((this_cbsa['main_mode']==m) & 
                                  (this_cbsa['TIME_M']>0))].apply(
                                    lambda row: row['DIST_M']/row['TIME_M'], axis=1)
        if len(all_speeds)>0:
            speeds[area]['km_per_minute_'+str(m)]=1.62* all_speeds.mean()
        else:
            speeds[area]['km_per_minute_'+str(m)]=float('nan')
    speeds[area]['walk_km_'+str(m)]=1.62*this_cbsa.loc[this_cbsa['main_mode']==3,'PMT_WALK'].mean()
    speeds[area]['drive_km_'+str(m)]=1.62*this_cbsa.loc[this_cbsa['main_mode']==3,'PMT_POV'].mean()

# for any region where a mode is not observed at all, assume the speed of that mode is that of the slowest region
for area in speeds:
    for mode_speed in speeds[area]:
        if not float(speeds[area][mode_speed]) == float(speeds[area][mode_speed]):
            speeds[area][mode_speed] = np.nanmin([speeds[other_area][mode_speed] for other_area in speeds])
            
# logit model attributes
alt_attrs = {'time_minutes': ['drive_time_minutes', 'cycle_time_minutes', 'walk_time_minutes', 'PT_time_minutes'], 
                 'walk_time_PT_minutes': ['nan', 'nan', 'nan', 'walk_time_PT_minutes'], 
                 'drive_time_PT_minutes': ['nan', 'nan', 'nan', 'drive_time_PT_minutes']}
generic_attrs = ['income_gt100', 'income_gt35-lt100', 'income_lt35', 'age_19 and under',
        'age_20 to 35', 'age_35 to 60', 'age_above 60', 'children_no', 'children_yes', 'workers_none', 
        'workers_one', 'workers_two or more', 'tenure_other', 'tenure_owned', 'tenure_rented', 
        'sex_female', 'sex_male', 'bach_degree_no', 'bach_degree_yes', 'cars_none', 'cars_one',
        'cars_two or more', 'race_asian', 'race_black', 'race_other', 'race_white', 
        'pop_per_sqmile_home', 'network_dist_km']
# categorial variables: to be normalized as reference level
exclude_generic_attrs = ['income_gt100', 'age_19 and under', 'children_no', 'workers_none',
    'tenure_other', 'sex_female', 'bach_degree_no', 'cars_none', 'race_asian']

## Main mode model 

This is the data we are using in the latest RF model, i.e. only cases concerning main mode are included.  
Three models are displayed here:  
(1) model with all regressors appeared in RF (full model), the result is too messy. Skip it for simplicity.  
(2) model with only time_minutes and ASCs. It is obviously problematic as the sign of time_minutes is unexpectedly positive.  
(3) model with only time_minutes, no ASCs, the sign of time_minutes becomes negative.   

It is unlucky for main mode data to get unexpected result in model(2), so all mode data will be tried in the next part.  
However, if we are going to use RF as the main model, and logit as a supplemental model to further draw people from driving/cycling/walk/pt to a similar new mode, model (3) is already adequate, as we can neither estimate nor make reasonable assumptions for ASCs in model (1)/(2).

### Data Preparing

In [4]:
# Only use weekdays for motifs
nhts_trip=nhts_trip.loc[nhts_trip['TRAVDAY'].isin(range(2,7))]

# with the person table only
tables['persons']['network_dist_km']=tables['persons'].apply(lambda row: get_main_dist_km(row), axis=1)
tables['persons']['mode']=tables['persons'].apply(lambda row: get_main_mode(row), axis=1) 

# For the urpose of main mode choice modelling, remove all records with no work transport mode or a distance of 0 to work
tables['persons']=tables['persons'].loc[((tables['persons']['mode']>=0) & (
        (tables['persons']['network_dist_km']>=0)))]

# create the mode choice table
mode_table=pd.DataFrame()
#    add the trip stats for each potential mode
mode_table['drive_time_minutes']=tables['persons'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(0)], axis=1)
mode_table['cycle_time_minutes']=tables['persons'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(1)], axis=1)
mode_table['walk_time_minutes']=tables['persons'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(2)], axis=1)
mode_table['PT_time_minutes']=tables['persons'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(3)], axis=1)
mode_table['walk_time_PT_minutes']=tables['persons'].apply(lambda row: speeds[row['HH_CBSA']]['walk_km_'+str(3)]/speeds[row['HH_CBSA']]['km_per_minute_'+str(2)], axis=1)
mode_table['drive_time_PT_minutes']=tables['persons'].apply(lambda row: speeds[row['HH_CBSA']]['drive_km_'+str(3)]/speeds[row['HH_CBSA']]['km_per_minute_'+str(0)], axis=1)

for col in ['income', 'age', 'children', 'workers', 'tenure', 'sex', 
            'bach_degree',  'cars', 'race']:
    new_dummys=pd.get_dummies(tables['persons'][col], prefix=col)
    mode_table=pd.concat([mode_table, new_dummys],  axis=1)

for col in [ 'pop_per_sqmile_home', 'network_dist_km', 'mode']:
    mode_table[col]=tables['persons'][col]
    
long_data_df = long_form_data(mode_table, alt_attrs, generic_attrs)

### Full Model 

Coefficient for time_minutes is POSITIVE!

In [5]:
alt_attr_vars = list(alt_attrs.keys())
generic_attrs = [var for var in generic_attrs if var not in exclude_generic_attrs]
model, numCoefs = logit_spec(long_data_df.copy(), alt_attr_vars, generic_attrs, constant=True)
modelDict = logit_est_disp(model, numCoefs)

Log-likelihood at zero: -5,790.5515
Initial Log-likelihood: -5,790.5515
Estimation Time for Point Estimation: 1.53 seconds.
Final log-likelihood: -1,118.7059
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                      choice   No. Observations:                4,177
Model:             Multinomial Logit Model   Df Residuals:                    4,114
Method:                                MLE   Df Model:                           63
Date:                     Wed, 11 Mar 2020   Pseudo R-squ.:                   0.807
Time:                             15:02:20   Pseudo R-bar-squ.:               0.796
AIC:                             2,363.412   Log-Likelihood:             -1,118.706
BIC:                             2,762.665   LL-Null:                    -5,790.552
                                      coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------

### Model with Only Time and ASCs 

Coefficient for time_minutes is still POSITIVE, so this happens to be an inherent problem for main mode dataset.

In [6]:
alt_attr_vars = ['time_minutes']
generic_attrs = []
model, numCoefs = logit_spec(long_data_df.copy(), alt_attr_vars, generic_attrs, constant=True)
modelDict = logit_est_disp(model, numCoefs)

Log-likelihood at zero: -5,790.5515
Initial Log-likelihood: -5,790.5515
Estimation Time for Point Estimation: 0.08 seconds.
Final log-likelihood: -1,528.5265
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                      choice   No. Observations:                4,177
Model:             Multinomial Logit Model   Df Residuals:                    4,173
Method:                                MLE   Df Model:                            4
Date:                     Wed, 11 Mar 2020   Pseudo R-squ.:                   0.736
Time:                             15:02:34   Pseudo R-bar-squ.:               0.735
AIC:                             3,065.053   Log-Likelihood:             -1,528.527
BIC:                             3,090.402   LL-Null:                    -5,790.552
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
ti

### Model with Only Time

Coefficient for time_minutes becomes NEGATIVE!

In [7]:
alt_attr_vars = ['time_minutes']
generic_attrs = []
model, numCoefs = logit_spec(long_data_df.copy(), alt_attr_vars, generic_attrs, constant=False)
modelDict = logit_est_disp(model, numCoefs)

Log-likelihood at zero: -5,790.5515
Initial Log-likelihood: -5,790.5515
Estimation Time for Point Estimation: 0.09 seconds.
Final log-likelihood: -4,686.3927
                     Multinomial Logit Model Regression Results                    
Dep. Variable:                      choice   No. Observations:                4,177
Model:             Multinomial Logit Model   Df Residuals:                    4,176
Method:                                MLE   Df Model:                            1
Date:                     Wed, 11 Mar 2020   Pseudo R-squ.:                   0.191
Time:                             15:02:40   Pseudo R-bar-squ.:               0.191
AIC:                             9,374.785   Log-Likelihood:             -4,686.393
BIC:                             9,381.123   LL-Null:                    -5,790.552
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
time_min

## All mode model

All mode dataset is considered for: (1) main mode only dataset will lead to problematic coefficient for time_minutes when ASCs are included; (2) all kinds of activites are simulated in our simulations.  

As a result, the coefficient of time_minutes is always negative in spite of ASCs. 

### Data Preparing 

In [8]:
tables['trips']['network_dist_km']=tables['trips'].apply(lambda row: row['TRPMILES']*1.62, axis=1)
tables['trips']['mode']=tables['trips'].apply(lambda row: mode_cat(row['TRPTRANS']), axis=1) 
tables['trips']=tables['trips'].loc[tables['trips']['mode']>=0]                                 #get rid of some samples with -99 mode
tables['trips'].loc[tables['trips']['TRPMILES']<0, 'TRPMILES']=0 # -9 for work-from-home   

# create the mode choice table
mode_table=pd.DataFrame()
mode_table['drive_time_minutes']=tables['trips'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(0)], axis=1)
mode_table['cycle_time_minutes']=tables['trips'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(1)], axis=1)
mode_table['walk_time_minutes']=tables['trips'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(2)], axis=1)
mode_table['PT_time_minutes']=tables['trips'].apply(lambda row: row['network_dist_km']/speeds[row['HH_CBSA']]['km_per_minute_'+str(3)], axis=1)
mode_table['walk_time_PT_minutes']=tables['trips'].apply(lambda row: speeds[row['HH_CBSA']]['walk_km_'+str(3)]/speeds[row['HH_CBSA']]['km_per_minute_'+str(2)], axis=1)
mode_table['drive_time_PT_minutes']=tables['trips'].apply(lambda row: speeds[row['HH_CBSA']]['drive_km_'+str(3)]/speeds[row['HH_CBSA']]['km_per_minute_'+str(0)], axis=1)

for col in ['income', 'age', 'children', 'workers', 'tenure', 'sex', 
            'bach_degree',  'cars', 'race']:
    new_dummys=pd.get_dummies(tables['trips'][col], prefix=col)
    mode_table=pd.concat([mode_table, new_dummys],  axis=1)

for col in [ 'pop_per_sqmile_home', 'network_dist_km', 'mode']:
    mode_table[col]=tables['trips'][col]
    
long_data_df = long_form_data(mode_table, alt_attrs, generic_attrs=[])

### Model with Only Time and ASCs

In [9]:
alt_attr_vars = ['time_minutes']
generic_attrs = []
model, numCoefs = logit_spec(long_data_df.copy(), alt_attr_vars, generic_attrs, constant=True)
modelDict = logit_est_disp(model, numCoefs)
modelDict_asc = modelDict

Log-likelihood at zero: -42,226.5262
Initial Log-likelihood: -42,226.5262
Estimation Time for Point Estimation: 1.19 seconds.
Final log-likelihood: -12,460.9377


Logit model summary
---------------------------
number of cases:  30460
Initial Log-likelihood:  -42226.526239711864
Final Log-likelihood:  -12460.93772066386
McFadden R2: 0.7049


Logit model parameters:
---------------------------
time_minutes: -0.092120
ASC for cycling: -3.528735
ASC for walking: -0.167859
ASC for pt: -4.534879


### Model with Only Time

In [10]:
alt_attr_vars = ['time_minutes']
generic_attrs = []
model, numCoefs = logit_spec(long_data_df.copy(), alt_attr_vars, generic_attrs, constant=False)
modelDict = logit_est_disp(model, numCoefs)

Log-likelihood at zero: -42,226.5262
Initial Log-likelihood: -42,226.5262
Estimation Time for Point Estimation: 0.74 seconds.
Final log-likelihood: -31,954.0685


Logit model summary
---------------------------
number of cases:  30460
Initial Log-likelihood:  -42226.526239711864
Final Log-likelihood:  -31954.068530410215
McFadden R2: 0.2433


Logit model parameters:
---------------------------
time_minutes: -0.087122


## Substitution Patterns

If we are going to completely switch to logit from RF, we could use a quasi-nested-logit to avoid red/blue bus problem.  

A quasi-nested-logit is actually using a simple mixed logit to imitate nested logit: set a dummy variable for each nest, alts in this nest have the value 1, and other alts have the value 0. Coefficients for these dummies are set to normally distributed with mean=0. Now we have the utility defined as, $$U=\beta*x+\alpha*d+\epsilon$$where x and $\beta$ are ordinary regressors and their fixed coefficients, d and $\alpha$ are nest dummies and their random coefficients, $\epsilon$ are i.i.d. random errors.  
For any two alts, alt n in nest i, and alt m in nest j, the covariance of their unobserved utilities are defined as:
$$E[(\alpha*d_m+\epsilon_m)(\alpha*d_n+\epsilon_n)] = d_m*d_n*E(\alpha^2) = d_m*d_n*(D(\alpha)-E(\alpha)^2)=d_m*d_n*\Sigma_{ij}$$where $\Sigma$ is the diagonal variance-covariance matrix for $\alpha$.  
If alts m and n are from the same nest (i=j), we will have $E[(\alpha*d_m+\epsilon_m)(\alpha*d_n+\epsilon_n)]=\Sigma_{ii}>0$  
If alts m and n are from different nests, we will have $E[(\alpha*d_m+\epsilon_m)(\alpha*d_n+\epsilon_n)]=0$  
By this way, we can generate the similar correlation pattern as nested logit. When applied to scenarios where brand new alts are introduced, we just need to set values for the diagonal elements on $\Sigma$, which represent our assumptions about the degree of correlation among the alts in the same nest.  

Follwing is an example to elaborate this approach. We randomly sampled 100 choice scenarios from all mode dataset for quick prediction, and the starting model is the standard logit model with time_minutes and ASCs. Supposing the 5th alt, uber, is now introduced, and we believe it is pretty similar as "driving", and thus shall mainly draw people from driving instead of other 3 alts. The standard logit and quasi-nested logit are directly applied to check the substitution pattern. Generally we can check the substitution pattern by calculating elasticity, but it would be weird here, since we are introducing a new alt rather than change some attribute by certain percentage. Therefore, another way is usd, i.e., observing how choice probabilities shift from the base scenario to the new scenario for an individual choice maker.


In both models, it is assumed that $time(uber)=time(driving)+5$, and ASC for uber equals with ASC for driving. As we can see, the standard logit model will yield a proportional substitution pattern, which is expected due to IIA.  

For the quasi-nested logit model, a dummy variable named "driving-like" is added for the {driving, uber}, as if we are specify a nest structure. For driving and uber, this variable is set to 1, and 0 for cycling, walk, and pt. TThe coefficient of "driving-like" is normally distributed, and thus we need an extra parameter, i.e., the standard deviation $\sigma$ for that random coefficient. We can arbitrarily set the value for $\sigma$, and higher $\sigma$ represents we are assuming larger correlation between driving and uber. When predicting, several (50 in the following experiments) random numbers are sampled from standard normal distribution, each of them is multiplied by $\sigma$ to generate one sample of the coefficient of "driving-like", and then we can easily get one version of predicted probabilities through standard logit calculations. The final prediction would be the mean over all versions. As we can see, this approcah is able to yield disproportional substitution patterns, where uber will draw more people from driving than cycling/walk/pt, and higher $\sigma$ will lead to more unbalanced patterns. 

Although the results of quasi-nested logit seem to be more realistic, it cannot reproduce a "perfect substitution pattern", where uber users are 100% drawn from driving.

### Data Preparing 

In [11]:
n = 100
np.random.seed(0)
use_idx = np.random.choice(list(set(long_data_df['group'])), n, replace=False)
data = long_data_df.loc[long_data_df['group'].isin(use_idx)]
const_cycling, const_walk, const_pt = [], [], []
for i in range(100): 
    const_cycling.extend([0, 1, 0, 0])
    const_walk.extend([0, 0, 1, 0])
    const_pt.extend([0, 0, 0, 1])
data['ASC for cycling'] = const_cycling
data['ASC for walking'] = const_walk
data['ASC for pt'] = const_pt
data.head(8)

Unnamed: 0,group,alt,choice,time_minutes,walk_time_PT_minutes,drive_time_PT_minutes,ASC for cycling,ASC for walking,ASC for pt
484,4672,0,1,3.803317,0.0,0.0,0,0,0
485,4672,1,0,12.752649,0.0,0.0,1,0,0
486,4672,2,0,18.205761,0.0,0.0,0,1,0
487,4672,3,0,3.731273,0.301044,1.053405,0,0,1
1892,12472,0,1,12.41042,0.0,0.0,0,0,0
1893,12472,1,0,41.61256,0.0,0.0,1,0,0
1894,12472,2,0,59.406348,0.0,0.0,0,1,0
1895,12472,3,0,12.17534,0.301044,1.053405,0,0,1


### Mode Share: Base

In [12]:
pred = pylogit_pred(data, modelDict_asc, customIDColumnName='group', even=True)
pred = np.asarray(pred).reshape(-1,4)
ap = pred.sum(axis=0) / pred.sum()  #aggregate prob
print('Probabilities:\n-------------------------------')
print('driving: {:4.4}\ncycling: {:4.4}\nwalk: {:4.4}\npt: {:4.4}'.format(ap[0], ap[1], ap[2], ap[3]))
pred_base, ap_base = pred, ap

Probabilities:
-------------------------------
driving: 0.8683
cycling: 0.006099
walk: 0.1147
pt: 0.01097


### Mode Share: Introducing Uber

The time_minutes of uber is set as time_minutes for driving plus 5 mins. 

In [13]:
groupID = list(set(data['group']))
groupID.sort()
uberID = [4 for i in range(n)]
additional_time_minutes = 5
driving_time = np.array(data.sort_values(by=['group', 'alt']).loc[data['alt']==0, 'time_minutes'])
uber = {'group':groupID, 'alt': uberID, 'time_minutes': driving_time+additional_time_minutes, 'walk_time_PT_minutes':0, 
       'drive_time_PT_minutes': 0,'ASC for cycling': 0, 'ASC for walking': 0, 'ASC for pt': 0, 
        'ASC for uber': 1, 'choice': 0}
tmp = data.copy()
tmp['ASC for uber'] = 0
data_uber = pd.concat([tmp, pd.DataFrame(uber)]).sort_values(by=['group', 'alt'])
data_uber = data_uber[['group', 'alt', 'choice', 'time_minutes', 'drive_time_PT_minutes', 'walk_time_PT_minutes',
                      'ASC for cycling', 'ASC for walking', 'ASC for pt', 'ASC for uber']]
data_uber.head(10)

Unnamed: 0,group,alt,choice,time_minutes,drive_time_PT_minutes,walk_time_PT_minutes,ASC for cycling,ASC for walking,ASC for pt,ASC for uber
484,4672,0,1,3.803317,0.0,0.0,0,0,0,0
485,4672,1,0,12.752649,0.0,0.0,1,0,0,0
486,4672,2,0,18.205761,0.0,0.0,0,1,0,0
487,4672,3,0,3.731273,1.053405,0.301044,0,0,1,0
0,4672,4,0,8.803317,0.0,0.0,0,0,0,1
1892,12472,0,1,12.41042,0.0,0.0,0,0,0,0
1893,12472,1,0,41.61256,0.0,0.0,1,0,0,0
1894,12472,2,0,59.406348,0.0,0.0,0,1,0,0
1895,12472,3,0,12.17534,1.053405,0.301044,0,0,1,0
1,12472,4,0,17.41042,0.0,0.0,0,0,0,1


The ASC for uber in the standard logit model is set the same as the ASC for driving, which is 0 as the reference alternative.  
We observed a proportional substitution pattern through the shift of probabilities of 3 randomly selected individuals.

In [14]:
modelDict_asc_with_uber = copy.deepcopy(modelDict_asc)
uber_ASC = 0       #the same as driving ASC
modelDict_asc_with_uber['params']['ASC for uber'] = uber_ASC
pred = pylogit_pred(data_uber, modelDict_asc_with_uber, customIDColumnName='group', even=True)
pred = np.asarray(pred).reshape(-1,5)
ap = pred.sum(axis=0) / pred.sum()   #aggregate prob
print('Probabilities:\n-------------------------------')
print('driving: {:4.4}\ncycling: {:4.4}\nwalk: {:4.4}\npt: {:4.4}\nuber: {:4.4}'.format(ap[0], ap[1], ap[2], ap[3], ap[4]))
pred_uber, ap_uber = pred, ap

# IIA: randomly select 3 samples and detect proportional changes on probs
rows = np.random.choice(range(pred.shape[0]), size=3, replace=False)
for row in rows:
    print('\nCheck for IIA (groupID = {}):\n------------------------------------------'.format(groupID[row]))
    print('             Base       +Uber       Change')
    print('%-8s %10.4f %10.4f %10.2f%%' % ('driving', pred_base[row,0], pred[row,0], 100*(pred[row,0]-pred_base[row,0])/pred_base[row,0]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('cycling', pred_base[row,1], pred[row,1], 100*(pred[row,1]-pred_base[row,1])/pred_base[row,1]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('walk', pred_base[row,2], pred[row,2], 100*(pred[row,2]-pred_base[row,2])/pred_base[row,2]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('pt', pred_base[row,3], pred[row,3], 100*(pred[row,3]-pred_base[row,3])/pred_base[row,3]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('uber', 0, pred[row,4], 100*(pred[row,4]-0)/0))

Probabilities:
-------------------------------
driving: 0.5572
cycling: 0.00416
walk: 0.0801
pt: 0.00703
uber: 0.3515

Check for IIA (groupID = 219726):
------------------------------------------
             Base       +Uber       Change
driving      0.6095     0.4402     -27.77%
cycling      0.0145     0.0105     -27.77%
walk         0.3694     0.2668     -27.77%
pt           0.0066     0.0047     -27.77%
uber         0.0000     0.2777        inf%

Check for IIA (groupID = 836001):
------------------------------------------
             Base       +Uber       Change
driving      0.8954     0.5722     -36.10%
cycling      0.0017     0.0011     -36.10%
walk         0.0933     0.0596     -36.10%
pt           0.0097     0.0062     -36.10%
uber         0.0000     0.3610        inf%

Check for IIA (groupID = 198610):
------------------------------------------
             Base       +Uber       Change
driving      0.9848     0.6074     -38.32%
cycling      0.0050     0.0031     -38.32%
wal

# Mode Share: Introducing Uber + Quasi-Nested Specification

Adding "driving-like" dummy variable to imitate the nest of {driving, uber}.

In [15]:
data_uber_nest = data_uber.copy()
data_uber_nest['driving_like'] = 0
data_uber_nest.loc[data_uber_nest['alt'].isin([0,4]), 'driving_like'] = 1
data_uber_nest.head(10)

Unnamed: 0,group,alt,choice,time_minutes,drive_time_PT_minutes,walk_time_PT_minutes,ASC for cycling,ASC for walking,ASC for pt,ASC for uber,driving_like
484,4672,0,1,3.803317,0.0,0.0,0,0,0,0,1
485,4672,1,0,12.752649,0.0,0.0,1,0,0,0,0
486,4672,2,0,18.205761,0.0,0.0,0,1,0,0,0
487,4672,3,0,3.731273,1.053405,0.301044,0,0,1,0,0
0,4672,4,0,8.803317,0.0,0.0,0,0,0,1,1
1892,12472,0,1,12.41042,0.0,0.0,0,0,0,0,1
1893,12472,1,0,41.61256,0.0,0.0,1,0,0,0,0
1894,12472,2,0,59.406348,0.0,0.0,0,1,0,0,0
1895,12472,3,0,12.17534,1.053405,0.301044,0,0,1,0,0
1,12472,4,0,17.41042,0.0,0.0,0,0,0,1,1


Predicting probabilities using quasi-nested logit model.  
The random coefficient of "driving-like" is normally distribute with mean=0 and std=sigma_nest=0.5. 50 samples are generated from this distribution to get 50 versions of predicted probabilities. The finally prediction is the mean result.  
We can observe an disproportional substitution pattern.

In [16]:
sigma_nest = 0.5
n_sample = 50
np.random.seed(1)

modelDict_nest = copy.deepcopy(modelDict_asc_with_uber)
normal01_samples = np.random.randn(n_sample)
pred = np.zeros(data_uber_nest.shape[0])

for normal01 in normal01_samples:
    modelDict_nest['params']['driving_like'] = normal01 * sigma_nest
    this_pred = pylogit_pred(data_uber_nest, modelDict_nest, customIDColumnName='group', even=True)
    pred += np.asarray(this_pred)
pred /= n_sample
pred = pred.reshape(-1,5)
ap = pred.sum(axis=0) / pred.sum()   #aggregate prob
print('Probabilities:\n-------------------------------')
print('driving: {:4.4}\ncycling: {:4.4}\nwalk: {:4.4}\npt: {:4.4}\nuber: {:4.4}'.format(ap[0], ap[1], ap[2], ap[3], ap[4]))
pred_uber_nest, ap_uber_nest = pred, ap

# IIA: randomly select 3 samples and detect proportional changes on probs
rows = np.random.choice(range(pred.shape[0]), size=3, replace=False)
for row in rows:
    print('\nCheck for IIA (groupID = {}):\n------------------------------------------'.format(groupID[row]))
    print('             Base       +Uber       Change')
    print('%-8s %10.4f %10.4f %10.2f%%' % ('driving', pred_base[row,0], pred[row,0], 100*(pred[row,0]-pred_base[row,0])/pred_base[row,0]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('cycling', pred_base[row,1], pred[row,1], 100*(pred[row,1]-pred_base[row,1])/pred_base[row,1]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('walk', pred_base[row,2], pred[row,2], 100*(pred[row,2]-pred_base[row,2])/pred_base[row,2]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('pt', pred_base[row,3], pred[row,3], 100*(pred[row,3]-pred_base[row,3])/pred_base[row,3]))
    print('%-8s %10.4f %10.4f %10.2f%%' % ('uber', 0, pred[row,4], 100*(pred[row,4]-0)/0))

Probabilities:
-------------------------------
driving: 0.5536
cycling: 0.004461
walk: 0.08491
pt: 0.007778
uber: 0.3493

Check for IIA (groupID = 459850):
------------------------------------------
             Base       +Uber       Change
driving      0.7270     0.4907     -32.51%
cycling      0.0123     0.0090     -26.82%
walk         0.2529     0.1851     -26.82%
pt           0.0078     0.0057     -26.82%
uber         0.0000     0.3096        inf%

Check for IIA (groupID = 352136):
------------------------------------------
             Base       +Uber       Change
driving      0.8501     0.5476     -35.58%
cycling      0.0087     0.0062     -28.70%
walk         0.1320     0.0941     -28.70%
pt           0.0092     0.0066     -28.70%
uber         0.0000     0.3455        inf%

Check for IIA (groupID = 297939):
------------------------------------------
             Base       +Uber       Change
driving      0.9685     0.5997     -38.08%
cycling      0.0027     0.0019     -30.12%


Trying more values of simga_nest. A function is used to replace the cell above to avoid repetition.  
When simga_nest=0, we are assuming that uber and driving have no correlations, which is the assumption of the standard logit. As the result, the probabilities are the same as the standard logit, and we observed a proportional substitution pattern again.

In [17]:
sigma_nest = 0
n_sample = 50
np.random.seed(1)
quasi_nested_logit_pred(sigma_nest, modelDict_nest, data_uber_nest, n_sample)

Probabilities:
-------------------------------
driving: 0.5572
cycling: 0.00416
walk: 0.0801
pt: 0.00703
uber: 0.3515

Check for IIA (groupID = 459850):
------------------------------------------
             Base       +Uber       Change
driving      0.7270     0.4984     -31.44%
cycling      0.0123     0.0084     -31.44%
walk         0.2529     0.1734     -31.44%
pt           0.0078     0.0054     -31.44%
uber         0.0000     0.3144        inf%

Check for IIA (groupID = 352136):
------------------------------------------
             Base       +Uber       Change
driving      0.8501     0.5533     -34.91%
cycling      0.0087     0.0057     -34.91%
walk         0.1320     0.0859     -34.91%
pt           0.0092     0.0060     -34.91%
uber         0.0000     0.3491        inf%

Check for IIA (groupID = 297939):
------------------------------------------
             Base       +Uber       Change
driving      0.9685     0.6012     -37.93%
cycling      0.0027     0.0017     -37.93%
wal

When sigma_nest increases to 0.9, we are assuming larger correlations between uber and driving, as a result, uber will draw even more people from driving and less from people from other modes, compared with sigma_nest=0.5.

In [18]:
sigma_nest = 0.9
n_sample = 50
np.random.seed(1)
quasi_nested_logit_pred(sigma_nest, modelDict_nest, data_uber_nest, n_sample)

Probabilities:
-------------------------------
driving: 0.5469
cycling: 0.005042
walk: 0.09364
pt: 0.009418
uber: 0.345

Check for IIA (groupID = 459850):
------------------------------------------
             Base       +Uber       Change
driving      0.7270     0.4770     -34.38%
cycling      0.0123     0.0100     -18.67%
walk         0.2529     0.2057     -18.67%
pt           0.0078     0.0064     -18.67%
uber         0.0000     0.3010        inf%

Check for IIA (groupID = 352136):
------------------------------------------
             Base       +Uber       Change
driving      0.8501     0.5364     -36.90%
cycling      0.0087     0.0073     -16.48%
walk         0.1320     0.1103     -16.48%
pt           0.0092     0.0077     -16.48%
uber         0.0000     0.3384        inf%

Check for IIA (groupID = 297939):
------------------------------------------
             Base       +Uber       Change
driving      0.9685     0.5962     -38.44%
cycling      0.0027     0.0023     -12.14%
w