In [253]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')
from urbansim.utils import misc
import pandana as pdna
from collections import OrderedDict
    
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import pandas as pd
import orca
import os; os.chdir('/home/juan/activitysynth/activitysynth/')
import warnings; warnings.simplefilter('ignore')
from matplotlib import pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from sklearn.neighbors import BallTree

import pylogit as pl                   # For choice model estimation
from pylogit import nested_logit as nl # For nested logit convenience funcs

# from scripts import datasources, models, variables, utils

In [2]:
os.getcwd()

'/home/juan/activitysynth/activitysynth'

In [10]:
from scripts import datasources, models, variables, utils

In [11]:
import urbansim_templates
urbansim_templates.__version__

'0.1.1'

In [12]:
input_file_format = 'csv'
input_data_dir = '/home/data/spring_2019/base/'

# estimate from base-year data
formattable_fname_dict = {
    'parcels': 'parcels.{0}',
    'buildings': 'buildings.{0}',
    'jobs': 'jobs.{0}',
    'establishments': 'establishments.{0}',
    'households': 'households.{0}',
    'persons': 'persons.{0}',
    'rentals': 'MTC_craigslist_listings_7-10-18.{0}',
    'units': 'units.{0}',
    'skims': 'mtc_skims.{0}',
    'beam_skims': '30.skims-smart-23April2019-baseline.csv.gz',
    'drive_nodes': 'bay_area_tertiary_strongly_nodes.{0}',
    'drive_edges': 'bay_area_tertiary_strongly_edges.{0}',
    'drive_access_vars': 'drive_net_vars.{0}',
    'walk_nodes': 'bayarea_walk_nodes.{0}',
    'walk_edges': 'bayarea_walk_edges.{0}',
    'walk_access_vars': 'walk_net_vars.{0}',
    'zones': 'zones.{0}',
    'zone_access_vars': 'zones_w_access_vars.{0}',
}

def format_fname_dict(formattable_fname_dict, format='csv'):
    formatted_dict = {
        k: v.format('csv')
        for k, v in formattable_fname_dict.items()}
    return formatted_dict

input_fnames = format_fname_dict(
            formattable_fname_dict, input_file_format)

In [13]:
orca.add_injectable('input_file_format', input_file_format)
orca.add_injectable('input_data_dir', input_data_dir)
orca.add_injectable('input_fnames', input_fnames)
orca.add_injectable('store', None)

In [15]:
# os.listdir("/home/data/fall_2018/HWtrips_032319.csv")

# Loading data

In [16]:
orca.run(['initialize_network_small']) 
orca.run([ 'initialize_network_walk']) 
orca.run(['impute_missing_skims']) 

Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.00 s
Total time to execute iteration 1 with iteration value None: 0.00 s
Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 0.00 s
Total time to execute iteration 1 with iteration value None: 0.00 s
Running step 'impute_missing_skims'
Time to execute step 'impute_missing_skims': 98.92 s
Total time to execute iteration 1 with iteration value None: 98.92 s


In [408]:
#Loading Data 
beam_skims = orca.get_table('beam_skims').to_frame()
reset_beam_skims = beam_skims.reset_index()

students = pd.read_csv('/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/students_with_school_id.csv')
schools = pd.read_csv('/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/schools.csv').rename({"parcel_id": "school_parcel_id"}, axis=1)
parcels = orca.get_table('parcels').to_frame()
household_chts = pd.read_csv("/home/data/fall_2018/CHTS_csv_format/data/Deliv_HH.csv")

## Preprocessing 

In [409]:
#Small netwrok 
nodessmall = pd.read_csv('/home/data/fall_2018/bay_area_tertiary_strongly_nodes.csv').set_index('osmid')
edgessmall = pd.read_csv('/home/data/fall_2018/bay_area_tertiary_strongly_edges.csv')
netsmall = pdna.Network(nodessmall.x, nodessmall.y, edgessmall.u,
                                edgessmall.v, edgessmall[['length']],
                                twoway=False)
netsmall.precompute(25000)

#Making sure x and y coordinates are type float
students['x_cor'] = students.HXCORD.astype(float)
students['y_cor'] = students.HYCORD.astype(float)

schools['Latitude'] = schools['Latitude'].astype(float)
schools['Longitude'] = schools['Longitude'].astype(float)

#Adding list of grades offered by each school
list_grades = []
for index, row in schools.loc[:,schools.columns.str.startswith("grade_")].iterrows():
    x = np.array(row)
    list_grades.append(x)
    
schools['list_grades'] = list_grades

#Filtering students by age
students = students[students.AGE <= 18]

In [410]:
#Some spatial functions 

def node_id_small(x, y, netsmall):
    """ Return the node ID given a pair of coordinates"""
    idssmall = netsmall.get_node_ids(x, y)
    return idssmall

def parcel_id(data):
    ''' Assigns a parcel ID given a x and y coordinate
    Input: DataFrame with x and y coordinates
    Output: Array-like Parcel_ID'''
    #Transforming Latitude/Longitude to radians
    parcels_rad = np.deg2rad(parcels[['y', 'x']])
    data_rad = np.deg2rad(data)
    
    tree = BallTree(parcels_rad, metric='haversine')
    
    dists, idxs = tree.query(data_rad, return_distance=True)
    
    return parcels.iloc[idxs[:,0]].index

def get_zone_id(parcel_id):
    '''gets the zone_id (TAZ) of school and home locations by the parcel id'''
    try: 
        parcel_id = list(parcels.loc[parcel_id,'zone_id'])

    except Exception as e:
        parcel_id = np.nan
        
    return parcel_id

In [411]:
# Adding spatial information to schools and students dataframes

#Node_ID
students['home_node_id'] = node_id_small(students.x_cor, students.y_cor, netsmall)
schools['school_node_id'] = node_id_small(schools.Longitude, schools.Latitude, netsmall)

#Parcel_ID
students['home_parcel_id'] = parcel_id(students[['y_cor', 'x_cor']])
schools['school_parcel_id'] = parcel_id(schools[['Latitude', 'Longitude']])

#Zone-ID
students['home_zone_id'] = get_zone_id(students['home_parcel_id'])
schools['school_zone_id'] = get_zone_id(schools['school_parcel_id'])

#Adding household income for each individual 
students = students.merge(household_chts.loc[:,['SAMPN', 'INCOM']], how = 'left', on = 'SAMPN')

#Creates 3 dummies for income

students['hh_inc_under_25k'] = (students['INCOM'] < 3).astype(int)
students['hh_inc_25_to_75k'] = ((students['INCOM'] > 2) & (students['INCOM'] < 6)).astype(int)
students['hh_inc_75_to_200k'] = ((students['INCOM'] > 5) & (students['INCOM'] < 9)).astype(int)

## Defining Choice set constrains

In [412]:
#Merging students and schools 
students_1 = students.merge(schools, how = 'left', on = 'school_id')

# Droping students with no assigned school
students_1 = students_1.dropna(subset=['school_id'])

In [413]:
#Define the node ID for each home and school location in students dataset
students_1['node_id_home'] = node_id_small(students_1.HXCORD, students_1.HYCORD, netsmall)
students_1['node_id_school'] = node_id_small(students_1.Longitude, students_1.Latitude, netsmall)

#Creating a df for public shcools only
df_public = students_1[students_1.school_id <= 2827].loc[:,['SAMPN', 'PERNO', 'school_id',
                                                        'AGE','HCITY','HYCORD','HXCORD',
                                                        'SNAME_lookup','SCITY_lookup', 
                                                        'node_id_home', 'node_id_school']]

#Creating a df for private shcools only
df_private = students_1[students_1.school_id > 2827].loc[:,['SAMPN', 'PERNO', 'school_id',
                                                        'AGE','HCITY','HYCORD','HXCORD',
                                                        'SNAME_lookup','SCITY_lookup', 
                                                        'node_id_home', 'node_id_school']]

In [414]:
#Setting public and private schools as POIs. 
netsmall.set_pois('public_school', 500000, 10000, 
                  schools[schools.type == 'public'].Longitude, 
                  schools[schools.type == 'public'].Latitude, )

netsmall.set_pois('private_school', 500000, 10000, 
                  schools[schools.type == 'private'].Longitude, 
                  schools[schools.type == 'private'].Latitude)

In [415]:
#Public schools. 95% of the time, the school is within the x's closest schools. Finding x
n =50

# n closest public schools per each node in netsmall
distance_matrix_public = netsmall.nearest_pois(200000, 
                                               'public_school', 
                                               num_pois=n,
                                               include_poi_ids=True)

# Selects POIS id's only 
public_nodes = distance_matrix_public.iloc[:,n:]

# Creates a list of n closest POIs for each node
list_values = []
for index, row in public_nodes.iterrows():
    x = np.array(row)
    x = x[~np.isnan(x)]
    list_values.append(x)

#Add created list to public nodes 
public_nodes['list_values'] = list_values


merge = df_public.merge(public_nodes.loc[:,['list_values']], how = 'left', left_on = 'node_id_home', right_index= True)

school_position = []
for index, row in merge.iterrows():
    school_position.append(np.isin(row['school_id']-1000,row['list_values'])==True)
    
merge['school_position'] = school_position

merge.school_position.mean()

0.9131504922644164

In [416]:
#Private schools. 95% of the time, the school is within the x's closest schools. Finding x
n =100

distance_matrix_private = netsmall.nearest_pois(100000, 
                                                'private_school', 
                                                num_pois=n, 
                                                include_poi_ids=True)

private_nodes = distance_matrix_private.iloc[:,n:]

list_values = []
for index, row in private_nodes.iterrows():
    x = np.array(row)
    x = x[~np.isnan(x)]
    list_values.append(x)
    
private_nodes['list_values'] = list_values


merge = df_private.merge(private_nodes.loc[:,['list_values']], how = 'left', left_on = 'node_id_home', right_index= True)

school_position = []
for index, row in merge.iterrows():
    school_position.append(np.isin(row['school_id']-1000,row['list_values'])==True)
    
merge['school_position'] = school_position

merge.school_position.mean()

0.8758465011286681

## Getting data_long table

In [417]:
def school_available(age, grades_offered):
    """ Checks if a school offers a grade according to the age range
    Input: Age: int in range (5-18)
           grades_offered: 13 element array
    Output: True if grade if offered acoording to age, false otherwise. 
     """
    index = age - 6
    
    if index < 0:
        index = 0

    if index < 12:
        result = (grades_offered[index] == 1) | (grades_offered[index + 1] == 1)
        
    elif index == 12:
        result = (grades_offered[index] == 1)
    
    return result

In [418]:
def school_choice_set(house_node_id, kid_age):
    """ Determines the school choice set given home node id and age of the student (4-18)
    Output: Pandas series with available school IDs"""
    
    public_id = distance_matrix_public.iloc[:,50:].loc[house_node_id] + 1000
    private_id = distance_matrix_private.iloc[:,100:].loc[house_node_id] + 1000
    schools_filter = pd.concat([public_id, private_id])
    schools_set = schools[schools.school_id.isin(schools_filter)]
    school_availability = [school_available(kid_age, x) for x in schools_set.list_grades]
    try:
        schools_available = schools_set[school_availability].school_id
        return schools_available
    except Exception as e:
        print(e)
        print("empty dataframe?")
    
    return np.nan

In [421]:
#Creating long data frame
stu = students[["AGE", 
                "home_node_id", 
                "home_zone_id", 
                "school_id", 
                'hh_inc_under_25k',
                'hh_inc_25_to_75k', 
                'hh_inc_75_to_200k',
                'HPrimaryCity']] 

dfs = []
for index, record in stu.iterrows():
    scs = school_choice_set(int(record['home_node_id']), int(record['AGE']))
    
    if pd.Series(record['school_id']).isin(scs)[0]: #Makes sure that the choseen school is in the choice set
        df = pd.DataFrame({'obs_id': index, 
                           'AGE': record['AGE'], 
                           'home_node_id' : record['home_node_id'],
                           'home_zone_id' : record['home_zone_id'], 
                           'chosen_school' : record['school_id'],
                           'school_choice_set': scs,
                           'hh_inc_under_25k': record['hh_inc_under_25k'],
                           'hh_inc_25_to_75k' : record['hh_inc_25_to_75k'],
                           'hh_inc_75_to_200k' : record['hh_inc_75_to_200k'],
                           'home_city': record['HPrimaryCity']})
                           
        dfs.append(df)
    

all_choices = pd.concat(dfs, axis=0)

#Creating the choice column
all_choices['choice'] = 1*(all_choices.chosen_school == all_choices.school_choice_set)
all_choices.head()

Unnamed: 0,obs_id,AGE,home_node_id,home_zone_id,chosen_school,school_choice_set,hh_inc_under_25k,hh_inc_25_to_75k,hh_inc_75_to_200k,home_city,choice
939,1,14,2089641167,311,3234.0,1939,0,0,1,REDWOOD CITY,0
943,1,14,2089641167,311,3234.0,1943,0,0,1,REDWOOD CITY,0
944,1,14,2089641167,311,3234.0,1944,0,0,1,REDWOOD CITY,0
950,1,14,2089641167,311,3234.0,1950,0,0,1,REDWOOD CITY,0
951,1,14,2089641167,311,3234.0,1951,0,0,1,REDWOOD CITY,0


In [402]:
schools.columns

Index(['CDSCode', 'School', 'District', 'County', 'Street', 'City', 'Zip',
       'State', 'Latitude', 'Longitude', 'type', 'grade_0', 'grade_1',
       'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7',
       'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12',
       'Kindergarten Enrollment', 'Grade 1 Enrollment', 'Grade 2 Enrollment',
       'Grade 3 Enrollment', 'Grade 4 Enrollment', 'Grade 5 Enrollment',
       'Grade 6 Enrollment', 'Grade 7 Enrollment', 'Grade 8 Enrollment',
       'Ungraded Elementary Enrollmnet', 'Grade 9 Enrollment',
       'Grade 10 Enrollment', 'Grade 11 Enrollment', 'Grade 12 Enrollment',
       'Ungraded Secondary Enrollment', 'Total Enrollment', 'sw_rank_2108',
       'sw_rank_2017', 'sw_rank_2016', 'ss_rank_2018_', 'ss_rank_2017',
       'ss_rank_2016', 'nodeID', 'school_parcel_id', 'school_id',
       'list_grades', 'school_node_id', 'school_zone_id'],
      dtype='object')

In [422]:
#Getting zone id for schools in the choice set
df = all_choices.merge(schools.loc[:,['school_id','school_zone_id','sw_rank_2108', 'City']], 
                       left_on = 'school_choice_set', 
                       right_on = 'school_id').sort_values('obs_id')

In [431]:
df[df.choice == 1]

df['home_city'] = df['home_city'].apply(lambda x: x.lower())
df['City'] = df['City'].apply(lambda x: x.lower())
df['same_city?'] = 1*(df['home_city'] == df['City'])

In [432]:
df

Unnamed: 0,obs_id,AGE,home_node_id,home_zone_id,chosen_school,school_choice_set,hh_inc_under_25k,hh_inc_25_to_75k,hh_inc_75_to_200k,home_city,choice,school_id,school_zone_id,sw_rank_2108,City,same_city?
0,1,14,2089641167,311,3234.0,1939,0,0,1,redwood city,0,1939,292,,san mateo,0
13181,1,14,2089641167,311,3234.0,3289,0,0,1,redwood city,0,3289,254,,san mateo,0
1524,1,14,2089641167,311,3234.0,2087,0,0,1,redwood city,0,2087,255,10.0,san mateo,0
9655,1,14,2089641167,311,3234.0,3272,0,0,1,redwood city,0,3272,212,,south san francisco,0
14671,1,14,2089641167,311,3234.0,3300,0,0,1,redwood city,0,3300,342,,portola valley,0
25363,1,14,2089641167,311,3234.0,3473,0,0,1,redwood city,0,3473,401,,mountain view,0
22643,1,14,2089641167,311,3234.0,3376,0,0,1,redwood city,0,3376,350,,palo alto,0
11520,1,14,2089641167,311,3234.0,3282,0,0,1,redwood city,0,3282,292,,san mateo,0
1579,1,14,2089641167,311,3234.0,2090,0,0,1,redwood city,0,2090,283,9.0,san mateo,0
8986,1,14,2089641167,311,3234.0,3270,0,0,1,redwood city,0,3270,340,,menlo park,0


In [433]:
#Getting beam generalized time and cost
data_long = df.merge(reset_beam_skims, 
                     how = 'left', 
                     left_on=['home_zone_id','school_zone_id'], 
                     right_on = ['from_zone_id','to_zone_id'])

In [434]:
data_long

#Imputing school grades
avg_grade = data_long.sw_rank_2108.mean()
data_long['rank'] = data_long.sw_rank_2108.fillna(avg_grade)
data_long

Unnamed: 0,obs_id,AGE,home_node_id,home_zone_id,chosen_school,school_choice_set,hh_inc_under_25k,hh_inc_25_to_75k,hh_inc_75_to_200k,home_city,...,gen_cost_WALK_TRANSIT,gen_tt_BIKE,gen_tt_CAR,gen_tt_DRIVE_TRANSIT,gen_tt_RIDE_HAIL,gen_tt_RIDE_HAIL_POOLED,gen_tt_RIDE_HAIL_TRANSIT,gen_tt_WALK,gen_tt_WALK_TRANSIT,rank
0,1,14,2089641167,311,3234.0,1939,0,0,1,redwood city,...,577.103546,42.142388,9.053736,14.992984,16.233625,12.613691,14.317352,177.719282,602.802226,6.72255
1,1,14,2089641167,311,3234.0,3289,0,0,1,redwood city,...,622.132326,45.430568,9.760158,16.162818,17.500262,13.597881,15.434471,191.585914,649.836158,6.72255
2,1,14,2089641167,311,3234.0,2087,0,0,1,redwood city,...,524.723129,38.317362,8.231980,13.632155,14.760191,11.468818,13.017847,161.588710,548.089285,10.00000
3,1,14,2089641167,311,3234.0,3272,0,0,1,redwood city,...,1311.348345,95.759853,20.572741,34.068452,36.887553,28.662002,32.533219,403.830280,1369.743275,6.72255
4,1,14,2089641167,311,3234.0,3300,0,0,1,redwood city,...,764.570303,55.831953,11.994759,19.863316,21.506968,16.711132,18.968212,235.449750,798.616962,6.72255
5,1,14,2089641167,311,3234.0,3473,0,0,1,redwood city,...,1374.756218,100.390147,21.567499,35.715770,38.671184,30.047902,34.106304,423.356761,1435.974730,6.72255
6,1,14,2089641167,311,3234.0,3376,0,0,1,redwood city,...,1132.152180,82.674239,17.761470,29.412987,31.846857,24.745331,28.087545,348.646744,1182.567425,6.72255
7,1,14,2089641167,311,3234.0,3282,0,0,1,redwood city,...,577.103546,42.142388,9.053736,14.992984,16.233625,12.613691,14.317352,177.719282,602.802226,6.72255
8,1,14,2089641167,311,3234.0,2090,0,0,1,redwood city,...,415.367521,30.331782,6.516383,10.791128,11.684074,9.078644,10.304846,127.912604,433.864023,9.00000
9,1,14,2089641167,311,3234.0,3270,0,0,1,redwood city,...,797.652672,58.247759,12.513763,20.722786,22.437558,17.434210,19.788952,245.637479,833.172504,6.72255


In [239]:
# all_choices_costs = pd.merge(all_choices, reset_beam_skims,  how='left', left_on=['school_zone_id','home_zone_id'], right_on = ['to_zone_id','from_zone_id'])
# all_choices_costs = all_choices_costs.set_index("node_id_home")
# all_choices_costs.head()
# all_choices_costs.to_csv("all_choices_costs.csv")

## Model estimation

In [49]:
import pylogit as pl 
import math 
from collections import OrderedDict 

In [88]:
data_long = pd.DataFrame({'household_id': [1,1,1,2,2,2,3,3], 
                          'school_id': [1,2,3,6,3,8,7,8],
                          'choice': [1,0,0,1,0,0,0,1],
                          'travel_distance_car': [2,4,6,4,5,6,8,9]})
data_long

Unnamed: 0,household_id,school_id,choice,travel_distance_car
0,1,1,1,2
1,1,2,0,4
2,1,3,0,6
3,2,6,1,4
4,2,3,0,5
5,2,8,0,6
6,3,7,0,8
7,3,8,1,9


In [89]:
# # Specify the nesting values
# nest_membership = OrderedDict()
# nest_membership["Ride alone"] = [1,2,3]
# nest_membership["Shared option"] = [6,7,8]

# def logit(x):
#     """
#     Parameters
#     ----------
#     x : int, float, or 1D ndarray.
#         If an array, all elements should be ints or floats. All
#         elements should be between zero and one, exclusive of 1.0.

#     Returns
#     -------
#     The logit of x:  `np.log(x / (1.0 - x))`.
#     """
#     return np.log(x/(1.0 - x))

In [147]:
# basic_specification = OrderedDict()
# basic_names = OrderedDict()

# basic_specification["travel_distance_car"] = [[1,2,3,6,7,8]]
# basic_names["travel_distance_car"] = ['Travel Time']

In [148]:
# # Estimate the nested multinomial logit model (MNL)
# SLCM = pl.create_choice_model(data = data_long,
#                                         alt_id_col = 'school_id',
#                                         obs_id_col = 'household_id',
#                                         choice_col = 'choice',
#                                         specification = basic_specification,
#                                         model_type="Nested Logit",
#                                         names=basic_names,
#                                         nest_spec=nest_membership)

# # Specify the initial values and method for the optimization.
# SLCM.fit_mle(np.zeros(3),constrained_pos=[0])

# type(SLCM.get_statsmodels_summary())

In [149]:
# SLCM.get_statsmodels_summary().add_table_params

In [382]:
data_long.columns

Index(['obs_id', 'AGE', 'home_node_id', 'home_zone_id', 'chosen_school',
       'school_choice_set', 'hh_inc_under_25k', 'hh_inc_25_to_75k',
       'hh_inc_75_to_200k', 'choice', 'school_id', 'school_zone_id',
       'sw_rank_2108', 'from_zone_id', 'to_zone_id', 'dist', 'gen_cost_BIKE',
       'gen_cost_CAR', 'gen_cost_DRIVE_TRANSIT', 'gen_cost_RIDE_HAIL',
       'gen_cost_RIDE_HAIL_POOLED', 'gen_cost_RIDE_HAIL_TRANSIT',
       'gen_cost_WALK', 'gen_cost_WALK_TRANSIT', 'gen_tt_BIKE', 'gen_tt_CAR',
       'gen_tt_DRIVE_TRANSIT', 'gen_tt_RIDE_HAIL', 'gen_tt_RIDE_HAIL_POOLED',
       'gen_tt_RIDE_HAIL_TRANSIT', 'gen_tt_WALK', 'gen_tt_WALK_TRANSIT',
       'rank', 'np.log1p(gen_tt_CAR)', 'gen_cost_CAR:hh_inc_under_25k',
       'gen_cost_CAR:hh_inc_25_to_75k', 'gen_cost_CAR:hh_inc_75_to_200k'],
      dtype='object')

In [435]:
data_long.columns

Index(['obs_id', 'AGE', 'home_node_id', 'home_zone_id', 'chosen_school',
       'school_choice_set', 'hh_inc_under_25k', 'hh_inc_25_to_75k',
       'hh_inc_75_to_200k', 'home_city', 'choice', 'school_id',
       'school_zone_id', 'sw_rank_2108', 'City', 'same_city?', 'from_zone_id',
       'to_zone_id', 'dist', 'gen_cost_BIKE', 'gen_cost_CAR',
       'gen_cost_DRIVE_TRANSIT', 'gen_cost_RIDE_HAIL',
       'gen_cost_RIDE_HAIL_POOLED', 'gen_cost_RIDE_HAIL_TRANSIT',
       'gen_cost_WALK', 'gen_cost_WALK_TRANSIT', 'gen_tt_BIKE', 'gen_tt_CAR',
       'gen_tt_DRIVE_TRANSIT', 'gen_tt_RIDE_HAIL', 'gen_tt_RIDE_HAIL_POOLED',
       'gen_tt_RIDE_HAIL_TRANSIT', 'gen_tt_WALK', 'gen_tt_WALK_TRANSIT',
       'rank'],
      dtype='object')

In [450]:
#Orca step
# data_long = pd.DataFrame({'household_id': [1,1,1,2,2,2,3,3], 
#                           'school_id': [1,2,3,6,3,8,7,8],
#                           'choice': [1,0,0,1,0,0,0,1],
#                           'travel_distance_car': [2,4,6,4,5,6,8,9]})

list_1 = [1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679, 3378, 3379, 3381, 3432, 3454]


@orca.injectable(autocall=False)
def SLCM(data_long):
    # Specify the nested structure
    nest_membership = OrderedDict()
    nest_membership["Public School"] = list(set(range(1000,2828,1)) -set([1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679]))
    nest_membership["Private School"] = list(set(range(2828,3547,1)) -set([3378, 3379, 3381, 3432, 3454]))
    
    #Model especification
    basic_specification = OrderedDict()
    basic_names = OrderedDict()
        
    basic_specification["gen_tt_RIDE_HAIL"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["gen_tt_RIDE_HAIL"] = ['gen_tt_RIDE_HAIL']
    
    basic_specification["gen_tt_WALK"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["gen_tt_WALK"] = ['gen_tt_WALK']
        
    basic_specification["gen_tt_WALK_TRANSIT"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["gen_tt_WALK_TRANSIT"] = ['gen_tt_WALK_TRANSIT']
    
    # Travel time
    data_long["np.log1p(gen_tt_CAR)"] = np.log1p(data_long['gen_tt_CAR'])
    basic_specification["np.log1p(gen_tt_CAR)"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["np.log1p(gen_tt_CAR)"] = ['Log Travel Time']
    
     # Travel time
    basic_specification["gen_tt_WALK"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["gen_tt_WALK"] = ['Travel Time walk']
    
    #Distance
    basic_specification["dist"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["dist"] = ['Distance']
    
    #Rank
    basic_specification["rank"] = [list(set(range(1000,2828,1)) -set([1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679]))]
    basic_names["rank"] = ['School Rank'] 
    
    #Income
    data_long['gen_cost_CAR:hh_inc_under_25k'] = data_long['gen_cost_CAR'] * data_long['hh_inc_under_25k']
    data_long['gen_cost_CAR:hh_inc_25_to_75k'] = data_long['gen_cost_CAR'] * data_long['hh_inc_25_to_75k']
    data_long['gen_cost_CAR:hh_inc_75_to_200k'] = data_long['gen_cost_CAR'] * data_long['hh_inc_75_to_200k']
    
    basic_specification['gen_cost_CAR:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['gen_cost_CAR:hh_inc_under_25k'] = ['gen_cost_CAR:hh_inc_under_25k'] 
    
    basic_specification['gen_cost_CAR:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['gen_cost_CAR:hh_inc_25_to_75k'] = ['gen_cost_CAR:hh_inc_25_to_75k'] 
    
    basic_specification['gen_cost_CAR:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['gen_cost_CAR:hh_inc_75_to_200k'] = ['gen_cost_CAR:hh_inc_75_to_200k'] 
    
    #SameCity
    basic_specification['same_city?'] = [list(set(range(1000,2828,1)) -set([1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679]))]
    basic_names['same_city?'] = ['same_city?']  
    
    #Interaction terms
    data_long['distance:np.log1p(gen_tt_CAR)'] = data_long['dist'] * data_long["np.log1p(gen_tt_CAR)"]
    basic_specification['distance:np.log1p(gen_tt_CAR)'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['distance:np.log1p(gen_tt_CAR)'] = ['distance:np.log1p(gen_tt_CAR)']
    
    #School rank and income
    data_long['rank:hh_inc_under_25k'] = data_long['rank'] * data_long['hh_inc_under_25k']
    data_long['rank:hh_inc_25_to_75k'] = data_long['rank'] * data_long['hh_inc_25_to_75k']
    data_long['rank:hh_inc_75_to_200k'] = data_long['rank'] * data_long['hh_inc_75_to_200k']   
    
    basic_specification['rank:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['rank:hh_inc_under_25k'] = ['rank:hh_inc_under_25k'] 
    
    basic_specification['rank:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['rank:hh_inc_25_to_75k'] = ['rank:hh_inc_25_to_75k'] 
    
    basic_specification['rank:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['rank:hh_inc_75_to_200k'] = ['rank:hh_inc_75_to_200k']   
    
    #Same city and distance
    data_long['dist:sameCity'] = data_long['dist'] * data_long['same_city?']
    basic_specification['dist:sameCity'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['dist:sameCity'] = ['dist:sameCity']  
    
    

    # Estimate the nested multinomial logit model (MNL)
    SLCM = pl.create_choice_model(data = data_long,
                                        alt_id_col = 'school_choice_set',
                                        obs_id_col = 'obs_id',
                                        choice_col = 'choice',
                                        specification = basic_specification,
                                        model_type="Nested Logit",
                                        names=basic_names,
                                        nest_spec=nest_membership)

    # Specify the initial values and method for the optimization.
    SLCM.fit_mle(np.zeros(17),constrained_pos=[0])
        
    #Resutls
    return SLCM

In [451]:
orca.get_injectable('SLCM')(data_long).get_statsmodels_summary().add_table_params

Log-likelihood at zero: -11,789.9798
Initial Log-likelihood: -11,789.9798
Estimation Time for Point Estimation: 5.91 minutes.
Final log-likelihood: -7,261.7009


<bound method Summary.add_table_params of <class 'statsmodels.iolib.summary.Summary'>
"""
                    Nested Logit Model Regression Results                     
Dep. Variable:                 choice   No. Observations:                2,875
Model:             Nested Logit Model   Df Residuals:                    2,858
Method:                           MLE   Df Model:                           17
Date:                Mon, 24 Jun 2019   Pseudo R-squ.:                   0.384
Time:                        21:28:14   Pseudo R-bar-squ.:               0.383
AIC:                       14,557.402   Log-Likelihood:             -7,261.701
BIC:                       14,658.787   LL-Null:                   -11,789.980
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Public School                           0        nan        nan        nan      

In [377]:
# -7,965.2369 Wiht np.log1p
# -8,129.6396 without np.log1p
# -7,947.6792 with np.log1p in car cost
# -7,935.404
# -7,640.791 Adding Same city Same city For all public and private
# -7,513.121 Same city for only public schools
# -7,273.447 Interaction terms (distance:tt_car) (rank:Income)
# -7,261.701 Intercation (dist:SameCity)

In [443]:
household_chts.columns

Index(['SAMPN', 'RECMODE', 'RETMODE', 'INCEN', 'ILANG', 'CTFIP', 'AREA',
       'STRATA', 'STYPE', 'CEC', 'GTYPE', 'GFLAG', 'RIBUS', 'HHVEH', 'HHBIC',
       'VEHNEW', 'BUYER1', 'BUYER2', 'BUYER3', 'BUYER4', 'BUYER5', 'BUYER6',
       'BUYER7', 'BUYER8', 'RESTY', 'O_RESTY', 'OWN', 'O_OWN', 'TEN',
       'PREVCITY', 'PREVSTAT', 'PREVZIP', 'PHLNS', 'INCOM', 'HHSIZ',
       'NONRELAT', 'HHEMP', 'HHSTU', 'HHLIC', 'RECDate', 'ASSN', 'DOW',
       'HTRIPS', 'HCITY', 'HSTAT', 'HZIP', 'HXCORD', 'HYCORD', 'HHNOV1',
       'HHNOV2', 'HHNOV3', 'HHNOV4', 'HHNOV5', 'HHNOV6', 'HHNOV7', 'HHNOV8',
       'VEHOP', 'VEHPR', 'VEDTE', 'FUTUR', 'CMPLD', 'LDPER1', 'LDPER2',
       'LDPER3', 'LDPER4', 'LDPER5', 'LDPER6', 'LDPER7', 'LDPER8', 'LDTRP',
       'LDFlag', 'HPFlag', 'HH_Complete', 'GPS_Complete', 'HCTRACT',
       'HPrimaryCity', 'HSTFIP', 'MTC_FInalFlag', 'HHWGT', 'EXPHHWGT'],
      dtype='object')