# SCHOOL LOCATION CHOICE MODEL (K-12)

## General discription:

Nested logit especification. Nest are public and private school. 

Input files: Beam-skim generalized time and cost, 
             School data set, 
             Students K-12 with school ID.

In [92]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')
from urbansim.utils import misc
import pandana as pdna
from collections import OrderedDict
    
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import pandas as pd
import orca
import os; os.chdir('/home/juan/activitysynth/activitysynth/')
import warnings; warnings.simplefilter('ignore')
from matplotlib import pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from sklearn.neighbors import BallTree

import pylogit as pl                   # For choice model estimation
from pylogit import nested_logit as nl # For nested logit convenience funcs
import math 
from collections import OrderedDict 

import pickle
import dill

import urbansim_templates

# from scripts import datasources, models, variables, utils

In [93]:
from scripts import datasources, models, variables, utils

## Loading input files

In [12]:
input_file_format = 'csv'
input_data_dir = '/home/data/spring_2019/base/'

# estimate from base-year data
formattable_fname_dict = {
    'parcels': 'parcels.{0}',
    'buildings': 'buildings.{0}',
    'jobs': 'jobs.{0}',
    'establishments': 'establishments.{0}',
    'households': 'households.{0}',
    'persons': 'persons.{0}',
    'rentals': 'MTC_craigslist_listings_7-10-18.{0}',
    'units': 'units.{0}',
    'skims': 'mtc_skims.{0}',
    'beam_skims': '30.skims-smart-23April2019-baseline.csv.gz',
    'drive_nodes': 'drive_nodes.{0}',
    'drive_edges': 'drive_edges.{0}',
    'drive_access_vars': 'drive_net_vars.{0}',
    'walk_nodes': 'walk_nodes.{0}',
    'walk_edges': 'walk_edges.{0}',
    'walk_access_vars': 'walk_net_vars.{0}',
    'zones': 'zones.{0}',
    'zone_access_vars': 'zones_w_access_vars.{0}',
}

def format_fname_dict(formattable_fname_dict, format='csv'):
    formatted_dict = {
        k: v.format('csv')
        for k, v in formattable_fname_dict.items()}
    return formatted_dict

input_fnames = format_fname_dict(
            formattable_fname_dict, input_file_format)

In [13]:
orca.add_injectable('input_file_format', input_file_format)
orca.add_injectable('input_data_dir', input_data_dir)
orca.add_injectable('input_fnames', input_fnames)
orca.add_injectable('store', None)

In [14]:
# os.listdir("/home/data/fall_2018/HWtrips_032319.csv")

# Loading data

In [15]:
orca.run(['initialize_network_small']) 
orca.run([ 'initialize_network_walk']) 
orca.run(['impute_missing_skims']) 

Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.00 s
Total time to execute iteration 1 with iteration value None: 0.00 s
Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 0.00 s
Total time to execute iteration 1 with iteration value None: 0.00 s
Running step 'impute_missing_skims'
Time to execute step 'impute_missing_skims': 141.29 s
Total time to execute iteration 1 with iteration value None: 141.30 s


In [16]:
#Loading Data 
beam_skims = orca.get_table('beam_skims').to_frame()
reset_beam_skims = beam_skims.reset_index()

students = pd.read_csv('/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/students_with_school_id.csv')
schools = pd.read_csv('/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/schools.csv').rename({"parcel_id": "school_parcel_id"}, axis=1)
parcels = orca.get_table('parcels').to_frame()
household_chts = pd.read_csv("/home/data/fall_2018/CHTS_csv_format/data/Deliv_HH.csv")

## Preprocessing 

In [17]:
#Small netwrok 
nodessmall = pd.read_csv('/home/data/fall_2018/bay_area_tertiary_strongly_nodes.csv').set_index('osmid')
edgessmall = pd.read_csv('/home/data/fall_2018/bay_area_tertiary_strongly_edges.csv')
netsmall = pdna.Network(nodessmall.x, nodessmall.y, edgessmall.u,
                                edgessmall.v, edgessmall[['length']],
                                twoway=False)
netsmall.precompute(25000)

#Making sure x and y coordinates are type float
students['x_cor'] = students.HXCORD.astype(float)
students['y_cor'] = students.HYCORD.astype(float)

schools['Latitude'] = schools['Latitude'].astype(float)
schools['Longitude'] = schools['Longitude'].astype(float)

#Adding list of grades offered by each school
list_grades = []
for index, row in schools.loc[:,schools.columns.str.startswith("grade_")].iterrows():
    x = np.array(row)
    list_grades.append(x)
    
schools['list_grades'] = list_grades

#Filtering students by age
students = students[students.AGE <= 18]

In [18]:
#Some spatial functions 

def node_id_small(x, y, netsmall):
    """ Return the node ID given a pair of coordinates"""
    idssmall = netsmall.get_node_ids(x, y)
    return idssmall

def parcel_id(data):
    ''' Assigns a parcel ID given a x and y coordinate
    Input: DataFrame with x and y coordinates
    Output: Array-like Parcel_ID'''
    #Transforming Latitude/Longitude to radians
    parcels_rad = np.deg2rad(parcels[['y', 'x']])
    data_rad = np.deg2rad(data)
    
    tree = BallTree(parcels_rad, metric='haversine')
    
    dists, idxs = tree.query(data_rad, return_distance=True)
    
    return parcels.iloc[idxs[:,0]].index

def get_zone_id(parcel_id):
    '''gets the zone_id (TAZ) of school and home locations by the parcel id'''
    try: 
        parcel_id = list(parcels.loc[parcel_id,'zone_id'])

    except Exception as e:
        parcel_id = np.nan
        
    return parcel_id

In [19]:
# Adding spatial information to schools and students dataframes

#Node_ID
students['home_node_id'] = node_id_small(students.x_cor, students.y_cor, netsmall)
schools['school_node_id'] = node_id_small(schools.Longitude, schools.Latitude, netsmall)

#Parcel_ID
students['home_parcel_id'] = parcel_id(students[['y_cor', 'x_cor']])
schools['school_parcel_id'] = parcel_id(schools[['Latitude', 'Longitude']])

#Zone-ID
students['home_zone_id'] = get_zone_id(students['home_parcel_id'])
schools['school_zone_id'] = get_zone_id(schools['school_parcel_id'])

#Adding household income for each individual 
students = students.merge(household_chts.loc[:,['SAMPN', 'INCOM']], how = 'left', on = 'SAMPN')

#Creates 3 dummies for income

students['hh_inc_under_25k'] = (students['INCOM'] < 3).astype(int)
students['hh_inc_25_to_75k'] = ((students['INCOM'] > 2) & (students['INCOM'] < 6)).astype(int)
students['hh_inc_75_to_200k'] = ((students['INCOM'] > 5) & (students['INCOM'] < 9)).astype(int)

## Defining Choice set constrains

In [20]:
#Merging students and schools 
students_1 = students.merge(schools, how = 'left', on = 'school_id')

# Droping students with no assigned school
students_1 = students_1.dropna(subset=['school_id'])

In [21]:
#Define the node ID for each home and school location in students dataset
students_1['node_id_home'] = node_id_small(students_1.HXCORD, students_1.HYCORD, netsmall)
students_1['node_id_school'] = node_id_small(students_1.Longitude, students_1.Latitude, netsmall)

#Creating a df for public shcools only
df_public = students_1[students_1.school_id <= 2827].loc[:,['SAMPN', 'PERNO', 'school_id',
                                                        'AGE','HCITY','HYCORD','HXCORD',
                                                        'SNAME_lookup','SCITY_lookup', 
                                                        'node_id_home', 'node_id_school']]

#Creating a df for private shcools only
df_private = students_1[students_1.school_id > 2827].loc[:,['SAMPN', 'PERNO', 'school_id',
                                                        'AGE','HCITY','HYCORD','HXCORD',
                                                        'SNAME_lookup','SCITY_lookup', 
                                                        'node_id_home', 'node_id_school']]

In [22]:
#Setting public and private schools as POIs. 
netsmall.set_pois('public_school', 500000, 10000, 
                  schools[schools.type == 'public'].Longitude, 
                  schools[schools.type == 'public'].Latitude, )

netsmall.set_pois('private_school', 500000, 10000, 
                  schools[schools.type == 'private'].Longitude, 
                  schools[schools.type == 'private'].Latitude)

In [23]:
#Public schools. 95% of the time, the school is within the x's closest schools. Finding x
n =50

# n closest public schools per each node in netsmall
distance_matrix_public = netsmall.nearest_pois(200000, 
                                               'public_school', 
                                               num_pois=n,
                                               include_poi_ids=True)

# Selects POIS id's only 
public_nodes = distance_matrix_public.iloc[:,n:]

# Creates a list of n closest POIs for each node
list_values = []
for index, row in public_nodes.iterrows():
    x = np.array(row)
    x = x[~np.isnan(x)]
    list_values.append(x)

#Add created list to public nodes 
public_nodes['list_values'] = list_values


merge = df_public.merge(public_nodes.loc[:,['list_values']], how = 'left', left_on = 'node_id_home', right_index= True)

school_position = []
for index, row in merge.iterrows():
    school_position.append(np.isin(row['school_id']-1000,row['list_values'])==True)
    
merge['school_position'] = school_position

merge.school_position.mean()

0.9131504922644164

In [24]:
#Private schools. 95% of the time, the school is within the x's closest schools. Finding x
n =100

distance_matrix_private = netsmall.nearest_pois(100000, 
                                                'private_school', 
                                                num_pois=n, 
                                                include_poi_ids=True)

private_nodes = distance_matrix_private.iloc[:,n:]

list_values = []
for index, row in private_nodes.iterrows():
    x = np.array(row)
    x = x[~np.isnan(x)]
    list_values.append(x)
    
private_nodes['list_values'] = list_values


merge = df_private.merge(private_nodes.loc[:,['list_values']], how = 'left', left_on = 'node_id_home', right_index= True)

school_position = []
for index, row in merge.iterrows():
    school_position.append(np.isin(row['school_id']-1000,row['list_values'])==True)
    
merge['school_position'] = school_position

merge.school_position.mean()

0.8758465011286681

## Getting data_long table

In [25]:
def school_available(age, grades_offered):
    """ Checks if a school offers a grade according to the age range
    Input: Age: int in range (5-18)
           grades_offered: 13 element array
    Output: True if grade if offered acoording to age, false otherwise. 
     """
    index = age - 6
    
    if index < 0:
        index = 0

    if index < 12:
        result = (grades_offered[index] == 1) | (grades_offered[index + 1] == 1)
        
    elif index == 12:
        result = (grades_offered[index] == 1)
    
    return result

In [26]:
def school_choice_set(house_node_id, kid_age):
    """ Determines the school choice set given home node id and age of the student (4-18)
    Output: Pandas series with available school IDs"""
    
    public_id = distance_matrix_public.iloc[:,50:].loc[house_node_id] + 1000
    private_id = distance_matrix_private.iloc[:,100:].loc[house_node_id] + 1000
    schools_filter = pd.concat([public_id, private_id])
    schools_set = schools[schools.school_id.isin(schools_filter)]
    school_availability = [school_available(kid_age, x) for x in schools_set.list_grades]
    try:
        schools_available = schools_set[school_availability].school_id
        return schools_available
    except Exception as e:
        print(e)
        print("empty dataframe?")
    
    return np.nan

In [27]:
#Creating long data frame
stu = students[["AGE", 
                "home_node_id", 
                "home_zone_id", 
                "school_id", 
                'hh_inc_under_25k',
                'hh_inc_25_to_75k', 
                'hh_inc_75_to_200k',
                'HPrimaryCity']] 

dfs = []
for index, record in stu.iterrows():
    scs = school_choice_set(int(record['home_node_id']), int(record['AGE']))
    
    if pd.Series(record['school_id']).isin(scs)[0]: #Makes sure that the choseen school is in the choice set
        df = pd.DataFrame({'obs_id': index, 
                           'AGE': record['AGE'], 
                           'home_node_id' : record['home_node_id'],
                           'home_zone_id' : record['home_zone_id'], 
                           'chosen_school' : record['school_id'],
                           'school_choice_set': scs,
                           'hh_inc_under_25k': record['hh_inc_under_25k'],
                           'hh_inc_25_to_75k' : record['hh_inc_25_to_75k'],
                           'hh_inc_75_to_200k' : record['hh_inc_75_to_200k'],
                           'home_city': record['HPrimaryCity']})
                           
        dfs.append(df)
    

all_choices = pd.concat(dfs, axis=0)

#Creating the choice column
all_choices['choice'] = 1*(all_choices.chosen_school == all_choices.school_choice_set)
all_choices.head()

Unnamed: 0,obs_id,AGE,home_node_id,home_zone_id,chosen_school,school_choice_set,hh_inc_under_25k,hh_inc_25_to_75k,hh_inc_75_to_200k,home_city,choice
939,1,14,2089641167,311,3234.0,1939,0,0,1,REDWOOD CITY,0
943,1,14,2089641167,311,3234.0,1943,0,0,1,REDWOOD CITY,0
944,1,14,2089641167,311,3234.0,1944,0,0,1,REDWOOD CITY,0
950,1,14,2089641167,311,3234.0,1950,0,0,1,REDWOOD CITY,0
951,1,14,2089641167,311,3234.0,1951,0,0,1,REDWOOD CITY,0


In [28]:
schools.columns

Index(['CDSCode', 'School', 'District', 'County', 'Street', 'City', 'Zip',
       'State', 'Latitude', 'Longitude', 'type', 'grade_0', 'grade_1',
       'grade_2', 'grade_3', 'grade_4', 'grade_5', 'grade_6', 'grade_7',
       'grade_8', 'grade_9', 'grade_10', 'grade_11', 'grade_12',
       'Kindergarten Enrollment', 'Grade 1 Enrollment', 'Grade 2 Enrollment',
       'Grade 3 Enrollment', 'Grade 4 Enrollment', 'Grade 5 Enrollment',
       'Grade 6 Enrollment', 'Grade 7 Enrollment', 'Grade 8 Enrollment',
       'Ungraded Elementary Enrollmnet', 'Grade 9 Enrollment',
       'Grade 10 Enrollment', 'Grade 11 Enrollment', 'Grade 12 Enrollment',
       'Ungraded Secondary Enrollment', 'Total Enrollment', 'sw_rank_2108',
       'sw_rank_2017', 'sw_rank_2016', 'ss_rank_2018_', 'ss_rank_2017',
       'ss_rank_2016', 'nodeID', 'school_parcel_id', 'school_id',
       'list_grades', 'school_node_id', 'school_zone_id'],
      dtype='object')

In [29]:
#Getting zone id for schools in the choice set
df = all_choices.merge(schools.loc[:,['school_id','school_zone_id','sw_rank_2108', 'City']], 
                       left_on = 'school_choice_set', 
                       right_on = 'school_id').sort_values('obs_id')

In [30]:
df[df.choice == 1]

df['home_city'] = df['home_city'].apply(lambda x: x.lower())
df['City'] = df['City'].apply(lambda x: x.lower())
df['same_city?'] = 1*(df['home_city'] == df['City'])

In [32]:
#Getting beam generalized time and cost
data_long = df.merge(reset_beam_skims, 
                     how = 'left', 
                     left_on=['home_zone_id','school_zone_id'], 
                     right_on = ['from_zone_id','to_zone_id'])

In [94]:
#Imputing school grades
avg_grade = data_long.sw_rank_2108.mean()
data_long['rank'] = data_long.sw_rank_2108.fillna(avg_grade)

## Model estimation

In [84]:
# list if schools that are not in any of choice set
list_1 = [1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679, 3378, 3379, 3381, 3432, 3454]


@orca.injectable(autocall=False)
def SLCM(data_long):
    # Specify the nested structure
    nest_membership = OrderedDict()
    nest_membership["Public School"] = list(set(range(1000,2828,1)) -set([1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679]))
    nest_membership["Private School"] = list(set(range(2828,3547,1)) -set([3378, 3379, 3381, 3432, 3454]))
    
    #Model especification
    basic_specification = OrderedDict()
    basic_names = OrderedDict()
        
#     basic_specification["gen_tt_RIDE_HAIL"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_tt_RIDE_HAIL"] = ['gen_tt_RIDE_HAIL']
    
#     basic_specification["gen_tt_WALK"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_tt_WALK"] = ['gen_tt_WALK']
        
#     basic_specification["gen_tt_WALK_TRANSIT"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_tt_WALK_TRANSIT"] = ['gen_tt_WALK_TRANSIT']
    
    # Travel time
    data_long["np.log1p(gen_tt_CAR)"] = np.log1p(data_long['gen_tt_CAR'])
    basic_specification["np.log1p(gen_tt_CAR)"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["np.log1p(gen_tt_CAR)"] = ["np.log1p(gen_tt_CAR)"]
    
#     data_long["np.log1p(gen_tt_WALK)"] = np.log1p(data_long['gen_tt_WALK'])
#     basic_specification["np.log1p(gen_tt_WALK)"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["np.log1p(gen_tt_WALK)"] = ['np.log1p(gen_tt_WALK)']
    
#     #Cost 
#     basic_specification["gen_cost_BIKE"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_BIKE"] = ['gen_cost_BIKE']
    
#     basic_specification["gen_cost_CAR"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_CAR"] = ['gen_cost_CAR']
    
#     basic_specification["gen_cost_DRIVE_TRANSIT"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_DRIVE_TRANSIT"] = ['gen_cost_DRIVE_TRANSIT']
    
#     basic_specification["gen_cost_RIDE_HAIL"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_RIDE_HAIL"] = ['gen_cost_RIDE_HAIL']
    
#     basic_specification["gen_cost_RIDE_HAIL_POOLED"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_RIDE_HAIL_POOLED"] = ['gen_cost_RIDE_HAIL_POOLED']
    
#     basic_specification["gen_cost_RIDE_HAIL_TRANSIT"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_RIDE_HAIL_TRANSIT"] = ['gen_cost_RIDE_HAIL_TRANSIT']
    
#     basic_specification["gen_cost_WALK"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_WALK"] = ['gen_cost_WALK']
    
#     basic_specification["gen_cost_WALK_TRANSIT"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["gen_cost_WALK_TRANSIT"] = ['gen_cost_WALK_TRANSIT']
    
    #Distance
    data_long['np.log1p(dist)'] = np.log1p(data_long.dist/1000)
    
    basic_specification["np.log1p(dist)"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names["np.log1p(dist)"] = ['np.log1p(dist)']
    
#     basic_specification["dist"] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names["dist"] = ['Distance']
    
    #Rank
    basic_specification["rank"] = [list(set(range(1000,2828,1)) -set([1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679]))]
    basic_names["rank"] = ['School Rank'] 
    
    #Income
#     data_long['gen_cost_CAR:hh_inc_under_25k'] = data_long['gen_cost_CAR'] * data_long['hh_inc_under_25k']
#     data_long['gen_cost_CAR:hh_inc_25_to_75k'] = data_long['gen_cost_CAR'] * data_long['hh_inc_25_to_75k']
#     data_long['gen_cost_CAR:hh_inc_75_to_200k'] = data_long['gen_cost_CAR'] * data_long['hh_inc_75_to_200k']
    
#     basic_specification['gen_cost_CAR:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['gen_cost_CAR:hh_inc_under_25k'] = ['gen_cost_CAR:hh_inc_under_25k'] 
    
#     basic_specification['gen_cost_CAR:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['gen_cost_CAR:hh_inc_25_to_75k'] = ['gen_cost_CAR:hh_inc_25_to_75k'] 
    
#     basic_specification['gen_cost_CAR:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['gen_cost_CAR:hh_inc_75_to_200k'] = ['gen_cost_CAR:hh_inc_75_to_200k'] 
    
    #SameCity
    basic_specification['same_city?'] = [list(set(range(1000,2828,1)) -set([1459, 1685, 1746, 1749, 1750, 2006, 2063, 2327, 2662, 2679]))]
    basic_names['same_city?'] = ['same_city?']  
    
    #Interaction terms
    data_long['np.log1p(dist):np.log1p(gen_tt_CAR)'] = data_long['np.log1p(dist)'] * data_long["np.log1p(gen_tt_CAR)"]
    basic_specification['np.log1p(dist):np.log1p(gen_tt_CAR)'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['np.log1p(dist):np.log1p(gen_tt_CAR)'] = ['np.log1p(dist):np.log1p(gen_tt_CAR)']
    
    #School rank and income
    
    data_long['rank_1_4'] = data_long['rank'].apply(lambda x: 1 if x<=4 else 0)
    data_long['rank_5_7'] = data_long['rank'].apply(lambda x: 1 if (x>4) & (x<=7) else 0)
    data_long['rank_8_9'] = data_long['rank'].apply(lambda x: 1 if x>7 else 0)
    
#     data_long['rank_5_7:hh_inc_under_25k'] = data_long['rank_5_7'] * data_long['hh_inc_under_25k']
#     data_long['rank_8_9:hh_inc_under_25k'] = data_long['rank_8_9'] * data_long['hh_inc_under_25k']
    
#     data_long['rank_5_7:hh_inc_25_to_75k'] = data_long['rank_5_7'] * data_long['hh_inc_25_to_75k']
#     data_long['rank_8_9:hh_inc_25_to_75k'] = data_long['rank_8_9'] * data_long['hh_inc_25_to_75k']
    
#     data_long['rank_5_7:hh_inc_75_to_200k'] = data_long['rank_5_7'] * data_long['hh_inc_75_to_200k']
#     data_long['rank_8_9:hh_inc_75_to_200k'] = data_long['rank_8_9'] * data_long['hh_inc_75_to_200k']
    
#     basic_specification['rank_5_7:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_specification['rank_8_9:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_specification['rank_5_7:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_specification['rank_8_9:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_specification['rank_5_7:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_specification['rank_8_9:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    
#     basic_names['rank_5_7:hh_inc_under_25k'] = ['rank_5_7:hh_inc_under_25k'] 
#     basic_names['rank_8_9:hh_inc_under_25k'] = ['rank_8_9:hh_inc_under_25k'] 
#     basic_names['rank_5_7:hh_inc_25_to_75k'] = ['rank_5_7:hh_inc_25_to_75k'] 
#     basic_names['rank_8_9:hh_inc_25_to_75k'] = ['rank_8_9:hh_inc_25_to_75k'] 
#     basic_names['rank_5_7:hh_inc_75_to_200k'] = ['rank_5_7:hh_inc_75_to_200k'] 
#     basic_names['rank_8_9:hh_inc_75_to_200k'] = ['rank_8_9:hh_inc_75_to_200k'] 
    
    data_long['rank:hh_inc_under_25k'] = data_long['rank'] * data_long['hh_inc_under_25k']
    data_long['rank:hh_inc_25_to_75k'] = data_long['rank'] * data_long['hh_inc_25_to_75k']
    data_long['rank:hh_inc_75_to_200k'] = data_long['rank'] * data_long['hh_inc_75_to_200k']   
    
    
    basic_specification['rank:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['rank:hh_inc_under_25k'] = ['rank:hh_inc_under_25k'] 
    
    basic_specification['rank:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['rank:hh_inc_25_to_75k'] = ['rank:hh_inc_25_to_75k'] 
    
    basic_specification['rank:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['rank:hh_inc_75_to_200k'] = ['rank:hh_inc_75_to_200k']   
    
    #Same city and distance
    data_long['np.log1p(dist):sameCity'] = data_long['np.log1p(dist)'] * data_long['same_city?']
    basic_specification['np.log1p(dist):sameCity'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['np.log1p(dist):sameCity'] = ['np.log1p(dist):sameCity'] 
    
#     #Distance and family income 
#     data_long['dist:hh_inc_under_25k'] = data_long['dist'] * data_long['hh_inc_under_25k']
#     data_long['dist:hh_inc_25_to_75k'] = data_long['dist'] * data_long['hh_inc_25_to_75k']
#     data_long['dist:hh_inc_75_to_200k'] = data_long['dist'] * data_long['hh_inc_75_to_200k']
    
#     basic_specification['dist:hh_inc_under_25k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['dist:hh_inc_under_25k'] = ['dist:hh_inc_under_25k']
    
#     basic_specification['dist:hh_inc_25_to_75k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['dist:hh_inc_25_to_75k'] = ['dist:hh_inc_25_to_75k']
    
#     basic_specification['dist:hh_inc_75_to_200k'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['dist:hh_inc_75_to_200k'] = ['dist:hh_inc_75_to_200k']

    data_long['np.log1p(dist):rank_1_4'] = data_long['np.log1p(dist)'] * data_long['rank_1_4']
    data_long['np.log1p(dist):rank_5_7'] = data_long['np.log1p(dist)'] * data_long['rank_5_7']
    data_long['np.log1p(dist):rank_8_9'] = data_long['np.log1p(dist)'] * data_long['rank_8_9']
    
#     basic_specification['np.log1p(dist):rank_1_4'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
#     basic_names['np.log1p(dist):rank_1_4'] = ['np.log1p(dist):rank_1_4'] 
    
    basic_specification['np.log1p(dist):rank_5_7'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['np.log1p(dist):rank_5_7'] = ['np.log1p(dist):rank_5_7'] 
    
    basic_specification['np.log1p(dist):rank_8_9'] = [list(set(range(1000,3547,1)) - set(list_1)) ]
    basic_names['np.log1p(dist):rank_8_9'] = ['np.log1p(dist):rank_8_9'] 
    

    # Estimate the nested multinomial logit model (MNL)
    SLCM = pl.create_choice_model(data = data_long,
                                        alt_id_col = 'school_choice_set',
                                        obs_id_col = 'obs_id',
                                        choice_col = 'choice',
                                        specification = basic_specification,
                                        model_type="Nested Logit",
                                        names=basic_names,
                                        nest_spec=nest_membership)

    # Specify the initial values and method for the optimization.
    SLCM.fit_mle(np.zeros(13),constrained_pos=[0])
        
    #Resutls
    return SLCM

In [85]:
orca.get_injectable('SLCM')(data_long).get_statsmodels_summary().add_table_params

Log-likelihood at zero: -11,789.9798
Initial Log-likelihood: -11,789.9798
Estimation Time for Point Estimation: 3.43 minutes.
Final log-likelihood: -7,201.8418


<bound method Summary.add_table_params of <class 'statsmodels.iolib.summary.Summary'>
"""
                    Nested Logit Model Regression Results                     
Dep. Variable:                 choice   No. Observations:                2,875
Model:             Nested Logit Model   Df Residuals:                    2,862
Method:                           MLE   Df Model:                           13
Date:                Wed, 03 Jul 2019   Pseudo R-squ.:                   0.389
Time:                        15:32:42   Pseudo R-bar-squ.:               0.388
AIC:                       14,429.684   Log-Likelihood:             -7,201.842
BIC:                       14,507.213   LL-Null:                   -11,789.980
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Public School                                0        nan        nan  

In [86]:
## Final-ll record 

# -7,965.2369 Wiht np.log1p
# -8,129.6396 without np.log1p
# -7,947.6792 with np.log1p in car cost
# -7,935.404
# -7,640.791 Adding Same city Same city For all public and private
# -7,513.121 Same city for only public schools
# -7,273.447 Interaction terms (distance:tt_car) (rank:Income)
# -7,261.701 Intercation (dist:SameCity)
# -7,264.1362 Deleting p-values > 0.05
# -7,233.017 Log1p distance transformation
# -7,227.987 Distance and family income (Excluded)
# -7,215.6650 Distance and school rank
# -7,214.3486 scale distances from meters to km
# -7,220.9058 Excluding distance and rank
# -7,215.3930 Adding gen cost varaibles [Do not include any]
# -7,222.265 Deleting tt_WALK_TRANSIT
# -7,201.830 Adding log distnace and school rank (1-4 rank as base)
# -7,201.842 Deleting tt_WALK

In [87]:
model_object = orca.get_injectable('SLCM')(data_long)

Log-likelihood at zero: -11,789.9798
Initial Log-likelihood: -11,789.9798
Estimation Time for Point Estimation: 3.34 minutes.
Final log-likelihood: -7,201.8418


In [90]:
#Creating a pkl file
file_Name = "/home/juan/activitysynth/activitysynth/configs/SLCM_gen_tt.pkl"
fileObject = open(file_Name,'wb') 
dill.dump(model_object,fileObject)   
fileObject.close()

In [91]:
#Testing pkl file
fileObject = open(file_Name,'rb')  
# load the object from the file into var b
b = pickle.load(fileObject)  
b.get_statsmodels_summary().add_table_params

<bound method Summary.add_table_params of <class 'statsmodels.iolib.summary.Summary'>
"""
                    Nested Logit Model Regression Results                     
Dep. Variable:                 choice   No. Observations:                2,875
Model:             Nested Logit Model   Df Residuals:                    2,862
Method:                           MLE   Df Model:                           13
Date:                Wed, 03 Jul 2019   Pseudo R-squ.:                   0.389
Time:                        15:46:11   Pseudo R-bar-squ.:               0.388
AIC:                       14,429.684   Log-Likelihood:             -7,201.842
BIC:                       14,507.213   LL-Null:                   -11,789.980
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Public School                                0        nan        nan  

In [97]:
# #Coding for simulation (stand by)
# a = model_object.predict(data_long)
# final = pd.DataFrame({'chosen_school': data_long['school_choice_set'], 'obs_id': data_long['obs_id'], 'probability' : a})

# # Assigning the school with the grates probability
# b =final.sort_values('probability', ascending= False).drop_duplicates(['obs_id'])