In [1]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')
from urbansim.utils import misc
import pandana as pdna
    
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import pandas as pd
import orca
# import os; os.chdir('/home/juan/ual_model_workspace/spring-2019-models/')
import warnings; warnings.simplefilter('ignore')
from matplotlib import pyplot as plt
import matplotlib.animation as animation
import seaborn as sns

# from scripts import datasources, models, variables, utils

In [2]:
from scripts import datasources, models, variables, utils

Registering model step 'auto_ownership'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM'


In [3]:
import urbansim_templates
urbansim_templates.__version__

'0.1.1'

# Loading data

In [4]:
orca.run(['initialize_network_small', 'initialize_network_walk','impute_missing_skims']) 

Running step 'initialize_network_small'
Time to execute step 'initialize_network_small': 0.00 s
Running step 'initialize_network_walk'
Time to execute step 'initialize_network_walk': 0.00 s
Running step 'impute_missing_skims'
Time to execute step 'impute_missing_skims': 95.98 s
Total time to execute iteration 1 with iteration value None: 95.98 s


In [52]:
#Loading Data 
beam_skims = orca.get_table('beam_skims').to_frame()
reset_beam_skims = beam_skims.reset_index()
students = pd.read_csv('/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/students_with_school_id.csv')
schools = pd.read_csv('/home/juan/ual_model_workspace/spring-2019-models/notebooks-juan/schools.csv').rename({"parcel_id": "school_parcel_id"}, axis=1)
parcels = orca.get_table('parcels').to_frame()

#Preprocessing
students = students[students.AGE <= 18]

In [33]:
#Adding list of grades offered by each school
list_grades = []
for index, row in schools.loc[:,schools.columns.str.startswith("grade_")].iterrows():
    x = np.array(row)
    list_grades.append(x)
    
schools['list_grades'] = list_grades

## Defining set choice contrains

In [34]:
#Small netwrok 
nodessmall = pd.read_csv('/home/data/fall_2018/bay_area_tertiary_strongly_nodes.csv').set_index('osmid')
edgessmall = pd.read_csv('/home/data/fall_2018/bay_area_tertiary_strongly_edges.csv')
netsmall = pdna.Network(nodessmall.x, nodessmall.y, edgessmall.u,
                                edgessmall.v, edgessmall[['length']],
                                twoway=False)
netsmall.precompute(25000)

def node_id_small(x, y, netsmall):
    """ Return the node ID given a pair of coordinates"""
    idssmall = netsmall.get_node_ids(x, y)
    return idssmall

# schools['nodeID'] = node_id_small(schools, netsmall)

In [35]:
#Merging students and schools 
students_1 = students.merge(schools, how = 'left', on = 'school_id')

# Droping students with no assigned school
students_1 = students_1.dropna(subset=['school_id'])

In [36]:
#Define the node ID for each home and school location in students dataset
students_1['node_id_home'] = node_id_small(students_1.HXCORD, students_1.HYCORD, netsmall)
students_1['node_id_school'] = node_id_small(students_1.Longitude, students_1.Latitude, netsmall)

#Creating a df for public shcools only
df_public = students_1[students_1.school_id <= 2827].loc[:,['SAMPN', 'PERNO', 'school_id',
                                                        'AGE','HCITY','HYCORD','HXCORD',
                                                        'SNAME_lookup','SCITY_lookup', 
                                                        'node_id_home', 'node_id_school']]

#Creating a df for private shcools only
df_private = students_1[students_1.school_id > 2827].loc[:,['SAMPN', 'PERNO', 'school_id',
                                                        'AGE','HCITY','HYCORD','HXCORD',
                                                        'SNAME_lookup','SCITY_lookup', 
                                                        'node_id_home', 'node_id_school']]

In [37]:
#Setting public and private schools as POIs. 
netsmall.set_pois('public_school', 500000, 10000, 
                  schools[schools.type == 'public'].Longitude, 
                  schools[schools.type == 'public'].Latitude, )

netsmall.set_pois('private_school', 500000, 10000, 
                  schools[schools.type == 'private'].Longitude, 
                  schools[schools.type == 'private'].Latitude)

In [38]:
#Public schools. 95% of the time, the school is within the x's closest schools. Finding x
n =50

# n closest public schools per each node in netsmall
distance_matrix_public = netsmall.nearest_pois(200000, 
                                               'public_school', 
                                               num_pois=n,
                                               include_poi_ids=True)

# Selects POIS id's only 
public_nodes = distance_matrix_public.iloc[:,n:]

# Creates a list of n closest POIs for each node
list_values = []
for index, row in public_nodes.iterrows():
    x = np.array(row)
    x = x[~np.isnan(x)]
    list_values.append(x)

#Add created list to public nodes 
public_nodes['list_values'] = list_values


merge = df_public.merge(public_nodes.loc[:,['list_values']], how = 'left', left_on = 'node_id_home', right_index= True)

school_position = []
for index, row in merge.iterrows():
    school_position.append(np.isin(row['school_id']-1000,row['list_values'])==True)
    
merge['school_position'] = school_position

merge.school_position.mean()

0.9131504922644164

In [39]:
#Private schools. 95% of the time, the school is within the x's closest schools. Finding x
n =100

distance_matrix_private = netsmall.nearest_pois(100000, 
                                                'private_school', 
                                                num_pois=n, 
                                                include_poi_ids=True)

private_nodes = distance_matrix_private.iloc[:,n:]

list_values = []
for index, row in private_nodes.iterrows():
    x = np.array(row)
    x = x[~np.isnan(x)]
    list_values.append(x)
    
private_nodes['list_values'] = list_values


merge = df_private.merge(private_nodes.loc[:,['list_values']], how = 'left', left_on = 'node_id_home', right_index= True)

school_position = []
for index, row in merge.iterrows():
    school_position.append(np.isin(row['school_id']-1000,row['list_values'])==True)
    
merge['school_position'] = school_position

merge.school_position.mean()

0.8758465011286681

In [40]:
# students = students[students.AGE <= 18]

In [41]:
def school_available(age, grades_offered):
    """ Checks if a school offers a grade according to the age range
    Input: Age: int in range (5-18)
           grades_offered: 13 element array
    Output: True if grade if offered acoording to age, false otherwise. 
     """
    index = age - 6
    
    if index < 0:
        index = 0

    if index < 12:
        result = (grades_offered[index] == 1) | (grades_offered[index + 1] == 1)
        
    elif index == 12:
        result = (grades_offered[index] == 1)
    
    return result

In [42]:
def school_choice_set(house_node_id, kid_age):
    """ Determines the school choice set given home node id and age of the student (4-18)
    Output: Pandas series with available school IDs"""
    
    public_id = distance_matrix_public.iloc[:,50:].loc[house_node_id] + 1000
    private_id = distance_matrix_private.iloc[:,100:].loc[house_node_id] + 1000
    schools_filter = pd.concat([public_id, private_id])
    schools_set = schools[schools.school_id.isin(schools_filter)]
    school_availability = [school_available(kid_age, x) for x in schools_set.list_grades]
    schools_available = schools_set[school_availability].school_id
    try:
        schools_available = schools_set[school_availability].school_id
        return schools_available
    except Exception as e:
        print(e)
        print("empty dataframe?")
    
    return np.nan

In [43]:
def get_zone_id(parcel_id):
    '''gets the zone_id (TAZ) of school and home locations by the parcel id'''
    try: 
        parcel_id = (parcels.iloc[parcel_id])["zone_id"]

    except Exception as e:
#         print(e)
        parcel_id = np.nan
    return parcel_id

In [44]:
students_1.columns

Index(['level_0', 'SAMPN', 'PERNO', 'Unnamed: 0', 'index', 'RELAT', 'GEND',
       'AGE', 'AGEB', 'HISP',
       ...
       'sw_rank_2017', 'sw_rank_2016', 'ss_rank_2018_', 'ss_rank_2017',
       'ss_rank_2016', 'nodeID', 'school_parcel_id', 'list_grades',
       'node_id_home', 'node_id_school'],
      dtype='object', length=246)

## Create parameters table with distances and generalized cost

In [49]:
stu = students_1[["node_id_home", "AGE", "school_parcel_id", "parcel_id_home"]] 
dfs = []
for index, record in stu.iterrows():
    age = int(record["AGE"])
    parcel_id_home = int(record["parcel_id_home"])
    node_id_home = int(record["node_id_home"])
    school_choices = school_choice_set(node_id_home, age)
    school_parcel_id = [schools[schools["school_id"] == choice]["school_parcel_id"].values[0] for choice in school_choices]
    df = pd.DataFrame({"node_id_home": node_id_home, "parcel_id_home": parcel_id_home, "age": age,
                       "school_choice": school_choices, "parcel_id_school": school_parcel_id})
    df["school_zone_id"] = df["parcel_id_school"].apply(get_zone_id)
    df["home_zone_id"] = df["parcel_id_home"].apply(get_zone_id)

    
    dfs.append(df)
    
all_choices = pd.concat(dfs, axis=0)

In [53]:
all_choices_costs = pd.merge(all_choices, reset_beam_skims,  how='left', left_on=['school_zone_id','home_zone_id'], right_on = ['to_zone_id','from_zone_id'])
all_choices_costs = all_choices_costs.set_index("node_id_home")
all_choices_costs.head()
all_choices_costs.to_csv("all_choices_costs.csv")

In [54]:
print(len(all_choices))
print(len(all_choices_costs))

291109
291109


In [58]:
all_choices_costs.head()

Unnamed: 0_level_0,parcel_id_home,age,school_choice,parcel_id_school,school_zone_id,home_zone_id,from_zone_id,to_zone_id,dist,gen_cost_BIKE,...,gen_cost_WALK,gen_cost_WALK_TRANSIT,gen_tt_BIKE,gen_tt_CAR,gen_tt_DRIVE_TRANSIT,gen_tt_RIDE_HAIL,gen_tt_RIDE_HAIL_POOLED,gen_tt_RIDE_HAIL_TRANSIT,gen_tt_WALK,gen_tt_WALK_TRANSIT
node_id_home,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
65497078,1195607,17,2021,1195135,901.0,742.0,742.0,901.0,36161.8698,46.519338,...,5338.670369,2064.891192,150.786537,32.394499,53.645277,58.084325,45.132109,51.227852,635.884119,2156.841723
65497078,1195607,17,2093,1171860,1273.0,742.0,742.0,1273.0,89382.7436,114.983713,...,13195.805621,5103.874356,372.705131,80.070782,132.597183,143.569354,111.554843,126.621935,1571.740275,5331.152172
65497078,1195607,17,2094,1186767,1085.0,742.0,742.0,1085.0,65387.4842,84.115741,...,9653.323413,3733.712911,272.650512,58.575367,97.000784,105.027419,81.607369,92.629622,1149.798476,3899.976823
65497078,1195607,17,2095,1195560,883.0,742.0,742.0,883.0,32782.2558,42.171736,...,4839.729213,1871.910706,136.694338,29.366976,48.6317,52.655883,40.914155,46.440202,576.455697,1955.26773
65497078,1195607,17,2097,1192891,761.0,742.0,742.0,761.0,26087.4014,33.559344,...,3851.350542,1489.625555,108.778361,23.369596,38.700042,41.902399,32.558589,36.956096,458.730822,1555.959249


In [29]:
school_choice_set(5418694673, 5).head()

944    1944
946    1946
947    1947
948    1948
949    1949
Name: school_id, dtype: int64

## Model estimation

In [49]:
import pylogit as pl 
import math 
from collections import OrderedDict 

In [88]:
data_long = pd.DataFrame({'household_id': [1,1,1,2,2,2,3,3], 
                          'school_id': [1,2,3,6,3,8,7,8],
                          'choice': [1,0,0,1,0,0,0,1],
                          'travel_distance_car': [2,4,6,4,5,6,8,9]})
data_long

Unnamed: 0,household_id,school_id,choice,travel_distance_car
0,1,1,1,2
1,1,2,0,4
2,1,3,0,6
3,2,6,1,4
4,2,3,0,5
5,2,8,0,6
6,3,7,0,8
7,3,8,1,9


In [89]:
# # Specify the nesting values
# nest_membership = OrderedDict()
# nest_membership["Ride alone"] = [1,2,3]
# nest_membership["Shared option"] = [6,7,8]

# def logit(x):
#     """
#     Parameters
#     ----------
#     x : int, float, or 1D ndarray.
#         If an array, all elements should be ints or floats. All
#         elements should be between zero and one, exclusive of 1.0.

#     Returns
#     -------
#     The logit of x:  `np.log(x / (1.0 - x))`.
#     """
#     return np.log(x/(1.0 - x))

In [147]:
# basic_specification = OrderedDict()
# basic_names = OrderedDict()

# basic_specification["travel_distance_car"] = [[1,2,3,6,7,8]]
# basic_names["travel_distance_car"] = ['Travel Time']

In [148]:
# # Estimate the nested multinomial logit model (MNL)
# SLCM = pl.create_choice_model(data = data_long,
#                                         alt_id_col = 'school_id',
#                                         obs_id_col = 'household_id',
#                                         choice_col = 'choice',
#                                         specification = basic_specification,
#                                         model_type="Nested Logit",
#                                         names=basic_names,
#                                         nest_spec=nest_membership)

# # Specify the initial values and method for the optimization.
# SLCM.fit_mle(np.zeros(3),constrained_pos=[0])

# type(SLCM.get_statsmodels_summary())

In [149]:
# SLCM.get_statsmodels_summary().add_table_params

In [152]:
#Orca step
data_long = pd.DataFrame({'household_id': [1,1,1,2,2,2,3,3], 
                          'school_id': [1,2,3,6,3,8,7,8],
                          'choice': [1,0,0,1,0,0,0,1],
                          'travel_distance_car': [2,4,6,4,5,6,8,9]})

@orca.injectable(autocall=False)
def SLCM(data_long):
    # Specify the nested structure
    nest_membership = OrderedDict()
    nest_membership["Public School"] = [1,2,3]
    nest_membership["Private School"] = [6,7,8]
    
    #Model especification
    basic_specification = OrderedDict()
    basic_names = OrderedDict()
    
    # Travel time
    basic_specification["travel_distance_car"] = [[1,2,3,6,7,8]]
    basic_names["travel_distance_car"] = ['Travel Time']

    # Estimate the nested multinomial logit model (MNL)
    SLCM = pl.create_choice_model(data = data_long,
                                        alt_id_col = 'school_id',
                                        obs_id_col = 'household_id',
                                        choice_col = 'choice',
                                        specification = basic_specification,
                                        model_type="Nested Logit",
                                        names=basic_names,
                                        nest_spec=nest_membership)

    # Specify the initial values and method for the optimization.
    SLCM.fit_mle(np.zeros(3),constrained_pos=[0])
        
    #Resutls
    return SLCM

In [153]:
orca.get_injectable('SLCM')(data_long).get_statsmodels_summary().add_table_params

Log-likelihood at zero: -3.0197
Initial Log-likelihood: -3.0197
Estimation Time for Point Estimation: 0.22 seconds.
Final log-likelihood: -1.7142


<bound method Summary.add_table_params of <class 'statsmodels.iolib.summary.Summary'>
"""
                    Nested Logit Model Regression Results                     
Dep. Variable:                 choice   No. Observations:                    3
Model:             Nested Logit Model   Df Residuals:                        0
Method:                           MLE   Df Model:                            3
Date:                Mon, 17 Jun 2019   Pseudo R-squ.:                   0.432
Time:                        20:10:57   Pseudo R-bar-squ.:              -0.561
AIC:                            9.428   Log-Likelihood:                 -1.714
BIC:                            6.724   LL-Null:                        -3.020
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Public School           0        nan        nan        nan         nan         nan
Private School    15.5584   5

# Additional code

In [154]:
#'network_aggregations_small'])
# orca.run(['initialize_network_walk', 'network_aggregations_walk'])

# orca.get_table('nodeswalk').to_frame().to_csv('./data/walk_net_vars.csv')
# orca.get_table('nodessmall').to_frame().to_csv('./data/drive_net_vars.csv')
# orca.get_table('zones').to_frame().to_csv('./data/zones_w_access_vars-2025-b-lt.csv')

In [155]:
# os.listdir("/home/data/spring_2019/2025")

In [156]:
# a = students.loc[:,['SAMPN', 'PERNO', 'AGE', 'HPrimaryCity','school_id']].merge(schools.loc[:,['school_id','City','District']], how = 'inner', on = 'school_id')
# students[students.SAMPN == 1342402].loc[:,['SNAME_lookup','school_id', 'HPrimaryCity']]
# schools[schools.school_id == 1710]