# Specify tables and columns

In [1]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')

# Set data directory

d = '/home/data/fall_2018/'

if 'data_directory' in orca.list_injectables():
    d = orca.get_injectable('data_directory')
    
#from scripts import datasources, models, variables

In [3]:
@orca.table(cache=False)
def persons():
    df = pd.read_csv(
        d + '/CHTS_csv_format/data/Deliv_PER.csv',
        dtype={'SAMPN': 'S', 'PERNO': 'S'},
        index_col = ["SAMPN", "PERNO"]
    )
    return df

#persons_df = pd.read_csv(
#        d + '/CHTS_csv_format/data/Deliv_PER.csv',
#        dtype={'HHPER': 'S'},
#        index_col = ["SAMPN", "PERNO"]
#    )
persons_df = orca.get_table("persons").to_frame()

persons_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,RELAT,GEND,AGE,AGEB,HISP,RACE1,RACE2,RACE3,RACE4,O_RACE,...,SCTFIP,STRACT,WPrimaryCity,WSTFIP,W2PrimaryCity,W2STFIP,SPrimaryCity,SSTFIP,PERWGT,EXPPERWGT
SAMPN,PERNO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1031985,1,1,1,74,,2,1.0,,,,,...,,,,,,,,,0.052086,17.647568
1031985,2,2,2,73,,2,1.0,,,,,...,,,,,,,,,0.052086,17.647568
1032036,1,1,1,46,,2,1.0,,,,,...,,,SAN DIEGO,6.0,,,,,1.223974,414.701494
1032036,2,2,2,47,,2,1.0,97.0,,,MULTI-RACIAL,...,,,,,,,,,0.863473,292.558373
1032036,3,3,1,15,,2,1.0,97.0,,,MULTI-RACIAL,...,73.0,17030.0,,,,,SAN DIEGO,6.0,0.941412,318.9651


In [2]:
# override orca persons and students tables for estimation
# 
@orca.table(cache=True)
def persons():
    df = pd.read_csv(
        d + 'chts_persons_w_zone_ids.csv',
        index_col = ["SAMPN", "PERNO"]
    )
    return df

#persons_chts = persons_df1.join(persons_df2, how="left")

persons = orca.get_table('persons').to_frame()
students = persons.loc[persons['STUDE'].isin([1, 2])# full time & part time students
                 & persons['SCHOL'].isin([3,  # Kindergarten to grade 8
                                          4,  # Grades 9 to 12 
                                          6,  # 2-year college (community college) 
                                          7,  # 4-year college or university 
                                          8]) # Graduate school / Professional 
                 & (~persons['SNAME_lookup'].isna()) 
                 & (persons['SNAME_lookup'] != "DK/RF")]
len(students)

schools_raw = students.groupby(by=["SCHOL", "SNAME_lookup", "SZIP_lookup"]) \
                      .size().reset_index(name='enrollment')
# There are 3505 unique "schools", most of them have 1 student in the CHTS sample
# For now, keep only schools with 3+ students in the CHTS sample
schools = schools_raw.loc[schools_raw['enrollment'] >= 3].reset_index(drop=True) #.drop(columns=["enrollment"])
schools.index.name = "school_id"
schools.reset_index(inplace=True)

students = pd.merge(students, schools.drop(columns=["enrollment"]), 
                    how="inner", on=["SCHOL", "SNAME_lookup", "SZIP_lookup"])
schools = schools[['school_id', 'enrollment']]
#students = students.loc[~students['school_id'].isna()]

orca.add_table('students', students)
orca.add_table('schools', schools)
#len(students)
#len(schools)

<orca.orca.DataFrameWrapper at 0x7fa92f817b00>

In [3]:
## DIAGNOSTICS
#schools
#students[["index", "school_id"]]

In [4]:
# Moved to scripts/variables.py

#@orca.column('students', 'is_college_student', cache=True)
#def is_college_student(students):
#    is_college_map = {3: 0,
#                      4: 0,
#                      6: 1,
#                      7: 1,
#                      8: 1}
#    return students.SCHOL.map(is_college_map)

#@orca.column('students', 'school_id', cache=True)
#def school_id(students, schools):
#    misc.reindex()
#    return pd.merge(students, schools, how="left", on=["SCHOL", "SNAME_lookup", "SZIP_lookup"])["school_id"]

@orca.column("students")
def zone_id_school(students, persons):
    fake_zone_id_school = np.random.choice(np.unique(persons.zone_id_home), len(students.school_id))
    return fake_zone_id_school


## load skims for interaction terms

In [5]:
# Travel Time
skims = pd.read_csv(d + '/mtc_skims/TimeSkimsDatabaseAM.csv')
interaction_terms_tt = skims[['orig', 'dest', 'da', 'wTrnW']].rename(
    columns={'orig': 'zone_id_home', 'dest': 'zone_id_school', 'da': 'tt_da', 'wTrnW': 'tt_wTrnW'})
interaction_terms_tt.set_index(['zone_id_home', 'zone_id_school'], inplace=True)
#interaction_terms.to_csv('./data/WLCM_interaction_terms_tt.csv')

# Distance
skims = pd.read_csv(d + '/mtc_skims/DistanceSkimsDatabaseAM.csv')
interaction_terms_dist = skims[['orig', 'dest', 'da', 'walk']].rename(
    columns={'orig': 'zone_id_home', 'dest': 'zone_id_school', 'da': 'dist_da', 'walk': 'dist_walk'})
interaction_terms_dist.set_index(['zone_id_home', 'zone_id_school'], inplace=True)
#interaction_terms_dist.to_csv('./data/WLCM_interaction_terms_dist.csv')

# Cost
skims = pd.read_csv(d + '/mtc_skims/CostSkimsDatabaseAM.csv')
interaction_terms_cost = skims[['orig', 'dest', 'daToll', 'wTrnW']].rename(
    columns={'orig': 'zone_id_home', 'dest': 'zone_id_school', 'daToll': 'cost_da_toll', 'wTrnW': 'cost_wTrnW'})
interaction_terms_cost.set_index(['zone_id_home', 'zone_id_school'], inplace=True)
#interaction_terms_cost.to_csv('./data/WLCM_interaction_terms_cost.csv')


In [7]:
from choicemodels.tools import MergedChoiceTable

students = orca.get_table("students")
students.zone_id_home
students.zone_id_school
students = students.to_frame()

schools = orca.get_table("schools").to_frame()

In [8]:
#%%time
#%memit
mct = MergedChoiceTable(students, schools, chosen_alternatives='school_id',
                        sample_size=10, interaction_terms=[
                            interaction_terms_tt, interaction_terms_dist, interaction_terms_cost])

# Configure models

In [9]:
from urbansim_templates import modelmanager
from urbansim_templates.models import SmallMultinomialLogitStep, LargeMultinomialLogitStep, SegmentedLargeMultinomialLogitStep

modelmanager.initialize()

m0 = LargeMultinomialLogitStep(
    constrained_choices=True,
    alt_sample_size=10
)

Registering model step 'auto_ownership'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM'


In [10]:
m0.model_expression = (
    'tt_da'
)

m0.fit(mct)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          1,595
Model:         Multinomial Logit   Df Residuals:              1,593
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-03-19   Pseudo R-squ.:             0.000
Time:                      22:01   Pseudo R-bar-squ.:        -0.001
AIC:                   7,349.246   Log-Likelihood:       -3,672.623
BIC:                   7,359.996   LL-Null:              -3,672.623
               coef   std err         z     P>|z|   Conf. Int.
--------------------------------------------------------------
Intercept    0.0000     0.058     0.000     1.000             
tt_da        0.0000     0.001     0.000     1.000             


In [11]:
from urbansim_templates.models import SegmentedLargeMultinomialLogitStep

m = SegmentedLargeMultinomialLogitStep(
      defaults = m0,
      name = "school-choice-model",
      segmentation_column = "is_college_student",
      )

In [12]:
m.model_expression = (
    'tt_da'
)
# this is not working yet, as m.fit_all() does not accept mct argument
m.fit_all(mct)

TypeError: fit_all() takes 1 positional argument but 2 were given

In [None]:
m.name = 'School-Choice-Model'
mm.register(m)