# Specify tables and columns

In [1]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')
from urbansim.utils import misc

# Set data directory

d = '/home/data/fall_2018/'

if 'data_directory' in orca.list_injectables():
    d = orca.get_injectable('data_directory')

#orca.add_injectable('data_mode', "csv")
#orca.add_injectable('local_data_dir', d)
#orca.add_injectable('store', None)

#from scripts import datasources, models, variables

In [2]:
import urbansim_templates
urbansim_templates.__version__

'0.2.dev3'

In [3]:
# from scripts/datasources.py
@orca.table(cache=True)
def parcels():
    df = pd.read_csv(
        d + 'parcel_attr.csv', index_col='primary_id',
        dtype={'primary_id': int, 'block_id': str})
    return df

# override orca persons and students tables for estimation
# 
#orca.clear_all()
@orca.table(cache=True)
def persons():
    persons_df = pd.read_csv(
        d + 'chts_persons_w_zone_ids.csv',
        index_col = ["SAMPN", "PERNO"]
    )
    return persons_df

#orca.
#orca.add_table("persons", persons_df)

@orca.table(cache=False)
def students(persons):
    stu_df = pd.read_csv(
        d + 'chts_students_k-12.csv',
        dtype={'CDSCode': str},
        #index_col = ["SAMPN", "PERNO"]
    )
    # filter students with NA in CDSCode
    stu_df = stu_df.loc[~stu_df["CDSCode"].isna()]
    pp_df = persons.to_frame()
    stu_df = pd.merge(stu_df, pp_df, how="left", on=["SAMPN", "PERNO"])
    stu_df.drop(columns=['Unnamed: 0'])
    return stu_df

@orca.table(cache=False)
def schools():
    df = pd.read_csv(
        d + 'schools.csv',
        dtype={'CDSCode': str},
        index_col = "CDSCode"
    )
    # exclude schools with duplicate CDSCode for now
    # TODO schools.csv needs to be fixed
    CDSCode_n = (df.groupby(by=["CDSCode"]) 
                      .size().reset_index(name='n')
                      .query('n==1'))
    df = pd.merge(df, CDSCode_n, how="inner", on="CDSCode")
    df = df.set_index("CDSCode").drop(columns=['Unnamed: 0'])
    return df


#orca.get_table("schools").to_frame().dtypes

#persons = orca.get_table('persons').to_frame()
#students = persons.loc[persons['STUDE'].isin([1, 2])# full time & part time students
#                 & persons['SCHOL'].isin([3,  # Kindergarten to grade 8
#                                          4,  # Grades 9 to 12 
#                                          6,  # 2-year college (community college) 
#                                          7,  # 4-year college or university 
#                                          8]) # Graduate school / Professional 
#                 & (~persons['SNAME_lookup'].isna()) 
#                 & (persons['SNAME_lookup'] != "DK/RF")]
#

#schools_raw = students.groupby(by=["SCHOL", "SNAME_lookup", "SZIP_lookup"]) \
#                      .size().reset_index(name='enrollment')
# There are 3505 unique "schools", most of them have 1 student in the CHTS sample
# For now, keep only schools with 3+ students in the CHTS sample
#schools = schools_raw.loc[schools_raw['enrollment'] >= 3].reset_index(drop=True) #.drop(columns=["enrollment"])
#schools.index.name = "school_id"
#schools.reset_index(inplace=True)

#students = pd.merge(students, schools.drop(columns=["enrollment"]), 
#                    how="inner", on=["SCHOL", "SNAME_lookup", "SZIP_lookup"])
#schools = schools[['school_id', 'enrollment']]
#students = students.loc[~students['school_id'].isna()]

#orca.add_table('students', students)

## Verification
#persons_df = orca.get_table("persons").to_frame()
#persons_df.head()

#students_df = orca.get_table("students").to_frame()
#students_df.head()
#len(students_df)

#schools = orca.get_table('schools').to_frame()
#schools.head()


In [4]:
## DIAGNOSTICS
#schools
#students[["index", "school_id"]]
#sch = orca.get_table("schoolx").to_frame()
#sch.groupby(by=["CDSCode"]).size().reset_index(name='n').sort_values(by="n", ascending=False)
#orca.get_table("students").to_frame().dtypes

In [5]:
# To be moved to scripts/variables.py

@orca.column('students', 'is_college_student', cache=True)
def is_college_student(students):
    is_college_map = {3: 0, # Kindergarten to grade 8
                      4: 0, # Grades 9 to 12 
                      6: 1, # 2-year college (community college) 
                      7: 1, # 4-year college or university 
                      8: 1} # Graduate school / Professional 
    return students.SCHOL.map(is_college_map)

#@orca.column('students', 'school_id', cache=True)
#def school_id(students, schools):
#    misc.reindex()
#    return pd.merge(students, schools, how="left", on=["SCHOL", "SNAME_lookup", "SZIP_lookup"])["school_id"]

@orca.column("students")
def parcel_id_school(students, schools):
    return misc.reindex(schools.parcel_id, students.CDSCode)
    #fake_zone_id_school = np.random.choice(np.unique(persons.zone_id_home), len(students.school_id))
    #return fake_zone_id_school

@orca.column("students")
def zone_id_school(students, parcels, schools):
    #stu_parcel_id_school = misc.reindex(schoolx.parcel_id, studentx.CDSCode)
    return misc.reindex(parcels.zone_id, students.parcel_id_school)

#orca.broadcast(
#    'schools', 'students', cast_index=True, onto_on='CDSCode')

## load skims for interaction terms

In [6]:
# Travel Time
skims = pd.read_csv(d + '/mtc_skims/TimeSkimsDatabaseAM.csv')
interaction_terms_tt = skims[['orig', 'dest', 'da', 'wTrnW']].rename(
    columns={'orig': 'zone_id_home', 'dest': 'zone_id_school', 'da': 'tt_da', 'wTrnW': 'tt_wTrnW'})
interaction_terms_tt.set_index(['zone_id_home', 'zone_id_school'], inplace=True)
#interaction_terms.to_csv('./data/WLCM_interaction_terms_tt.csv')

# Distance
skims = pd.read_csv(d + '/mtc_skims/DistanceSkimsDatabaseAM.csv')
interaction_terms_dist = skims[['orig', 'dest', 'da', 'walk']].rename(
    columns={'orig': 'zone_id_home', 'dest': 'zone_id_school', 'da': 'dist_da', 'walk': 'dist_walk'})
interaction_terms_dist.set_index(['zone_id_home', 'zone_id_school'], inplace=True)
#interaction_terms_dist.to_csv('./data/WLCM_interaction_terms_dist.csv')

# Cost
skims = pd.read_csv(d + '/mtc_skims/CostSkimsDatabaseAM.csv')
interaction_terms_cost = skims[['orig', 'dest', 'daToll', 'wTrnW']].rename(
    columns={'orig': 'zone_id_home', 'dest': 'zone_id_school', 'daToll': 'cost_da_toll', 'wTrnW': 'cost_wTrnW'})
interaction_terms_cost.set_index(['zone_id_home', 'zone_id_school'], inplace=True)
#interaction_terms_cost.to_csv('./data/WLCM_interaction_terms_cost.csv')


In [7]:
from choicemodels.tools import MergedChoiceTable

pps_df = orca.get_table("persons").to_frame()

students_df = orca.get_table("students").to_frame()

#students.zone_id_home  #from persons
#students.zone_id_school
#students = students.to_frame()
schools_df = orca.get_table("schools").to_frame()

# filter students without a matched school, due to deletion of schools with duplicate CDSCode
#students_df = pd.merge(students_df, schools_df["n"], left_on="CDSCode", right_index=True, how="inner").drop(columns=["n"])
students_df = students_df.loc[students_df.CDSCode.isin(schools_df.index)]

In [8]:
#students_df.shape
#pd.merge(students_df, schools_df, left_on="CDSCode", right_index=True, how="inner").shape

In [9]:
#schools_df["CDSCode"]

In [10]:
#%%time
#%memit
mct = MergedChoiceTable(students_df, schools_df, chosen_alternatives='CDSCode',
                        sample_size=20, interaction_terms=[
                            interaction_terms_tt, interaction_terms_dist, interaction_terms_cost])

# Configure models

In [11]:
from urbansim_templates import modelmanager
from urbansim_templates.models import LargeMultinomialLogitStep, SegmentedLargeMultinomialLogitStep

modelmanager.initialize()

m0 = LargeMultinomialLogitStep(
      choosers="students",
      alternatives="schools",
      choice_column="CDSCode",
      constrained_choices=True,
      alt_sample_size=20
)

Registering model step 'auto_ownership'
Registering model step 'school-choice-model'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM'


In [12]:
m0.model_expression = (
    'tt_da'
)

m0.fit(mct)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          3,359
Model:         Multinomial Logit   Df Residuals:              3,357
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-04-21   Pseudo R-squ.:             0.000
Time:                      03:09   Pseudo R-bar-squ.:        -0.000
AIC:                  20,129.329   Log-Likelihood:      -10,062.665
BIC:                  20,141.568   LL-Null:             -10,062.665
               coef   std err         z     P>|z|   Conf. Int.
--------------------------------------------------------------
Intercept    0.0000     0.026     0.000     1.000             
tt_da        0.0000     0.002     0.000     1.000             


In [13]:
# require the latest urbansim_template
# Since there is no college/university in the schools table
# only the k-12 segment is estimated
m = SegmentedLargeMultinomialLogitStep(
      defaults = m0,
      name = "school-choice-model",
      segmentation_column = "is_college_student",
      )

In [14]:
m.model_expression = (
    'tt_da'
)
# this is not working yet, as m.fit_all() does not accept mct argument
m.fit_all(mct)

Building submodels for 1 categories: [0]
################## SEGMENT: is_college_student = 0 ###################
                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          3,359
Model:         Multinomial Logit   Df Residuals:              3,357
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-04-21   Pseudo R-squ.:             0.000
Time:                      03:09   Pseudo R-bar-squ.:        -0.000
AIC:                  20,129.329   Log-Likelihood:      -10,062.665
BIC:                  20,141.568   LL-Null:             -10,062.665
               coef   std err         z     P>|z|   Conf. Int.
--------------------------------------------------------------
Intercept    0.0000     0.026     0.000     1.000             
tt_da        0.0000     0.002     0.000     1.000             


In [15]:
modelmanager.register(m)

Saving 'school-choice-model.yaml': /home/lmwang/workspace/activitysynth/activitysynth/configs
Registering model step 'school-choice-model'


## Manually segment data for model estimation (no longer needed)

In [16]:
m_k12 = LargeMultinomialLogitStep(
      chooser_filters=['is_college_student == 0'],
      name = "school-choice-model:k12"
      )

In [17]:
m_k12.model_expression = (
    'tt_da'
)
# this is not working yet, as m.fit_all() does not accept mct argument
m_k12.fit(mct)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          3,359
Model:         Multinomial Logit   Df Residuals:              3,357
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-04-21   Pseudo R-squ.:             0.000
Time:                      03:09   Pseudo R-bar-squ.:        -0.000
AIC:                  20,129.329   Log-Likelihood:      -10,062.665
BIC:                  20,141.568   LL-Null:             -10,062.665
               coef   std err         z     P>|z|   Conf. Int.
--------------------------------------------------------------
Intercept    0.0000     0.026     0.000     1.000             
tt_da        0.0000     0.002     0.000     1.000             


In [18]:
m_college = LargeMultinomialLogitStep(
      chooser_filters=['is_college_student == 1'],
      name = "school-choice-model:college"
      )

In [19]:
m_college.model_expression = (
    'tt_da'
)
# this is not working yet, as m.fit_all() does not accept mct argument
m_college.fit(mct)

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          3,359
Model:         Multinomial Logit   Df Residuals:              3,357
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-04-21   Pseudo R-squ.:             0.000
Time:                      03:09   Pseudo R-bar-squ.:        -0.000
AIC:                  20,129.329   Log-Likelihood:      -10,062.665
BIC:                  20,141.568   LL-Null:             -10,062.665
               coef   std err         z     P>|z|   Conf. Int.
--------------------------------------------------------------
Intercept    0.0000     0.026     0.000     1.000             
tt_da        0.0000     0.002     0.000     1.000             
