# Specify tables and columns

In [1]:
import numpy as np
import pandas as pd
import orca
import os; os.chdir('../')
import warnings; warnings.simplefilter('ignore')

# Set data directory

d = '/home/data/fall_2018/'

if 'data_directory' in orca.list_injectables():
    d = orca.get_injectable('data_directory')
    
#from scripts import datasources, models, variables

In [2]:
@orca.table(cache=True)
def persons():
    df = pd.read_csv(
        d + 'chts_persons_w_zone_ids.csv',
        index_col = ["SAMPN", "PERNO"]
    )
    return df

#persons_chts = persons_df1.join(persons_df2, how="left")

persons = orca.get_table('persons').to_frame()
students = persons.loc[persons['STUDE'].isin([1, 2])# full time & part time students
                 & persons['SCHOL'].isin([3,  # Kindergarten to grade 8
                                          4,  # Grades 9 to 12 
                                          6,  # 2-year college (community college) 
                                          7,  # 4-year college or university 
                                          8]) # Graduate school / Professional 
                 & (~persons['SNAME_lookup'].isna()) 
                 & (persons['SNAME_lookup'] != "DK/RF")]
len(students)

schools_raw = students.groupby(by=["SCHOL", "SNAME_lookup", "SZIP_lookup"]) \
                      .size().reset_index(name='enrollment')
# There are 3505 unique "schools", most of them have 1 student in the CHTS sample
# For now, keep only schools with 3+ students in the CHTS sample
schools = schools_raw.loc[schools_raw['enrollment'] >= 3].reset_index(drop=True) #.drop(columns=["enrollment"])
schools.index.name = "school_id"
schools.reset_index(inplace=True)

students = pd.merge(students, schools.drop(columns=["enrollment"]), 
                    how="inner", on=["SCHOL", "SNAME_lookup", "SZIP_lookup"])
#students = students.loc[~students['school_id'].isna()]

orca.add_table('students', students)
orca.add_table('schools', schools)
#len(students)
#len(schools)

<orca.orca.DataFrameWrapper at 0x7f2415fee630>

In [3]:
## DIAGNOSTICS
#schools
#students[["index", "school_id"]]

In [4]:
@orca.column('students', 'is_college_student', cache=True)
def is_college_student(students):
    is_college_map = {3: 0,
                      4: 0,
                      6: 1,
                      7: 1,
                      8: 1}
    return students.SCHOL.map(is_college_map)

#@orca.column('students', 'school_id', cache=True)
#def school_id(students, schools):
#    misc.reindex()
#    return pd.merge(students, schools, how="left", on=["SCHOL", "SNAME_lookup", "SZIP_lookup"])["school_id"]

# Configure models

In [5]:
from urbansim_templates import modelmanager
from urbansim_templates.models import SmallMultinomialLogitStep, LargeMultinomialLogitStep, SegmentedLargeMultinomialLogitStep

modelmanager.initialize()

m0 = LargeMultinomialLogitStep(
      choosers="students",
      alternatives="schools",
      model_expression="enrollment",
      choice_column="school_id",
      alt_sample_size = 20
)
m0.fit()

Registering model step 'auto_ownership'
Registering model step 'TOD_choice'
Registering model step 'primary_mode_choice'
Registering model step 'WLCM'
                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          1,595
Model:         Multinomial Logit   Df Residuals:              1,593
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-03-11   Pseudo R-squ.:             0.040
Time:                      18:39   Pseudo R-bar-squ.:         0.039
AIC:                   9,181.624   Log-Likelihood:       -4,588.812
BIC:                   9,192.373   LL-Null:              -4,778.193
                coef   std err         z     P>|z|   Conf. Int.
---------------------------------------------------------------
Intercept    -0.0000     0.035    -0.000     1.000             
enrollment    0.0746     0.003    22.400     0.000             


In [6]:
from urbansim_templates.models import SegmentedLargeMultinomialLogitStep

m = SegmentedLargeMultinomialLogitStep(
      defaults = m0,
      name = "school-choice-model",
      segmentation_column = "is_college_student",
      )

m.fit_all()

Building submodels for 2 categories: [0 1]
################## SEGMENT: is_college_student = 0 ###################
                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:          1,073
Model:         Multinomial Logit   Df Residuals:              1,071
Method:       Maximum Likelihood   Df Model:                      2
Date:                 2019-03-11   Pseudo R-squ.:             0.000
Time:                      18:39   Pseudo R-bar-squ.:        -0.001
AIC:                   6,432.834   Log-Likelihood:       -3,214.417
BIC:                   6,442.791   LL-Null:              -3,214.421
                coef   std err         z     P>|z|   Conf. Int.
---------------------------------------------------------------
Intercept    -0.0000     0.075    -0.000     1.000             
enrollment    0.0007     0.014     0.050     0.960             
################## SEGMENT: is_college_student = 1 ###################
               