# Large MNL work

Sam Maurer, April 2018 (last updated June 2018) - Python 3.6

This notebook is for development, testing, and demonstration of the template for MNL with large numbers of alternatives.

In [1]:
import os; os.chdir('../')

In [2]:
import numpy as np
import pandas as pd

In [3]:
from urbansim_templates import modelmanager as mm
from urbansim_templates.models import LargeMultinomialLogitStep
import orca

  from pandas.core import datetools


In [4]:
# Load any script-based Orca registrations
from scripts import datasources
from scripts import models

In [5]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

PARCELS
['development_type_id', 'land_value', 'acres', 'county_id', 'zone_id', 'proportion_undevelopable', 'tax_exempt_status', 'apn', 'parcel_id_local', 'geom_id', 'imputation_flag', 'x', 'y', 'shape_area', 'block_id', 'node_id']

BUILDINGS
['parcel_id', 'development_type_id', 'improvement_value', 'residential_units', 'residential_sqft', 'sqft_per_unit', 'non_residential_sqft', 'building_sqft', 'nonres_rent_per_sqft', 'res_price_per_sqft', 'stories', 'year_built', 'redfin_sale_price', 'redfin_sale_year', 'redfin_home_type', 'costar_property_type', 'costar_rent', 'building_type_id']

UNITS
['Unnamed: 0', 'building_id', 'num_units', 'tenure', 'unit_num', 'unit_residential_price', 'unit_residential_rent']

HOUSEHOLDS
['household_id', 'serialno', 'persons', 'building_type', 'cars', 'income', 'race_of_head', 'hispanic_head', 'age_of_head', 'workers', 'state', 'county', 'tract', 'block group', 'children', 'tenure', 'recent_mover', 'block_group_id', 'single_family', 'unit_id']

PERSONS
['Unn

# Large MNL

In [6]:
m = LargeMultinomialLogitStep()
m.choosers = ['households']
m.alternatives = ['units', 'buildings']
m.choice_column = 'unit_id'
m.alt_sample_size = 10
m.chooser_filters = ['household_id % 1000 < 1']

m.model_expression = 'res_price_per_sqft - 1'

m.name = 'large-mnl-test'
m.tags = ['sam', 'testing']

In [7]:
len(m._get_df(tables=m.choosers, filters=m.chooser_filters))

2680

In [8]:
%%time
m.fit()

                  CHOICEMODELS ESTIMATION RESULTS                  
Dep. Var.:                chosen   No. Observations:               
Model:         Multinomial Logit   Df Residuals:                   
Method:       Maximum Likelihood   Df Model:                       
Date:                              Pseudo R-squ.:                  
Time:                              Pseudo R-bar-squ.:              
AIC:                               Log-Likelihood:       -5,717.265
BIC:                               LL-Null:              -5,717.319
                        coef   std err         z     P>|z|   Conf. Int.
-----------------------------------------------------------------------
res_price_per_sqft    0.0000     0.000     0.432                       
CPU times: user 2.64 s, sys: 1.53 s, total: 4.17 s
Wall time: 4.14 s


In [9]:
m.fitted_parameters

[1.4474041623995042e-05]

In [10]:
m.register()

In [11]:
mm.get_step('large-mnl-test').fitted_parameters

[1.4474041623995042e-05]

In [12]:
m.out_chooser_filters = ['household_id % 1000 == 17']

In [13]:
df = orca.get_table('units').to_frame()
print(df.index)
df.index.name = 'unit_id'
print(df.index)

RangeIndex(start=0, stop=2784008, step=1, name='unit_id')
RangeIndex(start=0, stop=2784008, step=1, name='unit_id')


In [14]:
%%time
m.run()

CPU times: user 2.62 s, sys: 1.56 s, total: 4.18 s
Wall time: 4.15 s


In [15]:
print(m.probabilities.head())

   observation_id  unit_id  probability
0              17  1729708     0.102294
1              17  1554523     0.099769
2              17   462748     0.099666
3              17   245026     0.099663
4              17   342047     0.099565


In [16]:
print(m.choices.head())

17       342047
1017    2236440
2017    2658856
3017    2778429
4017    1881991
Name: choice, dtype: int64


In [25]:
# Check that choices are plausible
choices = pd.DataFrame(m.choices)
df = pd.merge(m.probabilities, choices, left_on='observation_id', right_index=True)
df['chosen'] = 0
df.loc[df.unit_id == df.choice, 'chosen'] = 1
print(df.head())
print(np.corrcoef(df.probability, df.chosen))

   observation_id  unit_id  probability  choice  chosen
0              17  1729708     0.102294  342047       0
1              17  1554523     0.099769  342047       0
2              17   462748     0.099666  342047       0
3              17   245026     0.099663  342047       0
4              17   342047     0.099565  342047       1
[[ 1.00000000e+00 -5.76017762e-04]
 [-5.76017762e-04  1.00000000e+00]]


That seems wrong (correlation should be positive), but I don't know why. Better to test with a more informative model.

In [21]:
# Check that choices are saved correctly
print(orca.get_table('households').to_frame().loc[17, 'unit_id'])
print(orca.get_table('households').to_frame().loc[1017, 'unit_id'])

342047
2236440
