# Binary logit testing

Sam Maurer, Mar 2018 | Python 3.6

In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd

In [2]:
# Standard to run UrbanSim from the root level of the project directory

import os; os.chdir('../')

In [3]:
import modelmanager as mm
from modelmanager.models import BinaryLogitStep
import orca

  from pandas.core import datetools


In [4]:
import legacy_datasources
import legacy_models

In [5]:
b = BinaryLogitStep(['table'], 'expression')

In [6]:
b.type

'BinaryLogitStep'

In [7]:
str(type(b)).split('.')[-1]

"BinaryLogitStep'>"

In [8]:
type(b).__name__

'BinaryLogitStep'

In [9]:
c = None
['a', 'b'] + [c]

['a', 'b', None]

In [10]:
c = None
if not (c is None):
    print('yes')

In [11]:
isinstance([1, 2, 3], list)

True

In [12]:
for table_name in orca.list_tables():
    print(table_name.upper())
    print(orca.get_table(table_name).to_frame().columns.tolist())
    print()

HOUSEHOLDS
['building_id', 'tenure', 'persons', 'workers', 'age_of_head', 'income', 'children', 'race_id', 'cars', 'base_luz', 'segmentation_col', 'node_id']

BUILDINGS
['parcel_id', 'development_type_id', 'improvement_value', 'residential_units', 'non_residential_sqft', 'stories', 'year_built', 'residential_sqft', 'note', 'res_price_per_sqft', 'node_id']

PARCELS
['development_type_id', 'land_value', 'parcel_acres', 'county_id', 'mgra_id', 'zoning_id', 'luz_id', 'msa_id', 'proportion_undevelopable', 'tax_exempt', 'distance_to_coast', 'distance_to_freeway', 'distance_to_onramp', 'distance_to_park', 'distance_to_school', 'distance_to_transit', 'x', 'y', 'taz_id', 'node_id', 'acres']

JOBS
['sector_id', 'building_id', 'node_id']



In [13]:
ch = orca.get_table('households').to_frame('children').children
ch.describe()

count    58671.000000
mean         0.434099
std          0.876846
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          6.000000
Name: children, dtype: float64

In [14]:
# Turn children into a binary variable
ch.loc[ch > 0] = 1
ch.describe()

count    58671.000000
mean         0.242471
std          0.428581
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: children, dtype: float64

In [15]:
# Update column
orca.get_table('households').update_col_from_series('children', ch)

# Check that it worked
orca.get_table('households').to_frame('children').children.describe()

count    58671.000000
mean         0.242471
std          0.428581
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: children, dtype: float64

In [16]:
b = BinaryLogitStep(['households', 'buildings', 'parcels'], 
                    'children ~ age_of_head + income + cars + residential_sqft',
                    ['age_of_head < 40'])

In [17]:
b._get_data()

Unnamed: 0_level_0,children,cars,building_id,age_of_head,income,residential_sqft,parcel_id
b'household_id',Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
177384,1,0,377345,36,17000,1444,46433
151392,0,1,378667,26,32000,4482,46101
177399,1,0,378667,36,17000,4482,46101
378703,0,2,378667,31,69000,4482,46101
151338,0,1,376292,26,32000,34756,44669
151369,0,1,376292,26,32000,34756,44669
151397,0,1,376292,26,32000,34756,44669
177252,1,0,376292,36,17000,34756,44669
177277,1,0,376292,36,17000,34756,44669
177297,1,0,376292,36,17000,34756,44669


In [18]:
b.fit()

Optimization terminated successfully.
         Current function value: 0.501164
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               children   No. Observations:                 4776
Model:                          Logit   Df Residuals:                     4771
Method:                           MLE   Df Model:                            4
Date:                Wed, 14 Mar 2018   Pseudo R-squ.:                 0.08767
Time:                        21:12:01   Log-Likelihood:                -2393.6
converged:                       True   LL-Null:                       -2623.6
                                        LLR p-value:                 2.927e-98
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -5.1120      0.250    -20.478      0.000      -5.601      -4.623
age_of_head

In [19]:
type(b.fitted_parameters)

list

In [20]:
b.fitted_parameters

[-5.111984721917692,
 0.12937080499546547,
 -7.380928627907898e-06,
 0.386911210479455,
 -5.0005270183615935e-06]

In [21]:
# TO DO - test with transformations

b.run()

In [22]:
b.out_value_true = -1
b.out_value_false = 'nothing'

In [23]:
b.run()

In [24]:
orca.get_table('households').to_frame('children').children.describe()

count    58671.000000
mean         0.140666
std          0.551677
min         -1.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: children, dtype: float64

In [26]:
d = {'d': 5}
d.update({'e':10})
print(d)

{'d': 5, 'e': 10}


In [27]:
b.register()

In [28]:
mm.list_steps()

[{'name': None, 'tags': None, 'type': 'BinaryLogitStep'},
 {'name': 'test-2',
  'tags': ['residential-price-hedonic', 'sam', '201802'],
  'type': 'RegressionStep'},
 {'name': 'test-1',
  'tags': ['residential-price-hedonic', 'sam', '201802'],
  'type': 'RegressionStep'},
 {'name': 'RegressionStep-20180214-210159',
  'tags': ['residential-price-hedonic', 'sam', '201802'],
  'type': 'RegressionStep'},
 {'name': 'BinaryLogitStep-20180314-211201',
  'tags': None,
  'type': 'BinaryLogitStep'}]

In [29]:
print(b.summary_table)

                           Logit Regression Results                           
Dep. Variable:               children   No. Observations:                 4776
Model:                          Logit   Df Residuals:                     4771
Method:                           MLE   Df Model:                            4
Date:                Wed, 14 Mar 2018   Pseudo R-squ.:                 0.08767
Time:                        21:12:01   Log-Likelihood:                -2393.6
converged:                       True   LL-Null:                       -2623.6
                                        LLR p-value:                 2.927e-98
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -5.1120      0.250    -20.478      0.000      -5.601      -4.623
age_of_head          0.1294      0.008     16.962      0.000       0.114       0.144
income           -7.381e-06 